1 Star 0 Fork 101

Liu Yuntao/openssl

forked from src-openEuler/openssl 
加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
Backport-SM4-optimization-for-ARM-by-ASIMD.patch 38.43 KB
一键复制 编辑 原始数据 按行查看 历史
xuyizhou 提交于 2023-03-16 09:45 . backport SM3/SM4 optimization
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334
From ca0b08e39bb619b6e62ef58c80edc784e8f20966 Mon Sep 17 00:00:00 2001
From: Daniel Hu <Daniel.Hu@arm.com>
Date: Mon, 14 Feb 2022 14:36:34 +0000
Subject: [PATCH 07/13] SM4 optimization for ARM by ASIMD
This patch optimizes SM4 for ARM processor using ASIMD instruction
It will improve performance if both of following conditions are met:
1) Input data equal to or more than 4 blocks
2) Cipher mode allows parallelism, including ECB,CTR,GCM or CBC decryption
This patch implements SM4 SBOX lookup in vector registers, with the
benefit of constant processing time over existing C implementation.
It is only enabled for micro-architecture N1/V1. In the ideal scenario,
performance can reach up to 2.7X
When either of above two conditions is not met, e.g. single block input
or CFB/OFB mode, CBC encryption, performance could drop about 50%.
The assembly code has been reviewed internally by ARM engineer
Fangming.Fang@arm.com
Signed-off-by: Daniel Hu <Daniel.Hu@arm.com>
Reviewed-by: Paul Dale <pauli@openssl.org>
Reviewed-by: Tomas Mraz <tomas@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/17951)
---
crypto/evp/e_sm4.c | 24 +
crypto/sm4/asm/vpsm4-armv8.pl | 1118 +++++++++++++++++
crypto/sm4/build.info | 6 +-
include/crypto/sm4_platform.h | 29 +
.../ciphers/cipher_sm4_gcm_hw.c | 7 +
.../implementations/ciphers/cipher_sm4_hw.c | 24 +
6 files changed, 1206 insertions(+), 2 deletions(-)
create mode 100755 crypto/sm4/asm/vpsm4-armv8.pl
diff --git a/crypto/evp/e_sm4.c b/crypto/evp/e_sm4.c
index bff79ff197..c8e8cfe9c9 100644
--- a/crypto/evp/e_sm4.c
+++ b/crypto/evp/e_sm4.c
@@ -76,6 +76,17 @@ static int sm4_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
dat->stream.ecb = (ecb128_f) HWSM4_ecb_encrypt;
# endif
} else
+#endif
+#ifdef VPSM4_CAPABLE
+ if (VPSM4_CAPABLE) {
+ vpsm4_set_decrypt_key(key, &dat->ks.ks);
+ dat->block = (block128_f) vpsm4_decrypt;
+ dat->stream.cbc = NULL;
+ if (mode == EVP_CIPH_CBC_MODE)
+ dat->stream.cbc = (cbc128_f) vpsm4_cbc_encrypt;
+ else if (mode == EVP_CIPH_ECB_MODE)
+ dat->stream.ecb = (ecb128_f) vpsm4_ecb_encrypt;
+ } else
#endif
{
dat->block = (block128_f) ossl_sm4_decrypt;
@@ -104,6 +115,19 @@ static int sm4_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
# endif
(void)0; /* terminate potentially open 'else' */
} else
+#endif
+#ifdef VPSM4_CAPABLE
+ if (VPSM4_CAPABLE) {
+ vpsm4_set_encrypt_key(key, &dat->ks.ks);
+ dat->block = (block128_f) vpsm4_encrypt;
+ dat->stream.cbc = NULL;
+ if (mode == EVP_CIPH_CBC_MODE)
+ dat->stream.cbc = (cbc128_f) vpsm4_cbc_encrypt;
+ else if (mode == EVP_CIPH_ECB_MODE)
+ dat->stream.ecb = (ecb128_f) vpsm4_ecb_encrypt;
+ else if (mode == EVP_CIPH_CTR_MODE)
+ dat->stream.ctr = (ctr128_f) vpsm4_ctr32_encrypt_blocks;
+ } else
#endif
{
dat->block = (block128_f) ossl_sm4_encrypt;
diff --git a/crypto/sm4/asm/vpsm4-armv8.pl b/crypto/sm4/asm/vpsm4-armv8.pl
new file mode 100755
index 0000000000..095d9dae64
--- /dev/null
+++ b/crypto/sm4/asm/vpsm4-armv8.pl
@@ -0,0 +1,1118 @@
+#! /usr/bin/env perl
+# Copyright 2020-2021 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# This module implements SM4 with ASIMD on aarch64
+#
+# Feb 2022
+#
+
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour \"$output\""
+ or die "can't call $xlate: $!";
+*STDOUT=*OUT;
+
+$prefix="vpsm4";
+my @vtmp=map("v$_",(0..3));
+my @data=map("v$_",(4..7));
+my @datax=map("v$_",(8..11));
+my ($rk0,$rk1)=("v12","v13");
+my ($rka,$rkb)=("v14","v15");
+my @vtmpx=map("v$_",(12..15));
+my @sbox=map("v$_",(16..31));
+my ($inp,$outp,$blocks,$rks)=("x0","x1","w2","x3");
+my ($tmpw,$tmp,$wtmp0,$wtmp1,$wtmp2)=("w6","x6","w7","w8","w9");
+my ($ptr,$counter)=("x10","w11");
+my ($word0,$word1,$word2,$word3)=("w12","w13","w14","w15");
+
+sub rev32() {
+ my $dst = shift;
+ my $src = shift;
+
+ if ($src and ("$src" ne "$dst")) {
+$code.=<<___;
+#ifndef __ARMEB__
+ rev32 $dst.16b,$src.16b
+#else
+ mov $dst.16b,$src.16b
+#endif
+___
+ } else {
+$code.=<<___;
+#ifndef __ARMEB__
+ rev32 $dst.16b,$dst.16b
+#endif
+___
+ }
+}
+
+sub transpose() {
+ my ($dat0,$dat1,$dat2,$dat3,$vt0,$vt1,$vt2,$vt3) = @_;
+
+$code.=<<___;
+ zip1 $vt0.4s,$dat0.4s,$dat1.4s
+ zip2 $vt1.4s,$dat0.4s,$dat1.4s
+ zip1 $vt2.4s,$dat2.4s,$dat3.4s
+ zip2 $vt3.4s,$dat2.4s,$dat3.4s
+ zip1 $dat0.2d,$vt0.2d,$vt2.2d
+ zip2 $dat1.2d,$vt0.2d,$vt2.2d
+ zip1 $dat2.2d,$vt1.2d,$vt3.2d
+ zip2 $dat3.2d,$vt1.2d,$vt3.2d
+___
+}
+
+# sbox operations for 4-lane of words
+sub sbox() {
+ my $dat = shift;
+
+$code.=<<___;
+ movi @vtmp[0].16b,#64
+ movi @vtmp[1].16b,#128
+ movi @vtmp[2].16b,#192
+ sub @vtmp[0].16b,$dat.16b,@vtmp[0].16b
+ sub @vtmp[1].16b,$dat.16b,@vtmp[1].16b
+ sub @vtmp[2].16b,$dat.16b,@vtmp[2].16b
+ tbl $dat.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$dat.16b
+ tbl @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b
+ tbl @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b
+ tbl @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b
+ add @vtmp[0].2d,@vtmp[0].2d,@vtmp[1].2d
+ add @vtmp[2].2d,@vtmp[2].2d,$dat.2d
+ add $dat.2d,@vtmp[0].2d,@vtmp[2].2d
+
+ ushr @vtmp[0].4s,$dat.4s,32-2
+ sli @vtmp[0].4s,$dat.4s,2
+ ushr @vtmp[2].4s,$dat.4s,32-10
+ eor @vtmp[1].16b,@vtmp[0].16b,$dat.16b
+ sli @vtmp[2].4s,$dat.4s,10
+ eor @vtmp[1].16b,@vtmp[2].16b,$vtmp[1].16b
+ ushr @vtmp[0].4s,$dat.4s,32-18
+ sli @vtmp[0].4s,$dat.4s,18
+ ushr @vtmp[2].4s,$dat.4s,32-24
+ eor @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b
+ sli @vtmp[2].4s,$dat.4s,24
+ eor $dat.16b,@vtmp[2].16b,@vtmp[1].16b
+___
+}
+
+# sbox operation for 8-lane of words
+sub sbox_double() {
+ my $dat = shift;
+ my $datx = shift;
+
+$code.=<<___;
+ movi @vtmp[3].16b,#64
+ sub @vtmp[0].16b,$dat.16b,@vtmp[3].16b
+ sub @vtmp[1].16b,@vtmp[0].16b,@vtmp[3].16b
+ sub @vtmp[2].16b,@vtmp[1].16b,@vtmp[3].16b
+ tbl $dat.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$dat.16b
+ tbl @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b
+ tbl @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b
+ tbl @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b
+ add @vtmp[1].2d,@vtmp[0].2d,@vtmp[1].2d
+ add $dat.2d,@vtmp[2].2d,$dat.2d
+ add $dat.2d,@vtmp[1].2d,$dat.2d
+
+ sub @vtmp[0].16b,$datx.16b,@vtmp[3].16b
+ sub @vtmp[1].16b,@vtmp[0].16b,@vtmp[3].16b
+ sub @vtmp[2].16b,@vtmp[1].16b,@vtmp[3].16b
+ tbl $datx.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$datx.16b
+ tbl @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b
+ tbl @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b
+ tbl @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b
+ add @vtmp[1].2d,@vtmp[0].2d,@vtmp[1].2d
+ add $datx.2d,@vtmp[2].2d,$datx.2d
+ add $datx.2d,@vtmp[1].2d,$datx.2d
+
+ ushr @vtmp[0].4s,$dat.4s,32-2
+ sli @vtmp[0].4s,$dat.4s,2
+ ushr @vtmp[2].4s,$datx.4s,32-2
+ eor @vtmp[1].16b,@vtmp[0].16b,$dat.16b
+ sli @vtmp[2].4s,$datx.4s,2
+
+ ushr @vtmp[0].4s,$dat.4s,32-10
+ eor @vtmp[3].16b,@vtmp[2].16b,$datx.16b
+ sli @vtmp[0].4s,$dat.4s,10
+ ushr @vtmp[2].4s,$datx.4s,32-10
+ eor @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b
+ sli @vtmp[2].4s,$datx.4s,10
+
+ ushr @vtmp[0].4s,$dat.4s,32-18
+ eor @vtmp[3].16b,@vtmp[2].16b,$vtmp[3].16b
+ sli @vtmp[0].4s,$dat.4s,18
+ ushr @vtmp[2].4s,$datx.4s,32-18
+ eor @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b
+ sli @vtmp[2].4s,$datx.4s,18
+
+ ushr @vtmp[0].4s,$dat.4s,32-24
+ eor @vtmp[3].16b,@vtmp[2].16b,$vtmp[3].16b
+ sli @vtmp[0].4s,$dat.4s,24
+ ushr @vtmp[2].4s,$datx.4s,32-24
+ eor $dat.16b,@vtmp[0].16b,@vtmp[1].16b
+ sli @vtmp[2].4s,$datx.4s,24
+ eor $datx.16b,@vtmp[2].16b,@vtmp[3].16b
+___
+}
+
+# sbox operation for one single word
+sub sbox_1word () {
+ my $word = shift;
+
+$code.=<<___;
+ movi @vtmp[1].16b,#64
+ movi @vtmp[2].16b,#128
+ movi @vtmp[3].16b,#192
+ mov @vtmp[0].s[0],$word
+
+ sub @vtmp[1].16b,@vtmp[0].16b,@vtmp[1].16b
+ sub @vtmp[2].16b,@vtmp[0].16b,@vtmp[2].16b
+ sub @vtmp[3].16b,@vtmp[0].16b,@vtmp[3].16b
+
+ tbl @vtmp[0].16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},@vtmp[0].16b
+ tbl @vtmp[1].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[1].16b
+ tbl @vtmp[2].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[2].16b
+ tbl @vtmp[3].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[3].16b
+
+ mov $word,@vtmp[0].s[0]
+ mov $wtmp0,@vtmp[1].s[0]
+ mov $wtmp2,@vtmp[2].s[0]
+ add $wtmp0,$word,$wtmp0
+ mov $word,@vtmp[3].s[0]
+ add $wtmp0,$wtmp0,$wtmp2
+ add $wtmp0,$wtmp0,$word
+
+ eor $word,$wtmp0,$wtmp0,ror #32-2
+ eor $word,$word,$wtmp0,ror #32-10
+ eor $word,$word,$wtmp0,ror #32-18
+ eor $word,$word,$wtmp0,ror #32-24
+___
+}
+
+# sm4 for one block of data, in scalar registers word0/word1/word2/word3
+sub sm4_1blk () {
+ my $kptr = shift;
+
+$code.=<<___;
+ ldp $wtmp0,$wtmp1,[$kptr],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor $tmpw,$word2,$word3
+ eor $wtmp2,$wtmp0,$word1
+ eor $tmpw,$tmpw,$wtmp2
+___
+ &sbox_1word($tmpw);
+$code.=<<___;
+ eor $word0,$word0,$tmpw
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor $tmpw,$word2,$word3
+ eor $wtmp2,$word0,$wtmp1
+ eor $tmpw,$tmpw,$wtmp2
+___
+ &sbox_1word($tmpw);
+$code.=<<___;
+ ldp $wtmp0,$wtmp1,[$kptr],8
+ eor $word1,$word1,$tmpw
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor $tmpw,$word0,$word1
+ eor $wtmp2,$wtmp0,$word3
+ eor $tmpw,$tmpw,$wtmp2
+___
+ &sbox_1word($tmpw);
+$code.=<<___;
+ eor $word2,$word2,$tmpw
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor $tmpw,$word0,$word1
+ eor $wtmp2,$word2,$wtmp1
+ eor $tmpw,$tmpw,$wtmp2
+___
+ &sbox_1word($tmpw);
+$code.=<<___;
+ eor $word3,$word3,$tmpw
+___
+}
+
+# sm4 for 4-lanes of data, in neon registers data0/data1/data2/data3
+sub sm4_4blks () {
+ my $kptr = shift;
+
+$code.=<<___;
+ ldp $wtmp0,$wtmp1,[$kptr],8
+ dup $rk0.4s,$wtmp0
+ dup $rk1.4s,$wtmp1
+
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor $rka.16b,@data[2].16b,@data[3].16b
+ eor $rk0.16b,@data[1].16b,$rk0.16b
+ eor $rk0.16b,$rka.16b,$rk0.16b
+___
+ &sbox($rk0);
+$code.=<<___;
+ eor @data[0].16b,@data[0].16b,$rk0.16b
+
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor $rka.16b,$rka.16b,@data[0].16b
+ eor $rk1.16b,$rka.16b,$rk1.16b
+___
+ &sbox($rk1);
+$code.=<<___;
+ ldp $wtmp0,$wtmp1,[$kptr],8
+ eor @data[1].16b,@data[1].16b,$rk1.16b
+
+ dup $rk0.4s,$wtmp0
+ dup $rk1.4s,$wtmp1
+
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor $rka.16b,@data[0].16b,@data[1].16b
+ eor $rk0.16b,@data[3].16b,$rk0.16b
+ eor $rk0.16b,$rka.16b,$rk0.16b
+___
+ &sbox($rk0);
+$code.=<<___;
+ eor @data[2].16b,@data[2].16b,$rk0.16b
+
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor $rka.16b,$rka.16b,@data[2].16b
+ eor $rk1.16b,$rka.16b,$rk1.16b
+___
+ &sbox($rk1);
+$code.=<<___;
+ eor @data[3].16b,@data[3].16b,$rk1.16b
+___
+}
+
+# sm4 for 8 lanes of data, in neon registers
+# data0/data1/data2/data3 datax0/datax1/datax2/datax3
+sub sm4_8blks () {
+ my $kptr = shift;
+
+$code.=<<___;
+ ldp $wtmp0,$wtmp1,[$kptr],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ dup $rk0.4s,$wtmp0
+ eor $rka.16b,@data[2].16b,@data[3].16b
+ eor $rkb.16b,@datax[2].16b,@datax[3].16b
+ eor @vtmp[0].16b,@data[1].16b,$rk0.16b
+ eor @vtmp[1].16b,@datax[1].16b,$rk0.16b
+ eor $rk0.16b,$rka.16b,@vtmp[0].16b
+ eor $rk1.16b,$rkb.16b,@vtmp[1].16b
+___
+ &sbox_double($rk0,$rk1);
+$code.=<<___;
+ eor @data[0].16b,@data[0].16b,$rk0.16b
+ eor @datax[0].16b,@datax[0].16b,$rk1.16b
+
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ dup $rk1.4s,$wtmp1
+ eor $rka.16b,$rka.16b,@data[0].16b
+ eor $rkb.16b,$rkb.16b,@datax[0].16b
+ eor $rk0.16b,$rka.16b,$rk1.16b
+ eor $rk1.16b,$rkb.16b,$rk1.16b
+___
+ &sbox_double($rk0,$rk1);
+$code.=<<___;
+ ldp $wtmp0,$wtmp1,[$kptr],8
+ eor @data[1].16b,@data[1].16b,$rk0.16b
+ eor @datax[1].16b,@datax[1].16b,$rk1.16b
+
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ dup $rk0.4s,$wtmp0
+ eor $rka.16b,@data[0].16b,@data[1].16b
+ eor $rkb.16b,@datax[0].16b,@datax[1].16b
+ eor @vtmp[0].16b,@data[3].16b,$rk0.16b
+ eor @vtmp[1].16b,@datax[3].16b,$rk0.16b
+ eor $rk0.16b,$rka.16b,@vtmp[0].16b
+ eor $rk1.16b,$rkb.16b,@vtmp[1].16b
+___
+ &sbox_double($rk0,$rk1);
+$code.=<<___;
+ eor @data[2].16b,@data[2].16b,$rk0.16b
+ eor @datax[2].16b,@datax[2].16b,$rk1.16b
+
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ dup $rk1.4s,$wtmp1
+ eor $rka.16b,$rka.16b,@data[2].16b
+ eor $rkb.16b,$rkb.16b,@datax[2].16b
+ eor $rk0.16b,$rka.16b,$rk1.16b
+ eor $rk1.16b,$rkb.16b,$rk1.16b
+___
+ &sbox_double($rk0,$rk1);
+$code.=<<___;
+ eor @data[3].16b,@data[3].16b,$rk0.16b
+ eor @datax[3].16b,@datax[3].16b,$rk1.16b
+___
+}
+
+sub encrypt_1blk_norev() {
+ my $dat = shift;
+
+$code.=<<___;
+ mov $ptr,$rks
+ mov $counter,#8
+ mov $word0,$dat.s[0]
+ mov $word1,$dat.s[1]
+ mov $word2,$dat.s[2]
+ mov $word3,$dat.s[3]
+10:
+___
+ &sm4_1blk($ptr);
+$code.=<<___;
+ subs $counter,$counter,#1
+ b.ne 10b
+ mov $dat.s[0],$word3
+ mov $dat.s[1],$word2
+ mov $dat.s[2],$word1
+ mov $dat.s[3],$word0
+___
+}
+
+sub encrypt_1blk() {
+ my $dat = shift;
+
+ &encrypt_1blk_norev($dat);
+ &rev32($dat,$dat);
+}
+
+sub encrypt_4blks() {
+$code.=<<___;
+ mov $ptr,$rks
+ mov $counter,#8
+10:
+___
+ &sm4_4blks($ptr);
+$code.=<<___;
+ subs $counter,$counter,#1
+ b.ne 10b
+___
+ &rev32(@vtmp[3],@data[0]);
+ &rev32(@vtmp[2],@data[1]);
+ &rev32(@vtmp[1],@data[2]);
+ &rev32(@vtmp[0],@data[3]);
+}
+
+sub encrypt_8blks() {
+$code.=<<___;
+ mov $ptr,$rks
+ mov $counter,#8
+10:
+___
+ &sm4_8blks($ptr);
+$code.=<<___;
+ subs $counter,$counter,#1
+ b.ne 10b
+___
+ &rev32(@vtmp[3],@data[0]);
+ &rev32(@vtmp[2],@data[1]);
+ &rev32(@vtmp[1],@data[2]);
+ &rev32(@vtmp[0],@data[3]);
+ &rev32(@data[3],@datax[0]);
+ &rev32(@data[2],@datax[1]);
+ &rev32(@data[1],@datax[2]);
+ &rev32(@data[0],@datax[3]);
+}
+
+sub load_sbox () {
+ my $data = shift;
+
+$code.=<<___;
+ adr $ptr,.Lsbox
+ ld1 {@sbox[0].4s,@sbox[1].4s,@sbox[2].4s,@sbox[3].4s},[$ptr],#64
+ ld1 {@sbox[4].4s,@sbox[5].4s,@sbox[6].4s,@sbox[7].4s},[$ptr],#64
+ ld1 {@sbox[8].4s,@sbox[9].4s,@sbox[10].4s,@sbox[11].4s},[$ptr],#64
+ ld1 {@sbox[12].4s,@sbox[13].4s,@sbox[14].4s,@sbox[15].4s},[$ptr]
+___
+}
+
+$code=<<___;
+#include "arm_arch.h"
+.arch armv8-a
+.text
+
+.type _vpsm4_consts,%object
+.align 7
+_vpsm4_consts:
+.Lsbox:
+ .byte 0xD6,0x90,0xE9,0xFE,0xCC,0xE1,0x3D,0xB7,0x16,0xB6,0x14,0xC2,0x28,0xFB,0x2C,0x05
+ .byte 0x2B,0x67,0x9A,0x76,0x2A,0xBE,0x04,0xC3,0xAA,0x44,0x13,0x26,0x49,0x86,0x06,0x99
+ .byte 0x9C,0x42,0x50,0xF4,0x91,0xEF,0x98,0x7A,0x33,0x54,0x0B,0x43,0xED,0xCF,0xAC,0x62
+ .byte 0xE4,0xB3,0x1C,0xA9,0xC9,0x08,0xE8,0x95,0x80,0xDF,0x94,0xFA,0x75,0x8F,0x3F,0xA6
+ .byte 0x47,0x07,0xA7,0xFC,0xF3,0x73,0x17,0xBA,0x83,0x59,0x3C,0x19,0xE6,0x85,0x4F,0xA8
+ .byte 0x68,0x6B,0x81,0xB2,0x71,0x64,0xDA,0x8B,0xF8,0xEB,0x0F,0x4B,0x70,0x56,0x9D,0x35
+ .byte 0x1E,0x24,0x0E,0x5E,0x63,0x58,0xD1,0xA2,0x25,0x22,0x7C,0x3B,0x01,0x21,0x78,0x87
+ .byte 0xD4,0x00,0x46,0x57,0x9F,0xD3,0x27,0x52,0x4C,0x36,0x02,0xE7,0xA0,0xC4,0xC8,0x9E
+ .byte 0xEA,0xBF,0x8A,0xD2,0x40,0xC7,0x38,0xB5,0xA3,0xF7,0xF2,0xCE,0xF9,0x61,0x15,0xA1
+ .byte 0xE0,0xAE,0x5D,0xA4,0x9B,0x34,0x1A,0x55,0xAD,0x93,0x32,0x30,0xF5,0x8C,0xB1,0xE3
+ .byte 0x1D,0xF6,0xE2,0x2E,0x82,0x66,0xCA,0x60,0xC0,0x29,0x23,0xAB,0x0D,0x53,0x4E,0x6F
+ .byte 0xD5,0xDB,0x37,0x45,0xDE,0xFD,0x8E,0x2F,0x03,0xFF,0x6A,0x72,0x6D,0x6C,0x5B,0x51
+ .byte 0x8D,0x1B,0xAF,0x92,0xBB,0xDD,0xBC,0x7F,0x11,0xD9,0x5C,0x41,0x1F,0x10,0x5A,0xD8
+ .byte 0x0A,0xC1,0x31,0x88,0xA5,0xCD,0x7B,0xBD,0x2D,0x74,0xD0,0x12,0xB8,0xE5,0xB4,0xB0
+ .byte 0x89,0x69,0x97,0x4A,0x0C,0x96,0x77,0x7E,0x65,0xB9,0xF1,0x09,0xC5,0x6E,0xC6,0x84
+ .byte 0x18,0xF0,0x7D,0xEC,0x3A,0xDC,0x4D,0x20,0x79,0xEE,0x5F,0x3E,0xD7,0xCB,0x39,0x48
+.Lck:
+ .long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269
+ .long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9
+ .long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249
+ .long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9
+ .long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229
+ .long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299
+ .long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209
+ .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
+.Lfk:
+ .dword 0x56aa3350a3b1bac6,0xb27022dc677d9197
+.Lshuffles:
+ .dword 0x0B0A090807060504,0x030201000F0E0D0C
+
+.size _vpsm4_consts,.-_vpsm4_consts
+___
+
+{{{
+my ($key,$keys,$enc)=("x0","x1","w2");
+my ($pointer,$schedules,$wtmp,$roundkey)=("x5","x6","w7","w8");
+my ($vkey,$vfk,$vmap)=("v5","v6","v7");
+$code.=<<___;
+.type _vpsm4_set_key,%function
+.align 4
+_vpsm4_set_key:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {$vkey.4s},[$key]
+___
+ &load_sbox();
+ &rev32($vkey,$vkey);
+$code.=<<___;
+ adr $pointer,.Lshuffles
+ ld1 {$vmap.4s},[$pointer]
+ adr $pointer,.Lfk
+ ld1 {$vfk.4s},[$pointer]
+ eor $vkey.16b,$vkey.16b,$vfk.16b
+ mov $schedules,#32
+ adr $pointer,.Lck
+ movi @vtmp[0].16b,#64
+ cbnz $enc,1f
+ add $keys,$keys,124
+1:
+ mov $wtmp,$vkey.s[1]
+ ldr $roundkey,[$pointer],#4
+ eor $roundkey,$roundkey,$wtmp
+ mov $wtmp,$vkey.s[2]
+ eor $roundkey,$roundkey,$wtmp
+ mov $wtmp,$vkey.s[3]
+ eor $roundkey,$roundkey,$wtmp
+ // sbox lookup
+ mov @data[0].s[0],$roundkey
+ tbl @vtmp[1].16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},@data[0].16b
+ sub @data[0].16b,@data[0].16b,@vtmp[0].16b
+ tbx @vtmp[1].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@data[0].16b
+ sub @data[0].16b,@data[0].16b,@vtmp[0].16b
+ tbx @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@data[0].16b
+ sub @data[0].16b,@data[0].16b,@vtmp[0].16b
+ tbx @vtmp[1].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@data[0].16b
+ mov $wtmp,@vtmp[1].s[0]
+ eor $roundkey,$wtmp,$wtmp,ror #19
+ eor $roundkey,$roundkey,$wtmp,ror #9
+ mov $wtmp,$vkey.s[0]
+ eor $roundkey,$roundkey,$wtmp
+ mov $vkey.s[0],$roundkey
+ cbz $enc,2f
+ str $roundkey,[$keys],#4
+ b 3f
+2:
+ str $roundkey,[$keys],#-4
+3:
+ tbl $vkey.16b,{$vkey.16b},$vmap.16b
+ subs $schedules,$schedules,#1
+ b.ne 1b
+ ret
+.size _vpsm4_set_key,.-_vpsm4_set_key
+___
+}}}
+
+
+{{{
+$code.=<<___;
+.type _vpsm4_enc_4blks,%function
+.align 4
+_vpsm4_enc_4blks:
+ AARCH64_VALID_CALL_TARGET
+___
+ &encrypt_4blks();
+$code.=<<___;
+ ret
+.size _vpsm4_enc_4blks,.-_vpsm4_enc_4blks
+___
+}}}
+
+{{{
+$code.=<<___;
+.type _vpsm4_enc_8blks,%function
+.align 4
+_vpsm4_enc_8blks:
+ AARCH64_VALID_CALL_TARGET
+___
+ &encrypt_8blks();
+$code.=<<___;
+ ret
+.size _vpsm4_enc_8blks,.-_vpsm4_enc_8blks
+___
+}}}
+
+
+{{{
+my ($key,$keys)=("x0","x1");
+$code.=<<___;
+.globl ${prefix}_set_encrypt_key
+.type ${prefix}_set_encrypt_key,%function
+.align 5
+${prefix}_set_encrypt_key:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ mov w2,1
+ bl _vpsm4_set_key
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
+___
+}}}
+
+{{{
+my ($key,$keys)=("x0","x1");
+$code.=<<___;
+.globl ${prefix}_set_decrypt_key
+.type ${prefix}_set_decrypt_key,%function
+.align 5
+${prefix}_set_decrypt_key:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ mov w2,0
+ bl _vpsm4_set_key
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
+___
+}}}
+
+{{{
+sub gen_block () {
+ my $dir = shift;
+ my ($inp,$outp,$rk)=map("x$_",(0..2));
+
+$code.=<<___;
+.globl ${prefix}_${dir}crypt
+.type ${prefix}_${dir}crypt,%function
+.align 5
+${prefix}_${dir}crypt:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {@data[0].16b},[$inp]
+___
+ &load_sbox();
+ &rev32(@data[0],@data[0]);
+$code.=<<___;
+ mov $rks,x2
+___
+ &encrypt_1blk(@data[0]);
+$code.=<<___;
+ st1 {@data[0].16b},[$outp]
+ ret
+.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
+___
+}
+&gen_block("en");
+&gen_block("de");
+}}}
+
+{{{
+my ($enc) = ("w4");
+my @dat=map("v$_",(16..23));
+
+$code.=<<___;
+.globl ${prefix}_ecb_encrypt
+.type ${prefix}_ecb_encrypt,%function
+.align 5
+${prefix}_ecb_encrypt:
+ AARCH64_SIGN_LINK_REGISTER
+ // convert length into blocks
+ lsr x2,x2,4
+ stp d8,d9,[sp,#-80]!
+ stp d10,d11,[sp,#16]
+ stp d12,d13,[sp,#32]
+ stp d14,d15,[sp,#48]
+ stp x29,x30,[sp,#64]
+___
+ &load_sbox();
+$code.=<<___;
+.Lecb_8_blocks_process:
+ cmp $blocks,#8
+ b.lt .Lecb_4_blocks_process
+ ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
+ ld4 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
+___
+ &rev32(@data[0],@data[0]);
+ &rev32(@data[1],@data[1]);
+ &rev32(@data[2],@data[2]);
+ &rev32(@data[3],@data[3]);
+ &rev32(@datax[0],@datax[0]);
+ &rev32(@datax[1],@datax[1]);
+ &rev32(@datax[2],@datax[2]);
+ &rev32(@datax[3],@datax[3]);
+$code.=<<___;
+ bl _vpsm4_enc_8blks
+ st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
+ st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
+ subs $blocks,$blocks,#8
+ b.gt .Lecb_8_blocks_process
+ b 100f
+.Lecb_4_blocks_process:
+ cmp $blocks,#4
+ b.lt 1f
+ ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
+___
+ &rev32(@data[0],@data[0]);
+ &rev32(@data[1],@data[1]);
+ &rev32(@data[2],@data[2]);
+ &rev32(@data[3],@data[3]);
+$code.=<<___;
+ bl _vpsm4_enc_4blks
+ st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
+ sub $blocks,$blocks,#4
+1:
+ // process last block
+ cmp $blocks,#1
+ b.lt 100f
+ b.gt 1f
+ ld1 {@data[0].16b},[$inp]
+___
+ &rev32(@data[0],@data[0]);
+ &encrypt_1blk(@data[0]);
+$code.=<<___;
+ st1 {@data[0].16b},[$outp]
+ b 100f
+1: // process last 2 blocks
+ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp],#16
+ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$inp],#16
+ cmp $blocks,#2
+ b.gt 1f
+___
+ &rev32(@data[0],@data[0]);
+ &rev32(@data[1],@data[1]);
+ &rev32(@data[2],@data[2]);
+ &rev32(@data[3],@data[3]);
+$code.=<<___;
+ bl _vpsm4_enc_4blks
+ st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16
+ st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp]
+ b 100f
+1: // process last 3 blocks
+ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$inp],#16
+___
+ &rev32(@data[0],@data[0]);
+ &rev32(@data[1],@data[1]);
+ &rev32(@data[2],@data[2]);
+ &rev32(@data[3],@data[3]);
+$code.=<<___;
+ bl _vpsm4_enc_4blks
+ st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16
+ st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp],#16
+ st4 {@vtmp[0].s-@vtmp[3].s}[2],[$outp]
+100:
+ ldp d10,d11,[sp,#16]
+ ldp d12,d13,[sp,#32]
+ ldp d14,d15,[sp,#48]
+ ldp x29,x30,[sp,#64]
+ ldp d8,d9,[sp],#80
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
+___
+}}}
+
+{{{
+my ($len,$ivp,$enc)=("x2","x4","w5");
+my $ivec0=("v3");
+my $ivec1=("v15");
+
+$code.=<<___;
+.globl ${prefix}_cbc_encrypt
+.type ${prefix}_cbc_encrypt,%function
+.align 5
+${prefix}_cbc_encrypt:
+ AARCH64_VALID_CALL_TARGET
+ lsr $len,$len,4
+___
+ &load_sbox();
+$code.=<<___;
+ cbz $enc,.Ldec
+ ld1 {$ivec0.4s},[$ivp]
+.Lcbc_4_blocks_enc:
+ cmp $blocks,#4
+ b.lt 1f
+ ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
+ eor @data[0].16b,@data[0].16b,$ivec0.16b
+___
+ &rev32(@data[1],@data[1]);
+ &rev32(@data[0],@data[0]);
+ &rev32(@data[2],@data[2]);
+ &rev32(@data[3],@data[3]);
+ &encrypt_1blk_norev(@data[0]);
+$code.=<<___;
+ eor @data[1].16b,@data[1].16b,@data[0].16b
+___
+ &encrypt_1blk_norev(@data[1]);
+ &rev32(@data[0],@data[0]);
+
+$code.=<<___;
+ eor @data[2].16b,@data[2].16b,@data[1].16b
+___
+ &encrypt_1blk_norev(@data[2]);
+ &rev32(@data[1],@data[1]);
+$code.=<<___;
+ eor @data[3].16b,@data[3].16b,@data[2].16b
+___
+ &encrypt_1blk_norev(@data[3]);
+ &rev32(@data[2],@data[2]);
+ &rev32(@data[3],@data[3]);
+$code.=<<___;
+ orr $ivec0.16b,@data[3].16b,@data[3].16b
+ st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
+ subs $blocks,$blocks,#4
+ b.ne .Lcbc_4_blocks_enc
+ b 2f
+1:
+ subs $blocks,$blocks,#1
+ b.lt 2f
+ ld1 {@data[0].4s},[$inp],#16
+ eor $ivec0.16b,$ivec0.16b,@data[0].16b
+___
+ &rev32($ivec0,$ivec0);
+ &encrypt_1blk($ivec0);
+$code.=<<___;
+ st1 {$ivec0.16b},[$outp],#16
+ b 1b
+2:
+ // save back IV
+ st1 {$ivec0.16b},[$ivp]
+ ret
+
+.Ldec:
+ // decryption mode starts
+ AARCH64_SIGN_LINK_REGISTER
+ stp d8,d9,[sp,#-80]!
+ stp d10,d11,[sp,#16]
+ stp d12,d13,[sp,#32]
+ stp d14,d15,[sp,#48]
+ stp x29,x30,[sp,#64]
+.Lcbc_8_blocks_dec:
+ cmp $blocks,#8
+ b.lt 1f
+ ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp]
+ add $ptr,$inp,#64
+ ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$ptr]
+___
+ &rev32(@data[0],@data[0]);
+ &rev32(@data[1],@data[1]);
+ &rev32(@data[2],@data[2]);
+ &rev32(@data[3],$data[3]);
+ &rev32(@datax[0],@datax[0]);
+ &rev32(@datax[1],@datax[1]);
+ &rev32(@datax[2],@datax[2]);
+ &rev32(@datax[3],$datax[3]);
+$code.=<<___;
+ bl _vpsm4_enc_8blks
+___
+ &transpose(@vtmp,@datax);
+ &transpose(@data,@datax);
+$code.=<<___;
+ ld1 {$ivec1.16b},[$ivp]
+ ld1 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
+ // note ivec1 and vtmpx[3] are resuing the same register
+ // care needs to be taken to avoid conflict
+ eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
+ ld1 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
+ eor @vtmp[1].16b,@vtmp[1].16b,@datax[0].16b
+ eor @vtmp[2].16b,@vtmp[2].16b,@datax[1].16b
+ eor @vtmp[3].16b,$vtmp[3].16b,@datax[2].16b
+ // save back IV
+ st1 {$vtmpx[3].16b}, [$ivp]
+ eor @data[0].16b,@data[0].16b,$datax[3].16b
+ eor @data[1].16b,@data[1].16b,@vtmpx[0].16b
+ eor @data[2].16b,@data[2].16b,@vtmpx[1].16b
+ eor @data[3].16b,$data[3].16b,@vtmpx[2].16b
+ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
+ st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
+ subs $blocks,$blocks,#8
+ b.gt .Lcbc_8_blocks_dec
+ b.eq 100f
+1:
+ ld1 {$ivec1.16b},[$ivp]
+.Lcbc_4_blocks_dec:
+ cmp $blocks,#4
+ b.lt 1f
+ ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp]
+___
+ &rev32(@data[0],@data[0]);
+ &rev32(@data[1],@data[1]);
+ &rev32(@data[2],@data[2]);
+ &rev32(@data[3],$data[3]);
+$code.=<<___;
+ bl _vpsm4_enc_4blks
+ ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
+___
+ &transpose(@vtmp,@datax);
+$code.=<<___;
+ eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
+ eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
+ orr $ivec1.16b,@data[3].16b,@data[3].16b
+ eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b
+ eor @vtmp[3].16b,$vtmp[3].16b,@data[2].16b
+ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
+ subs $blocks,$blocks,#4
+ b.gt .Lcbc_4_blocks_dec
+ // save back IV
+ st1 {@vtmp[3].16b}, [$ivp]
+ b 100f
+1: // last block
+ subs $blocks,$blocks,#1
+ b.lt 100f
+ b.gt 1f
+ ld1 {@data[0].4s},[$inp],#16
+ // save back IV
+ st1 {$data[0].16b}, [$ivp]
+___
+ &rev32(@datax[0],@data[0]);
+ &encrypt_1blk(@datax[0]);
+$code.=<<___;
+ eor @datax[0].16b,@datax[0].16b,$ivec1.16b
+ st1 {@datax[0].16b},[$outp],#16
+ b 100f
+1: // last two blocks
+ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp]
+ add $ptr,$inp,#16
+ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$ptr],#16
+ subs $blocks,$blocks,1
+ b.gt 1f
+___
+ &rev32(@data[0],@data[0]);
+ &rev32(@data[1],@data[1]);
+ &rev32(@data[2],@data[2]);
+ &rev32(@data[3],@data[3]);
+$code.=<<___;
+ bl _vpsm4_enc_4blks
+ ld1 {@data[0].4s,@data[1].4s},[$inp],#32
+___
+ &transpose(@vtmp,@datax);
+$code.=<<___;
+ eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
+ eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
+ st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32
+ // save back IV
+ st1 {@data[1].16b}, [$ivp]
+ b 100f
+1: // last 3 blocks
+ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$ptr]
+___
+ &rev32(@data[0],@data[0]);
+ &rev32(@data[1],@data[1]);
+ &rev32(@data[2],@data[2]);
+ &rev32(@data[3],@data[3]);
+$code.=<<___;
+ bl _vpsm4_enc_4blks
+ ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48
+___
+ &transpose(@vtmp,@datax);
+$code.=<<___;
+ eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
+ eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
+ eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b
+ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48
+ // save back IV
+ st1 {@data[2].16b}, [$ivp]
+100:
+ ldp d10,d11,[sp,#16]
+ ldp d12,d13,[sp,#32]
+ ldp d14,d15,[sp,#48]
+ ldp x29,x30,[sp,#64]
+ ldp d8,d9,[sp],#80
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
+___
+}}}
+
+{{{
+my ($ivp)=("x4");
+my ($ctr)=("w5");
+my $ivec=("v3");
+
+$code.=<<___;
+.globl ${prefix}_ctr32_encrypt_blocks
+.type ${prefix}_ctr32_encrypt_blocks,%function
+.align 5
+${prefix}_ctr32_encrypt_blocks:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {$ivec.4s},[$ivp]
+___
+ &rev32($ivec,$ivec);
+ &load_sbox();
+$code.=<<___;
+ cmp $blocks,#1
+ b.ne 1f
+ // fast processing for one single block without
+ // context saving overhead
+___
+ &encrypt_1blk($ivec);
+$code.=<<___;
+ ld1 {@data[0].16b},[$inp]
+ eor @data[0].16b,@data[0].16b,$ivec.16b
+ st1 {@data[0].16b},[$outp]
+ ret
+1:
+ AARCH64_SIGN_LINK_REGISTER
+ stp d8,d9,[sp,#-80]!
+ stp d10,d11,[sp,#16]
+ stp d12,d13,[sp,#32]
+ stp d14,d15,[sp,#48]
+ stp x29,x30,[sp,#64]
+ mov $word0,$ivec.s[0]
+ mov $word1,$ivec.s[1]
+ mov $word2,$ivec.s[2]
+ mov $ctr,$ivec.s[3]
+.Lctr32_4_blocks_process:
+ cmp $blocks,#4
+ b.lt 1f
+ dup @data[0].4s,$word0
+ dup @data[1].4s,$word1
+ dup @data[2].4s,$word2
+ mov @data[3].s[0],$ctr
+ add $ctr,$ctr,#1
+ mov $data[3].s[1],$ctr
+ add $ctr,$ctr,#1
+ mov @data[3].s[2],$ctr
+ add $ctr,$ctr,#1
+ mov @data[3].s[3],$ctr
+ add $ctr,$ctr,#1
+ cmp $blocks,#8
+ b.ge .Lctr32_8_blocks_process
+ bl _vpsm4_enc_4blks
+ ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
+ eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
+ eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
+ eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
+ eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
+ st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
+ subs $blocks,$blocks,#4
+ b.ne .Lctr32_4_blocks_process
+ b 100f
+.Lctr32_8_blocks_process:
+ dup @datax[0].4s,$word0
+ dup @datax[1].4s,$word1
+ dup @datax[2].4s,$word2
+ mov @datax[3].s[0],$ctr
+ add $ctr,$ctr,#1
+ mov $datax[3].s[1],$ctr
+ add $ctr,$ctr,#1
+ mov @datax[3].s[2],$ctr
+ add $ctr,$ctr,#1
+ mov @datax[3].s[3],$ctr
+ add $ctr,$ctr,#1
+ bl _vpsm4_enc_8blks
+ ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
+ ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
+ eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
+ eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
+ eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
+ eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
+ eor @data[0].16b,@data[0].16b,@datax[0].16b
+ eor @data[1].16b,@data[1].16b,@datax[1].16b
+ eor @data[2].16b,@data[2].16b,@datax[2].16b
+ eor @data[3].16b,@data[3].16b,@datax[3].16b
+ st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
+ st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
+ subs $blocks,$blocks,#8
+ b.ne .Lctr32_4_blocks_process
+ b 100f
+1: // last block processing
+ subs $blocks,$blocks,#1
+ b.lt 100f
+ b.gt 1f
+ mov $ivec.s[0],$word0
+ mov $ivec.s[1],$word1
+ mov $ivec.s[2],$word2
+ mov $ivec.s[3],$ctr
+___
+ &encrypt_1blk($ivec);
+$code.=<<___;
+ ld1 {@data[0].16b},[$inp]
+ eor @data[0].16b,@data[0].16b,$ivec.16b
+ st1 {@data[0].16b},[$outp]
+ b 100f
+1: // last 2 blocks processing
+ dup @data[0].4s,$word0
+ dup @data[1].4s,$word1
+ dup @data[2].4s,$word2
+ mov @data[3].s[0],$ctr
+ add $ctr,$ctr,#1
+ mov @data[3].s[1],$ctr
+ subs $blocks,$blocks,#1
+ b.ne 1f
+ bl _vpsm4_enc_4blks
+ ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16
+ ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16
+ eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
+ eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
+ eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
+ eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
+ st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16
+ st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16
+ b 100f
+1: // last 3 blocks processing
+ add $ctr,$ctr,#1
+ mov @data[3].s[2],$ctr
+ bl _vpsm4_enc_4blks
+ ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16
+ ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16
+ ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[2],[$inp],#16
+ eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
+ eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
+ eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
+ eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
+ st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16
+ st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16
+ st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[2],[$outp],#16
+100:
+ ldp d10,d11,[sp,#16]
+ ldp d12,d13,[sp,#32]
+ ldp d14,d15,[sp,#48]
+ ldp x29,x30,[sp,#64]
+ ldp d8,d9,[sp],#80
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
+___
+}}}
+########################################
+open SELF,$0;
+while(<SELF>) {
+ next if (/^#!/);
+ last if (!s/^#/\/\// and !/^$/);
+ print;
+}
+close SELF;
+
+foreach(split("\n",$code)) {
+ s/\`([^\`]*)\`/eval($1)/ge;
+ print $_,"\n";
+}
+
+close STDOUT or die "error closing STDOUT: $!";
diff --git a/crypto/sm4/build.info b/crypto/sm4/build.info
index e27aa49e67..75a215ab80 100644
--- a/crypto/sm4/build.info
+++ b/crypto/sm4/build.info
@@ -1,8 +1,8 @@
LIBS=../../libcrypto
IF[{- !$disabled{asm} -}]
- $SM4DEF_aarch64=SM4_ASM
- $SM4ASM_aarch64=sm4-armv8.S
+ $SM4DEF_aarch64=SM4_ASM VPSM4_ASM
+ $SM4ASM_aarch64=sm4-armv8.S vpsm4-armv8.S
# Now that we have defined all the arch specific variables, use the
# appropriate one, and define the appropriate macros
@@ -29,4 +29,6 @@ IF[{- !$disabled{module} && !$disabled{shared} -}]
ENDIF
GENERATE[sm4-armv8.S]=asm/sm4-armv8.pl
+GENERATE[vpsm4-armv8.S]=asm/vpsm4-armv8.pl
INCLUDE[sm4-armv8.o]=..
+INCLUDE[vpsm4-armv8.o]=..
diff --git a/include/crypto/sm4_platform.h b/include/crypto/sm4_platform.h
index 42c8b44a43..11f9b9d88b 100644
--- a/include/crypto/sm4_platform.h
+++ b/include/crypto/sm4_platform.h
@@ -15,6 +15,16 @@
# if (defined(__arm__) || defined(__arm) || defined(__aarch64__))
# include "arm_arch.h"
# if __ARM_MAX_ARCH__>=8
+extern unsigned int OPENSSL_arm_midr;
+static inline int vpsm4_capable(void)
+{
+ return (OPENSSL_armcap_P & ARMV8_CPUID) &&
+ (MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_V1) ||
+ MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_N1));
+}
+# if defined(VPSM4_ASM)
+# define VPSM4_CAPABLE vpsm4_capable()
+# endif
# define HWSM4_CAPABLE (OPENSSL_armcap_P & ARMV8_SM4)
# define HWSM4_set_encrypt_key sm4_v8_set_encrypt_key
# define HWSM4_set_decrypt_key sm4_v8_set_decrypt_key
@@ -45,4 +55,23 @@ void HWSM4_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
const unsigned char ivec[16]);
# endif /* HWSM4_CAPABLE */
+#ifdef VPSM4_CAPABLE
+int vpsm4_set_encrypt_key(const unsigned char *userKey, SM4_KEY *key);
+int vpsm4_set_decrypt_key(const unsigned char *userKey, SM4_KEY *key);
+void vpsm4_encrypt(const unsigned char *in, unsigned char *out,
+ const SM4_KEY *key);
+void vpsm4_decrypt(const unsigned char *in, unsigned char *out,
+ const SM4_KEY *key);
+void vpsm4_cbc_encrypt(const unsigned char *in, unsigned char *out,
+ size_t length, const SM4_KEY *key,
+ unsigned char *ivec, const int enc);
+void vpsm4_ecb_encrypt(const unsigned char *in, unsigned char *out,
+ size_t length, const SM4_KEY *key,
+ const int enc);
+void vpsm4_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
+ size_t len, const void *key,
+ const unsigned char ivec[16]);
+# endif /* VPSM4_CAPABLE */
+
+
#endif /* OSSL_SM4_PLATFORM_H */
diff --git a/providers/implementations/ciphers/cipher_sm4_gcm_hw.c b/providers/implementations/ciphers/cipher_sm4_gcm_hw.c
index b9633f83ed..db7fe0fe2f 100644
--- a/providers/implementations/ciphers/cipher_sm4_gcm_hw.c
+++ b/providers/implementations/ciphers/cipher_sm4_gcm_hw.c
@@ -32,6 +32,13 @@ static int sm4_gcm_initkey(PROV_GCM_CTX *ctx, const unsigned char *key,
# endif
} else
# endif /* HWSM4_CAPABLE */
+# ifdef VPSM4_CAPABLE
+ if (VPSM4_CAPABLE) {
+ vpsm4_set_encrypt_key(key, ks);
+ CRYPTO_gcm128_init(&ctx->gcm, ks, (block128_f) vpsm4_encrypt);
+ ctx->ctr = (ctr128_f) vpsm4_ctr32_encrypt_blocks;
+ } else
+# endif /* VPSM4_CAPABLE */
{
ossl_sm4_set_key(key, ks);
CRYPTO_gcm128_init(&ctx->gcm, ks, (block128_f)ossl_sm4_encrypt);
diff --git a/providers/implementations/ciphers/cipher_sm4_hw.c b/providers/implementations/ciphers/cipher_sm4_hw.c
index 4cd3d3d669..9a2e99f67c 100644
--- a/providers/implementations/ciphers/cipher_sm4_hw.c
+++ b/providers/implementations/ciphers/cipher_sm4_hw.c
@@ -41,6 +41,19 @@ static int cipher_hw_sm4_initkey(PROV_CIPHER_CTX *ctx,
#endif
(void)0; /* terminate potentially open 'else' */
} else
+#endif
+#ifdef VPSM4_CAPABLE
+ if (VPSM4_CAPABLE) {
+ vpsm4_set_encrypt_key(key, ks);
+ ctx->block = (block128_f)vpsm4_encrypt;
+ ctx->stream.cbc = NULL;
+ if (ctx->mode == EVP_CIPH_CBC_MODE)
+ ctx->stream.cbc = (cbc128_f)vpsm4_cbc_encrypt;
+ else if (ctx->mode == EVP_CIPH_ECB_MODE)
+ ctx->stream.ecb = (ecb128_f)vpsm4_ecb_encrypt;
+ else if (ctx->mode == EVP_CIPH_CTR_MODE)
+ ctx->stream.ctr = (ctr128_f)vpsm4_ctr32_encrypt_blocks;
+ } else
#endif
{
ossl_sm4_set_key(key, ks);
@@ -61,6 +74,17 @@ static int cipher_hw_sm4_initkey(PROV_CIPHER_CTX *ctx,
ctx->stream.ecb = (ecb128_f)HWSM4_ecb_encrypt;
#endif
} else
+#endif
+#ifdef VPSM4_CAPABLE
+ if (VPSM4_CAPABLE) {
+ vpsm4_set_decrypt_key(key, ks);
+ ctx->block = (block128_f)vpsm4_decrypt;
+ ctx->stream.cbc = NULL;
+ if (ctx->mode == EVP_CIPH_CBC_MODE)
+ ctx->stream.cbc = (cbc128_f)vpsm4_cbc_encrypt;
+ else if (ctx->mode == EVP_CIPH_ECB_MODE)
+ ctx->stream.ecb = (ecb128_f)vpsm4_ecb_encrypt;
+ } else
#endif
{
ossl_sm4_set_key(key, ks);
--
2.37.3.windows.1
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/liu-yuntao-10/openssl.git
git@gitee.com:liu-yuntao-10/openssl.git
liu-yuntao-10
openssl
openssl
master

搜索帮助