1 Star 0 Fork 96

jinlun/openssl

forked from src-openEuler/openssl 
加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
Backport-SM4-AESE-optimization-for-ARMv8.patch 66.15 KB
一键复制 编辑 原始数据 按行查看 历史
xuyizhou 提交于 2023-03-16 09:45 . backport SM3/SM4 optimization
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322
From 730387aebda57a1bb0af5a74747d4dadc5e033f7 Mon Sep 17 00:00:00 2001
From: Xu Yizhou <xuyizhou1@huawei.com>
Date: Wed, 18 Jan 2023 09:55:02 +0800
Subject: [PATCH 12/13] SM4 AESE optimization for ARMv8
Signed-off-by: Xu Yizhou <xuyizhou1@huawei.com>
Reviewed-by: Tomas Mraz <tomas@openssl.org>
Reviewed-by: Paul Dale <pauli@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/19914)
---
crypto/sm4/asm/vpsm4-armv8.pl | 458 +++++
crypto/sm4/asm/vpsm4_ex-armv8.pl | 1544 +++++++++++++++++
crypto/sm4/build.info | 4 +-
include/crypto/sm4_platform.h | 41 +-
.../implementations/ciphers/cipher_sm4_hw.c | 26 +-
.../implementations/ciphers/cipher_sm4_xts.c | 4 +-
.../implementations/ciphers/cipher_sm4_xts.h | 2 +-
.../ciphers/cipher_sm4_xts_hw.c | 33 +-
8 files changed, 2090 insertions(+), 22 deletions(-)
create mode 100644 crypto/sm4/asm/vpsm4_ex-armv8.pl
diff --git a/crypto/sm4/asm/vpsm4-armv8.pl b/crypto/sm4/asm/vpsm4-armv8.pl
index 73797af582..e19de30901 100755
--- a/crypto/sm4/asm/vpsm4-armv8.pl
+++ b/crypto/sm4/asm/vpsm4-armv8.pl
@@ -28,6 +28,7 @@ open OUT,"| \"$^X\" $xlate $flavour \"$output\""
$prefix="vpsm4";
my @vtmp=map("v$_",(0..3));
+my @qtmp=map("q$_",(0..3));
my @data=map("v$_",(4..7));
my @datax=map("v$_",(8..11));
my ($rk0,$rk1)=("v12","v13");
@@ -36,6 +37,7 @@ my @vtmpx=map("v$_",(12..15));
my @sbox=map("v$_",(16..31));
my ($inp,$outp,$blocks,$rks)=("x0","x1","w2","x3");
my ($tmpw,$tmp,$wtmp0,$wtmp1,$wtmp2)=("w6","x6","w7","w8","w9");
+my ($xtmp1,$xtmp2)=("x8","x9");
my ($ptr,$counter)=("x10","w11");
my ($word0,$word1,$word2,$word3)=("w12","w13","w14","w15");
@@ -60,6 +62,51 @@ ___
}
}
+sub rev32_armeb() {
+ my $dst = shift;
+ my $src = shift;
+
+ if ($src and ("$src" ne "$dst")) {
+$code.=<<___;
+#ifdef __AARCH64EB__
+ rev32 $dst.16b,$src.16b
+#else
+ mov $dst.16b,$src.16b
+#endif
+___
+ } else {
+$code.=<<___;
+#ifdef __AARCH64EB__
+ rev32 $dst.16b,$dst.16b
+#endif
+___
+ }
+}
+
+sub rbit() {
+ my $dst = shift;
+ my $src = shift;
+ my $std = shift;
+
+ if ($src and ("$src" ne "$dst")) {
+ if ($std eq "_gb") {
+$code.=<<___;
+ rbit $dst.16b,$src.16b
+___
+ } else {
+$code.=<<___;
+ mov $dst.16b,$src.16b
+___
+ }
+ } else {
+ if ($std eq "_gb") {
+$code.=<<___;
+ rbit $dst.16b,$src.16b
+___
+ }
+ }
+}
+
sub transpose() {
my ($dat0,$dat1,$dat2,$dat3,$vt0,$vt1,$vt2,$vt3) = @_;
@@ -435,6 +482,58 @@ $code.=<<___;
___
}
+
+sub mov_reg_to_vec() {
+ my $src0 = shift;
+ my $src1 = shift;
+ my $desv = shift;
+$code.=<<___;
+ mov $desv.d[0],$src0
+ mov $desv.d[1],$src1
+___
+ &rev32_armeb($desv,$desv);
+}
+
+sub mov_vec_to_reg() {
+ my $srcv = shift;
+ my $des0 = shift;
+ my $des1 = shift;
+$code.=<<___;
+ mov $des0,$srcv.d[0]
+ mov $des1,$srcv.d[1]
+___
+}
+
+sub compute_tweak() {
+ my $src0 = shift;
+ my $src1 = shift;
+ my $des0 = shift;
+ my $des1 = shift;
+$code.=<<___;
+ mov $wtmp0,0x87
+ extr $xtmp2,$src1,$src1,#32
+ extr $des1,$src1,$src0,#63
+ and $wtmp1,$wtmp0,$wtmp2,asr#31
+ eor $des0,$xtmp1,$src0,lsl#1
+___
+}
+
+sub compute_tweak_vec() {
+ my $src = shift;
+ my $des = shift;
+ my $std = shift;
+ &rbit(@vtmp[2],$src,$std);
+$code.=<<___;
+ ldr @qtmp[0], =0x01010101010101010101010101010187
+ shl $des.16b, @vtmp[2].16b, #1
+ ext @vtmp[1].16b, @vtmp[2].16b, @vtmp[2].16b,#15
+ ushr @vtmp[1].16b, @vtmp[1].16b, #7
+ mul @vtmp[1].16b, @vtmp[1].16b, @vtmp[0].16b
+ eor $des.16b, $des.16b, @vtmp[1].16b
+___
+ &rbit($des,$des,$std);
+}
+
$code=<<___;
#include "arm_arch.h"
.arch armv8-a
@@ -1101,6 +1200,365 @@ $code.=<<___;
.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
___
}}}
+
+{{{
+my ($blocks,$len)=("x2","x2");
+my $ivp=("x5");
+my @twx=map("x$_",(12..27));
+my ($rks1,$rks2)=("x26","x27");
+my $lastBlk=("x26");
+my $enc=("w28");
+my $remain=("x29");
+
+my @tweak=@datax;
+
+sub gen_xts_cipher() {
+ my $std = shift;
+$code.=<<___;
+.globl ${prefix}_xts_encrypt${std}
+.type ${prefix}_xts_encrypt${std},%function
+.align 5
+${prefix}_xts_encrypt${std}:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x15, x16, [sp, #-0x10]!
+ stp x17, x18, [sp, #-0x10]!
+ stp x19, x20, [sp, #-0x10]!
+ stp x21, x22, [sp, #-0x10]!
+ stp x23, x24, [sp, #-0x10]!
+ stp x25, x26, [sp, #-0x10]!
+ stp x27, x28, [sp, #-0x10]!
+ stp x29, x30, [sp, #-0x10]!
+ stp d8, d9, [sp, #-0x10]!
+ stp d10, d11, [sp, #-0x10]!
+ stp d12, d13, [sp, #-0x10]!
+ stp d14, d15, [sp, #-0x10]!
+ mov $rks1,x3
+ mov $rks2,x4
+ mov $enc,w6
+ ld1 {@tweak[0].4s}, [$ivp]
+ mov $rks,$rks2
+___
+ &load_sbox();
+ &rev32(@tweak[0],@tweak[0]);
+ &encrypt_1blk(@tweak[0]);
+$code.=<<___;
+ mov $rks,$rks1
+ and $remain,$len,#0x0F
+ // convert length into blocks
+ lsr $blocks,$len,4
+ cmp $blocks,#1
+ b.lt .return${std}
+
+ cmp $remain,0
+ // If the encryption/decryption Length is N times of 16,
+ // the all blocks are encrypted/decrypted in .xts_encrypt_blocks${std}
+ b.eq .xts_encrypt_blocks${std}
+
+ // If the encryption/decryption length is not N times of 16,
+ // the last two blocks are encrypted/decrypted in .last_2blks_tweak${std} or .only_2blks_tweak${std}
+ // the other blocks are encrypted/decrypted in .xts_encrypt_blocks${std}
+ subs $blocks,$blocks,#1
+ b.eq .only_2blks_tweak${std}
+.xts_encrypt_blocks${std}:
+___
+ &rbit(@tweak[0],@tweak[0],$std);
+ &rev32_armeb(@tweak[0],@tweak[0]);
+ &mov_vec_to_reg(@tweak[0],@twx[0],@twx[1]);
+ &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]);
+ &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]);
+ &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]);
+ &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]);
+ &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]);
+ &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]);
+ &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]);
+$code.=<<___;
+.Lxts_8_blocks_process${std}:
+ cmp $blocks,#8
+ b.lt .Lxts_4_blocks_process${std}
+___
+ &mov_reg_to_vec(@twx[0],@twx[1],@vtmp[0]);
+ &mov_reg_to_vec(@twx[2],@twx[3],@vtmp[1]);
+ &mov_reg_to_vec(@twx[4],@twx[5],@vtmp[2]);
+ &mov_reg_to_vec(@twx[6],@twx[7],@vtmp[3]);
+ &mov_reg_to_vec(@twx[8],@twx[9],@vtmpx[0]);
+ &mov_reg_to_vec(@twx[10],@twx[11],@vtmpx[1]);
+ &mov_reg_to_vec(@twx[12],@twx[13],@vtmpx[2]);
+ &mov_reg_to_vec(@twx[14],@twx[15],@vtmpx[3]);
+$code.=<<___;
+ ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
+___
+ &rbit(@vtmp[0],@vtmp[0],$std);
+ &rbit(@vtmp[1],@vtmp[1],$std);
+ &rbit(@vtmp[2],@vtmp[2],$std);
+ &rbit(@vtmp[3],@vtmp[3],$std);
+$code.=<<___;
+ eor @data[0].16b, @data[0].16b, @vtmp[0].16b
+ eor @data[1].16b, @data[1].16b, @vtmp[1].16b
+ eor @data[2].16b, @data[2].16b, @vtmp[2].16b
+ eor @data[3].16b, @data[3].16b, @vtmp[3].16b
+ ld1 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
+___
+ &rbit(@vtmpx[0],@vtmpx[0],$std);
+ &rbit(@vtmpx[1],@vtmpx[1],$std);
+ &rbit(@vtmpx[2],@vtmpx[2],$std);
+ &rbit(@vtmpx[3],@vtmpx[3],$std);
+$code.=<<___;
+ eor @datax[0].16b, @datax[0].16b, @vtmpx[0].16b
+ eor @datax[1].16b, @datax[1].16b, @vtmpx[1].16b
+ eor @datax[2].16b, @datax[2].16b, @vtmpx[2].16b
+ eor @datax[3].16b, @datax[3].16b, @vtmpx[3].16b
+___
+ &rev32(@data[0],@data[0]);
+ &rev32(@data[1],@data[1]);
+ &rev32(@data[2],@data[2]);
+ &rev32(@data[3],@data[3]);
+ &rev32(@datax[0],@datax[0]);
+ &rev32(@datax[1],@datax[1]);
+ &rev32(@datax[2],@datax[2]);
+ &rev32(@datax[3],@datax[3]);
+ &transpose(@data,@vtmp);
+ &transpose(@datax,@vtmp);
+$code.=<<___;
+ bl _${prefix}_enc_8blks
+___
+ &transpose(@vtmp,@datax);
+ &transpose(@data,@datax);
+
+ &mov_reg_to_vec(@twx[0],@twx[1],@vtmpx[0]);
+ &compute_tweak(@twx[14],@twx[15],@twx[0],@twx[1]);
+ &mov_reg_to_vec(@twx[2],@twx[3],@vtmpx[1]);
+ &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]);
+ &mov_reg_to_vec(@twx[4],@twx[5],@vtmpx[2]);
+ &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]);
+ &mov_reg_to_vec(@twx[6],@twx[7],@vtmpx[3]);
+ &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]);
+ &mov_reg_to_vec(@twx[8],@twx[9],@tweak[0]);
+ &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]);
+ &mov_reg_to_vec(@twx[10],@twx[11],@tweak[1]);
+ &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]);
+ &mov_reg_to_vec(@twx[12],@twx[13],@tweak[2]);
+ &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]);
+ &mov_reg_to_vec(@twx[14],@twx[15],@tweak[3]);
+ &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]);
+$code.=<<___;
+ eor @vtmp[0].16b, @vtmp[0].16b, @vtmpx[0].16b
+ eor @vtmp[1].16b, @vtmp[1].16b, @vtmpx[1].16b
+ eor @vtmp[2].16b, @vtmp[2].16b, @vtmpx[2].16b
+ eor @vtmp[3].16b, @vtmp[3].16b, @vtmpx[3].16b
+ eor @data[0].16b, @data[0].16b, @tweak[0].16b
+ eor @data[1].16b, @data[1].16b, @tweak[1].16b
+ eor @data[2].16b, @data[2].16b, @tweak[2].16b
+ eor @data[3].16b, @data[3].16b, @tweak[3].16b
+
+ // save the last tweak
+ st1 {@tweak[3].4s},[$ivp]
+ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
+ st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
+ subs $blocks,$blocks,#8
+ b.gt .Lxts_8_blocks_process${std}
+ b 100f
+.Lxts_4_blocks_process${std}:
+___
+ &mov_reg_to_vec(@twx[0],@twx[1],@tweak[0]);
+ &mov_reg_to_vec(@twx[2],@twx[3],@tweak[1]);
+ &mov_reg_to_vec(@twx[4],@twx[5],@tweak[2]);
+ &mov_reg_to_vec(@twx[6],@twx[7],@tweak[3]);
+$code.=<<___;
+ cmp $blocks,#4
+ b.lt 1f
+ ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
+___
+ &rbit(@tweak[0],@tweak[0],$std);
+ &rbit(@tweak[1],@tweak[1],$std);
+ &rbit(@tweak[2],@tweak[2],$std);
+ &rbit(@tweak[3],@tweak[3],$std);
+$code.=<<___;
+ eor @data[0].16b, @data[0].16b, @tweak[0].16b
+ eor @data[1].16b, @data[1].16b, @tweak[1].16b
+ eor @data[2].16b, @data[2].16b, @tweak[2].16b
+ eor @data[3].16b, @data[3].16b, @tweak[3].16b
+___
+ &rev32(@data[0],@data[0]);
+ &rev32(@data[1],@data[1]);
+ &rev32(@data[2],@data[2]);
+ &rev32(@data[3],@data[3]);
+ &transpose(@data,@vtmp);
+$code.=<<___;
+ bl _${prefix}_enc_4blks
+___
+ &transpose(@vtmp,@data);
+$code.=<<___;
+ eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
+ eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
+ eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
+ eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b
+ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
+ sub $blocks,$blocks,#4
+___
+ &mov_reg_to_vec(@twx[8],@twx[9],@tweak[0]);
+ &mov_reg_to_vec(@twx[10],@twx[11],@tweak[1]);
+ &mov_reg_to_vec(@twx[12],@twx[13],@tweak[2]);
+$code.=<<___;
+ // save the last tweak
+ st1 {@tweak[3].4s},[$ivp]
+1:
+ // process last block
+ cmp $blocks,#1
+ b.lt 100f
+ b.gt 1f
+ ld1 {@data[0].4s},[$inp],#16
+___
+ &rbit(@tweak[0],@tweak[0],$std);
+$code.=<<___;
+ eor @data[0].16b, @data[0].16b, @tweak[0].16b
+___
+ &rev32(@data[0],@data[0]);
+ &encrypt_1blk(@data[0]);
+$code.=<<___;
+ eor @data[0].16b, @data[0].16b, @tweak[0].16b
+ st1 {@data[0].4s},[$outp],#16
+ // save the last tweak
+ st1 {@tweak[0].4s},[$ivp]
+ b 100f
+1: // process last 2 blocks
+ cmp $blocks,#2
+ b.gt 1f
+ ld1 {@data[0].4s,@data[1].4s},[$inp],#32
+___
+ &rbit(@tweak[0],@tweak[0],$std);
+ &rbit(@tweak[1],@tweak[1],$std);
+$code.=<<___;
+ eor @data[0].16b, @data[0].16b, @tweak[0].16b
+ eor @data[1].16b, @data[1].16b, @tweak[1].16b
+___
+ &rev32(@data[0],@data[0]);
+ &rev32(@data[1],@data[1]);
+ &transpose(@data,@vtmp);
+$code.=<<___;
+ bl _${prefix}_enc_4blks
+___
+ &transpose(@vtmp,@data);
+$code.=<<___;
+ eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
+ eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
+ st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32
+ // save the last tweak
+ st1 {@tweak[1].4s},[$ivp]
+ b 100f
+1: // process last 3 blocks
+ ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48
+___
+ &rbit(@tweak[0],@tweak[0],$std);
+ &rbit(@tweak[1],@tweak[1],$std);
+ &rbit(@tweak[2],@tweak[2],$std);
+$code.=<<___;
+ eor @data[0].16b, @data[0].16b, @tweak[0].16b
+ eor @data[1].16b, @data[1].16b, @tweak[1].16b
+ eor @data[2].16b, @data[2].16b, @tweak[2].16b
+___
+ &rev32(@data[0],@data[0]);
+ &rev32(@data[1],@data[1]);
+ &rev32(@data[2],@data[2]);
+ &transpose(@data,@vtmp);
+$code.=<<___;
+ bl _${prefix}_enc_4blks
+___
+ &transpose(@vtmp,@data);
+$code.=<<___;
+ eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
+ eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
+ eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
+ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48
+ // save the last tweak
+ st1 {@tweak[2].4s},[$ivp]
+100:
+ cmp $remain,0
+ b.eq .return${std}
+
+// This brance calculates the last two tweaks,
+// while the encryption/decryption length is larger than 32
+.last_2blks_tweak${std}:
+ ld1 {@tweak[0].4s},[$ivp]
+___
+ &rev32_armeb(@tweak[0],@tweak[0]);
+ &compute_tweak_vec(@tweak[0],@tweak[1],$std);
+ &compute_tweak_vec(@tweak[1],@tweak[2],$std);
+$code.=<<___;
+ b .check_dec${std}
+
+
+// This brance calculates the last two tweaks,
+// while the encryption/decryption length is equal to 32, who only need two tweaks
+.only_2blks_tweak${std}:
+ mov @tweak[1].16b,@tweak[0].16b
+___
+ &rev32_armeb(@tweak[1],@tweak[1]);
+ &compute_tweak_vec(@tweak[1],@tweak[2]);
+$code.=<<___;
+ b .check_dec${std}
+
+
+// Determine whether encryption or decryption is required.
+// The last two tweaks need to be swapped for decryption.
+.check_dec${std}:
+ // encryption:1 decryption:0
+ cmp $enc,1
+ b.eq .prcess_last_2blks${std}
+ mov @vtmp[0].16B,@tweak[1].16b
+ mov @tweak[1].16B,@tweak[2].16b
+ mov @tweak[2].16B,@vtmp[0].16b
+
+.prcess_last_2blks${std}:
+___
+ &rev32_armeb(@tweak[1],@tweak[1]);
+ &rev32_armeb(@tweak[2],@tweak[2]);
+$code.=<<___;
+ ld1 {@data[0].4s},[$inp],#16
+ eor @data[0].16b, @data[0].16b, @tweak[1].16b
+___
+ &rev32(@data[0],@data[0]);
+ &encrypt_1blk(@data[0]);
+$code.=<<___;
+ eor @data[0].16b, @data[0].16b, @tweak[1].16b
+ st1 {@data[0].4s},[$outp],#16
+
+ sub $lastBlk,$outp,16
+ .loop${std}:
+ subs $remain,$remain,1
+ ldrb $wtmp0,[$lastBlk,$remain]
+ ldrb $wtmp1,[$inp,$remain]
+ strb $wtmp1,[$lastBlk,$remain]
+ strb $wtmp0,[$outp,$remain]
+ b.gt .loop${std}
+ ld1 {@data[0].4s}, [$lastBlk]
+ eor @data[0].16b, @data[0].16b, @tweak[2].16b
+___
+ &rev32(@data[0],@data[0]);
+ &encrypt_1blk(@data[0]);
+$code.=<<___;
+ eor @data[0].16b, @data[0].16b, @tweak[2].16b
+ st1 {@data[0].4s}, [$lastBlk]
+.return${std}:
+ ldp d14, d15, [sp], #0x10
+ ldp d12, d13, [sp], #0x10
+ ldp d10, d11, [sp], #0x10
+ ldp d8, d9, [sp], #0x10
+ ldp x29, x30, [sp], #0x10
+ ldp x27, x28, [sp], #0x10
+ ldp x25, x26, [sp], #0x10
+ ldp x23, x24, [sp], #0x10
+ ldp x21, x22, [sp], #0x10
+ ldp x19, x20, [sp], #0x10
+ ldp x17, x18, [sp], #0x10
+ ldp x15, x16, [sp], #0x10
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size ${prefix}_xts_encrypt${std},.-${prefix}_xts_encrypt${std}
+___
+} # end of gen_xts_cipher
+&gen_xts_cipher("_gb");
+&gen_xts_cipher("");
+}}}
########################################
open SELF,$0;
while(<SELF>) {
diff --git a/crypto/sm4/asm/vpsm4_ex-armv8.pl b/crypto/sm4/asm/vpsm4_ex-armv8.pl
new file mode 100644
index 0000000000..3d094aa535
--- /dev/null
+++ b/crypto/sm4/asm/vpsm4_ex-armv8.pl
@@ -0,0 +1,1544 @@
+#! /usr/bin/env perl
+# Copyright 2022 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# This module implements SM4 with ASIMD and AESE on AARCH64
+#
+# Dec 2022
+#
+
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour \"$output\""
+ or die "can't call $xlate: $!";
+*STDOUT=*OUT;
+
+$prefix="vpsm4_ex";
+my @vtmp=map("v$_",(0..3));
+my @qtmp=map("q$_",(0..3));
+my @data=map("v$_",(4..7));
+my @datax=map("v$_",(8..11));
+my ($rk0,$rk1)=("v12","v13");
+my ($rka,$rkb)=("v14","v15");
+my @vtmpx=map("v$_",(12..15));
+my ($vtmp4,$vtmp5)=("v24","v25");
+my ($MaskV,$TAHMatV,$TALMatV,$ATAHMatV,$ATALMatV,$ANDMaskV)=("v26","v27","v28","v29","v30","v31");
+my ($MaskQ,$TAHMatQ,$TALMatQ,$ATAHMatQ,$ATALMatQ,$ANDMaskQ)=("q26","q27","q28","q29","q30","q31");
+
+my ($inp,$outp,$blocks,$rks)=("x0","x1","w2","x3");
+my ($tmpw,$tmp,$wtmp0,$wtmp1,$wtmp2)=("w6","x6","w7","w8","w9");
+my ($xtmp1,$xtmp2)=("x8","x9");
+my ($ptr,$counter)=("x10","w11");
+my ($word0,$word1,$word2,$word3)=("w12","w13","w14","w15");
+
+sub rev32() {
+ my $dst = shift;
+ my $src = shift;
+
+ if ($src and ("$src" ne "$dst")) {
+$code.=<<___;
+#ifndef __AARCH64EB__
+ rev32 $dst.16b,$src.16b
+#else
+ mov $dst.16b,$src.16b
+#endif
+___
+ } else {
+$code.=<<___;
+#ifndef __AARCH64EB__
+ rev32 $dst.16b,$dst.16b
+#endif
+___
+ }
+}
+
+sub rev32_armeb() {
+ my $dst = shift;
+ my $src = shift;
+
+ if ($src and ("$src" ne "$dst")) {
+$code.=<<___;
+#ifdef __AARCH64EB__
+ rev32 $dst.16b,$src.16b
+#else
+ mov $dst.16b,$src.16b
+#endif
+___
+ } else {
+$code.=<<___;
+#ifdef __AARCH64EB__
+ rev32 $dst.16b,$dst.16b
+#endif
+___
+ }
+}
+
+sub rbit() {
+ my $dst = shift;
+ my $src = shift;
+ my $std = shift;
+
+ if ($src and ("$src" ne "$dst")) {
+ if ($std eq "_gb") {
+$code.=<<___;
+ rbit $dst.16b,$src.16b
+___
+ } else {
+$code.=<<___;
+ mov $dst.16b,$src.16b
+___
+ }
+ } else {
+ if ($std eq "_gb") {
+$code.=<<___;
+ rbit $dst.16b,$src.16b
+___
+ }
+ }
+}
+
+sub transpose() {
+ my ($dat0,$dat1,$dat2,$dat3,$vt0,$vt1,$vt2,$vt3) = @_;
+
+$code.=<<___;
+ zip1 $vt0.4s,$dat0.4s,$dat1.4s
+ zip2 $vt1.4s,$dat0.4s,$dat1.4s
+ zip1 $vt2.4s,$dat2.4s,$dat3.4s
+ zip2 $vt3.4s,$dat2.4s,$dat3.4s
+ zip1 $dat0.2d,$vt0.2d,$vt2.2d
+ zip2 $dat1.2d,$vt0.2d,$vt2.2d
+ zip1 $dat2.2d,$vt1.2d,$vt3.2d
+ zip2 $dat3.2d,$vt1.2d,$vt3.2d
+___
+}
+
+# matrix multiplication Mat*x = (lowerMat*x) ^ (higherMat*x)
+sub mul_matrix() {
+ my $x = shift;
+ my $higherMat = shift;
+ my $lowerMat = shift;
+ my $tmp = shift;
+$code.=<<___;
+ ushr $tmp.16b, $x.16b, 4
+ and $x.16b, $x.16b, $ANDMaskV.16b
+ tbl $x.16b, {$lowerMat.16b}, $x.16b
+ tbl $tmp.16b, {$higherMat.16b}, $tmp.16b
+ eor $x.16b, $x.16b, $tmp.16b
+___
+}
+
+# sbox operations for 4-lane of words
+# sbox operation for 4-lane of words
+sub sbox() {
+ my $dat = shift;
+
+$code.=<<___;
+ // optimize sbox using AESE instruction
+ tbl @vtmp[0].16b, {$dat.16b}, $MaskV.16b
+___
+ &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, $vtmp4);
+$code.=<<___;
+ eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b
+ aese @vtmp[0].16b,@vtmp[1].16b
+___
+ &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, $vtmp4);
+$code.=<<___;
+ mov $dat.16b,@vtmp[0].16b
+
+ // linear transformation
+ ushr @vtmp[0].4s,$dat.4s,32-2
+ ushr @vtmp[1].4s,$dat.4s,32-10
+ ushr @vtmp[2].4s,$dat.4s,32-18
+ ushr @vtmp[3].4s,$dat.4s,32-24
+ sli @vtmp[0].4s,$dat.4s,2
+ sli @vtmp[1].4s,$dat.4s,10
+ sli @vtmp[2].4s,$dat.4s,18
+ sli @vtmp[3].4s,$dat.4s,24
+ eor $vtmp4.16b,@vtmp[0].16b,$dat.16b
+ eor $vtmp4.16b,$vtmp4.16b,$vtmp[1].16b
+ eor $dat.16b,@vtmp[2].16b,@vtmp[3].16b
+ eor $dat.16b,$dat.16b,$vtmp4.16b
+___
+}
+
+# sbox operation for 8-lane of words
+sub sbox_double() {
+ my $dat = shift;
+ my $datx = shift;
+
+$code.=<<___;
+ // optimize sbox using AESE instruction
+ tbl @vtmp[0].16b, {$dat.16b}, $MaskV.16b
+ tbl @vtmp[1].16b, {$datx.16b}, $MaskV.16b
+___
+ &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, $vtmp4);
+ &mul_matrix(@vtmp[1], $TAHMatV, $TALMatV, $vtmp4);
+$code.=<<___;
+ eor $vtmp5.16b, $vtmp5.16b, $vtmp5.16b
+ aese @vtmp[0].16b,$vtmp5.16b
+ aese @vtmp[1].16b,$vtmp5.16b
+___
+ &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV,$vtmp4);
+ &mul_matrix(@vtmp[1], $ATAHMatV, $ATALMatV,$vtmp4);
+$code.=<<___;
+ mov $dat.16b,@vtmp[0].16b
+ mov $datx.16b,@vtmp[1].16b
+
+ // linear transformation
+ ushr @vtmp[0].4s,$dat.4s,32-2
+ ushr $vtmp5.4s,$datx.4s,32-2
+ ushr @vtmp[1].4s,$dat.4s,32-10
+ ushr @vtmp[2].4s,$dat.4s,32-18
+ ushr @vtmp[3].4s,$dat.4s,32-24
+ sli @vtmp[0].4s,$dat.4s,2
+ sli $vtmp5.4s,$datx.4s,2
+ sli @vtmp[1].4s,$dat.4s,10
+ sli @vtmp[2].4s,$dat.4s,18
+ sli @vtmp[3].4s,$dat.4s,24
+ eor $vtmp4.16b,@vtmp[0].16b,$dat.16b
+ eor $vtmp4.16b,$vtmp4.16b,@vtmp[1].16b
+ eor $dat.16b,@vtmp[2].16b,@vtmp[3].16b
+ eor $dat.16b,$dat.16b,$vtmp4.16b
+ ushr @vtmp[1].4s,$datx.4s,32-10
+ ushr @vtmp[2].4s,$datx.4s,32-18
+ ushr @vtmp[3].4s,$datx.4s,32-24
+ sli @vtmp[1].4s,$datx.4s,10
+ sli @vtmp[2].4s,$datx.4s,18
+ sli @vtmp[3].4s,$datx.4s,24
+ eor $vtmp4.16b,$vtmp5.16b,$datx.16b
+ eor $vtmp4.16b,$vtmp4.16b,@vtmp[1].16b
+ eor $datx.16b,@vtmp[2].16b,@vtmp[3].16b
+ eor $datx.16b,$datx.16b,$vtmp4.16b
+___
+}
+
+# sbox operation for one single word
+sub sbox_1word () {
+ my $word = shift;
+
+$code.=<<___;
+ mov @vtmp[3].s[0],$word
+ // optimize sbox using AESE instruction
+ tbl @vtmp[0].16b, {@vtmp[3].16b}, $MaskV.16b
+___
+ &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, @vtmp[2]);
+$code.=<<___;
+ eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b
+ aese @vtmp[0].16b,@vtmp[1].16b
+___
+ &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, @vtmp[2]);
+$code.=<<___;
+
+ mov $wtmp0,@vtmp[0].s[0]
+ eor $word,$wtmp0,$wtmp0,ror #32-2
+ eor $word,$word,$wtmp0,ror #32-10
+ eor $word,$word,$wtmp0,ror #32-18
+ eor $word,$word,$wtmp0,ror #32-24
+___
+}
+
+# sm4 for one block of data, in scalar registers word0/word1/word2/word3
+sub sm4_1blk () {
+ my $kptr = shift;
+
+$code.=<<___;
+ ldp $wtmp0,$wtmp1,[$kptr],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor $tmpw,$word2,$word3
+ eor $wtmp2,$wtmp0,$word1
+ eor $tmpw,$tmpw,$wtmp2
+___
+ &sbox_1word($tmpw);
+$code.=<<___;
+ eor $word0,$word0,$tmpw
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor $tmpw,$word2,$word3
+ eor $wtmp2,$word0,$wtmp1
+ eor $tmpw,$tmpw,$wtmp2
+___
+ &sbox_1word($tmpw);
+$code.=<<___;
+ ldp $wtmp0,$wtmp1,[$kptr],8
+ eor $word1,$word1,$tmpw
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor $tmpw,$word0,$word1
+ eor $wtmp2,$wtmp0,$word3
+ eor $tmpw,$tmpw,$wtmp2
+___
+ &sbox_1word($tmpw);
+$code.=<<___;
+ eor $word2,$word2,$tmpw
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor $tmpw,$word0,$word1
+ eor $wtmp2,$word2,$wtmp1
+ eor $tmpw,$tmpw,$wtmp2
+___
+ &sbox_1word($tmpw);
+$code.=<<___;
+ eor $word3,$word3,$tmpw
+___
+}
+
+# sm4 for 4-lanes of data, in neon registers data0/data1/data2/data3
+sub sm4_4blks () {
+ my $kptr = shift;
+
+$code.=<<___;
+ ldp $wtmp0,$wtmp1,[$kptr],8
+ dup $rk0.4s,$wtmp0
+ dup $rk1.4s,$wtmp1
+
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ eor $rka.16b,@data[2].16b,@data[3].16b
+ eor $rk0.16b,@data[1].16b,$rk0.16b
+ eor $rk0.16b,$rka.16b,$rk0.16b
+___
+ &sbox($rk0);
+$code.=<<___;
+ eor @data[0].16b,@data[0].16b,$rk0.16b
+
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ eor $rka.16b,$rka.16b,@data[0].16b
+ eor $rk1.16b,$rka.16b,$rk1.16b
+___
+ &sbox($rk1);
+$code.=<<___;
+ ldp $wtmp0,$wtmp1,[$kptr],8
+ eor @data[1].16b,@data[1].16b,$rk1.16b
+
+ dup $rk0.4s,$wtmp0
+ dup $rk1.4s,$wtmp1
+
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ eor $rka.16b,@data[0].16b,@data[1].16b
+ eor $rk0.16b,@data[3].16b,$rk0.16b
+ eor $rk0.16b,$rka.16b,$rk0.16b
+___
+ &sbox($rk0);
+$code.=<<___;
+ eor @data[2].16b,@data[2].16b,$rk0.16b
+
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ eor $rka.16b,$rka.16b,@data[2].16b
+ eor $rk1.16b,$rka.16b,$rk1.16b
+___
+ &sbox($rk1);
+$code.=<<___;
+ eor @data[3].16b,@data[3].16b,$rk1.16b
+___
+}
+
+# sm4 for 8 lanes of data, in neon registers
+# data0/data1/data2/data3 datax0/datax1/datax2/datax3
+sub sm4_8blks () {
+ my $kptr = shift;
+
+$code.=<<___;
+ ldp $wtmp0,$wtmp1,[$kptr],8
+ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
+ dup $rk0.4s,$wtmp0
+ eor $rka.16b,@data[2].16b,@data[3].16b
+ eor $rkb.16b,@datax[2].16b,@datax[3].16b
+ eor @vtmp[0].16b,@data[1].16b,$rk0.16b
+ eor @vtmp[1].16b,@datax[1].16b,$rk0.16b
+ eor $rk0.16b,$rka.16b,@vtmp[0].16b
+ eor $rk1.16b,$rkb.16b,@vtmp[1].16b
+___
+ &sbox_double($rk0,$rk1);
+$code.=<<___;
+ eor @data[0].16b,@data[0].16b,$rk0.16b
+ eor @datax[0].16b,@datax[0].16b,$rk1.16b
+
+ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
+ dup $rk1.4s,$wtmp1
+ eor $rka.16b,$rka.16b,@data[0].16b
+ eor $rkb.16b,$rkb.16b,@datax[0].16b
+ eor $rk0.16b,$rka.16b,$rk1.16b
+ eor $rk1.16b,$rkb.16b,$rk1.16b
+___
+ &sbox_double($rk0,$rk1);
+$code.=<<___;
+ ldp $wtmp0,$wtmp1,[$kptr],8
+ eor @data[1].16b,@data[1].16b,$rk0.16b
+ eor @datax[1].16b,@datax[1].16b,$rk1.16b
+
+ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
+ dup $rk0.4s,$wtmp0
+ eor $rka.16b,@data[0].16b,@data[1].16b
+ eor $rkb.16b,@datax[0].16b,@datax[1].16b
+ eor @vtmp[0].16b,@data[3].16b,$rk0.16b
+ eor @vtmp[1].16b,@datax[3].16b,$rk0.16b
+ eor $rk0.16b,$rka.16b,@vtmp[0].16b
+ eor $rk1.16b,$rkb.16b,@vtmp[1].16b
+___
+ &sbox_double($rk0,$rk1);
+$code.=<<___;
+ eor @data[2].16b,@data[2].16b,$rk0.16b
+ eor @datax[2].16b,@datax[2].16b,$rk1.16b
+
+ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
+ dup $rk1.4s,$wtmp1
+ eor $rka.16b,$rka.16b,@data[2].16b
+ eor $rkb.16b,$rkb.16b,@datax[2].16b
+ eor $rk0.16b,$rka.16b,$rk1.16b
+ eor $rk1.16b,$rkb.16b,$rk1.16b
+___
+ &sbox_double($rk0,$rk1);
+$code.=<<___;
+ eor @data[3].16b,@data[3].16b,$rk0.16b
+ eor @datax[3].16b,@datax[3].16b,$rk1.16b
+___
+}
+
+sub encrypt_1blk_norev() {
+ my $dat = shift;
+
+$code.=<<___;
+ mov $ptr,$rks
+ mov $counter,#8
+ mov $word0,$dat.s[0]
+ mov $word1,$dat.s[1]
+ mov $word2,$dat.s[2]
+ mov $word3,$dat.s[3]
+10:
+___
+ &sm4_1blk($ptr);
+$code.=<<___;
+ subs $counter,$counter,#1
+ b.ne 10b
+ mov $dat.s[0],$word3
+ mov $dat.s[1],$word2
+ mov $dat.s[2],$word1
+ mov $dat.s[3],$word0
+___
+}
+
+sub encrypt_1blk() {
+ my $dat = shift;
+
+ &encrypt_1blk_norev($dat);
+ &rev32($dat,$dat);
+}
+
+sub encrypt_4blks() {
+$code.=<<___;
+ mov $ptr,$rks
+ mov $counter,#8
+10:
+___
+ &sm4_4blks($ptr);
+$code.=<<___;
+ subs $counter,$counter,#1
+ b.ne 10b
+___
+ &rev32(@vtmp[3],@data[0]);
+ &rev32(@vtmp[2],@data[1]);
+ &rev32(@vtmp[1],@data[2]);
+ &rev32(@vtmp[0],@data[3]);
+}
+
+sub encrypt_8blks() {
+$code.=<<___;
+ mov $ptr,$rks
+ mov $counter,#8
+10:
+___
+ &sm4_8blks($ptr);
+$code.=<<___;
+ subs $counter,$counter,#1
+ b.ne 10b
+___
+ &rev32(@vtmp[3],@data[0]);
+ &rev32(@vtmp[2],@data[1]);
+ &rev32(@vtmp[1],@data[2]);
+ &rev32(@vtmp[0],@data[3]);
+ &rev32(@data[3],@datax[0]);
+ &rev32(@data[2],@datax[1]);
+ &rev32(@data[1],@datax[2]);
+ &rev32(@data[0],@datax[3]);
+}
+
+sub load_sbox () {
+ my $data = shift;
+
+$code.=<<___;
+ ldr $MaskQ, =0x0306090c0f0205080b0e0104070a0d00
+ ldr $TAHMatQ, =0x22581a6002783a4062185a2042387a00
+ ldr $TALMatQ, =0xc10bb67c4a803df715df62a89e54e923
+ ldr $ATAHMatQ, =0x1407c6d56c7fbeadb9aa6b78c1d21300
+ ldr $ATALMatQ, =0xe383c1a1fe9edcbc6404462679195b3b
+ ldr $ANDMaskQ, =0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+___
+}
+
+sub mov_reg_to_vec() {
+ my $src0 = shift;
+ my $src1 = shift;
+ my $desv = shift;
+$code.=<<___;
+ mov $desv.d[0],$src0
+ mov $desv.d[1],$src1
+___
+ &rev32_armeb($desv,$desv);
+}
+
+sub mov_vec_to_reg() {
+ my $srcv = shift;
+ my $des0 = shift;
+ my $des1 = shift;
+$code.=<<___;
+ mov $des0,$srcv.d[0]
+ mov $des1,$srcv.d[1]
+___
+}
+
+sub compute_tweak() {
+ my $src0 = shift;
+ my $src1 = shift;
+ my $des0 = shift;
+ my $des1 = shift;
+$code.=<<___;
+ mov $wtmp0,0x87
+ extr $xtmp2,$src1,$src1,#32
+ extr $des1,$src1,$src0,#63
+ and $wtmp1,$wtmp0,$wtmp2,asr#31
+ eor $des0,$xtmp1,$src0,lsl#1
+___
+}
+
+sub compute_tweak_vec() {
+ my $src = shift;
+ my $des = shift;
+ my $std = shift;
+ &rbit(@vtmp[2],$src,$std);
+$code.=<<___;
+ ldr @qtmp[0], =0x01010101010101010101010101010187
+ shl $des.16b, @vtmp[2].16b, #1
+ ext @vtmp[1].16b, @vtmp[2].16b, @vtmp[2].16b,#15
+ ushr @vtmp[1].16b, @vtmp[1].16b, #7
+ mul @vtmp[1].16b, @vtmp[1].16b, @vtmp[0].16b
+ eor $des.16b, $des.16b, @vtmp[1].16b
+___
+ &rbit($des,$des,$std);
+}
+
+$code=<<___;
+#include "arm_arch.h"
+.arch armv8-a+crypto
+.text
+
+.type _${prefix}_consts,%object
+.align 7
+_${prefix}_consts:
+.Lck:
+ .long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269
+ .long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9
+ .long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249
+ .long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9
+ .long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229
+ .long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299
+ .long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209
+ .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
+.Lfk:
+ .dword 0x56aa3350a3b1bac6,0xb27022dc677d9197
+.Lshuffles:
+ .dword 0x0B0A090807060504,0x030201000F0E0D0C
+
+.size _${prefix}_consts,.-_${prefix}_consts
+___
+
+{{{
+my ($key,$keys,$enc)=("x0","x1","w2");
+my ($pointer,$schedules,$wtmp,$roundkey)=("x5","x6","w7","w8");
+my ($vkey,$vfk,$vmap)=("v5","v6","v7");
+$code.=<<___;
+.type _${prefix}_set_key,%function
+.align 4
+_${prefix}_set_key:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {$vkey.4s},[$key]
+___
+ &load_sbox();
+ &rev32($vkey,$vkey);
+$code.=<<___;
+ adr $pointer,.Lshuffles
+ ld1 {$vmap.2d},[$pointer]
+ adr $pointer,.Lfk
+ ld1 {$vfk.2d},[$pointer]
+ eor $vkey.16b,$vkey.16b,$vfk.16b
+ mov $schedules,#32
+ adr $pointer,.Lck
+ movi @vtmp[0].16b,#64
+ cbnz $enc,1f
+ add $keys,$keys,124
+1:
+ mov $wtmp,$vkey.s[1]
+ ldr $roundkey,[$pointer],#4
+ eor $roundkey,$roundkey,$wtmp
+ mov $wtmp,$vkey.s[2]
+ eor $roundkey,$roundkey,$wtmp
+ mov $wtmp,$vkey.s[3]
+ eor $roundkey,$roundkey,$wtmp
+ // optimize sbox using AESE instruction
+ mov @data[0].s[0],$roundkey
+ tbl @vtmp[0].16b, {@data[0].16b}, $MaskV.16b
+___
+ &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, @vtmp[2]);
+$code.=<<___;
+ eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b
+ aese @vtmp[0].16b,@vtmp[1].16b
+___
+ &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, @vtmp[2]);
+$code.=<<___;
+ mov $wtmp,@vtmp[0].s[0]
+ eor $roundkey,$wtmp,$wtmp,ror #19
+ eor $roundkey,$roundkey,$wtmp,ror #9
+ mov $wtmp,$vkey.s[0]
+ eor $roundkey,$roundkey,$wtmp
+ mov $vkey.s[0],$roundkey
+ cbz $enc,2f
+ str $roundkey,[$keys],#4
+ b 3f
+2:
+ str $roundkey,[$keys],#-4
+3:
+ tbl $vkey.16b,{$vkey.16b},$vmap.16b
+ subs $schedules,$schedules,#1
+ b.ne 1b
+ ret
+.size _${prefix}_set_key,.-_${prefix}_set_key
+___
+}}}
+
+
+{{{
+$code.=<<___;
+.type _${prefix}_enc_4blks,%function
+.align 4
+_${prefix}_enc_4blks:
+ AARCH64_VALID_CALL_TARGET
+___
+ &encrypt_4blks();
+$code.=<<___;
+ ret
+.size _${prefix}_enc_4blks,.-_${prefix}_enc_4blks
+___
+}}}
+
+{{{
+$code.=<<___;
+.type _${prefix}_enc_8blks,%function
+.align 4
+_${prefix}_enc_8blks:
+ AARCH64_VALID_CALL_TARGET
+___
+ &encrypt_8blks();
+$code.=<<___;
+ ret
+.size _${prefix}_enc_8blks,.-_${prefix}_enc_8blks
+___
+}}}
+
+
+{{{
+my ($key,$keys)=("x0","x1");
+$code.=<<___;
+.globl ${prefix}_set_encrypt_key
+.type ${prefix}_set_encrypt_key,%function
+.align 5
+${prefix}_set_encrypt_key:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ mov w2,1
+ bl _${prefix}_set_key
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
+___
+}}}
+
+{{{
+my ($key,$keys)=("x0","x1");
+$code.=<<___;
+.globl ${prefix}_set_decrypt_key
+.type ${prefix}_set_decrypt_key,%function
+.align 5
+${prefix}_set_decrypt_key:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ mov w2,0
+ bl _${prefix}_set_key
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
+___
+}}}
+
+{{{
+sub gen_block () {
+ my $dir = shift;
+ my ($inp,$outp,$rk)=map("x$_",(0..2));
+
+$code.=<<___;
+.globl ${prefix}_${dir}crypt
+.type ${prefix}_${dir}crypt,%function
+.align 5
+${prefix}_${dir}crypt:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {@data[0].4s},[$inp]
+___
+ &load_sbox();
+ &rev32(@data[0],@data[0]);
+$code.=<<___;
+ mov $rks,$rk
+___
+ &encrypt_1blk(@data[0]);
+$code.=<<___;
+ st1 {@data[0].4s},[$outp]
+ ret
+.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
+___
+}
+&gen_block("en");
+&gen_block("de");
+}}}
+
+{{{
+$code.=<<___;
+.globl ${prefix}_ecb_encrypt
+.type ${prefix}_ecb_encrypt,%function
+.align 5
+${prefix}_ecb_encrypt:
+ AARCH64_SIGN_LINK_REGISTER
+ // convert length into blocks
+ lsr x2,x2,4
+ stp d8,d9,[sp,#-80]!
+ stp d10,d11,[sp,#16]
+ stp d12,d13,[sp,#32]
+ stp d14,d15,[sp,#48]
+ stp x29,x30,[sp,#64]
+___
+ &load_sbox();
+$code.=<<___;
+.Lecb_8_blocks_process:
+ cmp $blocks,#8
+ b.lt .Lecb_4_blocks_process
+ ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
+ ld4 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
+___
+ &rev32(@data[0],@data[0]);
+ &rev32(@data[1],@data[1]);
+ &rev32(@data[2],@data[2]);
+ &rev32(@data[3],@data[3]);
+ &rev32(@datax[0],@datax[0]);
+ &rev32(@datax[1],@datax[1]);
+ &rev32(@datax[2],@datax[2]);
+ &rev32(@datax[3],@datax[3]);
+$code.=<<___;
+ bl _${prefix}_enc_8blks
+ st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
+ st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
+ subs $blocks,$blocks,#8
+ b.gt .Lecb_8_blocks_process
+ b 100f
+.Lecb_4_blocks_process:
+ cmp $blocks,#4
+ b.lt 1f
+ ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
+___
+ &rev32(@data[0],@data[0]);
+ &rev32(@data[1],@data[1]);
+ &rev32(@data[2],@data[2]);
+ &rev32(@data[3],@data[3]);
+$code.=<<___;
+ bl _${prefix}_enc_4blks
+ st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
+ sub $blocks,$blocks,#4
+1:
+ // process last block
+ cmp $blocks,#1
+ b.lt 100f
+ b.gt 1f
+ ld1 {@data[0].4s},[$inp]
+___
+ &rev32(@data[0],@data[0]);
+ &encrypt_1blk(@data[0]);
+$code.=<<___;
+ st1 {@data[0].4s},[$outp]
+ b 100f
+1: // process last 2 blocks
+ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp],#16
+ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$inp],#16
+ cmp $blocks,#2
+ b.gt 1f
+___
+ &rev32(@data[0],@data[0]);
+ &rev32(@data[1],@data[1]);
+ &rev32(@data[2],@data[2]);
+ &rev32(@data[3],@data[3]);
+$code.=<<___;
+ bl _${prefix}_enc_4blks
+ st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16
+ st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp]
+ b 100f
+1: // process last 3 blocks
+ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$inp],#16
+___
+ &rev32(@data[0],@data[0]);
+ &rev32(@data[1],@data[1]);
+ &rev32(@data[2],@data[2]);
+ &rev32(@data[3],@data[3]);
+$code.=<<___;
+ bl _${prefix}_enc_4blks
+ st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16
+ st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp],#16
+ st4 {@vtmp[0].s-@vtmp[3].s}[2],[$outp]
+100:
+ ldp d10,d11,[sp,#16]
+ ldp d12,d13,[sp,#32]
+ ldp d14,d15,[sp,#48]
+ ldp x29,x30,[sp,#64]
+ ldp d8,d9,[sp],#80
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
+___
+}}}
+
+{{{
+my ($len,$ivp,$enc)=("x2","x4","w5");
+my $ivec0=("v3");
+my $ivec1=("v15");
+
+$code.=<<___;
+.globl ${prefix}_cbc_encrypt
+.type ${prefix}_cbc_encrypt,%function
+.align 5
+${prefix}_cbc_encrypt:
+ AARCH64_VALID_CALL_TARGET
+ lsr $len,$len,4
+___
+ &load_sbox();
+$code.=<<___;
+ cbz $enc,.Ldec
+ ld1 {$ivec0.4s},[$ivp]
+.Lcbc_4_blocks_enc:
+ cmp $blocks,#4
+ b.lt 1f
+ ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
+ eor @data[0].16b,@data[0].16b,$ivec0.16b
+___
+ &rev32(@data[1],@data[1]);
+ &rev32(@data[0],@data[0]);
+ &rev32(@data[2],@data[2]);
+ &rev32(@data[3],@data[3]);
+ &encrypt_1blk_norev(@data[0]);
+$code.=<<___;
+ eor @data[1].16b,@data[1].16b,@data[0].16b
+___
+ &encrypt_1blk_norev(@data[1]);
+ &rev32(@data[0],@data[0]);
+
+$code.=<<___;
+ eor @data[2].16b,@data[2].16b,@data[1].16b
+___
+ &encrypt_1blk_norev(@data[2]);
+ &rev32(@data[1],@data[1]);
+$code.=<<___;
+ eor @data[3].16b,@data[3].16b,@data[2].16b
+___
+ &encrypt_1blk_norev(@data[3]);
+ &rev32(@data[2],@data[2]);
+ &rev32(@data[3],@data[3]);
+$code.=<<___;
+ orr $ivec0.16b,@data[3].16b,@data[3].16b
+ st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
+ subs $blocks,$blocks,#4
+ b.ne .Lcbc_4_blocks_enc
+ b 2f
+1:
+ subs $blocks,$blocks,#1
+ b.lt 2f
+ ld1 {@data[0].4s},[$inp],#16
+ eor $ivec0.16b,$ivec0.16b,@data[0].16b
+___
+ &rev32($ivec0,$ivec0);
+ &encrypt_1blk($ivec0);
+$code.=<<___;
+ st1 {$ivec0.4s},[$outp],#16
+ b 1b
+2:
+ // save back IV
+ st1 {$ivec0.4s},[$ivp]
+ ret
+
+.Ldec:
+ // decryption mode starts
+ AARCH64_SIGN_LINK_REGISTER
+ stp d8,d9,[sp,#-80]!
+ stp d10,d11,[sp,#16]
+ stp d12,d13,[sp,#32]
+ stp d14,d15,[sp,#48]
+ stp x29,x30,[sp,#64]
+.Lcbc_8_blocks_dec:
+ cmp $blocks,#8
+ b.lt 1f
+ ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp]
+ add $ptr,$inp,#64
+ ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$ptr]
+___
+ &rev32(@data[0],@data[0]);
+ &rev32(@data[1],@data[1]);
+ &rev32(@data[2],@data[2]);
+ &rev32(@data[3],$data[3]);
+ &rev32(@datax[0],@datax[0]);
+ &rev32(@datax[1],@datax[1]);
+ &rev32(@datax[2],@datax[2]);
+ &rev32(@datax[3],$datax[3]);
+$code.=<<___;
+ bl _${prefix}_enc_8blks
+___
+ &transpose(@vtmp,@datax);
+ &transpose(@data,@datax);
+$code.=<<___;
+ ld1 {$ivec1.4s},[$ivp]
+ ld1 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
+ // note ivec1 and vtmpx[3] are resuing the same register
+ // care needs to be taken to avoid conflict
+ eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
+ ld1 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
+ eor @vtmp[1].16b,@vtmp[1].16b,@datax[0].16b
+ eor @vtmp[2].16b,@vtmp[2].16b,@datax[1].16b
+ eor @vtmp[3].16b,$vtmp[3].16b,@datax[2].16b
+ // save back IV
+ st1 {$vtmpx[3].4s}, [$ivp]
+ eor @data[0].16b,@data[0].16b,$datax[3].16b
+ eor @data[1].16b,@data[1].16b,@vtmpx[0].16b
+ eor @data[2].16b,@data[2].16b,@vtmpx[1].16b
+ eor @data[3].16b,$data[3].16b,@vtmpx[2].16b
+ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
+ st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
+ subs $blocks,$blocks,#8
+ b.gt .Lcbc_8_blocks_dec
+ b.eq 100f
+1:
+ ld1 {$ivec1.4s},[$ivp]
+.Lcbc_4_blocks_dec:
+ cmp $blocks,#4
+ b.lt 1f
+ ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp]
+___
+ &rev32(@data[0],@data[0]);
+ &rev32(@data[1],@data[1]);
+ &rev32(@data[2],@data[2]);
+ &rev32(@data[3],$data[3]);
+$code.=<<___;
+ bl _${prefix}_enc_4blks
+ ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
+___
+ &transpose(@vtmp,@datax);
+$code.=<<___;
+ eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
+ eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
+ orr $ivec1.16b,@data[3].16b,@data[3].16b
+ eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b
+ eor @vtmp[3].16b,$vtmp[3].16b,@data[2].16b
+ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
+ subs $blocks,$blocks,#4
+ b.gt .Lcbc_4_blocks_dec
+ // save back IV
+ st1 {@data[3].4s}, [$ivp]
+ b 100f
+1: // last block
+ subs $blocks,$blocks,#1
+ b.lt 100f
+ b.gt 1f
+ ld1 {@data[0].4s},[$inp],#16
+ // save back IV
+ st1 {$data[0].4s}, [$ivp]
+___
+ &rev32(@datax[0],@data[0]);
+ &encrypt_1blk(@datax[0]);
+$code.=<<___;
+ eor @datax[0].16b,@datax[0].16b,$ivec1.16b
+ st1 {@datax[0].4s},[$outp],#16
+ b 100f
+1: // last two blocks
+ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp]
+ add $ptr,$inp,#16
+ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$ptr],#16
+ subs $blocks,$blocks,1
+ b.gt 1f
+___
+ &rev32(@data[0],@data[0]);
+ &rev32(@data[1],@data[1]);
+ &rev32(@data[2],@data[2]);
+ &rev32(@data[3],@data[3]);
+$code.=<<___;
+ bl _${prefix}_enc_4blks
+ ld1 {@data[0].4s,@data[1].4s},[$inp],#32
+___
+ &transpose(@vtmp,@datax);
+$code.=<<___;
+ eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
+ eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
+ st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32
+ // save back IV
+ st1 {@data[1].4s}, [$ivp]
+ b 100f
+1: // last 3 blocks
+ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$ptr]
+___
+ &rev32(@data[0],@data[0]);
+ &rev32(@data[1],@data[1]);
+ &rev32(@data[2],@data[2]);
+ &rev32(@data[3],@data[3]);
+$code.=<<___;
+ bl _${prefix}_enc_4blks
+ ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48
+___
+ &transpose(@vtmp,@datax);
+$code.=<<___;
+ eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
+ eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
+ eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b
+ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48
+ // save back IV
+ st1 {@data[2].4s}, [$ivp]
+100:
+ ldp d10,d11,[sp,#16]
+ ldp d12,d13,[sp,#32]
+ ldp d14,d15,[sp,#48]
+ ldp x29,x30,[sp,#64]
+ ldp d8,d9,[sp],#80
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
+___
+}}}
+
+{{{
+my ($ivp)=("x4");
+my ($ctr)=("w5");
+my $ivec=("v3");
+
+$code.=<<___;
+.globl ${prefix}_ctr32_encrypt_blocks
+.type ${prefix}_ctr32_encrypt_blocks,%function
+.align 5
+${prefix}_ctr32_encrypt_blocks:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {$ivec.4s},[$ivp]
+___
+ &rev32($ivec,$ivec);
+ &load_sbox();
+$code.=<<___;
+ cmp $blocks,#1
+ b.ne 1f
+ // fast processing for one single block without
+ // context saving overhead
+___
+ &encrypt_1blk($ivec);
+$code.=<<___;
+ ld1 {@data[0].4s},[$inp]
+ eor @data[0].16b,@data[0].16b,$ivec.16b
+ st1 {@data[0].4s},[$outp]
+ ret
+1:
+ AARCH64_SIGN_LINK_REGISTER
+ stp d8,d9,[sp,#-80]!
+ stp d10,d11,[sp,#16]
+ stp d12,d13,[sp,#32]
+ stp d14,d15,[sp,#48]
+ stp x29,x30,[sp,#64]
+ mov $word0,$ivec.s[0]
+ mov $word1,$ivec.s[1]
+ mov $word2,$ivec.s[2]
+ mov $ctr,$ivec.s[3]
+.Lctr32_4_blocks_process:
+ cmp $blocks,#4
+ b.lt 1f
+ dup @data[0].4s,$word0
+ dup @data[1].4s,$word1
+ dup @data[2].4s,$word2
+ mov @data[3].s[0],$ctr
+ add $ctr,$ctr,#1
+ mov $data[3].s[1],$ctr
+ add $ctr,$ctr,#1
+ mov @data[3].s[2],$ctr
+ add $ctr,$ctr,#1
+ mov @data[3].s[3],$ctr
+ add $ctr,$ctr,#1
+ cmp $blocks,#8
+ b.ge .Lctr32_8_blocks_process
+ bl _${prefix}_enc_4blks
+ ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
+ eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
+ eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
+ eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
+ eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
+ st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
+ subs $blocks,$blocks,#4
+ b.ne .Lctr32_4_blocks_process
+ b 100f
+.Lctr32_8_blocks_process:
+ dup @datax[0].4s,$word0
+ dup @datax[1].4s,$word1
+ dup @datax[2].4s,$word2
+ mov @datax[3].s[0],$ctr
+ add $ctr,$ctr,#1
+ mov $datax[3].s[1],$ctr
+ add $ctr,$ctr,#1
+ mov @datax[3].s[2],$ctr
+ add $ctr,$ctr,#1
+ mov @datax[3].s[3],$ctr
+ add $ctr,$ctr,#1
+ bl _${prefix}_enc_8blks
+ ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
+ ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
+ eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
+ eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
+ eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
+ eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
+ eor @data[0].16b,@data[0].16b,@datax[0].16b
+ eor @data[1].16b,@data[1].16b,@datax[1].16b
+ eor @data[2].16b,@data[2].16b,@datax[2].16b
+ eor @data[3].16b,@data[3].16b,@datax[3].16b
+ st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
+ st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
+ subs $blocks,$blocks,#8
+ b.ne .Lctr32_4_blocks_process
+ b 100f
+1: // last block processing
+ subs $blocks,$blocks,#1
+ b.lt 100f
+ b.gt 1f
+ mov $ivec.s[0],$word0
+ mov $ivec.s[1],$word1
+ mov $ivec.s[2],$word2
+ mov $ivec.s[3],$ctr
+___
+ &encrypt_1blk($ivec);
+$code.=<<___;
+ ld1 {@data[0].4s},[$inp]
+ eor @data[0].16b,@data[0].16b,$ivec.16b
+ st1 {@data[0].4s},[$outp]
+ b 100f
+1: // last 2 blocks processing
+ dup @data[0].4s,$word0
+ dup @data[1].4s,$word1
+ dup @data[2].4s,$word2
+ mov @data[3].s[0],$ctr
+ add $ctr,$ctr,#1
+ mov @data[3].s[1],$ctr
+ subs $blocks,$blocks,#1
+ b.ne 1f
+ bl _${prefix}_enc_4blks
+ ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16
+ ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16
+ eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
+ eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
+ eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
+ eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
+ st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16
+ st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16
+ b 100f
+1: // last 3 blocks processing
+ add $ctr,$ctr,#1
+ mov @data[3].s[2],$ctr
+ bl _${prefix}_enc_4blks
+ ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16
+ ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16
+ ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[2],[$inp],#16
+ eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
+ eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
+ eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
+ eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
+ st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16
+ st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16
+ st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[2],[$outp],#16
+100:
+ ldp d10,d11,[sp,#16]
+ ldp d12,d13,[sp,#32]
+ ldp d14,d15,[sp,#48]
+ ldp x29,x30,[sp,#64]
+ ldp d8,d9,[sp],#80
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
+___
+}}}
+
+
+{{{
+my ($blocks,$len)=("x2","x2");
+my $ivp=("x5");
+my @twx=map("x$_",(12..27));
+my ($rks1,$rks2)=("x26","x27");
+my $lastBlk=("x26");
+my $enc=("w28");
+my $remain=("x29");
+
+my @tweak=map("v$_",(16..23));
+my $lastTweak=("v25");
+
+sub gen_xts_cipher() {
+ my $std = shift;
+$code.=<<___;
+.globl ${prefix}_xts_encrypt${std}
+.type ${prefix}_xts_encrypt${std},%function
+.align 5
+${prefix}_xts_encrypt${std}:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x15, x16, [sp, #-0x10]!
+ stp x17, x18, [sp, #-0x10]!
+ stp x19, x20, [sp, #-0x10]!
+ stp x21, x22, [sp, #-0x10]!
+ stp x23, x24, [sp, #-0x10]!
+ stp x25, x26, [sp, #-0x10]!
+ stp x27, x28, [sp, #-0x10]!
+ stp x29, x30, [sp, #-0x10]!
+ stp d8, d9, [sp, #-0x10]!
+ stp d10, d11, [sp, #-0x10]!
+ stp d12, d13, [sp, #-0x10]!
+ stp d14, d15, [sp, #-0x10]!
+ mov $rks1,x3
+ mov $rks2,x4
+ mov $enc,w6
+ ld1 {@tweak[0].4s}, [$ivp]
+ mov $rks,$rks2
+___
+ &load_sbox();
+ &rev32(@tweak[0],@tweak[0]);
+ &encrypt_1blk(@tweak[0]);
+$code.=<<___;
+ mov $rks,$rks1
+ and $remain,$len,#0x0F
+ // convert length into blocks
+ lsr $blocks,$len,4
+ cmp $blocks,#1
+ b.lt .return${std}
+
+ cmp $remain,0
+ // If the encryption/decryption Length is N times of 16,
+ // the all blocks are encrypted/decrypted in .xts_encrypt_blocks${std}
+ b.eq .xts_encrypt_blocks${std}
+
+ // If the encryption/decryption length is not N times of 16,
+ // the last two blocks are encrypted/decrypted in .last_2blks_tweak${std} or .only_2blks_tweak${std}
+ // the other blocks are encrypted/decrypted in .xts_encrypt_blocks${std}
+ subs $blocks,$blocks,#1
+ b.eq .only_2blks_tweak${std}
+.xts_encrypt_blocks${std}:
+___
+ &rbit(@tweak[0],@tweak[0],$std);
+ &rev32_armeb(@tweak[0],@tweak[0]);
+ &mov_vec_to_reg(@tweak[0],@twx[0],@twx[1]);
+ &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]);
+ &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]);
+ &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]);
+ &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]);
+ &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]);
+ &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]);
+ &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]);
+$code.=<<___;
+.Lxts_8_blocks_process${std}:
+ cmp $blocks,#8
+___
+ &mov_reg_to_vec(@twx[0],@twx[1],@tweak[0]);
+ &compute_tweak(@twx[14],@twx[15],@twx[0],@twx[1]);
+ &mov_reg_to_vec(@twx[2],@twx[3],@tweak[1]);
+ &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]);
+ &mov_reg_to_vec(@twx[4],@twx[5],@tweak[2]);
+ &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]);
+ &mov_reg_to_vec(@twx[6],@twx[7],@tweak[3]);
+ &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]);
+ &mov_reg_to_vec(@twx[8],@twx[9],@tweak[4]);
+ &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]);
+ &mov_reg_to_vec(@twx[10],@twx[11],@tweak[5]);
+ &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]);
+ &mov_reg_to_vec(@twx[12],@twx[13],@tweak[6]);
+ &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]);
+ &mov_reg_to_vec(@twx[14],@twx[15],@tweak[7]);
+ &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]);
+$code.=<<___;
+ b.lt .Lxts_4_blocks_process${std}
+ ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
+___
+ &rbit(@tweak[0],@tweak[0],$std);
+ &rbit(@tweak[1],@tweak[1],$std);
+ &rbit(@tweak[2],@tweak[2],$std);
+ &rbit(@tweak[3],@tweak[3],$std);
+$code.=<<___;
+ eor @data[0].16b, @data[0].16b, @tweak[0].16b
+ eor @data[1].16b, @data[1].16b, @tweak[1].16b
+ eor @data[2].16b, @data[2].16b, @tweak[2].16b
+ eor @data[3].16b, @data[3].16b, @tweak[3].16b
+ ld1 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
+___
+ &rbit(@tweak[4],@tweak[4],$std);
+ &rbit(@tweak[5],@tweak[5],$std);
+ &rbit(@tweak[6],@tweak[6],$std);
+ &rbit(@tweak[7],@tweak[7],$std);
+$code.=<<___;
+ eor @datax[0].16b, @datax[0].16b, @tweak[4].16b
+ eor @datax[1].16b, @datax[1].16b, @tweak[5].16b
+ eor @datax[2].16b, @datax[2].16b, @tweak[6].16b
+ eor @datax[3].16b, @datax[3].16b, @tweak[7].16b
+___
+ &rev32(@data[0],@data[0]);
+ &rev32(@data[1],@data[1]);
+ &rev32(@data[2],@data[2]);
+ &rev32(@data[3],@data[3]);
+ &rev32(@datax[0],@datax[0]);
+ &rev32(@datax[1],@datax[1]);
+ &rev32(@datax[2],@datax[2]);
+ &rev32(@datax[3],@datax[3]);
+ &transpose(@data,@vtmp);
+ &transpose(@datax,@vtmp);
+$code.=<<___;
+ bl _${prefix}_enc_8blks
+___
+ &transpose(@vtmp,@datax);
+ &transpose(@data,@datax);
+$code.=<<___;
+ eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
+ eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
+ eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
+ eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b
+ eor @data[0].16b, @data[0].16b, @tweak[4].16b
+ eor @data[1].16b, @data[1].16b, @tweak[5].16b
+ eor @data[2].16b, @data[2].16b, @tweak[6].16b
+ eor @data[3].16b, @data[3].16b, @tweak[7].16b
+
+ // save the last tweak
+ mov $lastTweak.16b,@tweak[7].16b
+ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
+ st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
+ subs $blocks,$blocks,#8
+ b.gt .Lxts_8_blocks_process${std}
+ b 100f
+.Lxts_4_blocks_process${std}:
+ cmp $blocks,#4
+ b.lt 1f
+ ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
+___
+ &rbit(@tweak[0],@tweak[0],$std);
+ &rbit(@tweak[1],@tweak[1],$std);
+ &rbit(@tweak[2],@tweak[2],$std);
+ &rbit(@tweak[3],@tweak[3],$std);
+$code.=<<___;
+ eor @data[0].16b, @data[0].16b, @tweak[0].16b
+ eor @data[1].16b, @data[1].16b, @tweak[1].16b
+ eor @data[2].16b, @data[2].16b, @tweak[2].16b
+ eor @data[3].16b, @data[3].16b, @tweak[3].16b
+___
+ &rev32(@data[0],@data[0]);
+ &rev32(@data[1],@data[1]);
+ &rev32(@data[2],@data[2]);
+ &rev32(@data[3],@data[3]);
+ &transpose(@data,@vtmp);
+$code.=<<___;
+ bl _${prefix}_enc_4blks
+___
+ &transpose(@vtmp,@data);
+$code.=<<___;
+ eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
+ eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
+ eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
+ eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b
+ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
+ sub $blocks,$blocks,#4
+ mov @tweak[0].16b,@tweak[4].16b
+ mov @tweak[1].16b,@tweak[5].16b
+ mov @tweak[2].16b,@tweak[6].16b
+ // save the last tweak
+ mov $lastTweak.16b,@tweak[3].16b
+1:
+ // process last block
+ cmp $blocks,#1
+ b.lt 100f
+ b.gt 1f
+ ld1 {@data[0].4s},[$inp],#16
+___
+ &rbit(@tweak[0],@tweak[0],$std);
+$code.=<<___;
+ eor @data[0].16b, @data[0].16b, @tweak[0].16b
+___
+ &rev32(@data[0],@data[0]);
+ &encrypt_1blk(@data[0]);
+$code.=<<___;
+ eor @data[0].16b, @data[0].16b, @tweak[0].16b
+ st1 {@data[0].4s},[$outp],#16
+ // save the last tweak
+ mov $lastTweak.16b,@tweak[0].16b
+ b 100f
+1: // process last 2 blocks
+ cmp $blocks,#2
+ b.gt 1f
+ ld1 {@data[0].4s,@data[1].4s},[$inp],#32
+___
+ &rbit(@tweak[0],@tweak[0],$std);
+ &rbit(@tweak[1],@tweak[1],$std);
+$code.=<<___;
+ eor @data[0].16b, @data[0].16b, @tweak[0].16b
+ eor @data[1].16b, @data[1].16b, @tweak[1].16b
+___
+ &rev32(@data[0],@data[0]);
+ &rev32(@data[1],@data[1]);
+ &transpose(@data,@vtmp);
+$code.=<<___;
+ bl _${prefix}_enc_4blks
+___
+ &transpose(@vtmp,@data);
+$code.=<<___;
+ eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
+ eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
+ st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32
+ // save the last tweak
+ mov $lastTweak.16b,@tweak[1].16b
+ b 100f
+1: // process last 3 blocks
+ ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48
+___
+ &rbit(@tweak[0],@tweak[0],$std);
+ &rbit(@tweak[1],@tweak[1],$std);
+ &rbit(@tweak[2],@tweak[2],$std);
+$code.=<<___;
+ eor @data[0].16b, @data[0].16b, @tweak[0].16b
+ eor @data[1].16b, @data[1].16b, @tweak[1].16b
+ eor @data[2].16b, @data[2].16b, @tweak[2].16b
+___
+ &rev32(@data[0],@data[0]);
+ &rev32(@data[1],@data[1]);
+ &rev32(@data[2],@data[2]);
+ &transpose(@data,@vtmp);
+$code.=<<___;
+ bl _${prefix}_enc_4blks
+___
+ &transpose(@vtmp,@data);
+$code.=<<___;
+ eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
+ eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
+ eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
+ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48
+ // save the last tweak
+ mov $lastTweak.16b,@tweak[2].16b
+100:
+ cmp $remain,0
+ b.eq .return${std}
+
+// This brance calculates the last two tweaks,
+// while the encryption/decryption length is larger than 32
+.last_2blks_tweak${std}:
+___
+ &rev32_armeb($lastTweak,$lastTweak);
+ &compute_tweak_vec($lastTweak,@tweak[1],$std);
+ &compute_tweak_vec(@tweak[1],@tweak[2],$std);
+$code.=<<___;
+ b .check_dec${std}
+
+
+// This brance calculates the last two tweaks,
+// while the encryption/decryption length is equal to 32, who only need two tweaks
+.only_2blks_tweak${std}:
+ mov @tweak[1].16b,@tweak[0].16b
+___
+ &rev32_armeb(@tweak[1],@tweak[1]);
+ &compute_tweak_vec(@tweak[1],@tweak[2]);
+$code.=<<___;
+ b .check_dec${std}
+
+
+// Determine whether encryption or decryption is required.
+// The last two tweaks need to be swapped for decryption.
+.check_dec${std}:
+ // encryption:1 decryption:0
+ cmp $enc,1
+ b.eq .prcess_last_2blks${std}
+ mov @vtmp[0].16B,@tweak[1].16b
+ mov @tweak[1].16B,@tweak[2].16b
+ mov @tweak[2].16B,@vtmp[0].16b
+
+.prcess_last_2blks${std}:
+___
+ &rev32_armeb(@tweak[1],@tweak[1]);
+ &rev32_armeb(@tweak[2],@tweak[2]);
+$code.=<<___;
+ ld1 {@data[0].4s},[$inp],#16
+ eor @data[0].16b, @data[0].16b, @tweak[1].16b
+___
+ &rev32(@data[0],@data[0]);
+ &encrypt_1blk(@data[0]);
+$code.=<<___;
+ eor @data[0].16b, @data[0].16b, @tweak[1].16b
+ st1 {@data[0].4s},[$outp],#16
+
+ sub $lastBlk,$outp,16
+ .loop${std}:
+ subs $remain,$remain,1
+ ldrb $wtmp0,[$lastBlk,$remain]
+ ldrb $wtmp1,[$inp,$remain]
+ strb $wtmp1,[$lastBlk,$remain]
+ strb $wtmp0,[$outp,$remain]
+ b.gt .loop${std}
+ ld1 {@data[0].4s}, [$lastBlk]
+ eor @data[0].16b, @data[0].16b, @tweak[2].16b
+___
+ &rev32(@data[0],@data[0]);
+ &encrypt_1blk(@data[0]);
+$code.=<<___;
+ eor @data[0].16b, @data[0].16b, @tweak[2].16b
+ st1 {@data[0].4s}, [$lastBlk]
+.return${std}:
+ ldp d14, d15, [sp], #0x10
+ ldp d12, d13, [sp], #0x10
+ ldp d10, d11, [sp], #0x10
+ ldp d8, d9, [sp], #0x10
+ ldp x29, x30, [sp], #0x10
+ ldp x27, x28, [sp], #0x10
+ ldp x25, x26, [sp], #0x10
+ ldp x23, x24, [sp], #0x10
+ ldp x21, x22, [sp], #0x10
+ ldp x19, x20, [sp], #0x10
+ ldp x17, x18, [sp], #0x10
+ ldp x15, x16, [sp], #0x10
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size ${prefix}_xts_encrypt${std},.-${prefix}_xts_encrypt${std}
+___
+} # end of gen_xts_cipher
+&gen_xts_cipher("_gb");
+&gen_xts_cipher("");
+}}}
+
+########################################
+open SELF,$0;
+while(<SELF>) {
+ next if (/^#!/);
+ last if (!s/^#/\/\// and !/^$/);
+ print;
+}
+close SELF;
+
+foreach(split("\n",$code)) {
+ s/\`([^\`]*)\`/eval($1)/ge;
+ print $_,"\n";
+}
+
+close STDOUT or die "error closing STDOUT: $!";
diff --git a/crypto/sm4/build.info b/crypto/sm4/build.info
index 75a215ab80..73ffe5ea09 100644
--- a/crypto/sm4/build.info
+++ b/crypto/sm4/build.info
@@ -2,7 +2,7 @@ LIBS=../../libcrypto
IF[{- !$disabled{asm} -}]
$SM4DEF_aarch64=SM4_ASM VPSM4_ASM
- $SM4ASM_aarch64=sm4-armv8.S vpsm4-armv8.S
+ $SM4ASM_aarch64=sm4-armv8.S vpsm4-armv8.S vpsm4_ex-armv8.S
# Now that we have defined all the arch specific variables, use the
# appropriate one, and define the appropriate macros
@@ -30,5 +30,7 @@ ENDIF
GENERATE[sm4-armv8.S]=asm/sm4-armv8.pl
GENERATE[vpsm4-armv8.S]=asm/vpsm4-armv8.pl
+GENERATE[vpsm4_ex-armv8.S]=asm/vpsm4_ex-armv8.pl
INCLUDE[sm4-armv8.o]=..
INCLUDE[vpsm4-armv8.o]=..
+INCLUDE[vpsm4_ex-armv8.o]=..
diff --git a/include/crypto/sm4_platform.h b/include/crypto/sm4_platform.h
index 15d8abbcb1..8b9cd10f97 100644
--- a/include/crypto/sm4_platform.h
+++ b/include/crypto/sm4_platform.h
@@ -20,11 +20,16 @@ static inline int vpsm4_capable(void)
{
return (OPENSSL_armcap_P & ARMV8_CPUID) &&
(MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_V1) ||
- MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_N1) ||
- MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, HISI_CPU_IMP, HISI_CPU_PART_KP920));
+ MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_N1));
+}
+static inline int vpsm4_ex_capable(void)
+{
+ return (OPENSSL_armcap_P & ARMV8_CPUID) &&
+ (MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, HISI_CPU_IMP, HISI_CPU_PART_KP920));
}
# if defined(VPSM4_ASM)
# define VPSM4_CAPABLE vpsm4_capable()
+# define VPSM4_EX_CAPABLE vpsm4_ex_capable()
# endif
# define HWSM4_CAPABLE (OPENSSL_armcap_P & ARMV8_SM4)
# define HWSM4_set_encrypt_key sm4_v8_set_encrypt_key
@@ -56,7 +61,7 @@ void HWSM4_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
const unsigned char ivec[16]);
# endif /* HWSM4_CAPABLE */
-#ifdef VPSM4_CAPABLE
+# ifdef VPSM4_CAPABLE
int vpsm4_set_encrypt_key(const unsigned char *userKey, SM4_KEY *key);
int vpsm4_set_decrypt_key(const unsigned char *userKey, SM4_KEY *key);
void vpsm4_encrypt(const unsigned char *in, unsigned char *out,
@@ -72,7 +77,37 @@ void vpsm4_ecb_encrypt(const unsigned char *in, unsigned char *out,
void vpsm4_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
size_t len, const void *key,
const unsigned char ivec[16]);
+void vpsm4_xts_encrypt(const unsigned char *in, unsigned char *out,
+ size_t len, const SM4_KEY *key1, const SM4_KEY *key2,
+ const unsigned char ivec[16], const int enc);
+void vpsm4_xts_encrypt_gb(const unsigned char *in, unsigned char *out,
+ size_t len, const SM4_KEY *key1, const SM4_KEY *key2,
+ const unsigned char ivec[16], const int enc);
# endif /* VPSM4_CAPABLE */
+# ifdef VPSM4_EX_CAPABLE
+int vpsm4_ex_set_encrypt_key(const unsigned char *userKey, SM4_KEY *key);
+int vpsm4_ex_set_decrypt_key(const unsigned char *userKey, SM4_KEY *key);
+void vpsm4_ex_encrypt(const unsigned char *in, unsigned char *out,
+ const SM4_KEY *key);
+void vpsm4_ex_decrypt(const unsigned char *in, unsigned char *out,
+ const SM4_KEY *key);
+void vpsm4_ex_cbc_encrypt(const unsigned char *in, unsigned char *out,
+ size_t length, const SM4_KEY *key,
+ unsigned char *ivec, const int enc);
+void vpsm4_ex_ecb_encrypt(const unsigned char *in, unsigned char *out,
+ size_t length, const SM4_KEY *key,
+ const int enc);
+void vpsm4_ex_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
+ size_t len, const void *key,
+ const unsigned char ivec[16]);
+void vpsm4_ex_xts_encrypt(const unsigned char *in, unsigned char *out,
+ size_t len, const SM4_KEY *key1, const SM4_KEY *key2,
+ const unsigned char ivec[16], const int enc);
+void vpsm4_ex_xts_encrypt_gb(const unsigned char *in, unsigned char *out,
+ size_t len, const SM4_KEY *key1,
+ const SM4_KEY *key2, const unsigned char ivec[16],
+ const int enc);
+# endif /* VPSM4_EX_CAPABLE */
#endif /* OSSL_SM4_PLATFORM_H */
diff --git a/providers/implementations/ciphers/cipher_sm4_hw.c b/providers/implementations/ciphers/cipher_sm4_hw.c
index 9a2e99f67c..8cabd78266 100644
--- a/providers/implementations/ciphers/cipher_sm4_hw.c
+++ b/providers/implementations/ciphers/cipher_sm4_hw.c
@@ -42,6 +42,19 @@ static int cipher_hw_sm4_initkey(PROV_CIPHER_CTX *ctx,
(void)0; /* terminate potentially open 'else' */
} else
#endif
+#ifdef VPSM4_EX_CAPABLE
+ if (VPSM4_EX_CAPABLE) {
+ vpsm4_ex_set_encrypt_key(key, ks);
+ ctx->block = (block128_f)vpsm4_ex_encrypt;
+ ctx->stream.cbc = NULL;
+ if (ctx->mode == EVP_CIPH_CBC_MODE)
+ ctx->stream.cbc = (cbc128_f)vpsm4_ex_cbc_encrypt;
+ else if (ctx->mode == EVP_CIPH_ECB_MODE)
+ ctx->stream.ecb = (ecb128_f)vpsm4_ex_ecb_encrypt;
+ else if (ctx->mode == EVP_CIPH_CTR_MODE)
+ ctx->stream.ctr = (ctr128_f)vpsm4_ex_ctr32_encrypt_blocks;
+ } else
+#endif
#ifdef VPSM4_CAPABLE
if (VPSM4_CAPABLE) {
vpsm4_set_encrypt_key(key, ks);
@@ -75,6 +88,17 @@ static int cipher_hw_sm4_initkey(PROV_CIPHER_CTX *ctx,
#endif
} else
#endif
+#ifdef VPSM4_EX_CAPABLE
+ if (VPSM4_EX_CAPABLE) {
+ vpsm4_ex_set_decrypt_key(key, ks);
+ ctx->block = (block128_f)vpsm4_ex_decrypt;
+ ctx->stream.cbc = NULL;
+ if (ctx->mode == EVP_CIPH_CBC_MODE)
+ ctx->stream.cbc = (cbc128_f)vpsm4_ex_cbc_encrypt;
+ else if (ctx->mode == EVP_CIPH_ECB_MODE)
+ ctx->stream.ecb = (ecb128_f)vpsm4_ex_ecb_encrypt;
+ } else
+#endif
#ifdef VPSM4_CAPABLE
if (VPSM4_CAPABLE) {
vpsm4_set_decrypt_key(key, ks);
@@ -82,7 +106,7 @@ static int cipher_hw_sm4_initkey(PROV_CIPHER_CTX *ctx,
ctx->stream.cbc = NULL;
if (ctx->mode == EVP_CIPH_CBC_MODE)
ctx->stream.cbc = (cbc128_f)vpsm4_cbc_encrypt;
- else if (ctx->mode == EVP_CIPH_ECB_MODE)
+ else if (ctx->mode == EVP_CIPH_ECB_MODE)
ctx->stream.ecb = (ecb128_f)vpsm4_ecb_encrypt;
} else
#endif
diff --git a/providers/implementations/ciphers/cipher_sm4_xts.c b/providers/implementations/ciphers/cipher_sm4_xts.c
index 3c568d4d18..037055fce8 100644
--- a/providers/implementations/ciphers/cipher_sm4_xts.c
+++ b/providers/implementations/ciphers/cipher_sm4_xts.c
@@ -145,14 +145,14 @@ static int sm4_xts_cipher(void *vctx, unsigned char *out, size_t *outl,
if (ctx->xts_standard) {
if (ctx->stream != NULL)
(*ctx->stream)(in, out, inl, ctx->xts.key1, ctx->xts.key2,
- ctx->base.iv);
+ ctx->base.iv, ctx->base.enc);
else if (CRYPTO_xts128_encrypt(&ctx->xts, ctx->base.iv, in, out, inl,
ctx->base.enc))
return 0;
} else {
if (ctx->stream_gb != NULL)
(*ctx->stream_gb)(in, out, inl, ctx->xts.key1, ctx->xts.key2,
- ctx->base.iv);
+ ctx->base.iv, ctx->base.enc);
else if (ossl_crypto_xts128gb_encrypt(&ctx->xts, ctx->base.iv, in, out,
inl, ctx->base.enc))
return 0;
diff --git a/providers/implementations/ciphers/cipher_sm4_xts.h b/providers/implementations/ciphers/cipher_sm4_xts.h
index 4c369183e2..cfca596979 100644
--- a/providers/implementations/ciphers/cipher_sm4_xts.h
+++ b/providers/implementations/ciphers/cipher_sm4_xts.h
@@ -14,7 +14,7 @@
PROV_CIPHER_FUNC(void, xts_stream,
(const unsigned char *in, unsigned char *out, size_t len,
const SM4_KEY *key1, const SM4_KEY *key2,
- const unsigned char iv[16]));
+ const unsigned char iv[16], const int enc));
typedef struct prov_sm4_xts_ctx_st {
/* Must be first */
diff --git a/providers/implementations/ciphers/cipher_sm4_xts_hw.c b/providers/implementations/ciphers/cipher_sm4_xts_hw.c
index 403eb879b1..67a9923d94 100644
--- a/providers/implementations/ciphers/cipher_sm4_xts_hw.c
+++ b/providers/implementations/ciphers/cipher_sm4_xts_hw.c
@@ -11,8 +11,7 @@
#define XTS_SET_KEY_FN(fn_set_enc_key, fn_set_dec_key, \
fn_block_enc, fn_block_dec, \
- fn_stream_enc, fn_stream_dec, \
- fn_stream_gb_enc, fn_stream_gb_dec) { \
+ fn_stream, fn_stream_gb) { \
size_t bytes = keylen / 2; \
\
if (ctx->enc) { \
@@ -26,8 +25,8 @@
xctx->xts.block2 = (block128_f)fn_block_enc; \
xctx->xts.key1 = &xctx->ks1; \
xctx->xts.key2 = &xctx->ks2; \
- xctx->stream = ctx->enc ? fn_stream_enc : fn_stream_dec; \
- xctx->stream_gb = ctx->enc ? fn_stream_gb_enc : fn_stream_gb_dec; \
+ xctx->stream = fn_stream; \
+ xctx->stream_gb = fn_stream_gb; \
}
static int cipher_hw_sm4_xts_generic_initkey(PROV_CIPHER_CTX *ctx,
@@ -35,23 +34,30 @@ static int cipher_hw_sm4_xts_generic_initkey(PROV_CIPHER_CTX *ctx,
size_t keylen)
{
PROV_SM4_XTS_CTX *xctx = (PROV_SM4_XTS_CTX *)ctx;
- OSSL_xts_stream_fn stream_enc = NULL;
- OSSL_xts_stream_fn stream_dec = NULL;
- OSSL_xts_stream_fn stream_gb_enc = NULL;
- OSSL_xts_stream_fn stream_gb_dec = NULL;
+ OSSL_xts_stream_fn stream = NULL;
+ OSSL_xts_stream_fn stream_gb = NULL;
#ifdef HWSM4_CAPABLE
if (HWSM4_CAPABLE) {
XTS_SET_KEY_FN(HWSM4_set_encrypt_key, HWSM4_set_decrypt_key,
- HWSM4_encrypt, HWSM4_decrypt, stream_enc, stream_dec,
- stream_gb_enc, stream_gb_dec);
+ HWSM4_encrypt, HWSM4_decrypt, stream, stream_gb);
return 1;
} else
#endif /* HWSM4_CAPABLE */
+#ifdef VPSM4_EX_CAPABLE
+ if (VPSM4_EX_CAPABLE) {
+ stream = vpsm4_ex_xts_encrypt;
+ stream_gb = vpsm4_ex_xts_encrypt_gb;
+ XTS_SET_KEY_FN(vpsm4_ex_set_encrypt_key, vpsm4_ex_set_decrypt_key,
+ vpsm4_ex_encrypt, vpsm4_ex_decrypt, stream, stream_gb);
+ return 1;
+ } else
+#endif /* VPSM4_EX_CAPABLE */
#ifdef VPSM4_CAPABLE
if (VPSM4_CAPABLE) {
+ stream = vpsm4_xts_encrypt;
+ stream_gb = vpsm4_xts_encrypt_gb;
XTS_SET_KEY_FN(vpsm4_set_encrypt_key, vpsm4_set_decrypt_key,
- vpsm4_encrypt, vpsm4_decrypt, stream_enc, stream_dec,
- stream_gb_enc, stream_gb_dec);
+ vpsm4_encrypt, vpsm4_decrypt, stream, stream_gb);
return 1;
} else
#endif /* VPSM4_CAPABLE */
@@ -60,8 +66,7 @@ static int cipher_hw_sm4_xts_generic_initkey(PROV_CIPHER_CTX *ctx,
}
{
XTS_SET_KEY_FN(ossl_sm4_set_key, ossl_sm4_set_key, ossl_sm4_encrypt,
- ossl_sm4_decrypt, stream_enc, stream_dec, stream_gb_enc,
- stream_gb_dec);
+ ossl_sm4_decrypt, stream, stream_gb);
}
return 1;
}
--
2.37.3.windows.1
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/jinlun123123/openssl.git
git@gitee.com:jinlun123123/openssl.git
jinlun123123
openssl
openssl
master

搜索帮助

D67c1975 1850385 1daf7b77 1850385