126 Star 3 Fork 18

src-openEuler/libwd

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
0026-cipher-add-support-for-SM4-CFB-and-XTS-modes-in-CE-i.patch 30.33 KB
一键复制 编辑 原始数据 按行查看 历史
JangShui Yang 提交于 2024-04-07 18:05 . libwd: update the source code
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348
From 091bbf55057370ab571d8a84cc33465ad145e1a9 Mon Sep 17 00:00:00 2001
From: Yuzeng Zhuang <yisen.zhuang@huawei.com>
Date: Wed, 20 Mar 2024 16:12:48 +0800
Subject: [PATCH 26/44] cipher: add support for SM4 CFB and XTS modes in CE
instruction
This patch implements the CE instruction using SM4 CFB and XTS modes.
Signed-off-by: Yuzeng Zhuang <yisen.zhuang@huawei.com>
Signed-off-by: Qi Tao <taoqi10@huawei.com>
---
drv/isa_ce_sm4.c | 115 +++-
drv/isa_ce_sm4.h | 14 +
drv/isa_ce_sm4_armv8.S | 1126 ++++++++++++++++++++++++++++++++++++++++
3 files changed, 1253 insertions(+), 2 deletions(-)
diff --git a/drv/isa_ce_sm4.c b/drv/isa_ce_sm4.c
index e2d81de..466b060 100644
--- a/drv/isa_ce_sm4.c
+++ b/drv/isa_ce_sm4.c
@@ -22,6 +22,8 @@
#define SM4_BLOCK_SIZE 16
#define MAX_BLOCK_NUM (1U << 28)
#define CTR96_SHIFT_BITS 8
+#define SM4_BYTES2BLKS(nbytes) ((nbytes) >> 4)
+#define SM4_KEY_SIZE 16
#define GETU32(p) \
((__u32)(p)[0] << 24 | (__u32)(p)[1] << 16 | (__u32)(p)[2] << 8 | (__u32)(p)[3])
@@ -136,10 +138,104 @@ void sm4_set_decrypt_key(const __u8 *userKey, struct SM4_KEY *key)
sm4_v8_set_decrypt_key(userKey, key);
}
+static void sm4_cfb_crypt(struct wd_cipher_msg *msg, const struct SM4_KEY *rkey, const int enc)
+{
+ unsigned char keydata[SM4_BLOCK_SIZE];
+ const unsigned char *src = msg->in;
+ unsigned char *dst = msg->out;
+ __u32 nbytes = msg->in_bytes;
+ __u32 blocks, bbytes;
+ __u32 i = 0;
+
+ blocks = SM4_BYTES2BLKS(nbytes);
+ if (blocks) {
+ if (enc == SM4_ENCRYPT)
+ sm4_v8_cfb_encrypt_blocks(src, dst, blocks, rkey, msg->iv);
+ else
+ sm4_v8_cfb_decrypt_blocks(src, dst, blocks, rkey, msg->iv);
+
+ bbytes = blocks * SM4_BLOCK_SIZE;
+ dst += bbytes;
+ src += bbytes;
+ nbytes -= bbytes;
+ }
+
+ if (nbytes == 0)
+ return;
+
+ sm4_v8_crypt_block(msg->iv, keydata, rkey);
+ while (nbytes > 0) {
+ *dst++ = *src++ ^ keydata[i++];
+ nbytes--;
+ }
+
+ /* store new IV */
+ if (enc == SM4_ENCRYPT) {
+ if (msg->out_bytes >= msg->iv_bytes)
+ memcpy(msg->iv, msg->out + msg->out_bytes -
+ msg->iv_bytes, msg->iv_bytes);
+ else
+ memcpy(msg->iv, msg->out, msg->out_bytes);
+ } else {
+ if (msg->in_bytes >= msg->iv_bytes)
+ memcpy(msg->iv, msg->in + msg->in_bytes -
+ msg->iv_bytes, msg->iv_bytes);
+ else
+ memcpy(msg->iv, msg->in, msg->in_bytes);
+ }
+}
+
+static void sm4_cfb_encrypt(struct wd_cipher_msg *msg, const struct SM4_KEY *rkey_enc)
+{
+ sm4_cfb_crypt(msg, rkey_enc, SM4_ENCRYPT);
+}
+
+static void sm4_cfb_decrypt(struct wd_cipher_msg *msg, const struct SM4_KEY *rkey_dec)
+{
+ sm4_cfb_crypt(msg, rkey_dec, SM4_DECRYPT);
+}
+
+static int sm4_xts_encrypt(struct wd_cipher_msg *msg, const struct SM4_KEY *rkey)
+{
+ struct SM4_KEY rkey2;
+
+ if (msg->in_bytes < SM4_BLOCK_SIZE) {
+ WD_ERR("invalid: cipher input length is wrong!\n");
+ return -WD_EINVAL;
+ }
+
+ /* set key for tweak */
+ sm4_set_encrypt_key(msg->key + SM4_KEY_SIZE, &rkey2);
+
+ sm4_v8_xts_encrypt(msg->in, msg->out, msg->in_bytes,
+ rkey, msg->iv, &rkey2);
+
+ return 0;
+}
+
+static int sm4_xts_decrypt(struct wd_cipher_msg *msg, const struct SM4_KEY *rkey)
+{
+ struct SM4_KEY rkey2;
+
+ if (msg->in_bytes < SM4_BLOCK_SIZE) {
+ WD_ERR("invalid: cipher input length is wrong!\n");
+ return -WD_EINVAL;
+ }
+
+ /* set key for tweak */
+ sm4_set_encrypt_key(msg->key + SM4_KEY_SIZE, &rkey2);
+
+ sm4_v8_xts_decrypt(msg->in, msg->out, msg->in_bytes,
+ rkey, msg->iv, &rkey2);
+
+ return 0;
+}
+
static int isa_ce_cipher_send(struct wd_alg_driver *drv, handle_t ctx, void *wd_msg)
{
struct wd_cipher_msg *msg = wd_msg;
struct SM4_KEY rkey;
+ int ret = 0;
if (!msg) {
WD_ERR("invalid: input sm4 msg is NULL!\n");
@@ -151,7 +247,8 @@ static int isa_ce_cipher_send(struct wd_alg_driver *drv, handle_t ctx, void *wd_
return -WD_EINVAL;
}
- if (msg->op_type == WD_CIPHER_ENCRYPTION || msg->mode == WD_CIPHER_CTR)
+ if (msg->op_type == WD_CIPHER_ENCRYPTION || msg->mode == WD_CIPHER_CTR
+ || msg->mode == WD_CIPHER_CFB)
sm4_set_encrypt_key(msg->key, &rkey);
else
sm4_set_decrypt_key(msg->key, &rkey);
@@ -166,12 +263,24 @@ static int isa_ce_cipher_send(struct wd_alg_driver *drv, handle_t ctx, void *wd_
case WD_CIPHER_CTR:
sm4_ctr_encrypt(msg, &rkey);
break;
+ case WD_CIPHER_CFB:
+ if (msg->op_type == WD_CIPHER_ENCRYPTION)
+ sm4_cfb_encrypt(msg, &rkey);
+ else
+ sm4_cfb_decrypt(msg, &rkey);
+ break;
+ case WD_CIPHER_XTS:
+ if (msg->op_type == WD_CIPHER_ENCRYPTION)
+ ret = sm4_xts_encrypt(msg, &rkey);
+ else
+ ret = sm4_xts_decrypt(msg, &rkey);
+ break;
default:
WD_ERR("The current block cipher mode is not supported!\n");
return -WD_EINVAL;
}
- return 0;
+ return ret;
}
static int isa_ce_cipher_recv(struct wd_alg_driver *drv, handle_t ctx, void *wd_msg)
@@ -206,6 +315,8 @@ static int cipher_recv(struct wd_alg_driver *drv, handle_t ctx, void *msg)
static struct wd_alg_driver cipher_alg_driver[] = {
GEN_CE_ALG_DRIVER("cbc(sm4)", cipher),
GEN_CE_ALG_DRIVER("ctr(sm4)", cipher),
+ GEN_CE_ALG_DRIVER("cfb(sm4)", cipher),
+ GEN_CE_ALG_DRIVER("xts(sm4)", cipher),
};
static void __attribute__((constructor)) isa_ce_probe(void)
diff --git a/drv/isa_ce_sm4.h b/drv/isa_ce_sm4.h
index 0bc074d..d92069f 100644
--- a/drv/isa_ce_sm4.h
+++ b/drv/isa_ce_sm4.h
@@ -31,6 +31,20 @@ void sm4_v8_cbc_encrypt(const unsigned char *in, unsigned char *out,
void sm4_v8_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
size_t len, const void *key, const unsigned char ivec[16]);
+void sm4_v8_cfb_encrypt_blocks(const unsigned char *in, unsigned char *out,
+ size_t length, const struct SM4_KEY *key, unsigned char *ivec);
+void sm4_v8_cfb_decrypt_blocks(const unsigned char *in, unsigned char *out,
+ size_t length, const struct SM4_KEY *key, unsigned char *ivec);
+void sm4_v8_crypt_block(const unsigned char *in, unsigned char *out,
+ const struct SM4_KEY *key);
+
+int sm4_v8_xts_encrypt(const unsigned char *in, unsigned char *out, size_t length,
+ const struct SM4_KEY *key, unsigned char *ivec,
+ const struct SM4_KEY *key2);
+int sm4_v8_xts_decrypt(const unsigned char *in, unsigned char *out, size_t length,
+ const struct SM4_KEY *key, unsigned char *ivec,
+ const struct SM4_KEY *key2);
+
#ifdef __cplusplus
}
#endif
diff --git a/drv/isa_ce_sm4_armv8.S b/drv/isa_ce_sm4_armv8.S
index d7d172a..342dfa5 100644
--- a/drv/isa_ce_sm4_armv8.S
+++ b/drv/isa_ce_sm4_armv8.S
@@ -37,6 +37,14 @@
.long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
.Lfk:
.long 0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc
+.align 4
+.cts_permute_table:
+.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+.byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
+.byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
+.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
.globl sm4_v8_set_encrypt_key
.type sm4_v8_set_encrypt_key,%function
.align 5
@@ -772,3 +780,1121 @@ sm4_v8_ctr32_encrypt_blocks:
ldp d8,d9,[sp],#16
ret
.size sm4_v8_ctr32_encrypt_blocks,.-sm4_v8_ctr32_encrypt_blocks
+
+.globl sm4_v8_crypt_block
+.type sm4_v8_crypt_block,%function
+.align 5
+sm4_v8_crypt_block:
+ /* parameters:
+ * x0: src
+ * x1: dst
+ * x2: key
+ */
+ AARCH64_VALID_CALL_TARGET
+
+ ld1 {v0.16b-v3.16b}, [x2], #64
+ ld1 {v4.16b-v7.16b}, [x2]
+
+ ld1 {v16.4s},[x0]
+
+ rev32 v16.16b, v16.16b
+ sm4e v16.4s, v0.4s
+ sm4e v16.4s, v1.4s
+ sm4e v16.4s, v2.4s
+ sm4e v16.4s, v3.4s
+ sm4e v16.4s, v4.4s
+ sm4e v16.4s, v5.4s
+ sm4e v16.4s, v6.4s
+ sm4e v16.4s, v7.4s
+ rev64 v16.4s, v16.4s
+ ext v16.16b, v16.16b, v16.16b, #8
+ rev32 v16.16b, v16.16b
+
+ st1 {v16.16b}, [x1];
+
+ ret
+.size sm4_v8_crypt_block,.-sm4_v8_crypt_block
+
+.globl sm4_v8_cfb_encrypt_blocks
+.type sm4_v8_cfb_encrypt_blocks,%function
+.align 5
+sm4_v8_cfb_encrypt_blocks:
+ /* parameters:
+ * x0: src
+ * x1: dst
+ * w2: nblocks
+ * x3: key
+ * x4: iv
+ */
+ AARCH64_VALID_CALL_TARGET
+ stp d8,d9,[sp, #-16]!
+
+ ld1 {v0.4s-v3.4s}, [x3], #64
+ ld1 {v4.4s-v7.4s}, [x3]
+
+ ld1 {v8.4s},[x4]
+
+.loop_cfb_enc_4block:
+ cmp w2, #4
+ blt .loob_cfb_enc_1block
+
+ sub w2, w2, #4
+
+ ld1 {v16.4s-v19.4s}, [x0], #64
+
+ rev32 v8.16b, v8.16b
+ sm4e v8.4s, v0.4s
+ sm4e v8.4s, v1.4s
+ sm4e v8.4s, v2.4s
+ sm4e v8.4s, v3.4s
+ sm4e v8.4s, v4.4s
+ sm4e v8.4s, v5.4s
+ sm4e v8.4s, v6.4s
+ sm4e v8.4s, v7.4s
+ rev64 v8.4s, v8.4s
+ ext v8.16b, v8.16b, v8.16b, #8
+ rev32 v8.16b, v8.16b
+ eor v16.16b, v16.16b, v8.16b
+
+ rev32 v8.16b, v16.16b
+ sm4e v8.4s, v0.4s
+ sm4e v8.4s, v1.4s
+ sm4e v8.4s, v2.4s
+ sm4e v8.4s, v3.4s
+ sm4e v8.4s, v4.4s
+ sm4e v8.4s, v5.4s
+ sm4e v8.4s, v6.4s
+ sm4e v8.4s, v7.4s
+ rev64 v8.4s, v8.4s
+ ext v8.16b, v8.16b, v8.16b, #8
+ rev32 v8.16b, v8.16b
+ eor v17.16b, v17.16b, v8.16b
+
+ rev32 v8.16b, v17.16b
+ sm4e v8.4s, v0.4s
+ sm4e v8.4s, v1.4s
+ sm4e v8.4s, v2.4s
+ sm4e v8.4s, v3.4s
+ sm4e v8.4s, v4.4s
+ sm4e v8.4s, v5.4s
+ sm4e v8.4s, v6.4s
+ sm4e v8.4s, v7.4s
+ rev64 v8.4s, v8.4s
+ ext v8.16b, v8.16b, v8.16b, #8
+ rev32 v8.16b, v8.16b
+ eor v18.16b, v18.16b, v8.16b
+
+ rev32 v8.16b, v18.16b
+ sm4e v8.4s, v0.4s
+ sm4e v8.4s, v1.4s
+ sm4e v8.4s, v2.4s
+ sm4e v8.4s, v3.4s
+ sm4e v8.4s, v4.4s
+ sm4e v8.4s, v5.4s
+ sm4e v8.4s, v6.4s
+ sm4e v8.4s, v7.4s
+ rev64 v8.4s, v8.4s
+ ext v8.16b, v8.16b, v8.16b, #8
+ rev32 v8.16b, v8.16b
+ eor v19.16b, v19.16b, v8.16b
+
+ st1 {v16.4s-v19.4s}, [x1], #64
+ mov v8.16b, v19.16b
+
+ cbz w2, .end_cfb_enc
+ b .loop_cfb_enc_4block
+
+.loob_cfb_enc_1block:
+ sub w2, w2, #1
+
+ ld1 {v16.4s}, [x0], #16
+
+ rev32 v8.16b, v8.16b
+ sm4e v8.4s, v0.4s
+ sm4e v8.4s, v1.4s
+ sm4e v8.4s, v2.4s
+ sm4e v8.4s, v3.4s
+ sm4e v8.4s, v4.4s
+ sm4e v8.4s, v5.4s
+ sm4e v8.4s, v6.4s
+ sm4e v8.4s, v7.4s
+ rev64 v8.4s, v8.4s
+ ext v8.16b, v8.16b, v8.16b, #8
+ rev32 v8.16b, v8.16b
+ eor v8.16b, v8.16b, v16.16b
+
+ st1 {v8.4s}, [x1], #16
+
+ cbnz w2, .loob_cfb_enc_1block
+
+.end_cfb_enc:
+ st1 {v8.4s}, [x4]
+
+ ldp d8,d9,[sp],#16
+ ret
+.size sm4_v8_cfb_encrypt_blocks,.-sm4_v8_cfb_encrypt_blocks
+
+.globl sm4_v8_cfb_decrypt_blocks
+.type sm4_v8_cfb_decrypt_blocks,%function
+.align 5
+sm4_v8_cfb_decrypt_blocks:
+ /* parameters:
+ * x0: src
+ * x1: dst
+ * w2: nblocks
+ * x3: key
+ * x4: iv
+ */
+ AARCH64_VALID_CALL_TARGET
+ stp d8,d9,[sp, #-16]!
+
+ ld1 {v0.4s-v3.4s}, [x3], #64
+ ld1 {v4.4s-v7.4s}, [x3]
+
+ ld1 {v8.4s},[x4]
+
+.loop_cfb_dec_8block:
+ cmp w2, #8
+ blt .cfb_dec_4block
+
+ sub w2, w2, #8
+
+ ld1 {v12.4s-v15.4s}, [x0], #64
+ ld1 {v16.4s-v19.4s}, [x0], #64
+
+ rev32 v20.16b, v8.16b
+ rev32 v21.16b, v12.16b
+ rev32 v22.16b, v13.16b
+ rev32 v23.16b, v14.16b
+ rev32 v24.16b, v15.16b
+ rev32 v25.16b, v16.16b
+ rev32 v26.16b, v17.16b
+ rev32 v27.16b, v18.16b
+ sm4e v20.4s, v0.4s
+ sm4e v21.4s, v0.4s
+ sm4e v22.4s, v0.4s
+ sm4e v23.4s, v0.4s
+ sm4e v24.4s, v0.4s
+ sm4e v25.4s, v0.4s
+ sm4e v26.4s, v0.4s
+ sm4e v27.4s, v0.4s
+ sm4e v20.4s, v1.4s
+ sm4e v21.4s, v1.4s
+ sm4e v22.4s, v1.4s
+ sm4e v23.4s, v1.4s
+ sm4e v24.4s, v1.4s
+ sm4e v25.4s, v1.4s
+ sm4e v26.4s, v1.4s
+ sm4e v27.4s, v1.4s
+ sm4e v20.4s, v2.4s
+ sm4e v21.4s, v2.4s
+ sm4e v22.4s, v2.4s
+ sm4e v23.4s, v2.4s
+ sm4e v24.4s, v2.4s
+ sm4e v25.4s, v2.4s
+ sm4e v26.4s, v2.4s
+ sm4e v27.4s, v2.4s
+ sm4e v20.4s, v3.4s
+ sm4e v21.4s, v3.4s
+ sm4e v22.4s, v3.4s
+ sm4e v23.4s, v3.4s
+ sm4e v24.4s, v3.4s
+ sm4e v25.4s, v3.4s
+ sm4e v26.4s, v3.4s
+ sm4e v27.4s, v3.4s
+ sm4e v20.4s, v4.4s
+ sm4e v21.4s, v4.4s
+ sm4e v22.4s, v4.4s
+ sm4e v23.4s, v4.4s
+ sm4e v24.4s, v4.4s
+ sm4e v25.4s, v4.4s
+ sm4e v26.4s, v4.4s
+ sm4e v27.4s, v4.4s
+ sm4e v20.4s, v5.4s
+ sm4e v21.4s, v5.4s
+ sm4e v22.4s, v5.4s
+ sm4e v23.4s, v5.4s
+ sm4e v24.4s, v5.4s
+ sm4e v25.4s, v5.4s
+ sm4e v26.4s, v5.4s
+ sm4e v27.4s, v5.4s
+ sm4e v20.4s, v6.4s
+ sm4e v21.4s, v6.4s
+ sm4e v22.4s, v6.4s
+ sm4e v23.4s, v6.4s
+ sm4e v24.4s, v6.4s
+ sm4e v25.4s, v6.4s
+ sm4e v26.4s, v6.4s
+ sm4e v27.4s, v6.4s
+ sm4e v20.4s, v7.4s
+ sm4e v21.4s, v7.4s
+ sm4e v22.4s, v7.4s
+ sm4e v23.4s, v7.4s
+ sm4e v24.4s, v7.4s
+ sm4e v25.4s, v7.4s
+ sm4e v26.4s, v7.4s
+ sm4e v27.4s, v7.4s
+ rev64 v20.4s, v20.4s
+ rev64 v21.4s, v21.4s
+ rev64 v22.4s, v22.4s
+ rev64 v23.4s, v23.4s
+ rev64 v24.4s, v24.4s
+ rev64 v25.4s, v25.4s
+ rev64 v26.4s, v26.4s
+ rev64 v27.4s, v27.4s
+ ext v20.16b, v20.16b, v20.16b, #8
+ ext v21.16b, v21.16b, v21.16b, #8
+ ext v22.16b, v22.16b, v22.16b, #8
+ ext v23.16b, v23.16b, v23.16b, #8
+ ext v24.16b, v24.16b, v24.16b, #8
+ ext v25.16b, v25.16b, v25.16b, #8
+ ext v26.16b, v26.16b, v26.16b, #8
+ ext v27.16b, v27.16b, v27.16b, #8
+ rev32 v20.16b, v20.16b
+ rev32 v21.16b, v21.16b
+ rev32 v22.16b, v22.16b
+ rev32 v23.16b, v23.16b
+ rev32 v24.16b, v24.16b
+ rev32 v25.16b, v25.16b
+ rev32 v26.16b, v26.16b
+ rev32 v27.16b, v27.16b
+
+ mov v8.16b, v19.16b //Modify IV
+
+ eor v20.16b, v20.16b, v12.16b
+ eor v21.16b, v21.16b, v13.16b
+ eor v22.16b, v22.16b, v14.16b
+ eor v23.16b, v23.16b, v15.16b
+ eor v24.16b, v24.16b, v16.16b
+ eor v25.16b, v25.16b, v17.16b
+ eor v26.16b, v26.16b, v18.16b
+ eor v27.16b, v27.16b, v19.16b
+
+ st1 {v20.4s-v23.4s}, [x1], #64
+ st1 {v24.4s-v27.4s}, [x1], #64
+
+ cbz w2, .end_cfb_dec
+ b .loop_cfb_dec_8block
+
+.cfb_dec_4block:
+ cmp w2, #4
+ blt .loop_cfb_dec_1block
+
+ sub w2, w2, #4
+
+ ld1 {v12.4s-v15.4s}, [x0], #64
+
+ rev32 v20.16b, v8.16b
+ rev32 v21.16b, v12.16b
+ rev32 v22.16b, v13.16b
+ rev32 v23.16b, v14.16b
+ sm4e v20.4s, v0.4s
+ sm4e v21.4s, v0.4s
+ sm4e v22.4s, v0.4s
+ sm4e v23.4s, v0.4s
+ sm4e v20.4s, v1.4s
+ sm4e v21.4s, v1.4s
+ sm4e v22.4s, v1.4s
+ sm4e v23.4s, v1.4s
+ sm4e v20.4s, v2.4s
+ sm4e v21.4s, v2.4s
+ sm4e v22.4s, v2.4s
+ sm4e v23.4s, v2.4s
+ sm4e v20.4s, v3.4s
+ sm4e v21.4s, v3.4s
+ sm4e v22.4s, v3.4s
+ sm4e v23.4s, v3.4s
+ sm4e v20.4s, v4.4s
+ sm4e v21.4s, v4.4s
+ sm4e v22.4s, v4.4s
+ sm4e v23.4s, v4.4s
+ sm4e v20.4s, v5.4s
+ sm4e v21.4s, v5.4s
+ sm4e v22.4s, v5.4s
+ sm4e v23.4s, v5.4s
+ sm4e v20.4s, v6.4s
+ sm4e v21.4s, v6.4s
+ sm4e v22.4s, v6.4s
+ sm4e v23.4s, v6.4s
+ sm4e v20.4s, v7.4s
+ sm4e v21.4s, v7.4s
+ sm4e v22.4s, v7.4s
+ sm4e v23.4s, v7.4s
+ rev64 v20.4s, v20.4s
+ rev64 v21.4s, v21.4s
+ rev64 v22.4s, v22.4s
+ rev64 v23.4s, v23.4s
+ ext v20.16b, v20.16b, v20.16b, #8
+ ext v21.16b, v21.16b, v21.16b, #8
+ ext v22.16b, v22.16b, v22.16b, #8
+ ext v23.16b, v23.16b, v23.16b, #8
+ rev32 v20.16b, v20.16b
+ rev32 v21.16b, v21.16b
+ rev32 v22.16b, v22.16b
+ rev32 v23.16b, v23.16b
+
+ mov v8.16b, v15.16b //Modify IV
+
+ eor v20.16b, v20.16b, v12.16b
+ eor v21.16b, v21.16b, v13.16b
+ eor v22.16b, v22.16b, v14.16b
+ eor v23.16b, v23.16b, v15.16b
+
+ st1 {v20.4s-v23.4s}, [x1], #64
+
+ cbz w2, .end_cfb_dec
+
+.loop_cfb_dec_1block:
+ sub w2, w2, #1
+
+ ld1 {v12.4s}, [x0], #16
+
+ rev32 v20.16b, v8.16b
+ sm4e v20.4s, v0.4s
+ sm4e v20.4s, v1.4s
+ sm4e v20.4s, v2.4s
+ sm4e v20.4s, v3.4s
+ sm4e v20.4s, v4.4s
+ sm4e v20.4s, v5.4s
+ sm4e v20.4s, v6.4s
+ sm4e v20.4s, v7.4s
+ rev64 v20.4s, v20.4s
+ ext v20.16b, v20.16b, v20.16b, #8
+ rev32 v20.16b, v20.16b
+
+ eor v20.16b, v20.16b, v12.16b
+ st1 {v20.4s}, [x1], #16
+
+ mov v8.16b, v12.16b //Modify IV
+
+ cbnz w2, .loop_cfb_dec_1block
+
+.end_cfb_dec:
+ /* store new IV */
+ st1 {v8.4s}, [x4]
+
+ ldp d8,d9,[sp],#16
+ ret
+.size sm4_v8_cfb_decrypt_blocks,.-sm4_v8_cfb_decrypt_blocks
+
+#define tweak_calc(out, in, MSK, TMP) \
+ sshr TMP.2d, in.2d, #63; \
+ and TMP.16b, TMP.16b, MSK.16b; \
+ add out.2d, in.2d, in.2d; \
+ ext TMP.16b, TMP.16b, TMP.16b, #8; \
+ eor out.16b, out.16b, TMP.16b;
+
+.globl sm4_v8_xts_encrypt
+.type sm4_v8_xts_encrypt,%function
+.align 5
+sm4_v8_xts_encrypt:
+ /* parameters:
+ * x0: src
+ * x1: dst
+ * w2: nbytes
+ * x3: key
+ * x4: tweak
+ * x5: key array for tweak
+ */
+ AARCH64_VALID_CALL_TARGET
+ stp d8,d9,[sp, #-16]!
+
+ ld1 {v8.16b}, [x4]
+
+ cbz x5, .enc_xts_nokey2
+
+ /* load round key array for tweak */
+ ld1 {v0.16b-v3.16b}, [x5], #64
+ ld1 {v4.16b-v7.16b}, [x5]
+
+ /* first tweak */
+ rev32 v8.16b, v8.16b
+ sm4e v8.4s, v0.4s
+ sm4e v8.4s, v1.4s
+ sm4e v8.4s, v2.4s
+ sm4e v8.4s, v3.4s
+ sm4e v8.4s, v4.4s
+ sm4e v8.4s, v5.4s
+ sm4e v8.4s, v6.4s
+ sm4e v8.4s, v7.4s
+ rev64 v8.4s, v8.4s
+ ext v8.16b, v8.16b, v8.16b, #8
+ rev32 v8.16b, v8.16b
+
+.enc_xts_nokey2:
+ /* load key array */
+ ld1 {v0.16b-v3.16b}, [x3], #64
+ ld1 {v4.16b-v7.16b}, [x3]
+
+ and w5, w2, #15
+ lsr w2, w2, #4
+ cbz w5, .enc_xts_mask
+ /* leave the last block for tail */
+ sub w2, w2, #1
+
+.enc_xts_mask:
+ /* init mask */
+ movi v31.2s, #0x1
+ movi v16.2s, #0x87
+ uzp1 v31.4s, v31.4s, v16.4s
+
+ cbz w2, .enc_xts_tail
+
+.enc_xts_8block:
+ sub w2, w2, #8
+ tbnz w2, #31, .enc_xts_4block
+
+ tweak_calc(v9, v8, v31, v16)
+ tweak_calc(v10, v9, v31, v17)
+ tweak_calc(v11, v10, v31, v18)
+ tweak_calc(v12, v11, v31, v19)
+ tweak_calc(v13, v12, v31, v16)
+ tweak_calc(v14, v13, v31, v17)
+ tweak_calc(v15, v14, v31, v18)
+
+ ld1 {v20.16b-v23.16b}, [x0], #64
+ ld1 {v24.16b-v27.16b}, [x0], #64
+ eor v20.16b, v20.16b, v8.16b
+ eor v21.16b, v21.16b, v9.16b
+ eor v22.16b, v22.16b, v10.16b
+ eor v23.16b, v23.16b, v11.16b
+ eor v24.16b, v24.16b, v12.16b
+ eor v25.16b, v25.16b, v13.16b
+ eor v26.16b, v26.16b, v14.16b
+ eor v27.16b, v27.16b, v15.16b
+
+ rev32 v20.16b, v20.16b
+ rev32 v21.16b, v21.16b
+ rev32 v22.16b, v22.16b
+ rev32 v23.16b, v23.16b
+ rev32 v24.16b, v24.16b
+ rev32 v25.16b, v25.16b
+ rev32 v26.16b, v26.16b
+ rev32 v27.16b, v27.16b
+ sm4e v20.4s, v0.4s
+ sm4e v21.4s, v0.4s
+ sm4e v22.4s, v0.4s
+ sm4e v23.4s, v0.4s
+ sm4e v24.4s, v0.4s
+ sm4e v25.4s, v0.4s
+ sm4e v26.4s, v0.4s
+ sm4e v27.4s, v0.4s
+ sm4e v20.4s, v1.4s
+ sm4e v21.4s, v1.4s
+ sm4e v22.4s, v1.4s
+ sm4e v23.4s, v1.4s
+ sm4e v24.4s, v1.4s
+ sm4e v25.4s, v1.4s
+ sm4e v26.4s, v1.4s
+ sm4e v27.4s, v1.4s
+ sm4e v20.4s, v2.4s
+ sm4e v21.4s, v2.4s
+ sm4e v22.4s, v2.4s
+ sm4e v23.4s, v2.4s
+ sm4e v24.4s, v2.4s
+ sm4e v25.4s, v2.4s
+ sm4e v26.4s, v2.4s
+ sm4e v27.4s, v2.4s
+ sm4e v20.4s, v3.4s
+ sm4e v21.4s, v3.4s
+ sm4e v22.4s, v3.4s
+ sm4e v23.4s, v3.4s
+ sm4e v24.4s, v3.4s
+ sm4e v25.4s, v3.4s
+ sm4e v26.4s, v3.4s
+ sm4e v27.4s, v3.4s
+ sm4e v20.4s, v4.4s
+ sm4e v21.4s, v4.4s
+ sm4e v22.4s, v4.4s
+ sm4e v23.4s, v4.4s
+ sm4e v24.4s, v4.4s
+ sm4e v25.4s, v4.4s
+ sm4e v26.4s, v4.4s
+ sm4e v27.4s, v4.4s
+ sm4e v20.4s, v5.4s
+ sm4e v21.4s, v5.4s
+ sm4e v22.4s, v5.4s
+ sm4e v23.4s, v5.4s
+ sm4e v24.4s, v5.4s
+ sm4e v25.4s, v5.4s
+ sm4e v26.4s, v5.4s
+ sm4e v27.4s, v5.4s
+ sm4e v20.4s, v6.4s
+ sm4e v21.4s, v6.4s
+ sm4e v22.4s, v6.4s
+ sm4e v23.4s, v6.4s
+ sm4e v24.4s, v6.4s
+ sm4e v25.4s, v6.4s
+ sm4e v26.4s, v6.4s
+ sm4e v27.4s, v6.4s
+ sm4e v20.4s, v7.4s
+ sm4e v21.4s, v7.4s
+ sm4e v22.4s, v7.4s
+ sm4e v23.4s, v7.4s
+ sm4e v24.4s, v7.4s
+ sm4e v25.4s, v7.4s
+ sm4e v26.4s, v7.4s
+ sm4e v27.4s, v7.4s
+ rev64 v20.4s, v20.4s
+ rev64 v21.4s, v21.4s
+ rev64 v22.4s, v22.4s
+ rev64 v23.4s, v23.4s
+ rev64 v24.4s, v24.4s
+ rev64 v25.4s, v25.4s
+ rev64 v26.4s, v26.4s
+ rev64 v27.4s, v27.4s
+ ext v20.16b, v20.16b, v20.16b, #8
+ ext v21.16b, v21.16b, v21.16b, #8
+ ext v22.16b, v22.16b, v22.16b, #8
+ ext v23.16b, v23.16b, v23.16b, #8
+ ext v24.16b, v24.16b, v24.16b, #8
+ ext v25.16b, v25.16b, v25.16b, #8
+ ext v26.16b, v26.16b, v26.16b, #8
+ ext v27.16b, v27.16b, v27.16b, #8
+ rev32 v20.16b, v20.16b
+ rev32 v21.16b, v21.16b
+ rev32 v22.16b, v22.16b
+ rev32 v23.16b, v23.16b
+ rev32 v24.16b, v24.16b
+ rev32 v25.16b, v25.16b
+ rev32 v26.16b, v26.16b
+ rev32 v27.16b, v27.16b
+
+ eor v20.16b, v20.16b, v8.16b
+ eor v21.16b, v21.16b, v9.16b
+ eor v22.16b, v22.16b, v10.16b
+ eor v23.16b, v23.16b, v11.16b
+ eor v24.16b, v24.16b, v12.16b
+ eor v25.16b, v25.16b, v13.16b
+ eor v26.16b, v26.16b, v14.16b
+ eor v27.16b, v27.16b, v15.16b
+ st1 {v20.16b-v23.16b}, [x1], #64
+ st1 {v24.16b-v27.16b}, [x1], #64
+
+ tweak_calc(v8, v15, v31, v19)
+
+ cbz w2, .enc_xts_tail
+ b .enc_xts_8block
+
+.enc_xts_4block:
+ add w2, w2, #8
+ cmp w2, #4
+ blt .enc_xts_1block
+
+ sub w2, w2, #4
+
+ tweak_calc(v9, v8, v31, v16)
+ tweak_calc(v10, v9, v31, v17)
+ tweak_calc(v11, v10, v31, v18)
+
+ ld1 {v20.16b-v23.16b}, [x0], #64
+ eor v20.16b, v20.16b, v8.16b
+ eor v21.16b, v21.16b, v9.16b
+ eor v22.16b, v22.16b, v10.16b
+ eor v23.16b, v23.16b, v11.16b
+
+ rev32 v20.16b, v20.16b
+ rev32 v21.16b, v21.16b
+ rev32 v22.16b, v22.16b
+ rev32 v23.16b, v23.16b
+ sm4e v20.4s, v0.4s
+ sm4e v21.4s, v0.4s
+ sm4e v22.4s, v0.4s
+ sm4e v23.4s, v0.4s
+ sm4e v20.4s, v1.4s
+ sm4e v21.4s, v1.4s
+ sm4e v22.4s, v1.4s
+ sm4e v23.4s, v1.4s
+ sm4e v20.4s, v2.4s
+ sm4e v21.4s, v2.4s
+ sm4e v22.4s, v2.4s
+ sm4e v23.4s, v2.4s
+ sm4e v20.4s, v3.4s
+ sm4e v21.4s, v3.4s
+ sm4e v22.4s, v3.4s
+ sm4e v23.4s, v3.4s
+ sm4e v20.4s, v4.4s
+ sm4e v21.4s, v4.4s
+ sm4e v22.4s, v4.4s
+ sm4e v23.4s, v4.4s
+ sm4e v20.4s, v5.4s
+ sm4e v21.4s, v5.4s
+ sm4e v22.4s, v5.4s
+ sm4e v23.4s, v5.4s
+ sm4e v20.4s, v6.4s
+ sm4e v21.4s, v6.4s
+ sm4e v22.4s, v6.4s
+ sm4e v23.4s, v6.4s
+ sm4e v20.4s, v7.4s
+ sm4e v21.4s, v7.4s
+ sm4e v22.4s, v7.4s
+ sm4e v23.4s, v7.4s
+ rev64 v20.4s, v20.4s
+ rev64 v21.4s, v21.4s
+ rev64 v22.4s, v22.4s
+ rev64 v23.4s, v23.4s
+ ext v20.16b, v20.16b, v20.16b, #8
+ ext v21.16b, v21.16b, v21.16b, #8
+ ext v22.16b, v22.16b, v22.16b, #8
+ ext v23.16b, v23.16b, v23.16b, #8
+ rev32 v20.16b, v20.16b
+ rev32 v21.16b, v21.16b
+ rev32 v22.16b, v22.16b
+ rev32 v23.16b, v23.16b
+
+ eor v20.16b, v20.16b, v8.16b
+ eor v21.16b, v21.16b, v9.16b
+ eor v22.16b, v22.16b, v10.16b
+ eor v23.16b, v23.16b, v11.16b
+ st1 {v20.16b-v23.16b}, [x1], #64
+
+ tweak_calc(v8, v11, v31, v19)
+
+ cbz w2, .enc_xts_tail
+
+.enc_xts_1block:
+ sub w2, w2, #1
+
+ ld1 {v20.16b}, [x0], #16
+ eor v20.16b, v20.16b, v8.16b
+
+ rev32 v20.16b, v20.16b
+ sm4e v20.4s, v0.4s
+ sm4e v20.4s, v1.4s
+ sm4e v20.4s, v2.4s
+ sm4e v20.4s, v3.4s
+ sm4e v20.4s, v4.4s
+ sm4e v20.4s, v5.4s
+ sm4e v20.4s, v6.4s
+ sm4e v20.4s, v7.4s
+ rev64 v20.4s, v20.4s
+ ext v20.16b, v20.16b, v20.16b, #8
+ rev32 v20.16b, v20.16b
+
+ eor v20.16b, v20.16b, v8.16b
+ st1 {v20.16b}, [x1], #16
+
+ tweak_calc(v8, v8, v31, v16)
+
+ cbnz w2, .enc_xts_1block
+
+.enc_xts_tail:
+ uxtw x5, w5
+ cbz x5, .enc_xts_end
+
+ tweak_calc(v9, v8, v31, v16)
+ ld1 {v20.16b}, [x0]
+ eor v20.16b, v20.16b, v8.16b
+ rev32 v20.16b, v20.16b
+ sm4e v20.4s, v0.4s
+ sm4e v20.4s, v1.4s
+ sm4e v20.4s, v2.4s
+ sm4e v20.4s, v3.4s
+ sm4e v20.4s, v4.4s
+ sm4e v20.4s, v5.4s
+ sm4e v20.4s, v6.4s
+ sm4e v20.4s, v7.4s
+ rev64 v20.4s, v20.4s
+ ext v20.16b, v20.16b, v20.16b, #8
+ rev32 v20.16b, v20.16b
+ eor v20.16b, v20.16b, v8.16b
+
+ adr x6, .cts_permute_table
+ add x7, x6, #32
+ add x6, x6, x5
+ sub x7, x7, x5
+ ld1 {v23.16b}, [x6]
+ ld1 {v24.16b}, [x7]
+
+ add x0, x0, x5
+ ld1 {v21.16b}, [x0]
+
+ tbl v22.16b, {v20.16b}, v23.16b
+ tbx v20.16b, {v21.16b}, v24.16b
+
+ eor v20.16b, v20.16b, v9.16b
+ rev32 v20.16b, v20.16b
+ sm4e v20.4s, v0.4s
+ sm4e v20.4s, v1.4s
+ sm4e v20.4s, v2.4s
+ sm4e v20.4s, v3.4s
+ sm4e v20.4s, v4.4s
+ sm4e v20.4s, v5.4s
+ sm4e v20.4s, v6.4s
+ sm4e v20.4s, v7.4s
+ rev64 v20.4s, v20.4s
+ ext v20.16b, v20.16b, v20.16b, #8
+ rev32 v20.16b, v20.16b
+ eor v20.16b, v20.16b, v9.16b
+
+ add x5, x1, x5
+ st1 {v22.16b}, [x5]
+ st1 {v20.16b}, [x1]
+
+ b .enc_xts_ret
+
+.enc_xts_end:
+ /* new tweak */
+ st1 {v8.16b}, [x4]
+
+.enc_xts_ret:
+ ldp d8,d9,[sp],#16
+ ret
+.size sm4_v8_xts_encrypt,.-sm4_v8_xts_encrypt
+
+.globl sm4_v8_xts_decrypt
+.type sm4_v8_xts_decrypt,%function
+.align 5
+sm4_v8_xts_decrypt:
+ /* parameters:
+ * x0: src
+ * x1: dst
+ * w2: nbytes
+ * x3: key
+ * x4: tweak
+ * x5: key array for tweak
+ */
+ AARCH64_VALID_CALL_TARGET
+ stp d8,d9,[sp, #-16]!
+
+ ld1 {v8.16b}, [x4]
+
+ cbz x5, .dec_xts_nokey2
+
+ /* load round key array for tweak */
+ ld1 {v0.16b-v3.16b}, [x5], #64
+ ld1 {v4.16b-v7.16b}, [x5]
+
+ /* first tweak */
+ rev32 v8.16b, v8.16b
+ sm4e v8.4s, v0.4s
+ sm4e v8.4s, v1.4s
+ sm4e v8.4s, v2.4s
+ sm4e v8.4s, v3.4s
+ sm4e v8.4s, v4.4s
+ sm4e v8.4s, v5.4s
+ sm4e v8.4s, v6.4s
+ sm4e v8.4s, v7.4s
+ rev64 v8.4s, v8.4s
+ ext v8.16b, v8.16b, v8.16b, #8
+ rev32 v8.16b, v8.16b
+
+.dec_xts_nokey2:
+ ld1 {v0.16b-v3.16b}, [x3], #64
+ ld1 {v4.16b-v7.16b}, [x3]
+
+ and w5, w2, #15
+ lsr w2, w2, #4
+ cbz w5, .dec_xts_mask
+ /* leave the last block for tail */
+ sub w2, w2, #1
+
+.dec_xts_mask:
+ /* init mask */
+ movi v31.2s, #0x1
+ movi v16.2s, #0x87
+ uzp1 v31.4s, v31.4s, v16.4s
+
+ cbz w2, .dec_xts_tail
+
+.dec_xts_8block:
+ sub w2, w2, #8
+ tbnz w2, #31, .dec_xts_4block
+
+ tweak_calc(v9, v8, v31, v16)
+ tweak_calc(v10, v9, v31, v17)
+ tweak_calc(v11, v10, v31, v18)
+ tweak_calc(v12, v11, v31, v19)
+ tweak_calc(v13, v12, v31, v16)
+ tweak_calc(v14, v13, v31, v17)
+ tweak_calc(v15, v14, v31, v18)
+
+ ld1 {v20.16b-v23.16b}, [x0], #64
+ ld1 {v24.16b-v27.16b}, [x0], #64
+ eor v20.16b, v20.16b, v8.16b
+ eor v21.16b, v21.16b, v9.16b
+ eor v22.16b, v22.16b, v10.16b
+ eor v23.16b, v23.16b, v11.16b
+ eor v24.16b, v24.16b, v12.16b
+ eor v25.16b, v25.16b, v13.16b
+ eor v26.16b, v26.16b, v14.16b
+ eor v27.16b, v27.16b, v15.16b
+
+ rev32 v20.16b, v20.16b
+ rev32 v21.16b, v21.16b
+ rev32 v22.16b, v22.16b
+ rev32 v23.16b, v23.16b
+ rev32 v24.16b, v24.16b
+ rev32 v25.16b, v25.16b
+ rev32 v26.16b, v26.16b
+ rev32 v27.16b, v27.16b
+ sm4e v20.4s, v0.4s
+ sm4e v21.4s, v0.4s
+ sm4e v22.4s, v0.4s
+ sm4e v23.4s, v0.4s
+ sm4e v24.4s, v0.4s
+ sm4e v25.4s, v0.4s
+ sm4e v26.4s, v0.4s
+ sm4e v27.4s, v0.4s
+ sm4e v20.4s, v1.4s
+ sm4e v21.4s, v1.4s
+ sm4e v22.4s, v1.4s
+ sm4e v23.4s, v1.4s
+ sm4e v24.4s, v1.4s
+ sm4e v25.4s, v1.4s
+ sm4e v26.4s, v1.4s
+ sm4e v27.4s, v1.4s
+ sm4e v20.4s, v2.4s
+ sm4e v21.4s, v2.4s
+ sm4e v22.4s, v2.4s
+ sm4e v23.4s, v2.4s
+ sm4e v24.4s, v2.4s
+ sm4e v25.4s, v2.4s
+ sm4e v26.4s, v2.4s
+ sm4e v27.4s, v2.4s
+ sm4e v20.4s, v3.4s
+ sm4e v21.4s, v3.4s
+ sm4e v22.4s, v3.4s
+ sm4e v23.4s, v3.4s
+ sm4e v24.4s, v3.4s
+ sm4e v25.4s, v3.4s
+ sm4e v26.4s, v3.4s
+ sm4e v27.4s, v3.4s
+ sm4e v20.4s, v4.4s
+ sm4e v21.4s, v4.4s
+ sm4e v22.4s, v4.4s
+ sm4e v23.4s, v4.4s
+ sm4e v24.4s, v4.4s
+ sm4e v25.4s, v4.4s
+ sm4e v26.4s, v4.4s
+ sm4e v27.4s, v4.4s
+ sm4e v20.4s, v5.4s
+ sm4e v21.4s, v5.4s
+ sm4e v22.4s, v5.4s
+ sm4e v23.4s, v5.4s
+ sm4e v24.4s, v5.4s
+ sm4e v25.4s, v5.4s
+ sm4e v26.4s, v5.4s
+ sm4e v27.4s, v5.4s
+ sm4e v20.4s, v6.4s
+ sm4e v21.4s, v6.4s
+ sm4e v22.4s, v6.4s
+ sm4e v23.4s, v6.4s
+ sm4e v24.4s, v6.4s
+ sm4e v25.4s, v6.4s
+ sm4e v26.4s, v6.4s
+ sm4e v27.4s, v6.4s
+ sm4e v20.4s, v7.4s
+ sm4e v21.4s, v7.4s
+ sm4e v22.4s, v7.4s
+ sm4e v23.4s, v7.4s
+ sm4e v24.4s, v7.4s
+ sm4e v25.4s, v7.4s
+ sm4e v26.4s, v7.4s
+ sm4e v27.4s, v7.4s
+ rev64 v20.4s, v20.4s
+ rev64 v21.4s, v21.4s
+ rev64 v22.4s, v22.4s
+ rev64 v23.4s, v23.4s
+ rev64 v24.4s, v24.4s
+ rev64 v25.4s, v25.4s
+ rev64 v26.4s, v26.4s
+ rev64 v27.4s, v27.4s
+ ext v20.16b, v20.16b, v20.16b, #8
+ ext v21.16b, v21.16b, v21.16b, #8
+ ext v22.16b, v22.16b, v22.16b, #8
+ ext v23.16b, v23.16b, v23.16b, #8
+ ext v24.16b, v24.16b, v24.16b, #8
+ ext v25.16b, v25.16b, v25.16b, #8
+ ext v26.16b, v26.16b, v26.16b, #8
+ ext v27.16b, v27.16b, v27.16b, #8
+ rev32 v20.16b, v20.16b
+ rev32 v21.16b, v21.16b
+ rev32 v22.16b, v22.16b
+ rev32 v23.16b, v23.16b
+ rev32 v24.16b, v24.16b
+ rev32 v25.16b, v25.16b
+ rev32 v26.16b, v26.16b
+ rev32 v27.16b, v27.16b
+
+ eor v20.16b, v20.16b, v8.16b
+ eor v21.16b, v21.16b, v9.16b
+ eor v22.16b, v22.16b, v10.16b
+ eor v23.16b, v23.16b, v11.16b
+ eor v24.16b, v24.16b, v12.16b
+ eor v25.16b, v25.16b, v13.16b
+ eor v26.16b, v26.16b, v14.16b
+ eor v27.16b, v27.16b, v15.16b
+ st1 {v20.16b-v23.16b}, [x1], #64
+ st1 {v24.16b-v27.16b}, [x1], #64
+
+ tweak_calc(v8, v15, v31, v19)
+
+ cbz w2, .dec_xts_tail
+ b .dec_xts_8block
+
+.dec_xts_4block:
+ add w2, w2, #8
+ cmp w2, #4
+ blt .dec_xts_1block
+
+ sub w2, w2, #4
+
+ tweak_calc(v9, v8, v31, v16)
+ tweak_calc(v10, v9, v31, v17)
+ tweak_calc(v11, v10, v31, v18)
+
+ ld1 {v20.16b-v23.16b}, [x0], #64
+ eor v20.16b, v20.16b, v8.16b
+ eor v21.16b, v21.16b, v9.16b
+ eor v22.16b, v22.16b, v10.16b
+ eor v23.16b, v23.16b, v11.16b
+
+ rev32 v20.16b, v20.16b
+ rev32 v21.16b, v21.16b
+ rev32 v22.16b, v22.16b
+ rev32 v23.16b, v23.16b
+ sm4e v20.4s, v0.4s
+ sm4e v21.4s, v0.4s
+ sm4e v22.4s, v0.4s
+ sm4e v23.4s, v0.4s
+ sm4e v20.4s, v1.4s
+ sm4e v21.4s, v1.4s
+ sm4e v22.4s, v1.4s
+ sm4e v23.4s, v1.4s
+ sm4e v20.4s, v2.4s
+ sm4e v21.4s, v2.4s
+ sm4e v22.4s, v2.4s
+ sm4e v23.4s, v2.4s
+ sm4e v20.4s, v3.4s
+ sm4e v21.4s, v3.4s
+ sm4e v22.4s, v3.4s
+ sm4e v23.4s, v3.4s
+ sm4e v20.4s, v4.4s
+ sm4e v21.4s, v4.4s
+ sm4e v22.4s, v4.4s
+ sm4e v23.4s, v4.4s
+ sm4e v20.4s, v5.4s
+ sm4e v21.4s, v5.4s
+ sm4e v22.4s, v5.4s
+ sm4e v23.4s, v5.4s
+ sm4e v20.4s, v6.4s
+ sm4e v21.4s, v6.4s
+ sm4e v22.4s, v6.4s
+ sm4e v23.4s, v6.4s
+ sm4e v20.4s, v7.4s
+ sm4e v21.4s, v7.4s
+ sm4e v22.4s, v7.4s
+ sm4e v23.4s, v7.4s
+ rev64 v20.4s, v20.4s
+ rev64 v21.4s, v21.4s
+ rev64 v22.4s, v22.4s
+ rev64 v23.4s, v23.4s
+ ext v20.16b, v20.16b, v20.16b, #8
+ ext v21.16b, v21.16b, v21.16b, #8
+ ext v22.16b, v22.16b, v22.16b, #8
+ ext v23.16b, v23.16b, v23.16b, #8
+ rev32 v20.16b, v20.16b
+ rev32 v21.16b, v21.16b
+ rev32 v22.16b, v22.16b
+ rev32 v23.16b, v23.16b
+
+ eor v20.16b, v20.16b, v8.16b
+ eor v21.16b, v21.16b, v9.16b
+ eor v22.16b, v22.16b, v10.16b
+ eor v23.16b, v23.16b, v11.16b
+ st1 {v20.16b-v23.16b}, [x1], #64
+
+ tweak_calc(v8, v11, v31, v19)
+
+ cbz w2, .dec_xts_tail
+
+.dec_xts_1block:
+ sub w2, w2, #1
+
+ ld1 {v20.16b}, [x0], #16
+ eor v20.16b, v20.16b, v8.16b
+
+ rev32 v20.16b, v20.16b
+ sm4e v20.4s, v0.4s
+ sm4e v20.4s, v1.4s
+ sm4e v20.4s, v2.4s
+ sm4e v20.4s, v3.4s
+ sm4e v20.4s, v4.4s
+ sm4e v20.4s, v5.4s
+ sm4e v20.4s, v6.4s
+ sm4e v20.4s, v7.4s
+ rev64 v20.4s, v20.4s
+ ext v20.16b, v20.16b, v20.16b, #8
+ rev32 v20.16b, v20.16b
+
+ eor v20.16b, v20.16b, v8.16b
+ st1 {v20.16b}, [x1], #16
+
+ tweak_calc(v8, v8, v31, v16)
+
+ cbnz w2, .dec_xts_1block
+
+.dec_xts_tail:
+ uxtw x5, w5
+ cbz x5, .dec_xts_end
+
+ tweak_calc(v9, v8, v31, v16)
+ ld1 {v20.16b}, [x0]
+ eor v20.16b, v20.16b, v9.16b
+ rev32 v20.16b, v20.16b
+ sm4e v20.4s, v0.4s
+ sm4e v20.4s, v1.4s
+ sm4e v20.4s, v2.4s
+ sm4e v20.4s, v3.4s
+ sm4e v20.4s, v4.4s
+ sm4e v20.4s, v5.4s
+ sm4e v20.4s, v6.4s
+ sm4e v20.4s, v7.4s
+ rev64 v20.4s, v20.4s
+ ext v20.16b, v20.16b, v20.16b, #8
+ rev32 v20.16b, v20.16b
+ eor v20.16b, v20.16b, v9.16b
+
+ adr x6, .cts_permute_table
+ add x7, x6, #32
+ add x6, x6, x5
+ sub x7, x7, x5
+ ld1 {v23.16b}, [x6]
+ ld1 {v24.16b}, [x7]
+
+ add x0, x0, x5
+ ld1 {v21.16b}, [x0]
+
+ tbl v22.16b, {v20.16b}, v23.16b
+ tbx v20.16b, {v21.16b}, v24.16b
+
+ eor v20.16b, v20.16b, v8.16b
+ rev32 v20.16b, v20.16b
+ sm4e v20.4s, v0.4s
+ sm4e v20.4s, v1.4s
+ sm4e v20.4s, v2.4s
+ sm4e v20.4s, v3.4s
+ sm4e v20.4s, v4.4s
+ sm4e v20.4s, v5.4s
+ sm4e v20.4s, v6.4s
+ sm4e v20.4s, v7.4s
+ rev64 v20.4s, v20.4s
+ ext v20.16b, v20.16b, v20.16b, #8
+ rev32 v20.16b, v20.16b
+ eor v20.16b, v20.16b, v8.16b
+
+ add x5, x1, x5
+ st1 {v22.16b}, [x5]
+ st1 {v20.16b}, [x1]
+
+ b .dec_xts_ret
+
+.dec_xts_end:
+ /* new tweak */
+ st1 {v8.16b}, [x4]
+
+.dec_xts_ret:
+ ldp d8,d9,[sp],#16
+ ret
+.size sm4_v8_xts_decrypt,.-sm4_v8_xts_decrypt
--
2.25.1
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/src-openeuler/libwd.git
git@gitee.com:src-openeuler/libwd.git
src-openeuler
libwd
libwd
master

搜索帮助