代码拉取完成,页面将自动刷新
From 5dbc53c96ac4efcf26b4dbcdbbf55d1b5e7a06be Mon Sep 17 00:00:00 2001
From: Weili Qian <qianweili@huawei.com>
Date: Sat, 23 Mar 2024 18:00:43 +0800
Subject: [PATCH 32/44] uadk/hash_mb: support multi-buffer calculation for sm3
and md5
Supports sm3 and md5 multi-buffer calculation by using SVE instructions.
If the platform supports SVE instructions, uesrs can choose SVE instructions
to perform sm3 and md5 algorithm calculation.
The assembly implementation is from isa-l_crypto:
https://github.com/intel/isa-l_crypto.git
Signed-off-by: Weili Qian <qianweili@huawei.com>
---
Makefile.am | 15 +-
drv/hash_mb/hash_mb.c | 843 ++++++++++++++++++++++++++++++++++
drv/hash_mb/hash_mb.h | 62 +++
drv/hash_mb/md5_mb_asimd_x1.S | 248 ++++++++++
drv/hash_mb/md5_mb_asimd_x4.S | 526 +++++++++++++++++++++
drv/hash_mb/md5_mb_sve.S | 158 +++++++
drv/hash_mb/md5_sve_common.S | 478 +++++++++++++++++++
drv/hash_mb/sm3_mb_asimd_x1.S | 387 ++++++++++++++++
drv/hash_mb/sm3_mb_asimd_x4.S | 576 +++++++++++++++++++++++
drv/hash_mb/sm3_mb_sve.S | 161 +++++++
drv/hash_mb/sm3_sve_common.S | 505 ++++++++++++++++++++
11 files changed, 3958 insertions(+), 1 deletion(-)
create mode 100644 drv/hash_mb/hash_mb.c
create mode 100644 drv/hash_mb/hash_mb.h
create mode 100644 drv/hash_mb/md5_mb_asimd_x1.S
create mode 100644 drv/hash_mb/md5_mb_asimd_x4.S
create mode 100644 drv/hash_mb/md5_mb_sve.S
create mode 100644 drv/hash_mb/md5_sve_common.S
create mode 100644 drv/hash_mb/sm3_mb_asimd_x1.S
create mode 100644 drv/hash_mb/sm3_mb_asimd_x4.S
create mode 100644 drv/hash_mb/sm3_mb_sve.S
create mode 100644 drv/hash_mb/sm3_sve_common.S
diff --git a/Makefile.am b/Makefile.am
index f78ad14..68f3106 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -45,7 +45,7 @@ lib_LTLIBRARIES=libwd.la libwd_comp.la libwd_crypto.la
uadk_driversdir=$(libdir)/uadk
uadk_drivers_LTLIBRARIES=libhisi_sec.la libhisi_hpre.la libhisi_zip.la \
- libisa_ce.la
+ libisa_ce.la libisa_sve.la
libwd_la_SOURCES=wd.c wd_mempool.c wd.h wd_alg.c wd_alg.h \
v1/wd.c v1/wd.h v1/wd_adapter.c v1/wd_adapter.h \
@@ -94,6 +94,12 @@ libhisi_hpre_la_SOURCES=drv/hisi_hpre.c drv/hisi_qm_udrv.c \
libisa_ce_la_SOURCES=arm_arch_ce.h drv/isa_ce_sm3.c drv/isa_ce_sm3_armv8.S isa_ce_sm3.h \
drv/isa_ce_sm4.c drv/isa_ce_sm4_armv8.S drv/isa_ce_sm4.h
+libisa_sve_la_SOURCES=drv/hash_mb/hash_mb.c wd_digest_drv.h drv/hash_mb/hash_mb.h \
+ drv/hash_mb/sm3_sve_common.S drv/hash_mb/sm3_mb_asimd_x1.S \
+ drv/hash_mb/sm3_mb_asimd_x4.S drv/hash_mb/sm3_mb_sve.S \
+ drv/hash_mb/md5_sve_common.S drv/hash_mb/md5_mb_asimd_x1.S \
+ drv/hash_mb/md5_mb_asimd_x4.S drv/hash_mb/md5_mb_sve.S
+
if WD_STATIC_DRV
AM_CFLAGS += -DWD_STATIC_DRV -fPIC
AM_CFLAGS += -DWD_NO_LOG
@@ -117,6 +123,9 @@ libhisi_hpre_la_DEPENDENCIES = libwd.la libwd_crypto.la
libisa_ce_la_LIBADD = $(libwd_la_OBJECTS) $(libwd_crypto_la_OBJECTS)
libisa_ce_la_DEPENDENCIES = libwd.la libwd_crypto.la
+libisa_sve_la_LIBADD = $(libwd_la_OBJECTS) $(libwd_crypto_la_OBJECTS)
+libisa_sve_la_DEPENDENCIES = libwd.la libwd_crypto.la
+
else
UADK_WD_SYMBOL= -Wl,--version-script,$(top_srcdir)/libwd.map
UADK_CRYPTO_SYMBOL= -Wl,--version-script,$(top_srcdir)/libwd_crypto.map
@@ -149,6 +158,10 @@ libhisi_hpre_la_DEPENDENCIES= libwd.la libwd_crypto.la
libisa_ce_la_LIBADD= -lwd -lwd_crypto
libisa_ce_la_LDFLAGS=$(UADK_VERSION)
libisa_ce_la_DEPENDENCIES= libwd.la libwd_crypto.la
+
+libisa_sve_la_LIBADD= -lwd -lwd_crypto
+libisa_sve_la_LDFLAGS=$(UADK_VERSION)
+libisa_sve_la_DEPENDENCIES= libwd.la libwd_crypto.la
endif # WD_STATIC_DRV
pkgconfigdir = $(libdir)/pkgconfig
diff --git a/drv/hash_mb/hash_mb.c b/drv/hash_mb/hash_mb.c
new file mode 100644
index 0000000..a73c698
--- /dev/null
+++ b/drv/hash_mb/hash_mb.c
@@ -0,0 +1,843 @@
+/* SPDX-License-Identifier: Apache-2.0 */
+/* Copyright 2024 Huawei Technologies Co.,Ltd. All rights reserved. */
+
+#include <sys/auxv.h>
+#include <pthread.h>
+#include <stdlib.h>
+#include <string.h>
+#include "hash_mb.h"
+
+#define MIN(a, b) (((a) > (b)) ? (b) : (a))
+#define IPAD_VALUE 0x36
+#define OPAD_VALUE 0x5C
+#define HASH_KEY_LEN 64
+#define HASH_BLOCK_OFFSET 6
+#define HASH_BLOCK_SIZE 64
+#define HASH_PADLENGTHFIELD_SIZE 56
+#define HASH_PADDING_SIZE 120
+#define HASH_HIGH_32BITS 32
+#define HASH_PADDING_BLOCKS 2
+#define HASH_NENO_PROCESS_JOBS 4
+#define HASH_TRY_PROCESS_COUNT 16
+#define BYTES_TO_BITS_OFFSET 3
+
+#define MD5_DIGEST_DATA_SIZE 16
+#define SM3_DIGEST_DATA_SIZE 32
+#define HASH_MAX_LANES 32
+#define SM3_MAX_LANES 16
+
+#define PUTU32(p, V) \
+ ((p)[0] = (uint8_t)((V) >> 24), \
+ (p)[1] = (uint8_t)((V) >> 16), \
+ (p)[2] = (uint8_t)((V) >> 8), \
+ (p)[3] = (uint8_t)(V))
+
+struct hash_mb_ops {
+ int (*max_lanes)(void);
+ void (*asimd_x4)(struct hash_job *job1, struct hash_job *job2,
+ struct hash_job *job3, struct hash_job *job4, int len);
+ void (*asimd_x1)(struct hash_job *job, int len);
+ void (*sve)(int blocks, int total_lanes, struct hash_job **job_vec);
+ __u8 *iv_data;
+ int iv_bytes;
+ int max_jobs;
+};
+
+struct hash_mb_poll_queue {
+ struct hash_job *head;
+ struct hash_job *tail;
+ pthread_spinlock_t s_lock;
+ const struct hash_mb_ops *ops;
+ __u32 job_num;
+};
+
+struct hash_mb_queue {
+ struct hash_mb_poll_queue sm3_poll_queue;
+ struct hash_mb_poll_queue md5_poll_queue;
+ pthread_spinlock_t r_lock;
+ struct hash_job *recv_head;
+ struct hash_job *recv_tail;
+ __u32 complete_cnt;
+ __u8 ctx_mode;
+};
+
+struct hash_mb_ctx {
+ struct wd_ctx_config_internal config;
+};
+
+static __u8 sm3_iv_data[SM3_DIGEST_DATA_SIZE] = {
+ 0x73, 0x80, 0x16, 0x6f, 0x49, 0x14, 0xb2, 0xb9,
+ 0x17, 0x24, 0x42, 0xd7, 0xda, 0x8a, 0x06, 0x00,
+ 0xa9, 0x6f, 0x30, 0xbc, 0x16, 0x31, 0x38, 0xaa,
+ 0xe3, 0x8d, 0xee, 0x4d, 0xb0, 0xfb, 0x0e, 0x4e,
+};
+
+static __u8 md5_iv_data[MD5_DIGEST_DATA_SIZE] = {
+ 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef,
+ 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10,
+};
+
+static struct hash_mb_ops md5_ops = {
+ .max_lanes = md5_mb_sve_max_lanes,
+ .asimd_x4 = md5_mb_asimd_x4,
+ .asimd_x1 = md5_mb_asimd_x1,
+ .sve = md5_mb_sve,
+ .iv_data = md5_iv_data,
+ .iv_bytes = MD5_DIGEST_DATA_SIZE,
+ .max_jobs = HASH_MAX_LANES,
+};
+
+static struct hash_mb_ops sm3_ops = {
+ .max_lanes = sm3_mb_sve_max_lanes,
+ .asimd_x4 = sm3_mb_asimd_x4,
+ .asimd_x1 = sm3_mb_asimd_x1,
+ .sve = sm3_mb_sve,
+ .iv_data = sm3_iv_data,
+ .iv_bytes = SM3_DIGEST_DATA_SIZE,
+ .max_jobs = SM3_MAX_LANES,
+};
+
+static void hash_mb_uninit_poll_queue(struct hash_mb_poll_queue *poll_queue)
+{
+ pthread_spin_destroy(&poll_queue->s_lock);
+}
+
+static void hash_mb_queue_uninit(struct wd_ctx_config_internal *config, int ctx_num)
+{
+ struct hash_mb_queue *mb_queue;
+ struct wd_soft_ctx *ctx;
+ int i;
+
+ for (i = 0; i < ctx_num; i++) {
+ ctx = (struct wd_soft_ctx *)config->ctxs[i].ctx;
+ mb_queue = ctx->priv;
+ pthread_spin_destroy(&mb_queue->r_lock);
+ hash_mb_uninit_poll_queue(&mb_queue->sm3_poll_queue);
+ hash_mb_uninit_poll_queue(&mb_queue->md5_poll_queue);
+ free(mb_queue);
+ }
+}
+
+static int hash_mb_init_poll_queue(struct hash_mb_poll_queue *poll_queue)
+{
+ int ret;
+
+ ret = pthread_spin_init(&poll_queue->s_lock, PTHREAD_PROCESS_SHARED);
+ if (ret) {
+ WD_ERR("failed to init s_lock!\n");
+ return ret;
+ }
+
+ poll_queue->head = NULL;
+ poll_queue->tail = NULL;
+ poll_queue->job_num = 0;
+
+ return WD_SUCCESS;
+}
+
+static int hash_mb_queue_init(struct wd_ctx_config_internal *config)
+{
+ struct hash_mb_queue *mb_queue;
+ int ctx_num = config->ctx_num;
+ struct wd_soft_ctx *ctx;
+ int i, ret;
+
+ for (i = 0; i < ctx_num; i++) {
+ mb_queue = calloc(1, sizeof(struct hash_mb_queue));
+ if (!mb_queue) {
+ ret = -WD_ENOMEM;
+ goto free_mb_queue;
+ }
+
+ mb_queue->ctx_mode = config->ctxs[i].ctx_mode;
+ ctx = (struct wd_soft_ctx *)config->ctxs[i].ctx;
+ ctx->priv = mb_queue;
+ ret = hash_mb_init_poll_queue(&mb_queue->sm3_poll_queue);
+ if (ret)
+ goto free_mem;
+
+ ret = hash_mb_init_poll_queue(&mb_queue->md5_poll_queue);
+ if (ret)
+ goto uninit_sm3_poll;
+
+ ret = pthread_spin_init(&mb_queue->r_lock, PTHREAD_PROCESS_SHARED);
+ if (ret) {
+ WD_ERR("failed to init r_lock!\n");
+ goto uninit_md5_poll;
+ }
+
+ mb_queue->sm3_poll_queue.ops = &sm3_ops;
+ mb_queue->md5_poll_queue.ops = &md5_ops;
+ mb_queue->recv_head = NULL;
+ mb_queue->recv_tail = NULL;
+ mb_queue->complete_cnt = 0;
+ }
+
+ return WD_SUCCESS;
+
+uninit_md5_poll:
+ hash_mb_uninit_poll_queue(&mb_queue->md5_poll_queue);
+uninit_sm3_poll:
+ hash_mb_uninit_poll_queue(&mb_queue->sm3_poll_queue);
+free_mem:
+ free(mb_queue);
+free_mb_queue:
+ hash_mb_queue_uninit(config, i);
+ return ret;
+}
+
+static int hash_mb_init(struct wd_alg_driver *drv, void *conf)
+{
+ struct wd_ctx_config_internal *config = conf;
+ struct hash_mb_ctx *priv;
+ int ret;
+
+ priv = malloc(sizeof(struct hash_mb_ctx));
+ if (!priv)
+ return -WD_ENOMEM;
+
+ /* multibuff does not use epoll. */
+ config->epoll_en = 0;
+ memcpy(&priv->config, config, sizeof(struct wd_ctx_config_internal));
+
+ ret = hash_mb_queue_init(config);
+ if (ret) {
+ free(priv);
+ return ret;
+ }
+
+ drv->priv = priv;
+
+ return WD_SUCCESS;
+}
+
+static void hash_mb_exit(struct wd_alg_driver *drv)
+{
+ struct hash_mb_ctx *priv = (struct hash_mb_ctx *)drv->priv;
+
+ if (!priv)
+ return;
+
+ hash_mb_queue_uninit(&priv->config, priv->config.ctx_num);
+ free(priv);
+ drv->priv = NULL;
+}
+
+static void hash_mb_pad_data(struct hash_pad *hash_pad, __u8 *in, __u32 partial,
+ __u64 total_len, bool transfer)
+{
+ __u64 size = total_len << BYTES_TO_BITS_OFFSET;
+ __u8 *buffer = hash_pad->pad;
+
+ if (partial)
+ memcpy(buffer, in, partial);
+
+ buffer[partial++] = 0x80;
+ if (partial <= HASH_PADLENGTHFIELD_SIZE) {
+ memset(buffer + partial, 0, HASH_PADLENGTHFIELD_SIZE - partial);
+ if (transfer) {
+ PUTU32(buffer + HASH_PADLENGTHFIELD_SIZE, size >> HASH_HIGH_32BITS);
+ PUTU32(buffer + HASH_PADLENGTHFIELD_SIZE + sizeof(__u32), size);
+ } else {
+ memcpy(buffer + HASH_PADLENGTHFIELD_SIZE, &size, sizeof(__u64));
+ }
+ hash_pad->pad_len = 1;
+ } else {
+ memset(buffer + partial, 0, HASH_PADDING_SIZE - partial);
+ if (transfer) {
+ PUTU32(buffer + HASH_PADDING_SIZE, size >> HASH_HIGH_32BITS);
+ PUTU32(buffer + HASH_PADDING_SIZE + sizeof(__u32), size);
+ } else {
+ memcpy(buffer + HASH_PADDING_SIZE, &size, sizeof(__u64));
+ }
+ hash_pad->pad_len = HASH_PADDING_BLOCKS;
+ }
+}
+
+static inline void hash_xor(__u8 *key_out, __u8 *key_in, __u32 key_len, __u8 xor_value)
+{
+ __u32 i;
+
+ for (i = 0; i < HASH_KEY_LEN; i++) {
+ if (i < key_len)
+ key_out[i] = key_in[i] ^ xor_value;
+ else
+ key_out[i] = xor_value;
+ }
+}
+
+static int hash_middle_block_process(struct hash_mb_poll_queue *poll_queue,
+ struct wd_digest_msg *d_msg,
+ struct hash_job *job)
+{
+ __u8 *buffer = d_msg->partial_block + d_msg->partial_bytes;
+ __u64 length = (__u64)d_msg->partial_bytes + d_msg->in_bytes;
+
+ if (length < HASH_BLOCK_SIZE) {
+ memcpy(buffer, d_msg->in, d_msg->in_bytes);
+ d_msg->partial_bytes = length;
+ return -WD_EAGAIN;
+ }
+
+ if (d_msg->partial_bytes) {
+ memcpy(buffer, d_msg->in, HASH_BLOCK_SIZE - d_msg->partial_bytes);
+ job->buffer = d_msg->partial_block;
+ poll_queue->ops->asimd_x1(job, 1);
+ length = d_msg->in_bytes - (HASH_BLOCK_SIZE - d_msg->partial_bytes);
+ buffer = d_msg->in + (HASH_BLOCK_SIZE - d_msg->partial_bytes);
+ } else {
+ buffer = d_msg->in;
+ }
+
+ job->len = length >> HASH_BLOCK_OFFSET;
+ d_msg->partial_bytes = length & (HASH_BLOCK_SIZE - 1);
+ if (d_msg->partial_bytes)
+ memcpy(d_msg->partial_block, buffer + (job->len << HASH_BLOCK_OFFSET),
+ d_msg->partial_bytes);
+
+ if (!job->len) {
+ memcpy(d_msg->out, job->result_digest, poll_queue->ops->iv_bytes);
+ return -WD_EAGAIN;
+ }
+
+ job->buffer = buffer;
+ job->pad.pad_len = 0;
+
+ return WD_SUCCESS;
+}
+
+static void hash_signle_block_process(struct wd_digest_msg *d_msg,
+ struct hash_job *job, __u64 total_len)
+{
+ __u32 hash_partial = d_msg->in_bytes & (HASH_BLOCK_SIZE - 1);
+ __u8 *buffer;
+
+ job->len = d_msg->in_bytes >> HASH_BLOCK_OFFSET;
+ buffer = d_msg->in + (job->len << HASH_BLOCK_OFFSET);
+ hash_mb_pad_data(&job->pad, buffer, hash_partial, total_len, job->is_transfer);
+ if (!job->len) {
+ job->buffer = job->pad.pad;
+ job->len = job->pad.pad_len;
+ job->pad.pad_len = 0;
+ return;
+ }
+
+ job->buffer = d_msg->in;
+}
+
+static void hash_final_block_process(struct hash_mb_poll_queue *poll_queue,
+ struct wd_digest_msg *d_msg,
+ struct hash_job *job)
+{
+ __u8 *buffer = d_msg->partial_block + d_msg->partial_bytes;
+ __u64 length = (__u64)d_msg->partial_bytes + d_msg->in_bytes;
+ __u32 hash_partial = length & (HASH_BLOCK_SIZE - 1);
+ __u64 total_len = d_msg->long_data_len;
+
+ if (job->opad.opad_size)
+ total_len += HASH_BLOCK_SIZE;
+
+ if (!d_msg->partial_bytes) {
+ hash_signle_block_process(d_msg, job, total_len);
+ return;
+ }
+
+ if (length <= HASH_BLOCK_SIZE) {
+ memcpy(buffer, d_msg->in, d_msg->in_bytes);
+ job->len = length >> HASH_BLOCK_OFFSET;
+ buffer = d_msg->partial_block + (job->len << HASH_BLOCK_OFFSET);
+ hash_mb_pad_data(&job->pad, buffer, hash_partial, total_len, job->is_transfer);
+ if (!job->len) {
+ job->buffer = job->pad.pad;
+ job->len = job->pad.pad_len;
+ job->pad.pad_len = 0;
+ return;
+ }
+
+ job->buffer = d_msg->partial_block;
+ return;
+ }
+
+ memcpy(buffer, d_msg->in, (HASH_BLOCK_SIZE - d_msg->partial_bytes));
+ job->buffer = d_msg->partial_block;
+ poll_queue->ops->asimd_x1(job, 1);
+ job->buffer = d_msg->in + (HASH_BLOCK_SIZE - d_msg->partial_bytes);
+ length = d_msg->in_bytes - (HASH_BLOCK_SIZE - d_msg->partial_bytes);
+ job->len = length >> HASH_BLOCK_OFFSET;
+ buffer = job->buffer + (job->len << HASH_BLOCK_OFFSET);
+ hash_partial = length & (HASH_BLOCK_SIZE - 1);
+ hash_mb_pad_data(&job->pad, buffer, hash_partial, total_len, job->is_transfer);
+ if (!job->len) {
+ job->buffer = job->pad.pad;
+ job->len = job->pad.pad_len;
+ job->pad.pad_len = 0;
+ }
+}
+
+static int hash_first_block_process(struct wd_digest_msg *d_msg,
+ struct hash_job *job,
+ __u32 iv_bytes)
+{
+ __u8 *buffer;
+
+ job->len = d_msg->in_bytes >> HASH_BLOCK_OFFSET;
+ d_msg->partial_bytes = d_msg->in_bytes & (HASH_BLOCK_SIZE - 1);
+ if (d_msg->partial_bytes) {
+ buffer = d_msg->in + (job->len << HASH_BLOCK_OFFSET);
+ memcpy(d_msg->partial_block, buffer, d_msg->partial_bytes);
+ }
+
+ /*
+ * Long hash mode, if first block is less than HASH_BLOCK_SIZE,
+ * copy ikey hash result to out.
+ */
+ if (!job->len) {
+ memcpy(d_msg->out, job->result_digest, iv_bytes);
+ return -WD_EAGAIN;
+ }
+ job->buffer = d_msg->in;
+ job->pad.pad_len = 0;
+
+ return WD_SUCCESS;
+}
+
+static int hash_do_partial(struct hash_mb_poll_queue *poll_queue,
+ struct wd_digest_msg *d_msg, struct hash_job *job)
+{
+ enum hash_block_type bd_type = get_hash_block_type(d_msg);
+ __u64 total_len = d_msg->in_bytes;
+ int ret = WD_SUCCESS;
+
+ switch (bd_type) {
+ case HASH_FIRST_BLOCK:
+ ret = hash_first_block_process(d_msg, job, poll_queue->ops->iv_bytes);
+ break;
+ case HASH_MIDDLE_BLOCK:
+ ret = hash_middle_block_process(poll_queue, d_msg, job);
+ break;
+ case HASH_END_BLOCK:
+ hash_final_block_process(poll_queue, d_msg, job);
+ break;
+ case HASH_SINGLE_BLOCK:
+ if (job->opad.opad_size)
+ total_len += HASH_BLOCK_SIZE;
+ hash_signle_block_process(d_msg, job, total_len);
+ break;
+ }
+
+ return ret;
+}
+
+static void hash_mb_init_iv(struct hash_mb_poll_queue *poll_queue,
+ struct wd_digest_msg *d_msg, struct hash_job *job)
+{
+ enum hash_block_type bd_type = get_hash_block_type(d_msg);
+ __u8 key_ipad[HASH_KEY_LEN];
+ __u8 key_opad[HASH_KEY_LEN];
+
+ job->opad.opad_size = 0;
+ switch (bd_type) {
+ case HASH_FIRST_BLOCK:
+ memcpy(job->result_digest, poll_queue->ops->iv_data, poll_queue->ops->iv_bytes);
+ if (d_msg->mode != WD_DIGEST_HMAC)
+ return;
+
+ hash_xor(key_ipad, d_msg->key, d_msg->key_bytes, IPAD_VALUE);
+ job->buffer = key_ipad;
+ poll_queue->ops->asimd_x1(job, 1);
+ break;
+ case HASH_MIDDLE_BLOCK:
+ memcpy(job->result_digest, d_msg->out, poll_queue->ops->iv_bytes);
+ break;
+ case HASH_END_BLOCK:
+ if (d_msg->mode != WD_DIGEST_HMAC) {
+ memcpy(job->result_digest, d_msg->out, poll_queue->ops->iv_bytes);
+ return;
+ }
+ memcpy(job->result_digest, poll_queue->ops->iv_data, poll_queue->ops->iv_bytes);
+ hash_xor(key_opad, d_msg->key, d_msg->key_bytes, OPAD_VALUE);
+ job->buffer = key_opad;
+ poll_queue->ops->asimd_x1(job, 1);
+ memcpy(job->opad.opad, job->result_digest, poll_queue->ops->iv_bytes);
+ job->opad.opad_size = poll_queue->ops->iv_bytes;
+ memcpy(job->result_digest, d_msg->out, poll_queue->ops->iv_bytes);
+ break;
+ case HASH_SINGLE_BLOCK:
+ memcpy(job->result_digest, poll_queue->ops->iv_data, poll_queue->ops->iv_bytes);
+ if (d_msg->mode != WD_DIGEST_HMAC)
+ return;
+
+ hash_xor(key_ipad, d_msg->key, d_msg->key_bytes, IPAD_VALUE);
+ hash_xor(key_opad, d_msg->key, d_msg->key_bytes, OPAD_VALUE);
+ job->buffer = key_opad;
+ poll_queue->ops->asimd_x1(job, 1);
+ memcpy(job->opad.opad, job->result_digest, poll_queue->ops->iv_bytes);
+ job->opad.opad_size = poll_queue->ops->iv_bytes;
+ job->buffer = key_ipad;
+ memcpy(job->result_digest, poll_queue->ops->iv_data, poll_queue->ops->iv_bytes);
+ poll_queue->ops->asimd_x1(job, 1);
+ break;
+ }
+}
+
+static void hash_do_sync(struct hash_mb_poll_queue *poll_queue, struct hash_job *job)
+{
+ __u32 iv_bytes = poll_queue->ops->iv_bytes;
+ __u32 length;
+
+ poll_queue->ops->asimd_x1(job, job->len);
+
+ if (job->pad.pad_len) {
+ job->buffer = job->pad.pad;
+ poll_queue->ops->asimd_x1(job, job->pad.pad_len);
+ }
+
+ if (job->opad.opad_size) {
+ job->buffer = job->opad.opad + job->opad.opad_size;
+ memcpy(job->buffer, job->result_digest, iv_bytes);
+ memcpy(job->result_digest, job->opad.opad, iv_bytes);
+ length = HASH_BLOCK_SIZE + iv_bytes;
+ hash_mb_pad_data(&job->pad, job->buffer, iv_bytes, length, job->is_transfer);
+ job->buffer = job->pad.pad;
+ poll_queue->ops->asimd_x1(job, job->pad.pad_len);
+ }
+}
+
+static void hash_mb_add_job_tail(struct hash_mb_poll_queue *poll_queue, struct hash_job *job)
+{
+ pthread_spin_lock(&poll_queue->s_lock);
+ if (poll_queue->job_num) {
+ poll_queue->tail->next = job;
+ poll_queue->tail = job;
+ } else {
+ poll_queue->head = job;
+ poll_queue->tail = job;
+ }
+ poll_queue->job_num++;
+ pthread_spin_unlock(&poll_queue->s_lock);
+}
+
+static void hash_mb_add_job_head(struct hash_mb_poll_queue *poll_queue, struct hash_job *job)
+{
+ pthread_spin_lock(&poll_queue->s_lock);
+ if (poll_queue->job_num) {
+ job->next = poll_queue->head;
+ poll_queue->head = job;
+ } else {
+ poll_queue->head = job;
+ poll_queue->tail = job;
+ }
+ poll_queue->job_num++;
+ pthread_spin_unlock(&poll_queue->s_lock);
+}
+
+static int hash_mb_check_param(struct hash_mb_queue *mb_queue, struct wd_digest_msg *d_msg)
+{
+ if (unlikely(mb_queue->ctx_mode == CTX_MODE_ASYNC && d_msg->has_next)) {
+ WD_ERR("invalid: async mode not supports long hash!\n");
+ return -WD_EINVAL;
+ }
+
+ if (unlikely(d_msg->data_fmt != WD_FLAT_BUF)) {
+ WD_ERR("invalid: hash multibuffer not supports sgl mode!\n");
+ return -WD_EINVAL;
+ }
+
+ return WD_SUCCESS;
+}
+
+static int hash_mb_send(struct wd_alg_driver *drv, handle_t ctx, void *drv_msg)
+{
+ struct wd_soft_ctx *s_ctx = (struct wd_soft_ctx *)ctx;
+ struct hash_mb_queue *mb_queue = s_ctx->priv;
+ struct wd_digest_msg *d_msg = drv_msg;
+ struct hash_mb_poll_queue *poll_queue;
+ struct hash_job hash_sync_job;
+ struct hash_job *hash_job;
+ int ret;
+
+ ret = hash_mb_check_param(mb_queue, d_msg);
+ if (ret)
+ return ret;
+
+ if (mb_queue->ctx_mode == CTX_MODE_ASYNC) {
+ hash_job = malloc(sizeof(struct hash_job));
+ if (unlikely(!hash_job))
+ return -WD_ENOMEM;
+ } else {
+ hash_job = &hash_sync_job;
+ }
+
+ switch (d_msg->alg) {
+ case WD_DIGEST_SM3:
+ poll_queue = &mb_queue->sm3_poll_queue;
+ hash_job->is_transfer = true;
+ break;
+ case WD_DIGEST_MD5:
+ poll_queue = &mb_queue->md5_poll_queue;
+ hash_job->is_transfer = false;
+ break;
+ default:
+ WD_ERR("invalid: alg type %u not support!\n", d_msg->alg);
+ if (mb_queue->ctx_mode == CTX_MODE_ASYNC)
+ free(hash_job);
+ return -WD_EINVAL;
+ }
+
+ hash_mb_init_iv(poll_queue, d_msg, hash_job);
+ /* If block not need process, return directly. */
+ ret = hash_do_partial(poll_queue, d_msg, hash_job);
+ if (ret == -WD_EAGAIN) {
+ if (mb_queue->ctx_mode == CTX_MODE_ASYNC)
+ free(hash_job);
+
+ d_msg->result = WD_SUCCESS;
+ return WD_SUCCESS;
+ }
+
+ if (mb_queue->ctx_mode == CTX_MODE_SYNC) {
+ hash_do_sync(poll_queue, hash_job);
+ memcpy(d_msg->out, hash_job->result_digest, d_msg->out_bytes);
+ d_msg->result = WD_SUCCESS;
+ return WD_SUCCESS;
+ }
+
+ hash_job->msg = d_msg;
+ hash_mb_add_job_tail(poll_queue, hash_job);
+
+ return WD_SUCCESS;
+}
+
+static struct hash_job *hash_mb_find_complete_job(struct hash_mb_queue *mb_queue)
+{
+ struct hash_job *job;
+
+ pthread_spin_lock(&mb_queue->r_lock);
+ if (!mb_queue->complete_cnt) {
+ pthread_spin_unlock(&mb_queue->r_lock);
+ return NULL;
+ }
+
+ job = mb_queue->recv_head;
+ mb_queue->recv_head = job->next;
+ mb_queue->complete_cnt--;
+ pthread_spin_unlock(&mb_queue->r_lock);
+
+ return job;
+}
+
+static int hash_recv_complete_job(struct hash_mb_queue *mb_queue, struct wd_digest_msg *msg)
+{
+ struct hash_mb_poll_queue *poll_queue;
+ struct hash_job *hash_job;
+ __u32 total_len;
+
+ hash_job = hash_mb_find_complete_job(mb_queue);
+ if (!hash_job)
+ return -WD_EAGAIN;
+
+ if (!hash_job->opad.opad_size) {
+ msg->tag = hash_job->msg->tag;
+ memcpy(hash_job->msg->out, hash_job->result_digest, hash_job->msg->out_bytes);
+ free(hash_job);
+ msg->result = WD_SUCCESS;
+ return WD_SUCCESS;
+ }
+
+ if (hash_job->msg->alg == WD_DIGEST_SM3)
+ poll_queue = &mb_queue->sm3_poll_queue;
+ else
+ poll_queue = &mb_queue->md5_poll_queue;
+ hash_job->buffer = hash_job->opad.opad + poll_queue->ops->iv_bytes;
+ memcpy(hash_job->buffer, hash_job->result_digest, poll_queue->ops->iv_bytes);
+ total_len = poll_queue->ops->iv_bytes + HASH_BLOCK_SIZE;
+ hash_mb_pad_data(&hash_job->pad, hash_job->buffer, poll_queue->ops->iv_bytes,
+ total_len, hash_job->is_transfer);
+ memcpy(hash_job->result_digest, hash_job->opad.opad, poll_queue->ops->iv_bytes);
+ hash_job->opad.opad_size = 0;
+ hash_job->buffer = hash_job->pad.pad;
+ hash_job->len = hash_job->pad.pad_len;
+ hash_job->pad.pad_len = 0;
+
+ hash_mb_add_job_head(poll_queue, hash_job);
+
+ return -WD_EAGAIN;
+}
+
+static struct hash_job *hash_mb_get_job(struct hash_mb_poll_queue *poll_queue)
+{
+ struct hash_job *job;
+
+ pthread_spin_lock(&poll_queue->s_lock);
+ if (!poll_queue->job_num) {
+ pthread_spin_unlock(&poll_queue->s_lock);
+ return NULL;
+ }
+
+ job = poll_queue->head;
+ poll_queue->head = job->next;
+ poll_queue->job_num--;
+ pthread_spin_unlock(&poll_queue->s_lock);
+
+ return job;
+}
+
+static void hash_mb_add_finish_job(struct hash_mb_queue *mb_queue, struct hash_job *job)
+{
+ pthread_spin_lock(&mb_queue->r_lock);
+ if (mb_queue->complete_cnt) {
+ mb_queue->recv_tail->next = job;
+ mb_queue->recv_tail = job;
+ } else {
+ mb_queue->recv_head = job;
+ mb_queue->recv_tail = job;
+ }
+ mb_queue->complete_cnt++;
+ pthread_spin_unlock(&mb_queue->r_lock);
+}
+
+static struct hash_mb_poll_queue *hash_get_poll_queue(struct hash_mb_queue *mb_queue)
+{
+ if (!mb_queue->sm3_poll_queue.job_num &&
+ !mb_queue->md5_poll_queue.job_num)
+ return NULL;
+
+ if (mb_queue->md5_poll_queue.job_num >= mb_queue->sm3_poll_queue.job_num)
+ return &mb_queue->md5_poll_queue;
+
+ return &mb_queue->sm3_poll_queue;
+}
+
+static int hash_mb_do_jobs(struct hash_mb_queue *mb_queue)
+{
+ struct hash_mb_poll_queue *poll_queue = hash_get_poll_queue(mb_queue);
+ struct hash_job *job_vecs[HASH_MAX_LANES];
+ __u64 len = 0;
+ int maxjobs;
+ int j = 0;
+ int i = 0;
+
+ if (!poll_queue)
+ return -WD_EAGAIN;
+
+ maxjobs = poll_queue->ops->max_lanes();
+ maxjobs = MIN(maxjobs, poll_queue->ops->max_jobs);
+ while (j < maxjobs) {
+ job_vecs[j] = hash_mb_get_job(poll_queue);
+ if (!job_vecs[j])
+ break;
+
+ if (!j)
+ len = job_vecs[j]->len;
+ else
+ len = MIN(job_vecs[j]->len, len);
+ j++;
+ }
+
+ if (!j)
+ return -WD_EAGAIN;
+
+ if (j > HASH_NENO_PROCESS_JOBS) {
+ poll_queue->ops->sve(len, j, job_vecs);
+ } else if (j == HASH_NENO_PROCESS_JOBS) {
+ poll_queue->ops->asimd_x4(job_vecs[0], job_vecs[1],
+ job_vecs[2], job_vecs[3], len);
+ } else {
+ while (i < j)
+ poll_queue->ops->asimd_x1(job_vecs[i++], len);
+ }
+
+ for (i = 0; i < j; i++) {
+ if (job_vecs[i]->len == len) {
+ if (!job_vecs[i]->pad.pad_len) {
+ hash_mb_add_finish_job(mb_queue, job_vecs[i]);
+ } else {
+ job_vecs[i]->buffer = job_vecs[i]->pad.pad;
+ job_vecs[i]->len = job_vecs[i]->pad.pad_len;
+ job_vecs[i]->pad.pad_len = 0;
+ hash_mb_add_job_head(poll_queue, job_vecs[i]);
+ }
+ } else {
+ job_vecs[i]->len -= len;
+ job_vecs[i]->buffer += len << HASH_BLOCK_OFFSET;
+ hash_mb_add_job_head(poll_queue, job_vecs[i]);
+ }
+ }
+
+ return WD_SUCCESS;
+}
+
+static int hash_mb_recv(struct wd_alg_driver *drv, handle_t ctx, void *drv_msg)
+{
+ struct wd_soft_ctx *s_ctx = (struct wd_soft_ctx *)ctx;
+ struct hash_mb_queue *mb_queue = s_ctx->priv;
+ struct wd_digest_msg *msg = drv_msg;
+ int ret, i = 0;
+
+ if (mb_queue->ctx_mode == CTX_MODE_SYNC)
+ return WD_SUCCESS;
+
+ while (i++ < HASH_TRY_PROCESS_COUNT) {
+ ret = hash_recv_complete_job(mb_queue, msg);
+ if (!ret)
+ return WD_SUCCESS;
+
+ ret = hash_mb_do_jobs(mb_queue);
+ if (ret)
+ return ret;
+ }
+
+ return -WD_EAGAIN;
+}
+
+static int hash_mb_get_usage(void *param)
+{
+ return 0;
+}
+
+#define GEN_HASH_ALG_DRIVER(hash_alg_name) \
+{\
+ .drv_name = "hash_mb",\
+ .alg_name = (hash_alg_name),\
+ .calc_type = UADK_ALG_SVE_INSTR,\
+ .priority = 100,\
+ .queue_num = 1,\
+ .op_type_num = 1,\
+ .fallback = 0,\
+ .init = hash_mb_init,\
+ .exit = hash_mb_exit,\
+ .send = hash_mb_send,\
+ .recv = hash_mb_recv,\
+ .get_usage = hash_mb_get_usage,\
+}
+
+static struct wd_alg_driver hash_mb_driver[] = {
+ GEN_HASH_ALG_DRIVER("sm3"),
+ GEN_HASH_ALG_DRIVER("md5"),
+};
+
+static void __attribute__((constructor)) hash_mb_probe(void)
+{
+ size_t alg_num = ARRAY_SIZE(hash_mb_driver);
+ size_t i;
+ int ret;
+
+ WD_INFO("Info: register hash_mb alg drivers!\n");
+ for (i = 0; i < alg_num; i++) {
+ ret = wd_alg_driver_register(&hash_mb_driver[i]);
+ if (ret && ret != -WD_ENODEV)
+ WD_ERR("Error: register hash multibuff %s failed!\n",
+ hash_mb_driver[i].alg_name);
+ }
+}
+
+static void __attribute__((destructor)) hash_mb_remove(void)
+{
+ size_t alg_num = ARRAY_SIZE(hash_mb_driver);
+ size_t i;
+
+ WD_INFO("Info: unregister hash_mb alg drivers!\n");
+ for (i = 0; i < alg_num; i++)
+ wd_alg_driver_unregister(&hash_mb_driver[i]);
+}
+
diff --git a/drv/hash_mb/hash_mb.h b/drv/hash_mb/hash_mb.h
new file mode 100644
index 0000000..aba5ec9
--- /dev/null
+++ b/drv/hash_mb/hash_mb.h
@@ -0,0 +1,62 @@
+/* SPDX-License-Identifier: Apache-2.0 */
+/* Copyright 2024 Huawei Technologies Co.,Ltd. All rights reserved. */
+
+#ifndef __HASH_MB_H
+#define __HASH_MB_H
+
+#include <stdbool.h>
+#include <stdint.h>
+#include "drv/wd_digest_drv.h"
+#include "wd_digest.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define HASH_BLOCK_SIZE 64
+#define HASH_DIGEST_NWORDS 32
+
+#if __STDC_VERSION__ >= 201112L
+# define __ALIGN_END __attribute__((aligned(64)))
+#else
+# define __ALIGN_END __aligned(64)
+#endif
+
+struct hash_pad {
+ __u8 pad[HASH_BLOCK_SIZE * 2];
+ __u32 pad_len;
+};
+
+struct hash_opad {
+ __u8 opad[HASH_BLOCK_SIZE];
+ __u32 opad_size;
+};
+
+struct hash_job {
+ void *buffer;
+ __u64 len;
+ __u8 result_digest[HASH_DIGEST_NWORDS] __ALIGN_END;
+ struct hash_pad pad;
+ struct hash_opad opad;
+ struct hash_job *next;
+ struct wd_digest_msg *msg;
+ bool is_transfer;
+};
+
+void sm3_mb_sve(int blocks, int total_lanes, struct hash_job **job_vec);
+void sm3_mb_asimd_x4(struct hash_job *job1, struct hash_job *job2,
+ struct hash_job *job3, struct hash_job *job4, int len);
+void sm3_mb_asimd_x1(struct hash_job *job, int len);
+int sm3_mb_sve_max_lanes(void);
+void md5_mb_sve(int blocks, int total_lanes, struct hash_job **job_vec);
+void md5_mb_asimd_x4(struct hash_job *job1, struct hash_job *job2,
+ struct hash_job *job3, struct hash_job *job4, int len);
+void md5_mb_asimd_x1(struct hash_job *job, int len);
+int md5_mb_sve_max_lanes(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __HASH_MB_H */
+
diff --git a/drv/hash_mb/md5_mb_asimd_x1.S b/drv/hash_mb/md5_mb_asimd_x1.S
new file mode 100644
index 0000000..27d1124
--- /dev/null
+++ b/drv/hash_mb/md5_mb_asimd_x1.S
@@ -0,0 +1,248 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ .arch armv8-a
+
+/*
+Macros
+*/
+
+.macro declare_var_vector_reg name:req,reg:req
+ q_\name .req q\reg
+ v_\name .req v\reg
+ s_\name .req s\reg
+.endm
+
+
+.macro round_0_15 d_a:req,d_b:req,d_c:req,d_d:req,kh:req,kl:req,w:req,r:req
+ eor tmp0,\d_c,\d_d
+ mov k,\kl
+ and tmp0,tmp0,\d_b
+ movk k,\kh,lsl 16
+ eor tmp0,tmp0,\d_d
+ add tmp1,k,\w
+ add tmp0,tmp1,tmp0
+ add tmp0,\d_a,tmp0
+ ror tmp0,tmp0,32 - \r
+ add \d_a,\d_b,tmp0
+.endm
+
+.macro round_16_31 d_a:req,d_b:req,d_c:req,d_d:req,kh:req,kl:req,w:req,r:req
+ eor tmp0,\d_b,\d_c
+ mov k,\kl
+ and tmp0,tmp0,\d_d
+ movk k,\kh,lsl 16
+ eor tmp0,tmp0,\d_c
+ add tmp1,k,\w
+ add tmp0,tmp1,tmp0
+ add tmp0,\d_a,tmp0
+ ror tmp0,tmp0,32 - \r
+ add \d_a,\d_b,tmp0
+.endm
+
+.macro round_32_47 d_a:req,d_b:req,d_c:req,d_d:req,kh:req,kl:req,w:req,r:req
+ eor tmp0,\d_b,\d_c
+ mov k,\kl
+ eor tmp0,tmp0,\d_d
+ movk k,\kh,lsl 16
+ add tmp1,k,\w
+ add tmp0,tmp1,tmp0
+ add tmp0,\d_a,tmp0
+ ror tmp0,tmp0,32 - \r
+ add \d_a,\d_b,tmp0
+.endm
+
+.macro round_48_63 d_a:req,d_b:req,d_c:req,d_d:req,kh:req,kl:req,w:req,r:req
+ orn tmp0,\d_b,\d_d
+ mov k,\kl
+ eor tmp0,tmp0,\d_c
+ movk k,\kh,lsl 16
+ add tmp1,k,\w
+ add tmp0,tmp1,tmp0
+ add tmp0,\d_a,tmp0
+ ror tmp0,tmp0,32 - \r
+ add \d_a,\d_b,tmp0
+.endm
+/*
+ variables
+*/
+ job0 .req x0
+ digest_addr .req x0
+ len .req w1
+ end .req x1
+
+ buf_adr .req x2
+ d_a .req w3
+ d_b .req w4
+ d_c .req w5
+ d_d .req w6
+ k .req w7
+ m0 .req w8
+ m1 .req w9
+ m2 .req w10
+ m3 .req w11
+ m4 .req w12
+ m5 .req w13
+ m6 .req w14
+ m7 .req w15
+ m8 .req w19
+ m9 .req w20
+ m10 .req w21
+ m11 .req w22
+ m12 .req w23
+ m13 .req w24
+ m14 .req w25
+ m15 .req w26
+
+ tmp0 .req w27
+ tmp1 .req w28
+
+ d_a1 .req w8
+ d_b1 .req w9
+ d_c1 .req w15
+ d_d1 .req w19
+
+/*
+ void md5_mb_asimd_x1(MD5_JOB * job0,int len)
+*/
+ .global md5_mb_asimd_x1
+ .type md5_mb_asimd_x1, %function
+md5_mb_asimd_x1:
+ cmp len,0
+ stp x29, x30, [sp,-96]!
+ ldr buf_adr,[job0],64
+ stp x19, x20, [sp, 16]
+ add end,buf_adr,end,lsl 6
+ stp x21, x22, [sp, 32]
+ ldp d_a,d_b,[digest_addr]
+ stp x23, x24, [sp, 48]
+ ldp d_c,d_d,[digest_addr,8]
+ stp x25, x26, [sp, 64]
+ stp x27, x28, [sp, 80]
+ ble .exit
+
+.loop_start:
+ ldp m0,m1,[buf_adr],8
+ ldp m2,m3,[buf_adr],8
+ round_0_15 d_a,d_b,d_c,d_d,0xd76a,0xa478,m0,7
+
+ ldp m4,m5,[buf_adr],8
+ round_0_15 d_d,d_a,d_b,d_c,0xe8c7,0xb756,m1,12
+ ldp m6,m7,[buf_adr],8
+ round_0_15 d_c,d_d,d_a,d_b,0x2420,0x70db,m2,17
+ ldp m8,m9,[buf_adr],8
+ round_0_15 d_b,d_c,d_d,d_a,0xc1bd,0xceee,m3,22
+ ldp m10,m11,[buf_adr],8
+ round_0_15 d_a,d_b,d_c,d_d,0xf57c,0xfaf,m4,7
+ ldp m12,m13,[buf_adr],8
+ round_0_15 d_d,d_a,d_b,d_c,0x4787,0xc62a,m5,12
+ ldp m14,m15,[buf_adr],8
+ round_0_15 d_c,d_d,d_a,d_b,0xa830,0x4613,m6,17
+ round_0_15 d_b,d_c,d_d,d_a,0xfd46,0x9501,m7,22
+ round_0_15 d_a,d_b,d_c,d_d,0x6980,0x98d8,m8,7
+ round_0_15 d_d,d_a,d_b,d_c,0x8b44,0xf7af,m9,12
+ round_0_15 d_c,d_d,d_a,d_b,0xffff,0x5bb1,m10,17
+ round_0_15 d_b,d_c,d_d,d_a,0x895c,0xd7be,m11,22
+ round_0_15 d_a,d_b,d_c,d_d,0x6b90,0x1122,m12,7
+ round_0_15 d_d,d_a,d_b,d_c,0xfd98,0x7193,m13,12
+ round_0_15 d_c,d_d,d_a,d_b,0xa679,0x438e,m14,17
+ round_0_15 d_b,d_c,d_d,d_a,0x49b4,0x821,m15,22
+
+ round_16_31 d_a,d_b,d_c,d_d,0xf61e,0x2562,m1,5
+ round_16_31 d_d,d_a,d_b,d_c,0xc040,0xb340,m6,9
+ round_16_31 d_c,d_d,d_a,d_b,0x265e,0x5a51,m11,14
+ round_16_31 d_b,d_c,d_d,d_a,0xe9b6,0xc7aa,m0,20
+ round_16_31 d_a,d_b,d_c,d_d,0xd62f,0x105d,m5,5
+ round_16_31 d_d,d_a,d_b,d_c,0x244,0x1453,m10,9
+ round_16_31 d_c,d_d,d_a,d_b,0xd8a1,0xe681,m15,14
+ round_16_31 d_b,d_c,d_d,d_a,0xe7d3,0xfbc8,m4,20
+ round_16_31 d_a,d_b,d_c,d_d,0x21e1,0xcde6,m9,5
+ round_16_31 d_d,d_a,d_b,d_c,0xc337,0x7d6,m14,9
+ round_16_31 d_c,d_d,d_a,d_b,0xf4d5,0xd87,m3,14
+ round_16_31 d_b,d_c,d_d,d_a,0x455a,0x14ed,m8,20
+ round_16_31 d_a,d_b,d_c,d_d,0xa9e3,0xe905,m13,5
+ round_16_31 d_d,d_a,d_b,d_c,0xfcef,0xa3f8,m2,9
+ round_16_31 d_c,d_d,d_a,d_b,0x676f,0x2d9,m7,14
+ round_16_31 d_b,d_c,d_d,d_a,0x8d2a,0x4c8a,m12,20
+
+ round_32_47 d_a,d_b,d_c,d_d,0xfffa,0x3942,m5,4
+ round_32_47 d_d,d_a,d_b,d_c,0x8771,0xf681,m8,11
+ round_32_47 d_c,d_d,d_a,d_b,0x6d9d,0x6122,m11,16
+ round_32_47 d_b,d_c,d_d,d_a,0xfde5,0x380c,m14,23
+ round_32_47 d_a,d_b,d_c,d_d,0xa4be,0xea44,m1,4
+ round_32_47 d_d,d_a,d_b,d_c,0x4bde,0xcfa9,m4,11
+ round_32_47 d_c,d_d,d_a,d_b,0xf6bb,0x4b60,m7,16
+ round_32_47 d_b,d_c,d_d,d_a,0xbebf,0xbc70,m10,23
+ round_32_47 d_a,d_b,d_c,d_d,0x289b,0x7ec6,m13,4
+ round_32_47 d_d,d_a,d_b,d_c,0xeaa1,0x27fa,m0,11
+ round_32_47 d_c,d_d,d_a,d_b,0xd4ef,0x3085,m3,16
+ round_32_47 d_b,d_c,d_d,d_a,0x488,0x1d05,m6,23
+ round_32_47 d_a,d_b,d_c,d_d,0xd9d4,0xd039,m9,4
+ round_32_47 d_d,d_a,d_b,d_c,0xe6db,0x99e5,m12,11
+ round_32_47 d_c,d_d,d_a,d_b,0x1fa2,0x7cf8,m15,16
+ round_32_47 d_b,d_c,d_d,d_a,0xc4ac,0x5665,m2,23
+
+ round_48_63 d_a,d_b,d_c,d_d,0xf429,0x2244,m0,6
+ round_48_63 d_d,d_a,d_b,d_c,0x432a,0xff97,m7,10
+ round_48_63 d_c,d_d,d_a,d_b,0xab94,0x23a7,m14,15
+ round_48_63 d_b,d_c,d_d,d_a,0xfc93,0xa039,m5,21
+ round_48_63 d_a,d_b,d_c,d_d,0x655b,0x59c3,m12,6
+ round_48_63 d_d,d_a,d_b,d_c,0x8f0c,0xcc92,m3,10
+ round_48_63 d_c,d_d,d_a,d_b,0xffef,0xf47d,m10,15
+ round_48_63 d_b,d_c,d_d,d_a,0x8584,0x5dd1,m1,21
+ round_48_63 d_a,d_b,d_c,d_d,0x6fa8,0x7e4f,m8,6
+ round_48_63 d_d,d_a,d_b,d_c,0xfe2c,0xe6e0,m15,10
+ round_48_63 d_c,d_d,d_a,d_b,0xa301,0x4314,m6,15
+ round_48_63 d_b,d_c,d_d,d_a,0x4e08,0x11a1,m13,21
+ round_48_63 d_a,d_b,d_c,d_d,0xf753,0x7e82,m4,6
+ ldp d_a1,d_b1,[digest_addr]
+ round_48_63 d_d,d_a,d_b,d_c,0xbd3a,0xf235,m11,10
+ ldp d_c1,d_d1,[digest_addr,8]
+ round_48_63 d_c,d_d,d_a,d_b,0x2ad7,0xd2bb,m2,15
+ round_48_63 d_b,d_c,d_d,d_a,0xeb86,0xd391,m9,21
+
+ cmp buf_adr,end
+ add d_a,d_a1 ,d_a
+ str d_a,[digest_addr]
+ add d_b,d_b1 ,d_b
+ str d_b,[digest_addr,4]
+ add d_c,d_c1 ,d_c
+ str d_c,[digest_addr,8]
+ add d_d,d_d1 ,d_d
+ str d_d,[digest_addr,12]
+ bne .loop_start
+
+.exit:
+ ldp x19, x20, [sp, 16]
+ ldp x21, x22, [sp, 32]
+ ldp x23, x24, [sp, 48]
+ ldp x25, x26, [sp, 64]
+ ldp x27, x28, [sp, 80]
+ ldp x29, x30, [sp], 96
+ ret
+ .size md5_mb_asimd_x1, .-md5_mb_asimd_x1
diff --git a/drv/hash_mb/md5_mb_asimd_x4.S b/drv/hash_mb/md5_mb_asimd_x4.S
new file mode 100644
index 0000000..5397913
--- /dev/null
+++ b/drv/hash_mb/md5_mb_asimd_x4.S
@@ -0,0 +1,526 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ .arch armv8-a
+
+/*
+Macros
+*/
+
+.macro declare_var_vector_reg name:req,reg:req
+ q_\name .req q\reg
+ v_\name .req v\reg
+ s_\name .req s\reg
+.endm
+
+.macro add_key_rol a:req,b:req,k:req,w:req,r:req
+ add v_tmp0.4s,v_\k\().4s,v_\w\().4s
+ add v_tmp1.4s,v_tmp1.4s,v_\a\().4s
+ add v_tmp1.4s,v_tmp1.4s,v_tmp0.4s
+ shl v_tmp0.4s,v_tmp1.4s,\r
+ ushr v_tmp1.4s,v_tmp1.4s,32-\r
+ orr v_tmp0.16b,v_tmp1.16b,v_tmp0.16b
+
+ add v_\a\().4s,v_\b\().4s,v_tmp0.4s
+.endm
+.macro round_0_15 a:req,b:req,c:req,d:req,k:req,k1:req,w:req,r:req
+ mov v_tmp1.16b, v_\b\().16b
+ bsl v_tmp1.16b, v_\c\().16b, v_\d\().16b
+ ldr q_\k1,[key_adr],16
+ add_key_rol \a,\b,\k,\w,\r
+.endm
+
+.macro round_16_31 a:req,b:req,c:req,d:req,k:req,k1:req,w:req,r:req
+ mov v_tmp1.16b, v_\d\().16b
+ bsl v_tmp1.16b, v_\b\().16b, v_\c\().16b
+ ldr q_\k1,[key_adr],16
+ add_key_rol \a,\b,\k,\w,\r
+.endm
+
+.macro round_32_47 a:req,b:req,c:req,d:req,k:req,k1:req,w:req,r:req
+ eor v_tmp1.16b,v_\b\().16b,v_\c\().16b
+ eor v_tmp1.16b,v_tmp1.16b,v_\d\().16b
+ ldr q_\k1,[key_adr],16
+ add_key_rol \a,\b,\k,\w,\r
+.endm
+
+.macro round_48_63 a:req,b:req,c:req,d:req,k:req,k1,w:req,r:req
+ orn v_tmp1.16b,v_\b\().16b,v_\d\().16b
+ eor v_tmp1.16b,v_tmp1.16b,v_\c\().16b
+ .ifnb \k1
+ ldr q_\k1,[key_adr],16
+ .endif
+ add_key_rol \a,\b,\k,\w,\r
+.endm
+/*
+ variables
+*/
+ declare_var_vector_reg tmp0, 0
+ declare_var_vector_reg tmp1, 1
+ declare_var_vector_reg k, 2
+ declare_var_vector_reg k1, 3
+ declare_var_vector_reg a, 4
+ declare_var_vector_reg b, 5
+ declare_var_vector_reg c, 6
+ declare_var_vector_reg d, 7
+ declare_var_vector_reg a1, 8
+ declare_var_vector_reg b1, 9
+ declare_var_vector_reg c1, 10
+ declare_var_vector_reg d1, 11
+
+ declare_var_vector_reg w0, 16
+ declare_var_vector_reg w1, 17
+ declare_var_vector_reg w2, 18
+ declare_var_vector_reg w3, 19
+ declare_var_vector_reg w4, 20
+ declare_var_vector_reg w5, 21
+ declare_var_vector_reg w6, 22
+ declare_var_vector_reg w7, 23
+ declare_var_vector_reg w8, 24
+ declare_var_vector_reg w9, 25
+ declare_var_vector_reg w10, 26
+ declare_var_vector_reg w11, 27
+ declare_var_vector_reg w12, 28
+ declare_var_vector_reg w13, 29
+ declare_var_vector_reg w14, 30
+ declare_var_vector_reg w15, 31
+
+ len .req w4
+ len_x .req x4
+ lane0 .req x5
+ lane1 .req x6
+ lane2 .req x7
+ lane3 .req x9
+ end .req x4
+ job0 .req x0
+ job1 .req x1
+ job2 .req x2
+ job3 .req x3
+ key_adr .req x10
+
+/*
+ void md5_mb_asimd_x4(MD5_JOB * job0, MD5_JOB * job1,
+ MD5_JOB * job2, MD5_JOB * job3, int len)
+*/
+ .global md5_mb_asimd_x4
+ .type md5_mb_asimd_x4, %function
+md5_mb_asimd_x4:
+ stp x29,x30,[sp,-48]!
+ ldr lane0,[job0],64
+ stp d8,d9,[sp,16]
+ ldr lane1,[job1],64
+ stp d10,d11,[sp,32]
+ ldr lane2,[job2],64
+ cmp len,0
+ ldr lane3,[job3],64
+ ble .exit
+
+ //load digests
+ ld4 {v_a.s-v_d.s}[0],[job0]
+ add end,lane0,len_x,lsl 6
+ ld4 {v_a.s-v_d.s}[1],[job1]
+ ld4 {v_a.s-v_d.s}[2],[job2]
+ ld4 {v_a.s-v_d.s}[3],[job3]
+.loop_start:
+ ld1 {v_w0.s}[0],[lane0],4
+ mov v_a1.16b,v_a.16b
+ ld1 {v_w0.s}[1],[lane1],4
+ mov v_b1.16b,v_b.16b
+ ld1 {v_w0.s}[2],[lane2],4
+ mov v_c1.16b,v_c.16b
+ ld1 {v_w0.s}[3],[lane3],4
+ mov v_d1.16b,v_d.16b
+
+ ld3 {v_w1.s-v_w3.s}[0],[lane0],12
+ adrp key_adr,.key_consts
+ ld3 {v_w1.s-v_w3.s}[1],[lane1],12
+ add key_adr,key_adr,#:lo12:.key_consts
+ ld3 {v_w1.s-v_w3.s}[2],[lane2],12
+ ldr q_k,[key_adr],16
+ ld3 {v_w1.s-v_w3.s}[3],[lane3],12
+
+
+ ld4 {v_w4.s-v_w7.s}[0], [lane0],16
+
+ round_0_15 a,b,c,d,k,k1,w0,7
+
+ ld4 {v_w4.s-v_w7.s}[1], [lane1],16
+ round_0_15 d,a,b,c,k1,k,w1,12
+ ld4 {v_w4.s-v_w7.s}[2], [lane2],16
+ round_0_15 c,d,a,b,k,k1,w2,17
+ ld4 {v_w4.s-v_w7.s}[3], [lane3],16
+ round_0_15 b,c,d,a,k1,k,w3,22
+ ld4 {v_w8.s-v_w11.s}[0],[lane0],16
+ round_0_15 a,b,c,d,k,k1,w4,7
+ ld4 {v_w8.s-v_w11.s}[1],[lane1],16
+ round_0_15 d,a,b,c,k1,k,w5,12
+ ld4 {v_w8.s-v_w11.s}[2],[lane2],16
+ round_0_15 c,d,a,b,k,k1,w6,17
+ ld4 {v_w8.s-v_w11.s}[3],[lane3],16
+ round_0_15 b,c,d,a,k1,k,w7,22
+ ld4 {v_w12.s-v_w15.s}[0],[lane0],16
+ round_0_15 a,b,c,d,k,k1,w8,7
+ ld4 {v_w12.s-v_w15.s}[1],[lane1],16
+ round_0_15 d,a,b,c,k1,k,w9,12
+ ld4 {v_w12.s-v_w15.s}[2],[lane2],16
+ round_0_15 c,d,a,b,k,k1,w10,17
+ ld4 {v_w12.s-v_w15.s}[3],[lane3],16
+ round_0_15 b,c,d,a,k1,k,w11,22
+ round_0_15 a,b,c,d,k,k1,w12,7
+ round_0_15 d,a,b,c,k1,k,w13,12
+ round_0_15 c,d,a,b,k,k1,w14,17
+ round_0_15 b,c,d,a,k1,k,w15,22
+
+ round_16_31 a,b,c,d,k,k1,w1,5
+ round_16_31 d,a,b,c,k1,k,w6,9
+ round_16_31 c,d,a,b,k,k1,w11,14
+ round_16_31 b,c,d,a,k1,k,w0,20
+ round_16_31 a,b,c,d,k,k1,w5,5
+ round_16_31 d,a,b,c,k1,k,w10,9
+ round_16_31 c,d,a,b,k,k1,w15,14
+ round_16_31 b,c,d,a,k1,k,w4,20
+ round_16_31 a,b,c,d,k,k1,w9,5
+ round_16_31 d,a,b,c,k1,k,w14,9
+ round_16_31 c,d,a,b,k,k1,w3,14
+ round_16_31 b,c,d,a,k1,k,w8,20
+ round_16_31 a,b,c,d,k,k1,w13,5
+ round_16_31 d,a,b,c,k1,k,w2,9
+ round_16_31 c,d,a,b,k,k1,w7,14
+ round_16_31 b,c,d,a,k1,k,w12,20
+
+ round_32_47 a,b,c,d,k,k1,w5,4
+ round_32_47 d,a,b,c,k1,k,w8,11
+ round_32_47 c,d,a,b,k,k1,w11,16
+ round_32_47 b,c,d,a,k1,k,w14,23
+ round_32_47 a,b,c,d,k,k1,w1,4
+ round_32_47 d,a,b,c,k1,k,w4,11
+ round_32_47 c,d,a,b,k,k1,w7,16
+ round_32_47 b,c,d,a,k1,k,w10,23
+ round_32_47 a,b,c,d,k,k1,w13,4
+ round_32_47 d,a,b,c,k1,k,w0,11
+ round_32_47 c,d,a,b,k,k1,w3,16
+ round_32_47 b,c,d,a,k1,k,w6,23
+ round_32_47 a,b,c,d,k,k1,w9,4
+ round_32_47 d,a,b,c,k1,k,w12,11
+ round_32_47 c,d,a,b,k,k1,w15,16
+ round_32_47 b,c,d,a,k1,k,w2,23
+
+ round_48_63 a,b,c,d,k,k1,w0,6
+ round_48_63 d,a,b,c,k1,k,w7,10
+ round_48_63 c,d,a,b,k,k1,w14,15
+ round_48_63 b,c,d,a,k1,k,w5,21
+ round_48_63 a,b,c,d,k,k1,w12,6
+ round_48_63 d,a,b,c,k1,k,w3,10
+ round_48_63 c,d,a,b,k,k1,w10,15
+ round_48_63 b,c,d,a,k1,k,w1,21
+ round_48_63 a,b,c,d,k,k1,w8,6
+ round_48_63 d,a,b,c,k1,k,w15,10
+ round_48_63 c,d,a,b,k,k1,w6,15
+ round_48_63 b,c,d,a,k1,k,w13,21
+ round_48_63 a,b,c,d,k,k1,w4,6
+ round_48_63 d,a,b,c,k1,k,w11,10
+ round_48_63 c,d,a,b,k,k1,w2,15
+ round_48_63 b,c,d,a,k1, ,w9,21
+
+
+
+
+ cmp lane0,end
+ add v_a.4s,v_a1.4s,v_a.4s
+ add v_b.4s,v_b1.4s,v_b.4s
+ add v_c.4s,v_c1.4s,v_c.4s
+ add v_d.4s,v_d1.4s,v_d.4s
+ bne .loop_start
+
+ st4 {v_a.s-v_d.s}[0],[job0]
+ st4 {v_a.s-v_d.s}[1],[job1]
+ st4 {v_a.s-v_d.s}[2],[job2]
+ st4 {v_a.s-v_d.s}[3],[job3]
+.exit:
+ ldp d8,d9,[sp,16]
+ ldp d10,d11,[sp,32]
+ ldp x29,x30,[sp],48
+ ret
+.key_consts:
+ .word 0xd76aa478
+ .word 0xd76aa478
+ .word 0xd76aa478
+ .word 0xd76aa478
+ .word 0xe8c7b756
+ .word 0xe8c7b756
+ .word 0xe8c7b756
+ .word 0xe8c7b756
+ .word 0x242070db
+ .word 0x242070db
+ .word 0x242070db
+ .word 0x242070db
+ .word 0xc1bdceee
+ .word 0xc1bdceee
+ .word 0xc1bdceee
+ .word 0xc1bdceee
+ .word 0xf57c0faf
+ .word 0xf57c0faf
+ .word 0xf57c0faf
+ .word 0xf57c0faf
+ .word 0x4787c62a
+ .word 0x4787c62a
+ .word 0x4787c62a
+ .word 0x4787c62a
+ .word 0xa8304613
+ .word 0xa8304613
+ .word 0xa8304613
+ .word 0xa8304613
+ .word 0xfd469501
+ .word 0xfd469501
+ .word 0xfd469501
+ .word 0xfd469501
+ .word 0x698098d8
+ .word 0x698098d8
+ .word 0x698098d8
+ .word 0x698098d8
+ .word 0x8b44f7af
+ .word 0x8b44f7af
+ .word 0x8b44f7af
+ .word 0x8b44f7af
+ .word 0xffff5bb1
+ .word 0xffff5bb1
+ .word 0xffff5bb1
+ .word 0xffff5bb1
+ .word 0x895cd7be
+ .word 0x895cd7be
+ .word 0x895cd7be
+ .word 0x895cd7be
+ .word 0x6b901122
+ .word 0x6b901122
+ .word 0x6b901122
+ .word 0x6b901122
+ .word 0xfd987193
+ .word 0xfd987193
+ .word 0xfd987193
+ .word 0xfd987193
+ .word 0xa679438e
+ .word 0xa679438e
+ .word 0xa679438e
+ .word 0xa679438e
+ .word 0x49b40821
+ .word 0x49b40821
+ .word 0x49b40821
+ .word 0x49b40821
+ .word 0xf61e2562
+ .word 0xf61e2562
+ .word 0xf61e2562
+ .word 0xf61e2562
+ .word 0xc040b340
+ .word 0xc040b340
+ .word 0xc040b340
+ .word 0xc040b340
+ .word 0x265e5a51
+ .word 0x265e5a51
+ .word 0x265e5a51
+ .word 0x265e5a51
+ .word 0xe9b6c7aa
+ .word 0xe9b6c7aa
+ .word 0xe9b6c7aa
+ .word 0xe9b6c7aa
+ .word 0xd62f105d
+ .word 0xd62f105d
+ .word 0xd62f105d
+ .word 0xd62f105d
+ .word 0x02441453
+ .word 0x02441453
+ .word 0x02441453
+ .word 0x02441453
+ .word 0xd8a1e681
+ .word 0xd8a1e681
+ .word 0xd8a1e681
+ .word 0xd8a1e681
+ .word 0xe7d3fbc8
+ .word 0xe7d3fbc8
+ .word 0xe7d3fbc8
+ .word 0xe7d3fbc8
+ .word 0x21e1cde6
+ .word 0x21e1cde6
+ .word 0x21e1cde6
+ .word 0x21e1cde6
+ .word 0xc33707d6
+ .word 0xc33707d6
+ .word 0xc33707d6
+ .word 0xc33707d6
+ .word 0xf4d50d87
+ .word 0xf4d50d87
+ .word 0xf4d50d87
+ .word 0xf4d50d87
+ .word 0x455a14ed
+ .word 0x455a14ed
+ .word 0x455a14ed
+ .word 0x455a14ed
+ .word 0xa9e3e905
+ .word 0xa9e3e905
+ .word 0xa9e3e905
+ .word 0xa9e3e905
+ .word 0xfcefa3f8
+ .word 0xfcefa3f8
+ .word 0xfcefa3f8
+ .word 0xfcefa3f8
+ .word 0x676f02d9
+ .word 0x676f02d9
+ .word 0x676f02d9
+ .word 0x676f02d9
+ .word 0x8d2a4c8a
+ .word 0x8d2a4c8a
+ .word 0x8d2a4c8a
+ .word 0x8d2a4c8a
+ .word 0xfffa3942
+ .word 0xfffa3942
+ .word 0xfffa3942
+ .word 0xfffa3942
+ .word 0x8771f681
+ .word 0x8771f681
+ .word 0x8771f681
+ .word 0x8771f681
+ .word 0x6d9d6122
+ .word 0x6d9d6122
+ .word 0x6d9d6122
+ .word 0x6d9d6122
+ .word 0xfde5380c
+ .word 0xfde5380c
+ .word 0xfde5380c
+ .word 0xfde5380c
+ .word 0xa4beea44
+ .word 0xa4beea44
+ .word 0xa4beea44
+ .word 0xa4beea44
+ .word 0x4bdecfa9
+ .word 0x4bdecfa9
+ .word 0x4bdecfa9
+ .word 0x4bdecfa9
+ .word 0xf6bb4b60
+ .word 0xf6bb4b60
+ .word 0xf6bb4b60
+ .word 0xf6bb4b60
+ .word 0xbebfbc70
+ .word 0xbebfbc70
+ .word 0xbebfbc70
+ .word 0xbebfbc70
+ .word 0x289b7ec6
+ .word 0x289b7ec6
+ .word 0x289b7ec6
+ .word 0x289b7ec6
+ .word 0xeaa127fa
+ .word 0xeaa127fa
+ .word 0xeaa127fa
+ .word 0xeaa127fa
+ .word 0xd4ef3085
+ .word 0xd4ef3085
+ .word 0xd4ef3085
+ .word 0xd4ef3085
+ .word 0x04881d05
+ .word 0x04881d05
+ .word 0x04881d05
+ .word 0x04881d05
+ .word 0xd9d4d039
+ .word 0xd9d4d039
+ .word 0xd9d4d039
+ .word 0xd9d4d039
+ .word 0xe6db99e5
+ .word 0xe6db99e5
+ .word 0xe6db99e5
+ .word 0xe6db99e5
+ .word 0x1fa27cf8
+ .word 0x1fa27cf8
+ .word 0x1fa27cf8
+ .word 0x1fa27cf8
+ .word 0xc4ac5665
+ .word 0xc4ac5665
+ .word 0xc4ac5665
+ .word 0xc4ac5665
+ .word 0xf4292244
+ .word 0xf4292244
+ .word 0xf4292244
+ .word 0xf4292244
+ .word 0x432aff97
+ .word 0x432aff97
+ .word 0x432aff97
+ .word 0x432aff97
+ .word 0xab9423a7
+ .word 0xab9423a7
+ .word 0xab9423a7
+ .word 0xab9423a7
+ .word 0xfc93a039
+ .word 0xfc93a039
+ .word 0xfc93a039
+ .word 0xfc93a039
+ .word 0x655b59c3
+ .word 0x655b59c3
+ .word 0x655b59c3
+ .word 0x655b59c3
+ .word 0x8f0ccc92
+ .word 0x8f0ccc92
+ .word 0x8f0ccc92
+ .word 0x8f0ccc92
+ .word 0xffeff47d
+ .word 0xffeff47d
+ .word 0xffeff47d
+ .word 0xffeff47d
+ .word 0x85845dd1
+ .word 0x85845dd1
+ .word 0x85845dd1
+ .word 0x85845dd1
+ .word 0x6fa87e4f
+ .word 0x6fa87e4f
+ .word 0x6fa87e4f
+ .word 0x6fa87e4f
+ .word 0xfe2ce6e0
+ .word 0xfe2ce6e0
+ .word 0xfe2ce6e0
+ .word 0xfe2ce6e0
+ .word 0xa3014314
+ .word 0xa3014314
+ .word 0xa3014314
+ .word 0xa3014314
+ .word 0x4e0811a1
+ .word 0x4e0811a1
+ .word 0x4e0811a1
+ .word 0x4e0811a1
+ .word 0xf7537e82
+ .word 0xf7537e82
+ .word 0xf7537e82
+ .word 0xf7537e82
+ .word 0xbd3af235
+ .word 0xbd3af235
+ .word 0xbd3af235
+ .word 0xbd3af235
+ .word 0x2ad7d2bb
+ .word 0x2ad7d2bb
+ .word 0x2ad7d2bb
+ .word 0x2ad7d2bb
+ .word 0xeb86d391
+ .word 0xeb86d391
+ .word 0xeb86d391
+ .word 0xeb86d391
+ .size md5_mb_asimd_x4, .-md5_mb_asimd_x4
diff --git a/drv/hash_mb/md5_mb_sve.S b/drv/hash_mb/md5_mb_sve.S
new file mode 100644
index 0000000..8d8ecc1
--- /dev/null
+++ b/drv/hash_mb/md5_mb_sve.S
@@ -0,0 +1,158 @@
+/**********************************************************************
+ Copyright(c) 2022 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+ .arch armv8.2-a+sve
+
+// copying data from sparse memory unto continuous stack space
+// in oroder to gather-load into SVE registers
+.macro copy_mb_16words vecs:req,dest:req
+ mov src,\vecs
+ mov dst,\dest
+ mov counter,total_lanes
+10:
+ ldr tmp,[src],8
+ ldr tmp,[tmp]
+ add tmp,tmp,block_ctr,lsl 6
+ ld1 {TMPV0.4s,TMPV1.4s,TMPV2.4s,TMPV3.4s}, [tmp]
+ st1 {TMPV0.4s,TMPV1.4s,TMPV2.4s,TMPV3.4s}, [dst],64
+ subs counter,counter,1
+ b.ne 10b
+.endm
+
+.macro load_init
+ mov tmpw,16
+ index VOFFS.s,0,tmpw
+ copy_mb_16words job_vec,databuf
+.endm
+
+.macro load_word pipelines:req,windex:req,zreg0:req,zreg1
+ add tmp,databuf,\windex * 4
+ ld1w { \zreg0\().s}, p0/z, [tmp, VOFFS.s, UXTW 2]
+ .if \pipelines > 1
+ add tmp,tmp,veclen,lsl #6
+ ld1w {\zreg1\().s}, p1/z, [tmp, VOFFS.s, UXTW 2]
+ .endif
+.endm
+
+#include "md5_sve_common.S"
+
+/* int md5_mb_sve_max_lanes()
+ */
+ .global md5_mb_sve_max_lanes
+ .type md5_mb_sve_max_lanes, %function
+md5_mb_sve_max_lanes:
+ cntw x0
+ add x0,x0,x0
+ ret
+ .size md5_mb_sve_max_lanes, .-md5_mb_sve_max_lanes
+
+/*
+ * void md5_mb_sve(int blocks, int total_lanes, MD5_JOB **job_vec)
+ */
+ num_blocks .req w0
+ total_lanes .req w1
+ job_vec .req x2
+ src .req x5
+ dst .req x6
+ tmp .req x8
+ tmpw .req w8
+ block_ctr .req x9
+ block_ctr_w .req w9
+ savedsp .req x10
+ databuf .req x11
+ counter .req w12
+ veclen .req x13
+ veclen_w .req w13
+ abcd_buf .req x14
+ md5key_adr .req x15
+
+ .global md5_mb_sve
+ .type md5_mb_sve, %function
+md5_mb_sve:
+ cbz num_blocks,.return
+ md5_sve_save_stack
+ mov savedsp,sp
+ // reserve (16 * lanes) for abcd buf
+ mov tmpw,total_lanes,lsl 4
+ sub abcd_buf,sp,tmp
+ // reserve (64 * lanes) for data buf
+ mov tmpw,total_lanes,lsl 6
+ sub databuf,abcd_buf,tmp
+ mov sp,databuf
+ adr md5key_adr,MD5_CONST_KEYS
+ whilelo p0.s,wzr,total_lanes
+ mov src,job_vec
+ mov dst,abcd_buf
+ mov counter,total_lanes
+.ldr_hash:
+ ldr tmp,[src],8
+ add tmp,tmp,64
+ ld1 {v0.16b},[tmp]
+ st1 {v0.16b},[dst],16
+ subs counter,counter,1
+ bne .ldr_hash
+ ld4w {VA_0.s,VB_0.s,VC_0.s,VD_0.s},p0/z,[abcd_buf]
+ mov block_ctr,0
+ cntp veclen,p0,p0.s
+ cmp veclen_w,total_lanes
+ b.eq .loop_1x
+ whilelo p1.s,veclen_w,total_lanes
+ add tmp,abcd_buf,veclen,lsl #4
+ ld4w {VA_1.s,VB_1.s,VC_1.s,VD_1.s},p1/z,[tmp]
+ b .loop_2x
+.loop_1x:
+ md5_single 1
+ add block_ctr, block_ctr, 1
+ cmp block_ctr_w,num_blocks
+ bne .loop_1x
+ st4w {VA_0.s,VB_0.s,VC_0.s,VD_0.s},p0,[abcd_buf]
+ b 1f
+.loop_2x:
+ md5_single 2
+ add block_ctr, block_ctr, 1
+ cmp block_ctr_w,num_blocks
+ bne .loop_2x
+ st4w {VA_0.s,VB_0.s,VC_0.s,VD_0.s},p0,[abcd_buf]
+ add tmp,abcd_buf,veclen,lsl #4
+ st4w {VA_1.s,VB_1.s,VC_1.s,VD_1.s},p1,[tmp]
+1:
+ mov dst,job_vec
+ mov src,abcd_buf
+.str_hash:
+ ld1 {v0.16b},[src],16
+ ldr tmp,[dst],8
+ add tmp,tmp,64
+ st1 {v0.16b},[tmp]
+ subs total_lanes,total_lanes,1
+ bne .str_hash
+ mov sp,savedsp
+ md5_sve_restore_stack
+.return:
+ ret
+ .size md5_mb_sve, .-md5_mb_sve
diff --git a/drv/hash_mb/md5_sve_common.S b/drv/hash_mb/md5_sve_common.S
new file mode 100644
index 0000000..ed81482
--- /dev/null
+++ b/drv/hash_mb/md5_sve_common.S
@@ -0,0 +1,478 @@
+/**********************************************************************
+ Copyright(c) 2022 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ VK .req z0
+ VOFFS .req z1
+ VA_0 .req z2
+ VB_0 .req z3
+ VC_0 .req z4
+ VD_0 .req z5
+ VF_0 .req z6
+ VF_1 .req z7
+ VA_1 .req z16
+ VB_1 .req z17
+ VC_1 .req z18
+ VD_1 .req z19
+ MD5WORD0_0 .req z20
+ MD5WORD1_0 .req z21
+ MD5WORD0_1 .req z22
+ MD5WORD1_1 .req z23
+ TMPV0 .req v20
+ TMPV1 .req v21
+ TMPV2 .req v22
+ TMPV3 .req v23
+ VTMP_0 .req z24
+ VAA_0 .req z25
+ VBB_0 .req z26
+ VCC_0 .req z27
+ VDD_0 .req z28
+ VTMP_1 .req z29
+ VAA_1 .req z30
+ VBB_1 .req z31
+ VCC_1 .req z8
+ VDD_1 .req z9
+ TT .req z0
+
+.macro rotate_left_x1 out:req,in:req,tmp:req,bits
+ .if \bits == 16
+ revh \out\().s,p0/m,\in\().s
+ .else
+ .if have_sve2 == 0
+ lsl \tmp\().s, \in\().s,\bits
+ lsr \out\().s,\in\().s,32-\bits
+ orr \out\().d,\out\().d,\tmp\().d
+ .else
+ movprfx \out\().d,\in\().d
+ xar \out\().s,\out\().s,VZERO.s,32-\bits
+ .endif
+ .endif
+.endm
+
+.macro rotate_left_x2 out:req,in:req,tmp:req,bits,out1:req,in1:req,tmp1:req,bits1
+
+ .if \bits == 16
+ revh \out\().s,p0/m,\in\().s
+ revh \out1\().s,p0/m,\in1\().s
+ .else
+ .if have_sve2 == 0
+ lsl \tmp\().s, \in\().s,\bits
+ lsl \tmp1\().s, \in1\().s,\bits1
+ lsr \out\().s,\in\().s,32-\bits
+ lsr \out1\().s,\in1\().s,32-\bits1
+ orr \out\().d,\out\().d,\tmp\().d
+ orr \out1\().d,\out1\().d,\tmp1\().d
+ .else
+ movprfx \out\().d,\in\().d
+ xar \out\().s,\out\().s,VZERO.s,32-\bits
+ movprfx \out1\().d,\in1\().d
+ xar \out1\().s,\out1\().s,VZERO.s,32-\bits1
+ .endif
+ .endif
+.endm
+
+.macro bsl_x1 ret:req,x:req,y:req,z:req,tmp:req
+ .if have_sve2 == 0
+ bic \ret\().d,\z\().d,\x\().d
+ and \tmp\().d,\x\().d,\y\().d
+ orr \ret\().d,\ret\().d,\tmp\().d
+ .else
+ movprfx \ret\().d,\x\().d
+ bsl \ret\().d,\ret\().d,\y\().d,\z\().d
+ .endif
+.endm
+
+.macro bsl_x2 ret:req,x:req,y:req,z:req,tmp:req,ret1:req,x1:req,y1:req,z1:req,tmp1:req
+ .if have_sve2 == 0
+ bic \ret\().d,\z\().d,\x\().d
+ bic \ret1\().d,\z1\().d,\x1\().d
+ and \tmp\().d,\x\().d,\y\().d
+ and \tmp1\().d,\x1\().d,\y1\().d
+ orr \ret\().d,\ret\().d,\tmp\().d
+ orr \ret1\().d,\ret1\().d,\tmp1\().d
+ .else
+ movprfx \ret\().d,\x\().d
+ bsl \ret\().d,\ret\().d,\y\().d,\z\().d
+ movprfx \ret1\().d,\x1\().d
+ bsl \ret1\().d,\ret1\().d,\y1\().d,\z1\().d
+ .endif
+.endm
+
+
+// F = D ^ (B and (C xor D))
+// that is (B and C) or ((not B) and D)
+.macro FUNC_F0_x1
+ bsl_x1 VF_0,VB_0,VC_0,VD_0,VTMP_0
+.endm
+
+.macro FUNC_F0_x2
+ bsl_x2 VF_0,VB_0,VC_0,VD_0,VTMP_0,VF_1,VB_1,VC_1,VD_1,VTMP_1
+.endm
+
+// F = C xor (D and (B xor C))
+// that is (D and B) or ((not D) and C)
+.macro FUNC_F1_x1
+ bsl_x1 VF_0,VD_0,VB_0,VC_0,VTMP_0
+.endm
+
+.macro FUNC_F1_x2
+ bsl_x2 VF_0,VD_0,VB_0,VC_0,VTMP_0,VF_1,VD_1,VB_1,VC_1,VTMP_1
+.endm
+
+// F := B xor C xor D
+.macro FUNC_F2_x1
+ .if have_sve2 == 0
+ eor VF_0.d,VB_0.d,VC_0.d
+ eor VF_0.d,VF_0.d,VD_0.d
+ .else
+ movprfx VF_0.d,VB_0.d
+ eor3 VF_0.d,VF_0.d,VC_0.d,VD_0.d
+ .endif
+.endm
+
+.macro FUNC_F2_x2
+ .if have_sve2 == 0
+ eor VF_0.d,VB_0.d,VC_0.d
+ eor VF_1.d,VB_1.d,VC_1.d
+ eor VF_0.d,VF_0.d,VD_0.d
+ eor VF_1.d,VF_1.d,VD_1.d
+ .else
+ movprfx VF_0.d,VB_0.d
+ eor3 VF_0.d,VF_0.d,VC_0.d,VD_0.d
+ movprfx VF_1.d,VB_1.d
+ eor3 VF_1.d,VF_1.d,VC_1.d,VD_1.d
+ .endif
+.endm
+
+// F := C xor (B or (not D))
+.macro FUNC_F3_x1
+ not VF_0.s,p0/m,VD_0.s
+ orr VF_0.d,VF_0.d,VB_0.d
+ eor VF_0.d,VF_0.d,VC_0.d
+.endm
+
+.macro FUNC_F3_x2
+ not VF_0.s,p0/m,VD_0.s
+ not VF_1.s,p0/m,VD_1.s
+ orr VF_0.d,VF_0.d,VB_0.d
+ orr VF_1.d,VF_1.d,VB_1.d
+ eor VF_0.d,VF_0.d,VC_0.d
+ eor VF_1.d,VF_1.d,VC_1.d
+.endm
+
+.macro SWAP_STATES
+ .unreq TT
+ TT .req VA_0
+ .unreq VA_0
+ VA_0 .req VD_0
+ .unreq VD_0
+ VD_0 .req VC_0
+ .unreq VC_0
+ VC_0 .req VB_0
+ .unreq VB_0
+ VB_0 .req TT
+
+ .unreq TT
+ TT .req VA_1
+ .unreq VA_1
+ VA_1 .req VD_1
+ .unreq VD_1
+ VD_1 .req VC_1
+ .unreq VC_1
+ VC_1 .req VB_1
+ .unreq VB_1
+ VB_1 .req TT
+.endm
+
+.macro MD5_STEP_x1 windex:req,mg:req,func_f:req,bits:req
+ ld1rw {VK.s},p0/z,[md5key_adr,windex * 4]
+ \func_f\()_x1
+ add VTMP_0.s,VA_0.s,\mg\()_0.s
+ add VF_0.s,VF_0.s,VK.s
+ add VF_0.s,VF_0.s,VTMP_0.s
+ rotate_left_x1 VA_0,VF_0,VTMP_0,\bits
+ add VA_0.s,VA_0.s,VB_0.s
+.endm
+
+.macro MD5_STEP_x2 windex:req,mg:req,func_f:req,bits:req
+ ld1rw {VK.s},p0/z,[md5key_adr,windex * 4]
+ \func_f\()_x2
+ add VTMP_0.s,VA_0.s,\mg\()_0.s
+ add VTMP_1.s,VA_1.s,\mg\()_1.s
+ add VF_0.s,VF_0.s,VK.s
+ add VF_1.s,VF_1.s,VK.s
+ add VF_0.s,VF_0.s,VTMP_0.s
+ add VF_1.s,VF_1.s,VTMP_1.s
+ rotate_left_x2 VA_0,VF_0,VTMP_0,\bits,VA_1,VF_1,VTMP_1,\bits
+ add VA_0.s,VA_0.s,VB_0.s
+ add VA_1.s,VA_1.s,VB_1.s
+.endm
+
+.altmacro
+.macro load_words index:req,mg:req
+ load_word %num_pipelines,\index,MD5WORD\mg\()_0,MD5WORD\mg\()_1
+.endm
+
+.macro MD5_STEP_WRAPPER pipelines:req,windex:req,gindex:req,mg:req,\
+ func_f:req,bits:req,gindex_next,mg_next
+ .ifnb \gindex_next
+ load_words \gindex_next,\mg_next
+ .endif
+ MD5_STEP_x\pipelines\() \windex,MD5WORD\mg\(),\func_f,\bits
+.endm
+
+.macro exec_step windex:req,gindex:req,bits:req,gindex_next
+ .if \windex % 2 == 0
+ mg=0
+ mg_next=1
+ .else
+ mg=1
+ mg_next=0
+ .endif
+
+ .if \windex <= 15
+ MD5_STEP_WRAPPER %num_pipelines,\windex,\gindex,%mg,\
+ FUNC_F0,\bits,\gindex_next,%mg_next
+ .endif
+ .if \windex >= 16 && \windex <= 31
+ MD5_STEP_WRAPPER %num_pipelines,\windex,\gindex,%mg,\
+ FUNC_F1,\bits,\gindex_next,%mg_next
+ .endif
+ .if \windex >= 32 && \windex <= 47
+ MD5_STEP_WRAPPER %num_pipelines,\windex,\gindex,%mg,\
+ FUNC_F2,\bits,\gindex_next,%mg_next
+ .endif
+ .if \windex >= 48 && \windex < 63
+ MD5_STEP_WRAPPER %num_pipelines,\windex,\gindex,%mg,\
+ FUNC_F3,\bits,\gindex_next,%mg_next
+ .endif
+ .if \windex == 63
+ MD5_STEP_WRAPPER %num_pipelines,\windex,\gindex,%mg,FUNC_F3,\bits
+ .endif
+ SWAP_STATES
+.endm
+
+.macro exec_steps
+ exec_step 0,0,7,1
+ exec_step 1,1,12,2
+ exec_step 2,2,17,3
+ exec_step 3,3,22,4
+ exec_step 4,4,7,5
+ exec_step 5,5,12,6
+ exec_step 6,6,17,7
+ exec_step 7,7,22,8
+ exec_step 8,8,7,9
+ exec_step 9,9,12,10
+ exec_step 10,10,17,11
+ exec_step 11,11,22,12
+ exec_step 12,12,7,13
+ exec_step 13,13,12,14
+ exec_step 14,14,17,15
+ exec_step 15,15,22,1
+ exec_step 16,1,5,6
+ exec_step 17,6,9,11
+ exec_step 18,11,14,0
+ exec_step 19,0,20,5
+ exec_step 20,5,5,10
+ exec_step 21,10,9,15
+ exec_step 22,15,14,4
+ exec_step 23,4,20,9
+ exec_step 24,9,5,14
+ exec_step 25,14,9,3
+ exec_step 26,3,14,8
+ exec_step 27,8,20,13
+ exec_step 28,13,5,2
+ exec_step 29,2,9,7
+ exec_step 30,7,14,12
+ exec_step 31,12,20,5
+ exec_step 32,5,4,8
+ exec_step 33,8,11,11
+ exec_step 34,11,16,14
+ exec_step 35,14,23,1
+ exec_step 36,1,4,4
+ exec_step 37,4,11,7
+ exec_step 38,7,16,10
+ exec_step 39,10,23,13
+ exec_step 40,13,4,0
+ exec_step 41,0,11,3
+ exec_step 42,3,16,6
+ exec_step 43,6,23,9
+ exec_step 44,9,4,12
+ exec_step 45,12,11,15
+ exec_step 46,15,16,2
+ exec_step 47,2,23,0
+ exec_step 48,0,6,7
+ exec_step 49,7,10,14
+ exec_step 50,14,15,5
+ exec_step 51,5,21,12
+ exec_step 52,12,6,3
+ exec_step 53,3,10,10
+ exec_step 54,10,15,1
+ exec_step 55,1,21,8
+ exec_step 56,8,6,15
+ exec_step 57,15,10,6
+ exec_step 58,6,15,13
+ exec_step 59,13,21,4
+ exec_step 60,4,6,11
+ exec_step 61,11,10,2
+ exec_step 62,2,15,9
+ exec_step 63,9,21
+.endm
+
+.macro prepare_x1
+ load_words 0,0
+ orr VAA_0.d,VA_0.d,VA_0.d
+ orr VBB_0.d,VB_0.d,VB_0.d
+ orr VCC_0.d,VC_0.d,VC_0.d
+ orr VDD_0.d,VD_0.d,VD_0.d
+.endm
+
+.macro prepare_x2
+ load_words 0,0
+ orr VAA_0.d,VA_0.d,VA_0.d
+ orr VAA_1.d,VA_1.d,VA_1.d
+ orr VBB_0.d,VB_0.d,VB_0.d
+ orr VBB_1.d,VB_1.d,VB_1.d
+ orr VCC_0.d,VC_0.d,VC_0.d
+ orr VCC_1.d,VC_1.d,VC_1.d
+ orr VDD_0.d,VD_0.d,VD_0.d
+ orr VDD_1.d,VD_1.d,VD_1.d
+.endm
+
+.macro finish_x1
+ add VA_0.s,VA_0.s,VAA_0.s
+ add VB_0.s,VB_0.s,VBB_0.s
+ add VC_0.s,VC_0.s,VCC_0.s
+ add VD_0.s,VD_0.s,VDD_0.s
+.endm
+
+.macro finish_x2
+ add VA_0.s,VA_0.s,VAA_0.s
+ add VA_1.s,VA_1.s,VAA_1.s
+ add VB_0.s,VB_0.s,VBB_0.s
+ add VB_1.s,VB_1.s,VBB_1.s
+ add VC_0.s,VC_0.s,VCC_0.s
+ add VC_1.s,VC_1.s,VCC_1.s
+ add VD_0.s,VD_0.s,VDD_0.s
+ add VD_1.s,VD_1.s,VDD_1.s
+.endm
+
+.macro md5_single pipelines:req,sve2
+ .ifnb \sve2
+ have_sve2=1
+ eor VZERO.d,VZERO.d,VZERO.d
+ .else
+ have_sve2=0
+ .endif
+ num_pipelines=\pipelines
+ load_init
+
+ prepare_x\pipelines\()
+ exec_steps
+ finish_x\pipelines\()
+.endm
+
+.macro md5_sve_save_stack
+ stp d8,d9,[sp, -48]!
+ stp d10,d11,[sp, 16]
+ stp d12,d13,[sp, 32]
+.endm
+
+.macro md5_sve_restore_stack
+ ldp d10,d11,[sp, 16]
+ ldp d12,d13,[sp, 32]
+ ldp d8,d9,[sp],48
+.endm
+
+ .section .rodata.cst16,"aM",@progbits,16
+ .align 16
+
+MD5_CONST_KEYS:
+ .word 0xd76aa478
+ .word 0xe8c7b756
+ .word 0x242070db
+ .word 0xc1bdceee
+ .word 0xf57c0faf
+ .word 0x4787c62a
+ .word 0xa8304613
+ .word 0xfd469501
+ .word 0x698098d8
+ .word 0x8b44f7af
+ .word 0xffff5bb1
+ .word 0x895cd7be
+ .word 0x6b901122
+ .word 0xfd987193
+ .word 0xa679438e
+ .word 0x49b40821
+ .word 0xf61e2562
+ .word 0xc040b340
+ .word 0x265e5a51
+ .word 0xe9b6c7aa
+ .word 0xd62f105d
+ .word 0x02441453
+ .word 0xd8a1e681
+ .word 0xe7d3fbc8
+ .word 0x21e1cde6
+ .word 0xc33707d6
+ .word 0xf4d50d87
+ .word 0x455a14ed
+ .word 0xa9e3e905
+ .word 0xfcefa3f8
+ .word 0x676f02d9
+ .word 0x8d2a4c8a
+ .word 0xfffa3942
+ .word 0x8771f681
+ .word 0x6d9d6122
+ .word 0xfde5380c
+ .word 0xa4beea44
+ .word 0x4bdecfa9
+ .word 0xf6bb4b60
+ .word 0xbebfbc70
+ .word 0x289b7ec6
+ .word 0xeaa127fa
+ .word 0xd4ef3085
+ .word 0x04881d05
+ .word 0xd9d4d039
+ .word 0xe6db99e5
+ .word 0x1fa27cf8
+ .word 0xc4ac5665
+ .word 0xf4292244
+ .word 0x432aff97
+ .word 0xab9423a7
+ .word 0xfc93a039
+ .word 0x655b59c3
+ .word 0x8f0ccc92
+ .word 0xffeff47d
+ .word 0x85845dd1
+ .word 0x6fa87e4f
+ .word 0xfe2ce6e0
+ .word 0xa3014314
+ .word 0x4e0811a1
+ .word 0xf7537e82
+ .word 0xbd3af235
+ .word 0x2ad7d2bb
+ .word 0xeb86d391
diff --git a/drv/hash_mb/sm3_mb_asimd_x1.S b/drv/hash_mb/sm3_mb_asimd_x1.S
new file mode 100644
index 0000000..c7362de
--- /dev/null
+++ b/drv/hash_mb/sm3_mb_asimd_x1.S
@@ -0,0 +1,387 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTmsgARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED msgARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED msgARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ dig_A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OmsgNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOmsgEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, msgHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERmsgISE) ARISING IN ANY msgAY OUT OF THE USE
+ OF THIS SOFTmsgARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ .arch armv8.2-a
+ .text
+ .align 2
+ .p2align 3,,7
+
+.macro declare_var_vector_reg name:req,reg:req
+ q\name\() .req q\reg
+ v\name\() .req v\reg
+ s\name\() .req s\reg
+.endm
+
+ job .req x0
+ len .req x1
+ data .req x2
+ digest .req x0
+
+ msg0 .req w3
+ msg1 .req w4
+ msg2 .req w5
+ msg3 .req w6
+ msg4 .req w7
+
+ msg .req w9
+ msgP .req w10
+ SS1 .req w11
+ SS2 .req w12
+ TT1 .req w13
+ TT2 .req w14
+ Tj .req w15
+ tmp0 .req w19
+ tmp1 .req w20
+ dig_A .req w21
+ dig_B .req w22
+ dig_C .req w23
+ dig_D .req w24
+ dig_E .req w25
+ dig_F .req w26
+ dig_G .req w27
+ dig_H .req w28
+
+ declare_var_vector_reg dig0,0
+ declare_var_vector_reg dig1,1
+ declare_var_vector_reg dig0_bak,2
+ declare_var_vector_reg dig1_bak,3
+ declare_var_vector_reg vect_msg0,4
+ declare_var_vector_reg vect_msg1,5
+ declare_var_vector_reg vect_msg2,6
+ declare_var_vector_reg vect_msg3,7
+
+ declare_var_vector_reg vect_msgP0,16
+ declare_var_vector_reg vect_msgP1,17
+ declare_var_vector_reg vect_msgP2,18
+
+
+
+
+
+
+// round 0-11
+.macro sm3_round_0 round:req
+ ldr msg, [sp,msg_off+4*\round\()]
+ ldr msgP,[sp,wp_off +4*\round\()]
+ add SS1,dig_E,Tj
+ ror TT1,dig_A,32-12
+ add SS1,SS1,TT1
+ ror SS1,SS1,32-7 //SS1 done
+ eor SS2,SS1,TT1 //SS2 done
+ eor TT1,dig_A,dig_B
+ eor TT2,dig_E,dig_F
+ add SS2,SS2,msgP
+ eor TT2,TT2,dig_G
+ add SS1,SS1,msg
+ eor TT1,TT1,dig_C
+ add SS2,SS2,dig_D
+ add SS1,SS1,dig_H
+ add TT1,TT1,SS2
+ add TT2,TT2,SS1
+ mov dig_D,dig_C
+ ror dig_C,dig_B,32-9
+ mov dig_B,dig_A
+ mov dig_A,TT1
+ eor TT1,TT2,TT2,ror (32-17)
+ mov dig_H,dig_G
+ ror dig_G,dig_F,32-19
+ mov dig_F,dig_E
+ eor dig_E,TT1,TT2,ror(32-9)
+ ror Tj,Tj,(32-1)
+.endm
+
+//round 12-15
+.macro sm3_round_12 round:req
+ ldr msg, [sp,msg_off+4*((\round\())%17)]
+ ldr msg0,[sp,msg_off+4*((\round\()+4 - 16)%17)]
+ ldr msg1,[sp,msg_off+4*((\round\()+4 - 9)%17)]
+ add SS1,dig_E,Tj
+ ror TT1,dig_A,32-12
+ add SS1,SS1,TT1
+ ror SS1,SS1,32-7 //SS1 done
+ eor SS2,SS1,TT1 //SS2 done
+
+ eor msg0,msg0,msg1
+ ldr msg2,[sp,msg_off+4*((\round\()+4 - 3)%17)]
+ eor TT1,dig_A,dig_B
+ eor TT2,dig_E,dig_F
+ add SS2,SS2,dig_D
+ eor TT2,TT2,dig_G
+ add SS1,SS1,msg
+ eor msg0,msg0,msg2,ror (32-15)
+ ldr msg3,[sp,msg_off+4*((\round\()+4 - 13)%17)]
+ ldr msg4,[sp,msg_off+4*((\round\()+4 - 6)%17)]
+ eor msg1,msg0,msg0,ror (32 -15)
+ eor TT1,TT1,dig_C
+ add TT1,TT1,SS2
+ eor msg4,msg4,msg3, ror (32-7)
+ eor msg0,msg1,msg0, ror (32-23)
+ add SS1,SS1,dig_H
+ eor msg0,msg0,msg4
+ add TT2,TT2,SS1
+ mov dig_D,dig_C
+ str msg0,[sp,msg_off+4*((\round\()+4)%17)]
+ eor msgP,msg,msg0
+ add TT1,TT1,msgP
+ ror dig_C,dig_B,32-9
+ mov dig_B,dig_A
+ mov dig_A,TT1
+ eor TT1,TT2,TT2,ror (32-17)
+ mov dig_H,dig_G
+ ror dig_G,dig_F,32-19
+ mov dig_F,dig_E
+ eor dig_E,TT1,TT2,ror(32-9)
+ ror Tj,Tj,32-1
+.endm
+
+// round 16-62
+.macro sm3_round_16 round:req
+ ldr msg, [sp,msg_off+4*((\round\())%17)]
+ ldr msg0,[sp,msg_off+4*((\round\()+4 - 16)%17)]
+ ldr msg1,[sp,msg_off+4*((\round\()+4 - 9)%17)]
+ add SS1,dig_E,Tj
+ ror TT1,dig_A,32-12
+ add SS1,SS1,TT1
+ ror SS1,SS1,32-7 //SS1 done
+ eor SS2,SS1,TT1 //SS2 done
+
+ eor msg0,msg0,msg1
+ ldr msg2,[sp,msg_off+4*((\round\()+4 - 3)%17)]
+ orr TT1,dig_B,dig_C
+ and tmp0,dig_B,dig_C
+
+ eor TT2,dig_F,dig_G
+ and TT1,TT1,dig_A
+ add SS2,SS2,dig_D
+ orr TT1,TT1,tmp0
+ and TT2,TT2,dig_E
+ add SS1,SS1,msg
+ eor TT2,TT2,dig_G
+
+ eor msg0,msg0,msg2,ror (32-15)
+ ldr msg3,[sp,msg_off+4*((\round\()+4 - 13)%17)]
+ ldr msg4,[sp,msg_off+4*((\round\()+4 - 6)%17)]
+ eor msg1,msg0,msg0,ror (32 -15)
+ add TT1,TT1,SS2
+ eor msg4,msg4,msg3, ror (32-7)
+ eor msg0,msg1,msg0, ror (32-23)
+ add SS1,SS1,dig_H
+ eor msg0,msg0,msg4
+ add TT2,TT2,SS1
+ mov dig_D,dig_C
+ str msg0,[sp,msg_off+4*((\round\()+4)%17)]
+ eor msgP,msg,msg0
+ add TT1,TT1,msgP
+ ror dig_C,dig_B,32-9
+ mov dig_B,dig_A
+ mov dig_A,TT1
+ eor TT1,TT2,TT2,ror (32-17)
+ mov dig_H,dig_G
+ ror dig_G,dig_F,32-19
+ mov dig_F,dig_E
+ eor dig_E,TT1,TT2,ror(32-9)
+ ror Tj,Tj,32-1
+.endm
+
+//round 63
+.macro sm3_round_63 round:req
+ ldr msg, [sp,msg_off+4*((\round\())%17)]
+ ldr msg0,[sp,msg_off+4*((\round\()+4 - 16)%17)]
+ ldr msg1,[sp,msg_off+4*((\round\()+4 - 9)%17)]
+ add SS1,dig_E,Tj
+ ror TT1,dig_A,32-12
+ add SS1,SS1,TT1
+ ror SS1,SS1,32-7 //SS1 done
+ eor SS2,SS1,TT1 //SS2 done
+ eor msg0,msg0,msg1
+ ldr msg2,[sp,msg_off+4*((\round\()+4 - 3)%17)]
+ orr TT1,dig_B,dig_C
+ and tmp0,dig_B,dig_C
+ eor TT2,dig_F,dig_G
+ and TT1,TT1,dig_A
+ add SS2,SS2,dig_D
+ orr TT1,TT1,tmp0
+ and TT2,TT2,dig_E
+ add SS1,SS1,msg
+ eor TT2,TT2,dig_G
+ eor msg0,msg0,msg2,ror (32-15)
+ ldr msg3,[sp,msg_off+4*((\round\()+4 - 13)%17)]
+ ldr msg4,[sp,msg_off+4*((\round\()+4 - 6)%17)]
+ eor msg1,msg0,msg0,ror (32 -15)
+ add TT1,TT1,SS2
+ eor msg4,msg4,msg3, ror (32-7)
+ eor msg0,msg1,msg0, ror (32-23)
+ add SS1,SS1,dig_H
+ eor msg0,msg0,msg4
+ add TT2,TT2,SS1
+ str msg0,[sp,msg_off+4*((\round\()+4)%17)]
+ eor msgP,msg,msg0
+ add TT1,TT1,msgP
+ ins vdig0_bak.s[3],dig_C
+ ror dig_C,dig_B,32-9
+ ins vdig0_bak.s[1],dig_A
+ ins vdig0_bak.s[0],TT1
+ ins vdig0_bak.s[2],dig_C
+ eor TT1,TT2,TT2,ror (32-17)
+ ins vdig1_bak.s[3],dig_G
+ ror dig_G,dig_F,32-19
+ ins vdig1_bak.s[1],dig_E
+ ins vdig1_bak.s[2],dig_G
+ eor dig_E,TT1,TT2,ror(32-9)
+ ins vdig1_bak.s[0],dig_E
+.endm
+
+ .set wp_off , 96
+ .set msg_off, 96 + 12*4
+#define STACK_SIZE 224
+ .global sm3_mb_asimd_x1
+ .type sm3_mb_asimd_x1, %function
+sm3_mb_asimd_x1:
+ stp x29,x30, [sp,-STACK_SIZE]!
+ cmp len,0
+ ldr data,[job],64
+ ldp qdig0,qdig1,[digest]
+ stp x19, x20, [sp, 16]
+ stp x21, x22, [sp, 32]
+ rev32 vdig0.16b,vdig0.16b
+ stp x23, x24, [sp, 48]
+ rev32 vdig1.16b,vdig1.16b
+ stp x25, x26, [sp, 64]
+ stp x27, x28, [sp, 80]
+ ble .exit_func
+
+.start_loop:
+
+ /** prepare first 12 round data **/
+ ld1 {vvect_msg0.16b-vvect_msg3.16b},[data],64
+ mov Tj, 17689
+ umov dig_A,vdig0.s[0]
+ movk Tj, 0x79cc, lsl 16
+ rev32 vvect_msg0.16b,vvect_msg0.16b
+ umov dig_B,vdig0.s[1]
+ rev32 vvect_msg1.16b,vvect_msg1.16b
+ umov dig_C,vdig0.s[2]
+ rev32 vvect_msg2.16b,vvect_msg2.16b
+ umov dig_D,vdig0.s[3]
+ rev32 vvect_msg3.16b,vvect_msg3.16b
+ umov dig_E,vdig1.s[0]
+ stp qvect_msg0,qvect_msg1,[sp,msg_off]
+ umov dig_F,vdig1.s[1]
+ stp qvect_msg2,qvect_msg3,[sp,msg_off+32]
+ umov dig_G,vdig1.s[2]
+ eor vvect_msgP0.16b,vvect_msg0.16b,vvect_msg1.16b
+ eor vvect_msgP1.16b,vvect_msg1.16b,vvect_msg2.16b
+ umov dig_H,vdig1.s[3]
+ stp qvect_msgP0,qvect_msgP1,[sp,wp_off]
+ eor vvect_msgP2.16b,vvect_msg2.16b,vvect_msg3.16b
+ str qvect_msgP2,[sp,wp_off+32]
+
+ sm3_round_0 0
+ sm3_round_0 1
+ sm3_round_0 2
+ sm3_round_0 3
+ sm3_round_0 4
+ sm3_round_0 5
+ sm3_round_0 6
+ sm3_round_0 7
+ sm3_round_0 8
+ sm3_round_0 9
+ sm3_round_0 10
+ sm3_round_0 11
+
+ sm3_round_12 12
+ sm3_round_12 13
+ sm3_round_12 14
+ sm3_round_12 15
+ mov Tj, 0x7a87
+ movk Tj, 0x9d8a, lsl 16
+ sm3_round_16 16
+ sm3_round_16 17
+ sm3_round_16 18
+ sm3_round_16 19
+ sm3_round_16 20
+ sm3_round_16 21
+ sm3_round_16 22
+ sm3_round_16 23
+ sm3_round_16 24
+ sm3_round_16 25
+ sm3_round_16 26
+ sm3_round_16 27
+ sm3_round_16 28
+ sm3_round_16 29
+ sm3_round_16 30
+ sm3_round_16 31
+ sm3_round_16 32
+ sm3_round_16 33
+ sm3_round_16 34
+ sm3_round_16 35
+ sm3_round_16 36
+ sm3_round_16 37
+ sm3_round_16 38
+ sm3_round_16 39
+ sm3_round_16 40
+ sm3_round_16 41
+ sm3_round_16 42
+ sm3_round_16 43
+ sm3_round_16 44
+ sm3_round_16 45
+ sm3_round_16 46
+ sm3_round_16 47
+ sm3_round_16 48
+ sm3_round_16 49
+ sm3_round_16 50
+ sm3_round_16 51
+ sm3_round_16 52
+ sm3_round_16 53
+ sm3_round_16 54
+ sm3_round_16 55
+ sm3_round_16 56
+ sm3_round_16 57
+ sm3_round_16 58
+ sm3_round_16 59
+ sm3_round_16 60
+ sm3_round_16 61
+ sm3_round_16 62
+ sm3_round_63 63
+ subs len,len,1
+ eor vdig0.16b,vdig0.16b,vdig0_bak.16b
+ eor vdig1.16b,vdig1.16b,vdig1_bak.16b
+ bne .start_loop
+.exit_func:
+ ldp x19, x20, [sp, 16]
+ rev32 vdig0.16b,vdig0.16b
+ ldp x21, x22, [sp, 32]
+ rev32 vdig1.16b,vdig1.16b
+ ldp x23, x24, [sp, 48]
+ stp qdig0,qdig1,[digest]
+ ldp x25, x26, [sp, 64]
+ ldp x27, x28, [sp, 80]
+ ldp x29, x30, [sp], STACK_SIZE
+ ret
+ .size sm3_mb_asimd_x1, .-sm3_mb_asimd_x1
+
diff --git a/drv/hash_mb/sm3_mb_asimd_x4.S b/drv/hash_mb/sm3_mb_asimd_x4.S
new file mode 100644
index 0000000..975a07c
--- /dev/null
+++ b/drv/hash_mb/sm3_mb_asimd_x4.S
@@ -0,0 +1,576 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTmsgARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED msgARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED msgARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ dig_A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OmsgNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOmsgEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, msgHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERmsgISE) ARISING IN ANY msgAY OUT OF THE USE
+ OF THIS SOFTmsgARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ .arch armv8.2-a
+ .text
+ .align 2
+ .p2align 3,,7
+
+.macro declare_var_vector_reg name:req,reg:req
+ q\name\() .req q\reg
+ v\name\() .req v\reg
+ s\name\() .req s\reg
+.endm
+
+ job0 .req x0
+ job1 .req x1
+ job2 .req x2
+ job3 .req x3
+ len .req x4
+
+ job0_data .req x5
+ job1_data .req x6
+ job2_data .req x7
+ job3_data .req x9
+
+ job0_digest .req x0
+ job1_digest .req x1
+ job2_digest .req x2
+ job3_digest .req x3
+ job0_tmp .req x10
+ job1_tmp .req x11
+ job2_tmp .req x12
+ job3_tmp .req x13
+ const_adr .req x14
+
+
+ declare_var_vector_reg msg0,0
+ declare_var_vector_reg msg1,1
+ declare_var_vector_reg msg2,2
+ declare_var_vector_reg msg3,3
+ declare_var_vector_reg msg4,4
+ declare_var_vector_reg msg5,5
+ declare_var_vector_reg msg6,6
+ declare_var_vector_reg msg7,7
+ declare_var_vector_reg msg8,8
+ declare_var_vector_reg msg9,9
+ declare_var_vector_reg msg10,10
+ declare_var_vector_reg msg11,11
+ declare_var_vector_reg msg12,12
+ declare_var_vector_reg msg13,13
+ declare_var_vector_reg msg14,14
+ declare_var_vector_reg msg15,15
+ declare_var_vector_reg msg16,16
+
+
+ declare_var_vector_reg dig_A,24
+ declare_var_vector_reg dig_B,25
+ declare_var_vector_reg dig_C,26
+ declare_var_vector_reg dig_D,27
+ declare_var_vector_reg dig_E,28
+ declare_var_vector_reg dig_F,29
+ declare_var_vector_reg dig_G,30
+ declare_var_vector_reg dig_H,31
+
+ declare_var_vector_reg TT1,17
+ declare_var_vector_reg TT2,18
+ declare_var_vector_reg SS1,19
+ declare_var_vector_reg SS2,20
+ declare_var_vector_reg tmp0,21
+ declare_var_vector_reg word_pair,23
+ declare_var_vector_reg Tj,22
+
+
+.macro rol32 target:req,reg:req,bit:req
+ ushr v\target\().4s,v\reg\().4s,32 - \bit
+ sli v\target\().4s,v\reg\().4s,\bit
+.endm
+
+// round 0-11
+.macro sm3_round_0 round:req,wp:req
+
+ ushr vtmp0.4s,vdig_A.4s,32 - 12
+
+ add vSS1.4s,vdig_E.4s,vTj.4s
+ sli vtmp0.4s,vdig_A.4s,12
+ rev32 vmsg\round\().16b,vmsg\round\().16b
+ rev32 vmsg\wp\().16b,vmsg\wp\().16b
+ add vTT1.4s,vSS1.4s,vtmp0.4s //SS1 Done
+ rol32 SS1,TT1,7
+ eor vSS2.16b,vSS1.16b,vtmp0.16b //SS2 Done
+ eor vword_pair.16b,vmsg\round\().16b,vmsg\wp\().16b
+
+ eor vTT1.16b,vdig_A.16b,vdig_B.16b
+ eor vTT2.16b,vdig_E.16b,vdig_F.16b
+ eor vTT1.16b,vTT1.16b,vdig_C.16b
+ eor vTT2.16b,vTT2.16b,vdig_G.16b
+
+ add vSS1.4s,vSS1.4s,vmsg\round\().4s
+ add vSS2.4s,vSS2.4s,vword_pair.4s
+ add vTT1.4s,vTT1.4s,vdig_D.4s
+ add vTT2.4s,vTT2.4s,vdig_H.4s
+ ushr vtmp0.4s,vTj.4s,32-1
+ add vTT1.4s,vTT1.4s,vSS2.4s //TT1 Done
+ sli vtmp0.4s,vTj.4s,1
+ add vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done
+ mov vTj.16b,vtmp0.16b
+ //D=C
+ mov vdig_D.16b,vdig_C.16b
+ //C = ROTL32(B, 9);
+ ushr vdig_C.4s,vdig_B.4s,32 - 9
+ sli vdig_C.4s,vdig_B.4s,9
+ //B=A
+ mov vdig_B.16b,vdig_A.16b
+ //A=TT1
+ mov vdig_A.16b,vTT1.16b
+ // H=G
+ mov vdig_H.16b,vdig_G.16b
+ //G = ROTL32(F,19)
+ rol32 dig_G,dig_F,19
+ //F = E
+ mov vdig_F.16b,vdig_E.16b
+ // E=Target, TT2=src, TT1,SS1,SS2 is free
+ // E = P0(TT2);
+ ushr vSS2.4s, vTT2.4s, 32 - 9
+ ushr vSS1.4s, vTT2.4s, 32 - 17
+ sli vSS2.4s, vTT2.4s, 9
+ sli vSS1.4s, vTT2.4s, 17
+ eor vdig_E.16b, vTT2.16b, vSS1.16b
+ eor vdig_E.16b, vdig_E.16b, vSS2.16b
+
+.endm
+
+
+.macro sm3_round_4 round:req,wp:req
+
+ ushr vtmp0.4s,vdig_A.4s,32 - 12
+ add vSS1.4s,vdig_E.4s,vTj.4s
+ sli vtmp0.4s,vdig_A.4s,12
+ rev32 vmsg\wp\().16b,vmsg\wp\().16b
+ add vTT1.4s,vSS1.4s,vtmp0.4s //SS1 Done
+ rol32 SS1,TT1,7
+ eor vSS2.16b,vSS1.16b,vtmp0.16b //SS2 Done
+ eor vword_pair.16b,vmsg\round\().16b,vmsg\wp\().16b
+ eor vTT1.16b,vdig_A.16b,vdig_B.16b
+ eor vTT2.16b,vdig_E.16b,vdig_F.16b
+ eor vTT1.16b,vTT1.16b,vdig_C.16b
+ eor vTT2.16b,vTT2.16b,vdig_G.16b
+ add vSS1.4s,vSS1.4s,vmsg\round\().4s
+ add vSS2.4s,vSS2.4s,vword_pair.4s
+ add vTT1.4s,vTT1.4s,vdig_D.4s
+ add vTT2.4s,vTT2.4s,vdig_H.4s
+ ushr vtmp0.4s,vTj.4s,32-1
+ add vTT1.4s,vTT1.4s,vSS2.4s //TT1 Done
+ sli vtmp0.4s,vTj.4s,1
+ add vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done
+ mov vTj.16b,vtmp0.16b
+ //D=C
+ mov vdig_D.16b,vdig_C.16b
+ //C = ROTL32(B, 9);
+ ushr vdig_C.4s,vdig_B.4s,32 - 9
+ sli vdig_C.4s,vdig_B.4s,9
+ //B=A
+ mov vdig_B.16b,vdig_A.16b
+ //A=TT1
+ mov vdig_A.16b,vTT1.16b
+ // H=G
+ mov vdig_H.16b,vdig_G.16b
+ //G = ROTL32(F,19)
+ rol32 dig_G,dig_F,19
+ //F = E
+ mov vdig_F.16b,vdig_E.16b
+ // E=Target, TT2=src, TT1,SS1,SS2 is free
+ // E = P0(TT2);
+ ushr vSS2.4s, vTT2.4s, 32 - 9
+ ushr vSS1.4s, vTT2.4s, 32 - 17
+ sli vSS2.4s, vTT2.4s, 9
+ sli vSS1.4s, vTT2.4s, 17
+ eor vdig_E.16b, vTT2.16b, vSS1.16b
+ eor vdig_E.16b, vdig_E.16b, vSS2.16b
+
+.endm
+
+//round 12-15
+.macro sm3_round_12 round:req,plus_4:req,m0,m1,m2,m3,m4
+ rol32 msg\plus_4,msg\m2,15
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m0\().16b
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m1\().16b
+ rol32 tmp0,msg\plus_4,15
+ rol32 word_pair,msg\plus_4,23
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vword_pair.16b
+ rol32 tmp0,msg\m3,7
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m4\().16b
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b
+ ushr vtmp0.4s,vdig_A.4s,32 - 12
+ sli vtmp0.4s,vdig_A.4s,12
+ add vSS1.4s,vdig_E.4s,vTj.4s
+ add vSS2.4s,vSS1.4s,vtmp0.4s //SS1 Done
+ rol32 SS1,SS2,7
+ eor vSS2.16b,vSS1.16b,vtmp0.16b //SS2 Done
+ eor vword_pair.16b,vmsg\round\().16b,vmsg\plus_4\().16b
+ eor vTT1.16b,vdig_A.16b,vdig_B.16b
+ eor vTT1.16b,vTT1.16b,vdig_C.16b
+ eor vTT2.16b,vdig_E.16b,vdig_F.16b
+ eor vTT2.16b,vTT2.16b,vdig_G.16b
+ add vSS1.4s,vSS1.4s,vmsg\round\().4s
+ add vSS2.4s,vSS2.4s,vword_pair.4s
+ add vTT1.4s,vTT1.4s,vdig_D.4s
+ add vTT2.4s,vTT2.4s,vdig_H.4s
+ ushr vtmp0.4s,vTj.4s,32-1
+ add vTT1.4s,vTT1.4s,vSS2.4s //TT1 Done
+ sli vtmp0.4s,vTj.4s,1
+ add vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done
+ mov vTj.16b,vtmp0.16b
+ //D=C
+ mov vdig_D.16b,vdig_C.16b
+ //C = ROTL32(B, 9);
+ ushr vdig_C.4s,vdig_B.4s,32 - 9
+ sli vdig_C.4s,vdig_B.4s,9
+ //B=A
+ mov vdig_B.16b,vdig_A.16b
+ //A=TT1
+ mov vdig_A.16b,vTT1.16b
+ // H=G
+ mov vdig_H.16b,vdig_G.16b
+ //G = ROTL32(F,19)
+ rol32 dig_G,dig_F,19
+ //F = E
+ mov vdig_F.16b,vdig_E.16b
+ // E=Target, TT2=src, TT1,SS1,SS2 is free
+ // E = P0(TT2);
+ ushr vSS2.4s, vTT2.4s, 32 - 9
+ ushr vSS1.4s, vTT2.4s, 32 - 17
+ sli vSS2.4s, vTT2.4s, 9
+ sli vSS1.4s, vTT2.4s, 17
+ eor vdig_E.16b, vTT2.16b, vSS1.16b
+ eor vdig_E.16b, vdig_E.16b, vSS2.16b
+.endm
+
+// round 16-62
+.macro sm3_round_16 round:req,plus_4:req,m0,m1,m2,m3,m4
+ rol32 msg\plus_4,msg\m2,15
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m0\().16b
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m1\().16b
+ rol32 tmp0,msg\plus_4,15
+ rol32 word_pair,msg\plus_4,23
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vword_pair.16b
+ rol32 tmp0,msg\m3,7
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m4\().16b
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b
+ ushr vtmp0.4s,vdig_A.4s,32 - 12
+ sli vtmp0.4s,vdig_A.4s,12
+ add vSS1.4s,vdig_E.4s,vTj.4s
+ add vSS2.4s,vSS1.4s,vtmp0.4s //SS1 Done
+ rol32 SS1,SS2,7
+ eor vSS2.16b,vSS1.16b,vtmp0.16b //SS2 Done
+ eor vword_pair.16b,vmsg\round\().16b,vmsg\plus_4\().16b
+ mov vTT2.16b,vdig_E.16b
+ orr vTT1.16b,vdig_B.16b,vdig_C.16b
+ and vtmp0.16b,vdig_B.16b,vdig_C.16b
+ bsl vTT2.16b,vdig_F.16b,vdig_G.16b
+ and vTT1.16b,vTT1.16b,vdig_A.16b
+ add vSS1.4s,vSS1.4s,vmsg\round\().4s
+ orr vTT1.16b,vTT1.16b,vtmp0.16b
+ add vSS2.4s,vSS2.4s,vword_pair.4s
+ add vTT1.4s,vTT1.4s,vdig_D.4s
+ add vTT2.4s,vTT2.4s,vdig_H.4s
+ ushr vtmp0.4s,vTj.4s,32-1
+ add vTT1.4s,vTT1.4s,vSS2.4s //TT1 Done
+ sli vtmp0.4s,vTj.4s,1
+ add vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done
+ mov vTj.16b,vtmp0.16b
+ //D=C
+ mov vdig_D.16b,vdig_C.16b
+ //C = ROTL32(B, 9);
+ ushr vdig_C.4s,vdig_B.4s,32 - 9
+ sli vdig_C.4s,vdig_B.4s,9
+ //B=A
+ mov vdig_B.16b,vdig_A.16b
+ //A=TT1
+ mov vdig_A.16b,vTT1.16b
+ // H=G
+ mov vdig_H.16b,vdig_G.16b
+ //G = ROTL32(F,19)
+ rol32 dig_G,dig_F,19
+ //F = E
+ mov vdig_F.16b,vdig_E.16b
+ // E=Target, TT2=src, TT1,SS1,SS2 is free
+ // E = P0(TT2);
+ ushr vSS2.4s, vTT2.4s, 32 - 9
+ ushr vSS1.4s, vTT2.4s, 32 - 17
+ sli vSS2.4s, vTT2.4s, 9
+ sli vSS1.4s, vTT2.4s, 17
+ eor vdig_E.16b, vTT2.16b, vSS1.16b
+ eor vdig_E.16b, vdig_E.16b, vSS2.16b
+.endm
+
+//round 63
+.macro sm3_round_63 round:req,plus_4:req,m0,m1,m2,m3,m4
+ rol32 msg\plus_4,msg\m2,15
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m0\().16b
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m1\().16b
+ rol32 tmp0,msg\plus_4,15
+ rol32 word_pair,msg\plus_4,23
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vword_pair.16b
+ rol32 tmp0,msg\m3,7
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m4\().16b
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b
+ ushr vtmp0.4s,vdig_A.4s,32 - 12
+ sli vtmp0.4s,vdig_A.4s,12
+ add vSS1.4s,vdig_E.4s,vTj.4s
+ add vSS2.4s,vSS1.4s,vtmp0.4s //SS1 Done
+ rol32 SS1,SS2,7
+ eor vSS2.16b,vSS1.16b,vtmp0.16b //SS2 Done
+ eor vword_pair.16b,vmsg\round\().16b,vmsg\plus_4\().16b
+
+ ldp qmsg0,qmsg1,[sp,dig_off+ 0]
+ mov vTT2.16b,vdig_E.16b
+ ldp qmsg2,qmsg3,[sp,dig_off+ 32]
+ orr vTT1.16b,vdig_B.16b,vdig_C.16b
+ ldp qmsg4,qmsg5,[sp,dig_off+ 64]
+ and vtmp0.16b,vdig_B.16b,vdig_C.16b
+ bsl vTT2.16b,vdig_F.16b,vdig_G.16b
+ ldp qmsg6,qmsg7,[sp,dig_off+ 96]
+ and vTT1.16b,vTT1.16b,vdig_A.16b
+ add vSS1.4s,vSS1.4s,vmsg\round\().4s
+ orr vTT1.16b,vTT1.16b,vtmp0.16b
+ add vSS2.4s,vSS2.4s,vword_pair.4s
+ add vTT1.4s,vTT1.4s,vdig_D.4s
+ add vTT2.4s,vTT2.4s,vdig_H.4s
+ add vTT1.4s,vTT1.4s,vSS2.4s //TT1 Done
+ add vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done
+ //D=C
+ eor vdig_D.16b,vdig_C.16b,vmsg3.16b
+ //C = ROTL32(B, 9);
+ ushr vdig_C.4s,vdig_B.4s,32 - 9
+ sli vdig_C.4s,vdig_B.4s,9
+ eor vdig_C.16b,vdig_C.16b,vmsg2.16b
+ //B=A
+ eor vdig_B.16b,vdig_A.16b,vmsg1.16b
+ stp qdig_C,qdig_D,[sp,dig_off+ 32]
+ //A=TT1
+ eor vdig_A.16b,vTT1.16b,vmsg0.16b
+ // H=G
+ eor vdig_H.16b,vdig_G.16b,vmsg7.16b
+ stp qdig_A,qdig_B,[sp,dig_off+ 0]
+ //G = ROTL32(F,19)
+ rol32 dig_G,dig_F,19
+ eor vdig_G.16b,vdig_G.16b,vmsg6.16b
+ //F = E
+ eor vdig_F.16b,vdig_E.16b,vmsg5.16b
+ stp qdig_G,qdig_H,[sp,dig_off+ 96]
+ // E=Target, TT2=src, TT1,SS1,SS2 is free
+ // E = P0(TT2);
+ ushr vSS2.4s, vTT2.4s, 32 - 9
+ ushr vSS1.4s, vTT2.4s, 32 - 17
+ sli vSS2.4s, vTT2.4s, 9
+ sli vSS1.4s, vTT2.4s, 17
+ eor vdig_E.16b, vTT2.16b, vSS1.16b
+ eor vdig_E.16b, vdig_E.16b, vSS2.16b
+ eor vdig_E.16b, vdig_E.16b, vmsg4.16b
+ stp qdig_E,qdig_F,[sp,dig_off+ 64]
+.endm
+
+ .set dig_off , 80
+
+#define STACK_SIZE 224
+ .global sm3_mb_asimd_x4
+ .type sm3_mb_asimd_x4, %function
+sm3_mb_asimd_x4:
+ stp x29,x30, [sp,-STACK_SIZE]!
+ cmp len,0
+ //push d8~d15
+ ldr job0_data, [job0],64
+ stp d8,d9, [sp,16]
+ ldr job1_data, [job1],64
+ stp d10,d11,[sp,32]
+ ldr job2_data, [job2],64
+ stp d12,d13,[sp,48]
+ ldr job3_data, [job3],64
+ stp d14,d15,[sp,64]
+ ble .exit_func
+
+ mov job0_tmp,job0_digest
+ mov job1_tmp,job1_digest
+ mov job2_tmp,job2_digest
+ mov job3_tmp,job3_digest
+ //load digests
+ ld4 {vdig_A.s-vdig_D.s}[0],[job0_tmp],16
+ ld4 {vdig_A.s-vdig_D.s}[1],[job1_tmp],16
+ ld4 {vdig_A.s-vdig_D.s}[2],[job2_tmp],16
+ adrp const_adr, .consts
+ ld4 {vdig_A.s-vdig_D.s}[3],[job3_tmp],16
+ add const_adr, const_adr, #:lo12:.consts
+ ld4 {vdig_E.s-vdig_H.s}[0],[job0_tmp]
+ rev32 vdig_A.16b,vdig_A.16b
+ ld4 {vdig_E.s-vdig_H.s}[1],[job1_tmp]
+ rev32 vdig_B.16b,vdig_B.16b
+ ld4 {vdig_E.s-vdig_H.s}[2],[job2_tmp]
+ rev32 vdig_C.16b,vdig_C.16b
+ ld4 {vdig_E.s-vdig_H.s}[3],[job3_tmp]
+ rev32 vdig_D.16b,vdig_D.16b
+ stp qdig_A,qdig_B,[sp,dig_off+ 0]
+ rev32 vdig_E.16b,vdig_E.16b
+ rev32 vdig_F.16b,vdig_F.16b
+ stp qdig_C,qdig_D,[sp,dig_off+ 32]
+ rev32 vdig_G.16b,vdig_G.16b
+ rev32 vdig_H.16b,vdig_H.16b
+ stp qdig_E,qdig_F,[sp,dig_off+ 64]
+ stp qdig_G,qdig_H,[sp,dig_off+ 96]
+
+.start_loop:
+ ld4 {vmsg0.s-vmsg3.s}[0],[job0_data],16
+ ld4 {vmsg0.s-vmsg3.s}[1],[job1_data],16
+ ld4 {vmsg0.s-vmsg3.s}[2],[job2_data],16
+ ld4 {vmsg0.s-vmsg3.s}[3],[job3_data],16
+ ld4 {vmsg4.s-vmsg7.s}[0],[job0_data],16
+ ld4 {vmsg4.s-vmsg7.s}[1],[job1_data],16
+ ld4 {vmsg4.s-vmsg7.s}[2],[job2_data],16
+ ld4 {vmsg4.s-vmsg7.s}[3],[job3_data],16
+ ld4 {vmsg8.s-vmsg11.16b}[0],[job0_data],16
+ ldr qTj,[const_adr]
+
+ sm3_round_0 0, 4
+
+ ld4 {vmsg8.s-vmsg11.s}[1],[job1_data],16
+ sm3_round_0 1, 5
+
+ ld4 {vmsg8.s-vmsg11.s}[2],[job2_data],16
+ sm3_round_0 2, 6
+ ld4 {vmsg8.s-vmsg11.s}[3],[job3_data],16
+ sm3_round_0 3, 7
+
+ ld4 {vmsg12.s-vmsg15.s}[0],[job0_data],16
+
+ sm3_round_4 4, 8
+ ld4 {vmsg12.s-vmsg15.s}[1],[job1_data],16
+ sm3_round_4 5, 9
+ ld4 {vmsg12.s-vmsg15.s}[2],[job2_data],16
+ sm3_round_4 6,10
+ ld4 {vmsg12.s-vmsg15.s}[3],[job3_data],16
+ sm3_round_4 7,11
+ sm3_round_4 8,12
+ sm3_round_4 9,13
+ sm3_round_4 10,14
+ sm3_round_4 11,15
+
+ sm3_round_12 12,16, 0, 7,13, 3,10 //12
+ sm3_round_12 13, 0, 1, 8,14, 4,11 //13
+ sm3_round_12 14, 1, 2, 9,15, 5,12 //14
+ sm3_round_12 15, 2, 3,10,16, 6,13 //15
+
+ ldr qTj,[const_adr,16]
+ sm3_round_16 16, 3, 4,11, 0, 7,14 //16
+#if 0
+ stp sdig_A,sdig_B,[job0_digest]
+ stp sdig_C,sdig_D,[job0_digest,8]
+ stp sdig_E,sdig_F,[job0_digest,16]
+ stp sdig_G,sdig_H,[job0_digest,24]
+ b .exit_func
+#endif
+ sm3_round_16 0, 4, 5,12, 1, 8,15 //17
+
+ sm3_round_16 1, 5, 6,13, 2, 9,16 //18
+ sm3_round_16 2, 6, 7,14, 3,10, 0 //19
+ sm3_round_16 3, 7, 8,15, 4,11, 1 //20
+ sm3_round_16 4, 8, 9,16, 5,12, 2 //21
+ sm3_round_16 5, 9,10, 0, 6,13, 3 //22
+ sm3_round_16 6,10,11, 1, 7,14, 4 //23
+ sm3_round_16 7,11,12, 2, 8,15, 5 //24
+ sm3_round_16 8,12,13, 3, 9,16, 6 //25
+ sm3_round_16 9,13,14, 4,10, 0, 7 //26
+ sm3_round_16 10,14,15, 5,11, 1, 8 //27
+ sm3_round_16 11,15,16, 6,12, 2, 9 //28
+ sm3_round_16 12,16, 0, 7,13, 3,10 //29
+ sm3_round_16 13, 0, 1, 8,14, 4,11 //30
+ sm3_round_16 14, 1, 2, 9,15, 5,12 //31
+ sm3_round_16 15, 2, 3,10,16, 6,13 //32
+ sm3_round_16 16, 3, 4,11, 0, 7,14 //33
+ sm3_round_16 0, 4, 5,12, 1, 8,15 //34
+ sm3_round_16 1, 5, 6,13, 2, 9,16 //35
+ sm3_round_16 2, 6, 7,14, 3,10, 0 //36
+ sm3_round_16 3, 7, 8,15, 4,11, 1 //37
+ sm3_round_16 4, 8, 9,16, 5,12, 2 //38
+ sm3_round_16 5, 9,10, 0, 6,13, 3 //39
+ sm3_round_16 6,10,11, 1, 7,14, 4 //40
+ sm3_round_16 7,11,12, 2, 8,15, 5 //41
+ sm3_round_16 8,12,13, 3, 9,16, 6 //42
+ sm3_round_16 9,13,14, 4,10, 0, 7 //43
+ sm3_round_16 10,14,15, 5,11, 1, 8 //44
+ sm3_round_16 11,15,16, 6,12, 2, 9 //45
+ sm3_round_16 12,16, 0, 7,13, 3,10 //46
+ sm3_round_16 13, 0, 1, 8,14, 4,11 //47
+ sm3_round_16 14, 1, 2, 9,15, 5,12 //48
+ sm3_round_16 15, 2, 3,10,16, 6,13 //49
+ sm3_round_16 16, 3, 4,11, 0, 7,14 //50
+ sm3_round_16 0, 4, 5,12, 1, 8,15 //51
+ sm3_round_16 1, 5, 6,13, 2, 9,16 //52
+ sm3_round_16 2, 6, 7,14, 3,10, 0 //53
+ sm3_round_16 3, 7, 8,15, 4,11, 1 //54
+ sm3_round_16 4, 8, 9,16, 5,12, 2 //55
+ sm3_round_16 5, 9,10, 0, 6,13, 3 //56
+ sm3_round_16 6,10,11, 1, 7,14, 4 //57
+ sm3_round_16 7,11,12, 2, 8,15, 5 //58
+ sm3_round_16 8,12,13, 3, 9,16, 6 //59
+ sm3_round_16 9,13,14, 4,10, 0, 7 //60
+ sm3_round_16 10,14,15, 5,11, 1, 8 //61
+ sm3_round_16 11,15,16, 6,12, 2, 9 //62
+ sm3_round_63 12,16, 0, 7,13, 3,10 //63
+
+ subs len,len,1
+ bne .start_loop
+
+ //save digests with big endian
+ rev32 vdig_A.16b,vdig_A.16b
+ rev32 vdig_B.16b,vdig_B.16b
+ rev32 vdig_C.16b,vdig_C.16b
+ rev32 vdig_D.16b,vdig_D.16b
+ st4 {vdig_A.s-vdig_D.s}[0],[job0_digest],16
+ rev32 vdig_E.16b,vdig_E.16b
+ rev32 vdig_F.16b,vdig_F.16b
+ st4 {vdig_A.s-vdig_D.s}[1],[job1_digest],16
+ rev32 vdig_G.16b,vdig_G.16b
+ rev32 vdig_H.16b,vdig_H.16b
+ st4 {vdig_A.s-vdig_D.s}[2],[job2_digest],16
+ st4 {vdig_A.s-vdig_D.s}[3],[job3_digest],16
+ st4 {vdig_E.s-vdig_H.s}[0],[job0_digest]
+ st4 {vdig_E.s-vdig_H.s}[1],[job1_digest]
+ st4 {vdig_E.s-vdig_H.s}[2],[job2_digest]
+ st4 {vdig_E.s-vdig_H.s}[3],[job3_digest]
+
+.exit_func:
+ ldp d8, d9, [sp,16]
+ ldp d10,d11,[sp,32]
+ ldp d12,d13,[sp,48]
+ ldp d14,d15,[sp,64]
+ ldp x29, x30, [sp], STACK_SIZE
+ ret
+.consts:
+ .word 0x79cc4519
+ .word 0x79cc4519
+ .word 0x79cc4519
+ .word 0x79cc4519
+ .word 0x9d8a7a87
+ .word 0x9d8a7a87
+ .word 0x9d8a7a87
+ .word 0x9d8a7a87
+ .size sm3_mb_asimd_x4, .-sm3_mb_asimd_x4
+
diff --git a/drv/hash_mb/sm3_mb_sve.S b/drv/hash_mb/sm3_mb_sve.S
new file mode 100644
index 0000000..7dd2428
--- /dev/null
+++ b/drv/hash_mb/sm3_mb_sve.S
@@ -0,0 +1,161 @@
+/**********************************************************************
+ Copyright(c) 2022 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ .arch armv8.2-a+sve
+
+.macro copy_mb_16words vecs:req,dest:req
+ mov src,\vecs
+ mov dst,\dest
+ mov ctr,lanes
+1:
+ ldr tmp,[src],8
+ ldr tmp,[tmp]
+ add tmp,tmp,block_ctr,lsl 6
+ ld1 {TMPV0.4s,TMPV1.4s,TMPV2.4s,TMPV3.4s}, [tmp]
+ st1 {TMPV0.4s,TMPV1.4s,TMPV2.4s,TMPV3.4s}, [dst],64
+ subs ctr,ctr,1
+ b.ne 1b
+.endm
+
+.macro load_words windex:req
+ .if \windex == 0
+ mov tmpw,16
+ index VOFFS.s,0,tmpw
+ copy_mb_16words job_vec,databuf
+ mov dataptr,databuf
+ .endif
+ ld1w { WORD\windex\().s}, p0/z, [dataptr, VOFFS.s, UXTW 2]
+ add dataptr,dataptr,4
+.endm
+
+#include "sm3_sve_common.S"
+
+/* int sm3_mb_sve_max_lanes()
+ * return : max lanes of SVE vector
+ */
+ .global sm3_mb_sve_max_lanes
+ .type sm3_mb_sve_max_lanes, %function
+sm3_mb_sve_max_lanes:
+ cntw x0
+ ret
+ .size sm3_mb_sve_max_lanes, .-sm3_mb_sve_max_lanes
+/*
+ * void sm3_mb_sve(int blocks, int total_lanes, SM3_JOB **job_vec)
+ */
+ num_blocks .req w0
+ total_lanes .req w1
+ job_vec .req x2
+ lanes .req x4
+ src .req x5
+ dst .req x6
+ lane_offset .req w7
+ lane_offset_x .req x7
+ tmp .req x8
+ tmpw .req w8
+ block_ctr .req x9
+ block_ctr_w .req w9
+ savedsp .req x10
+ databuf .req x11
+ dataptr .req x12
+ efgh_buf .req x12
+ ctr .req x13
+ abcd_buf .req x14
+ sm3const_adr .req x15
+
+ .global sm3_mb_sve
+ .type sm3_mb_sve, %function
+sm3_mb_sve:
+ cbz num_blocks,.return
+ sm3_sve_save_stack
+ mov savedsp,sp
+ mov lane_offset, #0
+ whilelo p0.s, wzr, total_lanes
+ // reserve (32 * max lanes) for abcdefgh buf
+ cntw tmp
+ lsl tmp, tmp, 5
+ sub abcd_buf,sp,tmp
+ mov tmp,63
+ bic abcd_buf,abcd_buf,tmp
+ // reserve (64 * lanes) for data buf
+ cntp lanes,p0,p0.s
+ lsl tmp,lanes,6
+ sub databuf,abcd_buf,tmp
+ mov sp,databuf
+ adr sm3const_adr,SM3_CONSTS
+.seg_loops:
+ mov src,job_vec
+ mov dst,abcd_buf
+ cntp lanes,p0,p0.s
+ add efgh_buf,abcd_buf,lanes,lsl 4
+ mov ctr,lanes
+.ldr_hash:
+ ldr tmp,[src],8
+ add tmp,tmp,64
+ ld1 {v0.16b, v1.16b},[tmp]
+ rev32 v0.16b,v0.16b
+ rev32 v1.16b,v1.16b
+ st1 {v0.16b},[dst],16
+ st1 {v1.16b},[efgh_buf],16
+ subs ctr,ctr,1
+ bne .ldr_hash
+ ld4w {VA.s,VB.s,VC.s,VD.s},p0/z,[abcd_buf]
+ add tmp,abcd_buf,lanes,lsl 4
+ ld4w {VE.s,VF.s,VG.s,VH.s},p0/z,[tmp]
+ mov block_ctr,0
+ // always unpredicated SVE mode in current settings
+ pred_mode=0
+.block_loop:
+ sm3_single
+ add block_ctr, block_ctr, 1
+ cmp block_ctr_w,num_blocks
+ bne .block_loop
+ st4w {VA.s,VB.s,VC.s,VD.s},p0,[abcd_buf]
+ add efgh_buf,abcd_buf,lanes,lsl 4
+ st4w {VE.s,VF.s,VG.s,VH.s},p0,[efgh_buf]
+ mov dst,job_vec
+ mov src,abcd_buf
+ add job_vec,job_vec,lanes,lsl 3
+ mov ctr,lanes
+.str_hash:
+ ld1 {v0.16b},[src],16
+ ld1 {v1.16b},[efgh_buf],16
+ rev32 v0.16b,v0.16b
+ rev32 v1.16b,v1.16b
+ ldr tmp,[dst],8
+ add tmp,tmp,64
+ st1 {v0.16b,v1.16b},[tmp]
+ subs ctr,ctr,1
+ bne .str_hash
+ incw lane_offset_x
+ whilelo p0.s, lane_offset, total_lanes
+ b.mi .seg_loops
+ mov sp,savedsp
+ sm3_sve_restore_stack
+.return:
+ ret
+ .size sm3_mb_sve, .-sm3_mb_sve
diff --git a/drv/hash_mb/sm3_sve_common.S b/drv/hash_mb/sm3_sve_common.S
new file mode 100644
index 0000000..3d54952
--- /dev/null
+++ b/drv/hash_mb/sm3_sve_common.S
@@ -0,0 +1,505 @@
+/**********************************************************************
+ Copyright(c) 2022 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ VA .req z0
+ VB .req z1
+ VC .req z2
+ VD .req z3
+ VE .req z4
+ VF .req z5
+ VG .req z6
+ VH .req z7
+ TMPV0 .req v8
+ TMPV1 .req v9
+ TMPV2 .req v10
+ TMPV3 .req v11
+ WORD0 .req z8
+ WORD1 .req z9
+ WORD2 .req z10
+ WORD3 .req z11
+ WORD4 .req z12
+ WORD5 .req z13
+ WORD6 .req z14
+ WORD7 .req z15
+ WORD8 .req z16
+ WORD9 .req z17
+ WORD10 .req z18
+ WORD11 .req z19
+ WORD12 .req z20
+ WORD13 .req z21
+ WORD14 .req z22
+ WORD15 .req z23
+ WORD16 .req z24
+ VOFFS .req z24 // reuse WORD16
+ SS1 .req z25
+ SS2 .req z26
+ VT .req z26 // reuse SS2
+ TT2 .req z27
+ VT1 .req z28
+ VT2 .req z29
+ VT3 .req z30
+ VT4 .req z31
+ VZERO .req z31
+ TT .req z0
+
+.macro sve_op inst:req,regd,args:vararg
+ .if pred_mode == 1
+ \inst \regd,p0/m,\args
+ .else
+ \inst \regd,\args
+ .endif
+.endm
+
+.macro sve_bitop inst:req,regd:req,regm:req
+ .if pred_mode == 1
+ \inst \regd\().s,p0/m,\regd\().s,\regm\().s
+ .else
+ \inst \regd\().d,\regd\().d,\regm\().d
+ .endif
+.endm
+
+.macro rotate_left0 out:req,in:req,tmp:req,bits:req,args:vararg
+ .if have_sve2 == 0
+ lsl \tmp\().s,\in\().s,\bits
+ .else
+ movprfx \out\().d,\in\().d
+ xar \out\().s,\out\().s,VZERO.s,32-\bits
+ .endif
+
+ .ifnb \args
+ rotate_left0 \args
+ .endif
+.endm
+
+.macro rotate_left1 out:req,in:req,tmp:req,bits:req,args:vararg
+ .if have_sve2 == 0
+ lsr \out\().s,\in\().s,32-\bits
+ .endif
+
+ .ifnb \args
+ rotate_left1 \args
+ .endif
+.endm
+
+.macro rotate_left2 out:req,in:req,tmp:req,bits:req,args:vararg
+ .if have_sve2 == 0
+ orr \out\().d,\out\().d,\tmp\().d
+ .endif
+
+ .ifnb \args
+ rotate_left2 \args
+ .endif
+.endm
+
+.macro rotate_left args:vararg
+ rotate_left0 \args
+ rotate_left1 \args
+ rotate_left2 \args
+.endm
+
+.macro SVE_EOR3 rd:req,r1:req,r2:req
+ .if have_sve2 == 0
+ sve_bitop eor,\rd,\r1
+ sve_bitop eor,\rd,\r2
+ .else
+ eor3 \rd\().d,\rd\().d,\r1\().d,\r2\().d
+ .endif
+.endm
+
+.macro FUNC_EOR3 ret:req,x:req,y:req,z:req
+ .if have_sve2 == 0
+ eor \ret\().d,\x\().d,\y\().d
+ sve_bitop eor,\ret,\z
+ .else
+ movprfx \ret\().d,\x\().d
+ eor3 \ret\().d,\ret\().d,\y\().d,\z\().d
+ .endif
+.endm
+
+.macro FUNC_FF windex:req,ret:req,x:req,y:req,z:req,tmp1:req,tmp2:req
+ and \ret\().d,\x\().d,\y\().d
+ and \tmp1\().d,\x\().d,\z\().d
+ and \tmp2\().d,\y\().d,\z\().d
+ sve_bitop orr,\ret,\tmp1
+ sve_bitop orr,\ret,\tmp2
+.endm
+
+.macro FUNC_BSL ret:req,x:req,y:req,z:req,tmp:req
+ .if have_sve2 == 0
+ bic \ret\().d,\z\().d,\x\().d
+ and \tmp\().d,\x\().d,\y\().d
+ sve_bitop orr,\ret,\tmp
+ .else
+ movprfx \ret\().d,\x\().d
+ bsl \ret\().d,\ret\().d,\y\().d,\z\().d
+ .endif
+.endm
+
+.altmacro
+.macro load_next_words windex
+ .if \windex < 16
+ load_words \windex
+ .endif
+.endm
+
+.macro SM3_STEP_00_11 windex:req,w:req,w4:req
+ // SS1 = rol32(rol32(a, 12) + e + rol32(T, (j % 32)), 7)
+ ld1rw {VT2.s},p0/z,[sm3const_adr,\windex * 4]
+ rotate_left SS1,VA,VT1,12
+ mov SS2.s,p0/m,SS1.s
+ sve_op add,SS1.s,SS1.s,VE.s
+ sve_op add,SS1.s,SS1.s,VT2.s
+ rotate_left SS1,SS1,VT2,7
+ // d <- TT2 = GG(index, e, f, g) + h + SS1 + W[index]
+ add VT2.s,\w\().s,VH.s
+ FUNC_EOR3 TT2,VE,VF,VG
+ // SS2 = SS1 ^ rol32(a, 12)
+ sve_bitop eor,SS2,SS1
+ sve_op add,TT2.s,TT2.s,VT2.s
+ // h <- TT1 = FF(index, a, b, c) + d + SS2 + WB[index]
+ FUNC_EOR3 VH,VA,VB,VC
+ eor VT1.d,\w\().d,\w4\().d
+ sve_op add,VH.s,VH.s,VD.s
+ sve_op add,VH.s,VH.s,VT1.s
+ add VD.s,TT2.s,SS1.s
+ sve_op add,VH.s,VH.s,SS2.s
+ // d = P0(TT2)
+ rotate_left VT1,VD,VT2,9,VT3,VD,VT4,17
+ SVE_EOR3 VD,VT1,VT3
+ // b = rol32(b, 9)
+ // f = rol32(f, 19)
+ rotate_left VB,VB,VT3,9,VF,VF,VT4,19
+.endm
+
+.macro SM3_STEP_12_15 windex:req,w:req,w4:req,w16:req,w13:req,w9:req,w6:req,w3:req
+ // SS1 = rol32(rol32(a, 12) + e + rol32(T, (j % 32)), 7)
+ rotate_left VT,\w3,VT1,15,\w4,\w13,VT2,7,SS1,VA,VT3,12
+ ld1rw {VT1.s},p0/z,[sm3const_adr,\windex * 4]
+ mov TT2.s,p0/m,SS1.s
+ sve_bitop eor,VT,\w16
+ sve_op add,SS1.s,SS1.s,VE.s
+ sve_bitop eor,VT,\w9
+ sve_op add,SS1.s,SS1.s,VT1.s
+ rotate_left VT1,VT,VT2,15,VT3,VT,VT4,23
+ SVE_EOR3 VT,VT1,VT3
+ rotate_left SS1,SS1,VT2,7
+ sve_bitop eor,\w4,VT
+ // SS2 = SS1 ^ rol32(a, 12)
+ eor SS2.d,TT2.d,SS1.d
+ sve_bitop eor,\w4,\w6
+ // d <- TT2 = GG(index, e, f, g) + h + SS1 + W[index]
+ FUNC_EOR3 TT2,VE,VF,VG
+ add VT1.s,\w\().s,VH.s
+ sve_op add,TT2.s,TT2.s,VT1.s
+ // h <- TT1 = FF(index, a, b, c) + d + SS2 + WB[index]
+ FUNC_EOR3 VH,VA,VB,VC
+ eor VT1.d,\w\().d,\w4\().d
+ sve_op add,VH.s,VH.s,VD.s
+ // b = rol32(b, 9)
+ // f = rol32(f, 19)
+ rotate_left VB,VB,VT3,9
+ sve_op add,VH.s,VH.s,VT1.s
+ add VD.s,TT2.s,SS1.s
+ sve_op add,VH.s,VH.s,SS2.s
+ // d = P0(TT2)
+ rotate_left VT1,VD,VT2,9,VT3,VD,VT4,17,VF,VF,TT2,19
+ SVE_EOR3 VD,VT1,VT3
+.endm
+
+.macro SM3_STEP_16_62 windex:req,w:req,w4:req,w16:req,w13:req,w9:req,w6:req,w3:req
+ // SS1 = rol32(rol32(a, 12) + e + rol32(T, (j % 32)), 7)
+ rotate_left VT,\w3,VT1,15,\w4,\w13,VT2,7,SS1,VA,VT3,12
+ ld1rw {VT1.s},p0/z,[sm3const_adr,\windex * 4]
+ mov TT2.s,p0/m,SS1.s
+ sve_bitop eor,VT,\w16
+ sve_op add,SS1.s,SS1.s,VE.s
+ sve_bitop eor,VT,\w9
+ sve_op add,SS1.s,SS1.s,VT1.s
+ rotate_left VT1,VT,VT2,15,VT3,VT,VT4,23
+ SVE_EOR3 \w4,VT,VT1
+ rotate_left SS1,SS1,VT2,7
+ sve_bitop eor,\w4,VT3
+ // SS2 = SS1 ^ rol32(a, 12)
+ eor SS2.d,TT2.d,SS1.d
+ sve_bitop eor,\w4,\w6
+ // d <- TT2 = GG(index, e, f, g) + h + SS1 + W[index]
+ sve_op add,SS1.s,SS1.s,\w\().s
+ FUNC_BSL TT2,VE,VF,VG,VT1
+ sve_op add,SS1.s,SS1.s,VH.s
+ // h <- TT1 = FF(index, a, b, c) + d + SS2 + WB[index]
+ FUNC_FF \windex,VH,VA,VB,VC,VT1,VT2
+ eor VT1.d,\w\().d,\w4\().d
+ sve_op add,VH.s,VH.s,VD.s
+ // b = rol32(b, 9)
+ // f = rol32(f, 19)
+ rotate_left VB,VB,VT2,9,VF,VF,VT4,19
+ sve_op add,VH.s,VH.s,VT1.s
+ add VD.s,TT2.s,SS1.s
+ sve_op add,VH.s,VH.s,SS2.s
+ // d = P0(TT2)
+ rotate_left VT1,VD,VT2,9,VT3,VD,VT4,17
+ SVE_EOR3 VD,VT1,VT3
+.endm
+
+.macro SM3_STEP_63 windex:req,w:req,w4:req,w16:req,w13:req,w9:req,w6:req,w3:req
+ // SS1 = rol32(rol32(a, 12) + e + rol32(T, (j % 32)), 7)
+ rotate_left VT,\w3,VT1,15,\w4,\w13,VT2,7,SS1,VA,VT3,12
+ ld1rw {VT1.s},p0/z,[sm3const_adr,\windex * 4]
+ mov TT2.s,p0/m,SS1.s
+ sve_bitop eor,VT,\w16
+ sve_op add,SS1.s,SS1.s,VE.s
+ sve_bitop eor,VT,\w9
+ sve_op add,SS1.s,SS1.s,VT1.s
+ rotate_left VT1,VT,VT2,15,VT3,VT,VT4,23
+ SVE_EOR3 VT,VT1,VT3
+ rotate_left SS1,SS1,VT2,7
+ sve_bitop eor,\w4,VT
+ // SS2 = SS1 ^ rol32(a, 12)
+ eor SS2.d,TT2.d,SS1.d
+ sve_bitop eor,\w4,\w6
+ // d <- TT2 = GG(index, e, f, g) + h + SS1 + W[index]
+ FUNC_BSL TT2,VE,VF,VG,VT1
+ add VT1.s,\w\().s,VH.s
+ .if \windex == 63
+ ld1w {WORD0.s},p0/z,[abcd_buf, 0, MUL VL]
+ ld1w {WORD1.s},p0/z,[abcd_buf, 1, MUL VL]
+ ld1w {WORD2.s},p0/z,[abcd_buf, 2, MUL VL]
+ ld1w {WORD3.s},p0/z,[abcd_buf, 3, MUL VL]
+ ld1w {WORD4.s},p0/z,[abcd_buf, 4, MUL VL]
+ ld1w {WORD5.s},p0/z,[abcd_buf, 5, MUL VL]
+ ld1w {WORD6.s},p0/z,[abcd_buf, 6, MUL VL]
+ ld1w {WORD7.s},p0/z,[abcd_buf, 7, MUL VL]
+ .endif
+ sve_op add,TT2.s,TT2.s,VT1.s
+ // h <- TT1 = FF(index, a, b, c) + d + SS2 + WB[index]
+ FUNC_FF \windex,VH,VA,VB,VC,VT1,VT2
+ eor VT1.d,\w\().d,\w4\().d
+ sve_op add,VH.s,VH.s,VD.s
+ // b = rol32(b, 9)
+ // f = rol32(f, 19)
+ rotate_left VB,VB,VT2,9,VF,VF,VT4,19
+ sve_op add,VH.s,VH.s,VT1.s
+ add VD.s,TT2.s,SS1.s
+ sve_bitop eor,VA,WORD1
+ sve_bitop eor,VB,WORD2
+ sve_bitop eor,VC,WORD3
+ // d = P0(TT2)
+ rotate_left VT1,VD,VT2,9,VT3,VD,VT4,17
+ sve_bitop eor,VF,WORD6
+ SVE_EOR3 VD,VT1,VT3
+ sve_bitop eor,VG,WORD7
+ sve_bitop eor,VD,WORD4
+ sve_op add,VH.s,VH.s,SS2.s
+ sve_bitop eor,VE,WORD5
+ sve_bitop eor,VH,WORD0
+.endm
+
+.macro SWAP_STATES
+ .unreq TT
+ TT .req VH
+ .unreq VH
+ VH .req VG
+ .unreq VG
+ VG .req VF
+ .unreq VF
+ VF .req VE
+ .unreq VE
+ VE .req VD
+ .unreq VD
+ VD .req VC
+ .unreq VC
+ VC .req VB
+ .unreq VB
+ VB .req VA
+ .unreq VA
+ VA .req TT
+.endm
+
+.altmacro
+.macro SM3_STEP_WRAPPER windex:req,idx:req,idx4:req,idx16,idx13,idx9,idx6,idx3
+ .if \windex <= 11
+ revb WORD\idx4\().s, p0/m, WORD\idx4\().s
+ next=\idx4+1
+ load_next_words %next
+ SM3_STEP_00_11 \windex,WORD\idx\(),WORD\idx4\()
+ .else
+ .if \windex < 16
+ SM3_STEP_12_15 \windex,WORD\idx\(),\
+ WORD\idx4\(),WORD\idx16\(),WORD\idx13\(),\
+ WORD\idx9\(),WORD\idx6\(),WORD\idx3\()
+ .else
+ .if \windex == 63
+ SM3_STEP_63 \windex,WORD\idx\(),WORD\idx4\(),\
+ WORD\idx16\(),WORD\idx13\(),WORD\idx9\(),\
+ WORD\idx6\(),WORD\idx3\()
+ .else
+ SM3_STEP_16_62 \windex,WORD\idx\(),WORD\idx4\(),\
+ WORD\idx16\(),WORD\idx13\(),WORD\idx9\(),\
+ WORD\idx6\(),WORD\idx3\()
+ .endif
+ .endif
+ .endif
+.endm
+
+.macro exec_step windex:req
+ .if \windex <= 11
+ idx4=\windex+4
+ SM3_STEP_WRAPPER \windex,\windex,%idx4
+ .else
+ idxp4=\windex + 4
+ idx4=idxp4 % 17
+ idx16=(idxp4 - 16) % 17
+ idx13=(idxp4 - 13) % 17
+ idx9=(idxp4 - 9) % 17
+ idx6=(idxp4 - 6) % 17
+ idx3=(idxp4 - 3) % 17
+ idx=\windex % 17
+ SM3_STEP_WRAPPER \windex,%idx,%idx4,%idx16,%idx13,%idx9,%idx6,%idx3
+ .endif
+ SWAP_STATES
+.endm
+
+.macro sm3_exec
+ current_step=0
+ .rept 64
+ exec_step %current_step
+ current_step=current_step+1
+ .endr
+.endm
+
+.macro sm3_single sve2:vararg
+ .ifnb \sve2
+ have_sve2 = 1
+ .else
+ have_sve2=0
+ .endif
+ st1w {VA.s},p0,[abcd_buf, 0, MUL VL]
+ st1w {VB.s},p0,[abcd_buf, 1, MUL VL]
+ st1w {VC.s},p0,[abcd_buf, 2, MUL VL]
+ st1w {VD.s},p0,[abcd_buf, 3, MUL VL]
+ st1w {VE.s},p0,[abcd_buf, 4, MUL VL]
+ st1w {VF.s},p0,[abcd_buf, 5, MUL VL]
+ st1w {VG.s},p0,[abcd_buf, 6, MUL VL]
+ st1w {VH.s},p0,[abcd_buf, 7, MUL VL]
+ load_words 0
+ load_words 1
+ load_words 2
+ load_words 3
+ load_words 4
+ revb WORD0.s, p0/m, WORD0.s
+ revb WORD1.s, p0/m, WORD1.s
+ revb WORD2.s, p0/m, WORD2.s
+ revb WORD3.s, p0/m, WORD3.s
+ .if have_sve2 == 1
+ mov VZERO.s,p0/m,#0
+ .endif
+ sm3_exec
+.endm
+
+.macro sm3_sve_save_stack
+ stp d8,d9,[sp, -64]!
+ stp d10,d11,[sp, 16]
+ stp d12,d13,[sp, 32]
+ stp d14,d15,[sp, 48]
+.endm
+
+.macro sm3_sve_restore_stack
+ ldp d10,d11,[sp, 16]
+ ldp d12,d13,[sp, 32]
+ ldp d14,d15,[sp, 48]
+ ldp d8,d9,[sp],64
+.endm
+
+ .section .rodata.cst16,"aM",@progbits,16
+ .align 16
+SM3_CONSTS:
+ .word 0x79CC4519
+ .word 0xF3988A32
+ .word 0xE7311465
+ .word 0xCE6228CB
+ .word 0x9CC45197
+ .word 0x3988A32F
+ .word 0x7311465E
+ .word 0xE6228CBC
+ .word 0xCC451979
+ .word 0x988A32F3
+ .word 0x311465E7
+ .word 0x6228CBCE
+ .word 0xC451979C
+ .word 0x88A32F39
+ .word 0x11465E73
+ .word 0x228CBCE6
+ .word 0x9D8A7A87
+ .word 0x3B14F50F
+ .word 0x7629EA1E
+ .word 0xEC53D43C
+ .word 0xD8A7A879
+ .word 0xB14F50F3
+ .word 0x629EA1E7
+ .word 0xC53D43CE
+ .word 0x8A7A879D
+ .word 0x14F50F3B
+ .word 0x29EA1E76
+ .word 0x53D43CEC
+ .word 0xA7A879D8
+ .word 0x4F50F3B1
+ .word 0x9EA1E762
+ .word 0x3D43CEC5
+ .word 0x7A879D8A
+ .word 0xF50F3B14
+ .word 0xEA1E7629
+ .word 0xD43CEC53
+ .word 0xA879D8A7
+ .word 0x50F3B14F
+ .word 0xA1E7629E
+ .word 0x43CEC53D
+ .word 0x879D8A7A
+ .word 0x0F3B14F5
+ .word 0x1E7629EA
+ .word 0x3CEC53D4
+ .word 0x79D8A7A8
+ .word 0xF3B14F50
+ .word 0xE7629EA1
+ .word 0xCEC53D43
+ .word 0x9D8A7A87
+ .word 0x3B14F50F
+ .word 0x7629EA1E
+ .word 0xEC53D43C
+ .word 0xD8A7A879
+ .word 0xB14F50F3
+ .word 0x629EA1E7
+ .word 0xC53D43CE
+ .word 0x8A7A879D
+ .word 0x14F50F3B
+ .word 0x29EA1E76
+ .word 0x53D43CEC
+ .word 0xA7A879D8
+ .word 0x4F50F3B1
+ .word 0x9EA1E762
+ .word 0x3D43CEC5
+
--
2.25.1
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。