1 Star 0 Fork 30

小羊/gzip

forked from src-openEuler/gzip 
加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
performance-neoncrc32-and-prfm.patch 5.38 KB
一键复制 编辑 原始数据 按行查看 历史
dogsheng 提交于 2019-11-19 14:09 . Package init
From 134712c35ed2ec5a06c61583dce59867aeb28862 Mon Sep 17 00:00:00 2001
From: liqiang64 <liqiang64@huawei.com>
Date: Mon, 11 Nov 2019 19:47:36 +0800
Subject: [PATCH] performance-neoncrc32-and-prfm
Analysis of gzip software by perf tool, found that crc32 and
longest_match hotspots are very high.
On the ARM architecture, we can optimize the efficiency of
crc32 through the interface provided by the neon instruction
set, and optimize the performance of random access code through
prefetch instructions.
Modify by Li Qiang.
---
deflate.c | 27 ++++++++++++++++++++++++++-
util.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 75 insertions(+), 1 deletion(-)
diff --git a/deflate.c b/deflate.c
index 951d7af..f15a227 100644
--- a/deflate.c
+++ b/deflate.c
@@ -392,6 +392,9 @@ longest_match(IPos cur_match)
register int len; /* length of current match */
int best_len = prev_length; /* best match length so far */
IPos limit = strstart > (IPos)MAX_DIST ? strstart - (IPos)MAX_DIST : NIL;
+ #ifdef __aarch64__
+ IPos next_match;
+ #endif
/* Stop when cur_match becomes <= limit. To simplify the code,
* we prevent matches with the string of window index 0.
*/
@@ -425,6 +428,10 @@ longest_match(IPos cur_match)
do {
Assert(cur_match < strstart, "no future");
match = window + cur_match;
+ #ifdef __aarch64__
+ next_match = prev[cur_match & WMASK];
+ __asm__("PRFM PLDL1STRM, [%0]"::"r"(&(prev[next_match & WMASK])));
+ #endif
/* Skip to next match if the match length cannot increase
* or if the match length is less than 2:
@@ -502,8 +509,14 @@ longest_match(IPos cur_match)
scan_end = scan[best_len];
#endif
}
- } while ((cur_match = prev[cur_match & WMASK]) > limit
+ }
+ #ifdef __aarch64__
+ while ((cur_match = next_match) > limit
&& --chain_length != 0);
+ #else
+ while ((cur_match = prev[cur_match & WMASK]) > limit
+ && --chain_length != 0);
+ #endif
return best_len;
}
@@ -788,7 +801,19 @@ off_t deflate()
lookahead -= prev_length-1;
prev_length -= 2;
RSYNC_ROLL(strstart, prev_length+1);
+ while (prev_length >= 4) {
+ prev_length -= 4;
+ strstart++;
+ INSERT_STRING(strstart, hash_head);
+ strstart++;
+ INSERT_STRING(strstart, hash_head);
+ strstart++;
+ INSERT_STRING(strstart, hash_head);
+ strstart++;
+ INSERT_STRING(strstart, hash_head);
+ }
do {
+ if (prev_length == 0) break;
strstart++;
INSERT_STRING(strstart, hash_head);
/* strstart never exceeds WSIZE-MAX_MATCH, so there are
diff --git a/util.c b/util.c
index bb5e9f3..d0b3cb0 100644
--- a/util.c
+++ b/util.c
@@ -31,6 +31,9 @@
#include "gzip.h"
#include <dirname.h>
#include <xalloc.h>
+#ifdef __aarch64__
+#include <arm_acle.h>
+#endif
#ifndef CHAR_BIT
# define CHAR_BIT 8
@@ -41,6 +44,7 @@ static int write_buffer (int, voidp, unsigned int);
/* ========================================================================
* Table of CRC-32's of all single-byte values (made by makecrc.c)
*/
+#ifndef __aarch64__
static const ulg crc_32_tab[] = {
0x00000000L, 0x77073096L, 0xee0e612cL, 0x990951baL, 0x076dc419L,
0x706af48fL, 0xe963a535L, 0x9e6495a3L, 0x0edb8832L, 0x79dcb8a4L,
@@ -95,6 +99,7 @@ static const ulg crc_32_tab[] = {
0x5d681b02L, 0x2a6f2b94L, 0xb40bbe37L, 0xc30c8ea1L, 0x5a05df1bL,
0x2d02ef8dL
};
+#endif
/* ===========================================================================
* Copy input to output unchanged: zcat == cat with --force.
@@ -129,6 +134,49 @@ ulg updcrc(s, n)
uch *s; /* pointer to bytes to pump through */
unsigned n; /* number of bytes in s[] */
{
+ #ifdef __aarch64__
+ register ulg c;
+ static ulg crc = (ulg)0xffffffffL;
+ register const uint8_t *buf1;
+ register const uint16_t *buf2;
+ register const uint32_t *buf4;
+ register const uint64_t *buf8;
+ int64_t length = (int64_t)n;
+ buf8 = (const uint64_t *)(const void *)s;
+
+ if (s == NULL) {
+ c = 0xffffffffL;
+ } else {
+ c = crc;
+
+ while(length >= sizeof(uint64_t)) {
+ c = __crc32d(c, *buf8++);
+ length -= sizeof(uint64_t);
+ }
+
+ buf4 = (const uint32_t *)(const void *)buf8;
+ if (length >= sizeof(uint32_t)) {
+ c = __crc32w(c, *buf4++);
+ length -= sizeof(uint32_t);
+ }
+
+ buf2 = (const uint16_t *)(const void *)buf4;
+ if(length >= sizeof(uint16_t)) {
+ c = __crc32h(c, *buf2++);
+ length -= sizeof(uint16_t);
+ }
+
+ buf1 = (const uint8_t *)(const void *)buf2;
+ if (length >= sizeof(uint8_t)) {
+ c = __crc32b(c, *buf1);
+ length -= sizeof(uint8_t);
+ }
+ }
+
+ crc = c;
+
+ return (c ^ 0xffffffffL);
+#else
register ulg c; /* temporary variable */
static ulg crc = (ulg)0xffffffffL; /* shift register contents */
@@ -143,6 +191,7 @@ ulg updcrc(s, n)
}
crc = c;
return c ^ 0xffffffffL; /* (instead of ~c for 64-bit machines) */
+#endif
}
/* ===========================================================================
--
1.8.3.1
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/caffeaulait/gzip.git
git@gitee.com:caffeaulait/gzip.git
caffeaulait
gzip
gzip
master

搜索帮助