代码拉取完成,页面将自动刷新
同步操作将从 src-openEuler/glibc 强制同步,此操作会覆盖自 Fork 仓库以来所做的任何修改,且无法恢复!!!
确定后同步将在后台操作,完成时将刷新页面,请耐心等待。
From 4ad2c9d04b76d7c4a42d80a82c022cd60b43b8b2 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Fri, 24 May 2024 12:38:51 -0500
Subject: [PATCH 04/10] x86: Add seperate non-temporal tunable for memset
The tuning for non-temporal stores for memset vs memcpy is not always
the same. This includes both the exact value and whether non-temporal
stores are profitable at all for a given arch.
This patch add `x86_memset_non_temporal_threshold`. Currently we
disable non-temporal stores for non Intel vendors as the only
benchmarks showing its benefit have been on Intel hardware.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
manual/tunables.texi | 16 +++++++++++++++-
sysdeps/x86/cacheinfo.h | 8 +++++++-
sysdeps/x86/dl-cacheinfo.h | 16 ++++++++++++++++
sysdeps/x86/dl-diagnostics-cpu.c | 2 ++
sysdeps/x86/dl-tunables.list | 3 +++
sysdeps/x86/include/cpu-features.h | 4 +++-
.../x86_64/multiarch/memset-vec-unaligned-erms.S | 6 +++---
7 files changed, 49 insertions(+), 6 deletions(-)
diff --git a/manual/tunables.texi b/manual/tunables.texi
index 6493904bae..2a2877884c 100644
--- a/manual/tunables.texi
+++ b/manual/tunables.texi
@@ -52,6 +52,7 @@ glibc.elision.skip_lock_busy: 3 (min: 0, max: 2147483647)
glibc.malloc.top_pad: 0x20000 (min: 0x0, max: 0xffffffffffffffff)
glibc.cpu.x86_rep_stosb_threshold: 0x800 (min: 0x1, max: 0xffffffffffffffff)
glibc.cpu.x86_non_temporal_threshold: 0xc0000 (min: 0x4040, max: 0xfffffffffffffff)
+glibc.cpu.x86_memset_non_temporal_threshold: 0xc0000 (min: 0x4040, max: 0xfffffffffffffff)
glibc.cpu.x86_shstk:
glibc.pthread.stack_cache_size: 0x2800000 (min: 0x0, max: 0xffffffffffffffff)
glibc.cpu.hwcap_mask: 0x6 (min: 0x0, max: 0xffffffffffffffff)
@@ -486,7 +487,8 @@ thread stack originally backup by Huge Pages to default pages.
@cindex shared_cache_size tunables
@cindex tunables, shared_cache_size
@cindex non_temporal_threshold tunables
-@cindex tunables, non_temporal_threshold
+@cindex memset_non_temporal_threshold tunables
+@cindex tunables, non_temporal_threshold, memset_non_temporal_threshold
@deftp {Tunable namespace} glibc.cpu
Behavior of @theglibc{} can be tuned to assume specific hardware capabilities
@@ -562,6 +564,18 @@ like memmove and memcpy.
This tunable is specific to i386 and x86-64.
@end deftp
+@deftp Tunable glibc.cpu.x86_memset_non_temporal_threshold
+The @code{glibc.cpu.x86_memset_non_temporal_threshold} tunable allows
+the user to set threshold in bytes for non temporal store in
+memset. Non temporal stores give a hint to the hardware to move data
+directly to memory without displacing other data from the cache. This
+tunable is used by some platforms to determine when to use non
+temporal stores memset.
+
+This tunable is specific to i386 and x86-64.
+@end deftp
+
+
@deftp Tunable glibc.cpu.x86_rep_movsb_threshold
The @code{glibc.cpu.x86_rep_movsb_threshold} tunable allows the user to
set threshold in bytes to start using "rep movsb". The value must be
diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
index ec1bc142c4..fd2b2ae66b 100644
--- a/sysdeps/x86/cacheinfo.h
+++ b/sysdeps/x86/cacheinfo.h
@@ -35,9 +35,12 @@ long int __x86_data_cache_size attribute_hidden = 32 * 1024;
long int __x86_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2;
long int __x86_shared_cache_size attribute_hidden = 1024 * 1024;
-/* Threshold to use non temporal store. */
+/* Threshold to use non temporal store in memmove. */
long int __x86_shared_non_temporal_threshold attribute_hidden;
+/* Threshold to use non temporal store in memset. */
+long int __x86_memset_non_temporal_threshold attribute_hidden;
+
/* Threshold to use Enhanced REP MOVSB. */
long int __x86_rep_movsb_threshold attribute_hidden = 2048;
@@ -77,6 +80,9 @@ init_cacheinfo (void)
__x86_shared_non_temporal_threshold
= cpu_features->non_temporal_threshold;
+ __x86_memset_non_temporal_threshold
+ = cpu_features->memset_non_temporal_threshold;
+
__x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold;
__x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold;
__x86_rep_movsb_stop_threshold = cpu_features->rep_movsb_stop_threshold;
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index ce2e6927e4..9f27da21ce 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -1048,6 +1048,13 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
rep_movsb_threshold = 2112;
+ /* Non-temporal stores in memset have only been tested on Intel hardware.
+ Until we benchmark data on other x86 processor, disable non-temporal
+ stores in memset. */
+ unsigned long int memset_non_temporal_threshold = SIZE_MAX;
+ if (cpu_features->basic.kind == arch_kind_intel)
+ memset_non_temporal_threshold = non_temporal_threshold;
+
/* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
cases slower than the vectorized path (and for some alignments,
it is really slow, check BZ #30994). */
@@ -1074,6 +1081,11 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
&& tunable_size <= maximum_non_temporal_threshold)
non_temporal_threshold = tunable_size;
+ tunable_size = TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL);
+ if (tunable_size > minimum_non_temporal_threshold
+ && tunable_size <= maximum_non_temporal_threshold)
+ memset_non_temporal_threshold = tunable_size;
+
tunable_size = TUNABLE_GET (x86_rep_movsb_threshold, long int, NULL);
if (tunable_size > minimum_rep_movsb_threshold)
rep_movsb_threshold = tunable_size;
@@ -1089,6 +1101,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
minimum_non_temporal_threshold,
maximum_non_temporal_threshold);
+ TUNABLE_SET_WITH_BOUNDS (
+ x86_memset_non_temporal_threshold, memset_non_temporal_threshold,
+ minimum_non_temporal_threshold, maximum_non_temporal_threshold);
TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
minimum_rep_movsb_threshold, SIZE_MAX);
TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
@@ -1102,6 +1117,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
cpu_features->data_cache_size = data;
cpu_features->shared_cache_size = shared;
cpu_features->non_temporal_threshold = non_temporal_threshold;
+ cpu_features->memset_non_temporal_threshold = memset_non_temporal_threshold;
cpu_features->rep_movsb_threshold = rep_movsb_threshold;
cpu_features->rep_stosb_threshold = rep_stosb_threshold;
cpu_features->rep_movsb_stop_threshold = rep_movsb_stop_threshold;
diff --git a/sysdeps/x86/dl-diagnostics-cpu.c b/sysdeps/x86/dl-diagnostics-cpu.c
index 5aab63e532..05d54b5eba 100644
--- a/sysdeps/x86/dl-diagnostics-cpu.c
+++ b/sysdeps/x86/dl-diagnostics-cpu.c
@@ -83,6 +83,8 @@ _dl_diagnostics_cpu (void)
cpu_features->shared_cache_size);
print_cpu_features_value ("non_temporal_threshold",
cpu_features->non_temporal_threshold);
+ print_cpu_features_value ("memset_non_temporal_threshold",
+ cpu_features->memset_non_temporal_threshold);
print_cpu_features_value ("rep_movsb_threshold",
cpu_features->rep_movsb_threshold);
print_cpu_features_value ("rep_movsb_stop_threshold",
diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list
index d1442d88ba..53852d6a07 100644
--- a/sysdeps/x86/dl-tunables.list
+++ b/sysdeps/x86/dl-tunables.list
@@ -30,6 +30,9 @@ glibc {
x86_non_temporal_threshold {
type: SIZE_T
}
+ x86_memset_non_temporal_threshold {
+ type: SIZE_T
+ }
x86_rep_movsb_threshold {
type: SIZE_T
# Since there is overhead to set up REP MOVSB operation, REP
diff --git a/sysdeps/x86/include/cpu-features.h b/sysdeps/x86/include/cpu-features.h
index 594feeb2f4..e2d641dcd0 100644
--- a/sysdeps/x86/include/cpu-features.h
+++ b/sysdeps/x86/include/cpu-features.h
@@ -918,8 +918,10 @@ struct cpu_features
/* Shared cache size for use in memory and string routines, typically
L2 or L3 size. */
unsigned long int shared_cache_size;
- /* Threshold to use non temporal store. */
+ /* Threshold to use non temporal store in memmove. */
unsigned long int non_temporal_threshold;
+ /* Threshold to use non temporal store in memset. */
+ unsigned long int memset_non_temporal_threshold;
/* Threshold to use "rep movsb". */
unsigned long int rep_movsb_threshold;
/* Threshold to stop using "rep movsb". */
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index aba45e3da0..d95750b516 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -24,9 +24,9 @@
5. If size is more to 4 * VEC_SIZE, align to 1 * VEC_SIZE with
4 VEC stores and store 4 * VEC at a time until done.
6. On machines ERMS feature, if size is range
- [__x86_rep_stosb_threshold, __x86_shared_non_temporal_threshold)
+ [__x86_rep_stosb_threshold, __x86_memset_non_temporal_threshold)
then REP STOSB will be used.
- 7. If size >= __x86_shared_non_temporal_threshold, use a
+ 7. If size >= __x86_memset_non_temporal_threshold, use a
non-temporal stores. */
#include <sysdep.h>
@@ -318,7 +318,7 @@ L(return_vzeroupper):
/* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in
range for 2-byte jump encoding. */
L(stosb_local):
- cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
+ cmp __x86_memset_non_temporal_threshold(%rip), %RDX_LP
jae L(nt_memset)
movzbl %sil, %eax
mov %RDX_LP, %RCX_LP
--
2.17.1
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。