代码拉取完成,页面将自动刷新
From ce7c6c491ed0750a10f9a52b5edc710d978e70e2 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Mon, 15 Jul 2024 16:19:17 +0800
Subject: [PATCH 07/10] x86: Disable non-temporal memset on Skylake Server
The original commit enabling non-temporal memset on Skylake Server had
erroneous benchmarks (actually done on ICX).
Further benchmarks indicate non-temporal stores may in fact by a
regression on Skylake Server.
This commit may be over-cautious in some cases, but should avoid any
regressions for 2.40.
Tested using qemu on all x86_64 cpu arch supported by both qemu +
GLIBC.
Reviewed-by: DJ Delorie <dj@redhat.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86/cpu-features.c | 13 +-
sysdeps/x86/cpu-tunables.c | 6 +
sysdeps/x86/dl-cacheinfo.h | 15 +-
...cpu-features-preferred_feature_index_1.def | 1 +
sysdeps/x86/tst-hwcap-tunables.c | 148 ++++++++++++++++++
5 files changed, 173 insertions(+), 10 deletions(-)
create mode 100644 sysdeps/x86/tst-hwcap-tunables.c
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index c4dd85145e..b4030776a7 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -754,11 +754,18 @@ init_cpu_features (struct cpu_features *cpu_features)
/* Newer Bigcore microarch (larger non-temporal store
threshold). */
- case INTEL_BIGCORE_SKYLAKE:
- case INTEL_BIGCORE_KABYLAKE:
- case INTEL_BIGCORE_COMETLAKE:
case INTEL_BIGCORE_SKYLAKE_AVX512:
case INTEL_BIGCORE_CANNONLAKE:
+ /* Benchmarks indicate non-temporal memset is not
+ necessarily profitable on SKX (and in some cases much
+ worse). This is likely unique to SKX due its it unique
+ mesh interconnect (not present on ICX or BWD). Disable
+ non-temporal on all Skylake servers. */
+ cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
+ |= bit_arch_Avoid_Non_Temporal_Memset;
+ case INTEL_BIGCORE_COMETLAKE:
+ case INTEL_BIGCORE_SKYLAKE:
+ case INTEL_BIGCORE_KABYLAKE:
case INTEL_BIGCORE_ICELAKE:
case INTEL_BIGCORE_TIGERLAKE:
case INTEL_BIGCORE_ROCKETLAKE:
diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
index 0d4f328585..b8475730ea 100644
--- a/sysdeps/x86/cpu-tunables.c
+++ b/sysdeps/x86/cpu-tunables.c
@@ -272,6 +272,12 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
disable, 24);
}
break;
+ case 25:
+ {
+ CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
+ Avoid_Non_Temporal_Memset,
+ disable, 25);
+ }
case 26:
{
CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index a76df092e6..de4584116f 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -1051,13 +1051,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
/* Non-temporal stores are more performant on Intel and AMD hardware above
non_temporal_threshold. Enable this for both Intel and AMD hardware. */
unsigned long int memset_non_temporal_threshold = SIZE_MAX;
- if (cpu_features->basic.kind == arch_kind_intel
- || cpu_features->basic.kind == arch_kind_amd)
- memset_non_temporal_threshold = non_temporal_threshold;
-
- /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
- cases slower than the vectorized path (and for some alignments,
- it is really slow, check BZ #30994). */
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)
+ && (cpu_features->basic.kind == arch_kind_intel
+ || cpu_features->basic.kind == arch_kind_amd))
+ memset_non_temporal_threshold = non_temporal_threshold;
+
+ /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
+ cases slower than the vectorized path (and for some alignments,
+ it is really slow, check BZ #30994). */
if (cpu_features->basic.kind == arch_kind_amd)
rep_movsb_threshold = non_temporal_threshold;
diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
index d20c5b3196..aae1c85551 100644
--- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
+++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
@@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512)
BIT (MathVec_Prefer_No_AVX512)
BIT (Prefer_FSRM)
BIT (Avoid_Short_Distance_REP_MOVSB)
+BIT (Avoid_Non_Temporal_Memset)
diff --git a/sysdeps/x86/tst-hwcap-tunables.c b/sysdeps/x86/tst-hwcap-tunables.c
new file mode 100644
index 0000000000..94307283d7
--- /dev/null
+++ b/sysdeps/x86/tst-hwcap-tunables.c
@@ -0,0 +1,148 @@
+/* Tests for x86 GLIBC_TUNABLES=glibc.cpu.hwcaps filter.
+ Copyright (C) 2023-2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <array_length.h>
+#include <getopt.h>
+#include <ifunc-impl-list.h>
+#include <spawn.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <intprops.h>
+#include <support/check.h>
+#include <support/support.h>
+#include <support/xunistd.h>
+#include <support/capture_subprocess.h>
+
+/* Nonzero if the program gets called via `exec'. */
+#define CMDLINE_OPTIONS \
+ { "restart", no_argument, &restart, 1 },
+static int restart;
+
+/* Disable everything. */
+static const char *test_1[] =
+{
+ "__memcpy_avx512_no_vzeroupper",
+ "__memcpy_avx512_unaligned",
+ "__memcpy_avx512_unaligned_erms",
+ "__memcpy_evex_unaligned",
+ "__memcpy_evex_unaligned_erms",
+ "__memcpy_avx_unaligned",
+ "__memcpy_avx_unaligned_erms",
+ "__memcpy_avx_unaligned_rtm",
+ "__memcpy_avx_unaligned_erms_rtm",
+ "__memcpy_ssse3",
+};
+
+static const struct test_t
+{
+ const char *env;
+ const char *const *funcs;
+ size_t nfuncs;
+} tests[] =
+{
+ {
+ /* Disable everything. */
+ "-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
+ "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS,"
+ "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset",
+ test_1,
+ array_length (test_1)
+ },
+ {
+ /* Same as before, but with some empty suboptions. */
+ ",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
+ "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-,"
+ "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,-,",
+ test_1,
+ array_length (test_1)
+ }
+};
+
+/* Called on process re-execution. */
+_Noreturn static void
+handle_restart (int ntest)
+{
+ struct libc_ifunc_impl impls[32];
+ int cnt = __libc_ifunc_impl_list ("memcpy", impls, array_length (impls));
+ if (cnt == 0)
+ _exit (EXIT_SUCCESS);
+ TEST_VERIFY_EXIT (cnt >= 1);
+ for (int i = 0; i < cnt; i++)
+ {
+ for (int f = 0; f < tests[ntest].nfuncs; f++)
+ {
+ if (strcmp (impls[i].name, tests[ntest].funcs[f]) == 0)
+ TEST_COMPARE (impls[i].usable, false);
+ }
+ }
+
+ _exit (EXIT_SUCCESS);
+}
+
+static int
+do_test (int argc, char *argv[])
+{
+ /* We must have either:
+ - One our fource parameters left if called initially:
+ + path to ld.so optional
+ + "--library-path" optional
+ + the library path optional
+ + the application name
+ + the test to check */
+
+ TEST_VERIFY_EXIT (argc == 2 || argc == 5);
+
+ if (restart)
+ handle_restart (atoi (argv[1]));
+
+ char nteststr[INT_BUFSIZE_BOUND (int)];
+
+ char *spargv[10];
+ {
+ int i = 0;
+ for (; i < argc - 1; i++)
+ spargv[i] = argv[i + 1];
+ spargv[i++] = (char *) "--direct";
+ spargv[i++] = (char *) "--restart";
+ spargv[i++] = nteststr;
+ spargv[i] = NULL;
+ }
+
+ for (int i = 0; i < array_length (tests); i++)
+ {
+ snprintf (nteststr, sizeof nteststr, "%d", i);
+
+ printf ("[%d] Spawned test for %s\n", i, tests[i].env);
+ char *tunable = xasprintf ("glibc.cpu.hwcaps=%s", tests[i].env);
+ setenv ("GLIBC_TUNABLES", tunable, 1);
+
+ struct support_capture_subprocess result
+ = support_capture_subprogram (spargv[0], spargv, NULL);
+ support_capture_subprocess_check (&result, "tst-tunables", 0,
+ sc_allow_stderr);
+ support_capture_subprocess_free (&result);
+
+ free (tunable);
+ }
+
+ return 0;
+}
+
+#define TEST_FUNCTION_ARGV do_test
+#include <support/test-driver.c>
--
2.17.1
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。