代码拉取完成,页面将自动刷新
同步操作将从 yangshicheng/gcc 强制同步,此操作会覆盖自 Fork 仓库以来所做的任何修改,且无法恢复!!!
确定后同步将在后台操作,完成时将刷新页面,请耐心等待。
From b5865aef36ebaac87ae30d51f08bfe081795ed67 Mon Sep 17 00:00:00 2001
From: Chernonog Viacheslav <chernonog.vyacheslav@huawei.com>
Date: Tue, 12 Mar 2024 23:30:56 +0800
Subject: [PATCH 17/18] Add more flexible check for pointer aliasing during
vectorization It takes minimum between number of iteration and segment length
it helps to speed up loops with small number of iterations when only tail can
be vectorized
---
gcc/params.opt | 5 ++
.../sve/var_stride_flexible_segment_len_1.c | 23 +++++++
gcc/tree-data-ref.cc | 67 +++++++++++++------
gcc/tree-data-ref.h | 11 ++-
gcc/tree-vect-data-refs.cc | 14 +++-
5 files changed, 95 insertions(+), 25 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c
diff --git a/gcc/params.opt b/gcc/params.opt
index 6176d4790..7e5c119cf 100644
--- a/gcc/params.opt
+++ b/gcc/params.opt
@@ -1180,6 +1180,11 @@ Maximum number of loop peels to enhance alignment of data references in a loop.
Common Joined UInteger Var(param_vect_max_version_for_alias_checks) Init(10) Param Optimization
Bound on number of runtime checks inserted by the vectorizer's loop versioning for alias check.
+-param=vect-alias-flexible-segment-len=
+Common Joined UInteger Var(param_flexible_seg_len) Init(0) IntegerRange(0, 1) Param Optimization
+Use a minimum length of different segments. Currenlty the minimum between
+iteration number and vectorization length is chosen by this param.
+
-param=vect-max-version-for-alignment-checks=
Common Joined UInteger Var(param_vect_max_version_for_alignment_checks) Init(6) Param Optimization
Bound on number of runtime checks inserted by the vectorizer's loop versioning for alignment check.
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c
new file mode 100644
index 000000000..894f075f3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize --param=vect-alias-flexible-segment-len=1" } */
+
+#define TYPE int
+#define SIZE 257
+
+void __attribute__ ((weak))
+f (TYPE *x, TYPE *y, unsigned short n, long m __attribute__((unused)))
+{
+ for (int i = 0; i < SIZE; ++i)
+ x[i * n] += y[i * n];
+}
+
+/* { dg-final { scan-assembler {\tld1w\tz[0-9]+} } } */
+/* { dg-final { scan-assembler {\tst1w\tz[0-9]+} } } */
+/* { dg-final { scan-assembler {\tldr\tw[0-9]+} } } */
+/* { dg-final { scan-assembler {\tstr\tw[0-9]+} } } */
+/* Should use a WAR check that multiplies by (VF-2)*4 rather than
+ an overlap check that multiplies by (257-1)*4. */
+/* { dg-final { scan-assembler {\tcntb\t(x[0-9]+)\n.*\tsub\tx[0-9]+, \1, #8\n.*\tmul\tx[0-9]+,[^\n]*\1} } } */
+/* One range check and a check for n being zero. */
+/* { dg-final { scan-assembler-times {\t(?:cmp|tst)\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tccmp\t} 1 } } */
diff --git a/gcc/tree-data-ref.cc b/gcc/tree-data-ref.cc
index 397792c35..e6ae9e847 100644
--- a/gcc/tree-data-ref.cc
+++ b/gcc/tree-data-ref.cc
@@ -2329,31 +2329,15 @@ create_intersect_range_checks_index (class loop *loop, tree *cond_expr,
same arguments. Try to optimize cases in which the second access
is a write and in which some overlap is valid. */
-static bool
-create_waw_or_war_checks (tree *cond_expr,
+static void
+create_waw_or_war_checks2 (tree *cond_expr, tree seg_len_a,
const dr_with_seg_len_pair_t &alias_pair)
{
const dr_with_seg_len& dr_a = alias_pair.first;
const dr_with_seg_len& dr_b = alias_pair.second;
- /* Check for cases in which:
-
- (a) DR_B is always a write;
- (b) the accesses are well-ordered in both the original and new code
- (see the comment above the DR_ALIAS_* flags for details); and
- (c) the DR_STEPs describe all access pairs covered by ALIAS_PAIR. */
- if (alias_pair.flags & ~(DR_ALIAS_WAR | DR_ALIAS_WAW))
- return false;
-
- /* Check for equal (but possibly variable) steps. */
tree step = DR_STEP (dr_a.dr);
- if (!operand_equal_p (step, DR_STEP (dr_b.dr)))
- return false;
-
- /* Make sure that we can operate on sizetype without loss of precision. */
tree addr_type = TREE_TYPE (DR_BASE_ADDRESS (dr_a.dr));
- if (TYPE_PRECISION (addr_type) != TYPE_PRECISION (sizetype))
- return false;
/* All addresses involved are known to have a common alignment ALIGN.
We can therefore subtract ALIGN from an exclusive endpoint to get
@@ -2370,9 +2354,6 @@ create_waw_or_war_checks (tree *cond_expr,
fold_convert (ssizetype, indicator),
ssize_int (0));
- /* Get lengths in sizetype. */
- tree seg_len_a
- = fold_convert (sizetype, rewrite_to_non_trapping_overflow (dr_a.seg_len));
step = fold_convert (sizetype, rewrite_to_non_trapping_overflow (step));
/* Each access has the following pattern:
@@ -2479,6 +2460,50 @@ create_waw_or_war_checks (tree *cond_expr,
*cond_expr = fold_build2 (GT_EXPR, boolean_type_node, subject, limit);
if (dump_enabled_p ())
dump_printf (MSG_NOTE, "using an address-based WAR/WAW test\n");
+}
+
+/* This is a wrapper function for create_waw_or_war_checks2. */
+static bool
+create_waw_or_war_checks (tree *cond_expr,
+ const dr_with_seg_len_pair_t &alias_pair)
+{
+ const dr_with_seg_len& dr_a = alias_pair.first;
+ const dr_with_seg_len& dr_b = alias_pair.second;
+
+ /* Check for cases in which:
+
+ (a) DR_B is always a write;
+ (b) the accesses are well-ordered in both the original and new code
+ (see the comment above the DR_ALIAS_* flags for details); and
+ (c) the DR_STEPs describe all access pairs covered by ALIAS_PAIR. */
+ if (alias_pair.flags & ~(DR_ALIAS_WAR | DR_ALIAS_WAW))
+ return false;
+
+ /* Check for equal (but possibly variable) steps. */
+ tree step = DR_STEP (dr_a.dr);
+ if (!operand_equal_p (step, DR_STEP (dr_b.dr)))
+ return false;
+
+ /* Make sure that we can operate on sizetype without loss of precision. */
+ tree addr_type = TREE_TYPE (DR_BASE_ADDRESS (dr_a.dr));
+ if (TYPE_PRECISION (addr_type) != TYPE_PRECISION (sizetype))
+ return false;
+
+ /* Get lengths in sizetype. */
+ tree seg_len_a
+ = fold_convert (sizetype,
+ rewrite_to_non_trapping_overflow (dr_a.seg_len));
+ create_waw_or_war_checks2 (cond_expr, seg_len_a, alias_pair);
+ if (param_flexible_seg_len && dr_a.seg_len != dr_a.seg_len2)
+ {
+ tree seg_len2_a
+ = fold_convert (sizetype,
+ rewrite_to_non_trapping_overflow (dr_a.seg_len2));
+ tree cond_expr2;
+ create_waw_or_war_checks2 (&cond_expr2, seg_len2_a, alias_pair);
+ *cond_expr = fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
+ *cond_expr, cond_expr2);
+ }
return true;
}
diff --git a/gcc/tree-data-ref.h b/gcc/tree-data-ref.h
index f643a95b2..9bc5f16ee 100644
--- a/gcc/tree-data-ref.h
+++ b/gcc/tree-data-ref.h
@@ -213,12 +213,19 @@ class dr_with_seg_len
public:
dr_with_seg_len (data_reference_p d, tree len, unsigned HOST_WIDE_INT size,
unsigned int a)
- : dr (d), seg_len (len), access_size (size), align (a) {}
-
+ : dr (d), seg_len (len), seg_len2 (len), access_size (size), align (a)
+ {}
+ dr_with_seg_len (data_reference_p d, tree len, tree len2,
+ unsigned HOST_WIDE_INT size, unsigned int a)
+ : dr (d), seg_len (len), seg_len2 (len2), access_size (size), align (a)
+ {}
data_reference_p dr;
/* The offset of the last access that needs to be checked minus
the offset of the first. */
tree seg_len;
+ /* The second version of segment length. Currently this is used to
+ soften checks for a small number of iterations. */
+ tree seg_len2;
/* A value that, when added to abs (SEG_LEN), gives the total number of
bytes in the segment. */
poly_uint64 access_size;
diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
index 4e615b80b..04e68f621 100644
--- a/gcc/tree-vect-data-refs.cc
+++ b/gcc/tree-vect-data-refs.cc
@@ -3646,6 +3646,7 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
{
poly_uint64 lower_bound;
tree segment_length_a, segment_length_b;
+ tree segment_length2_a, segment_length2_b;
unsigned HOST_WIDE_INT access_size_a, access_size_b;
unsigned int align_a, align_b;
@@ -3751,6 +3752,8 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
{
segment_length_a = size_zero_node;
segment_length_b = size_zero_node;
+ segment_length2_a = size_zero_node;
+ segment_length2_b = size_zero_node;
}
else
{
@@ -3759,8 +3762,15 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
length_factor = scalar_loop_iters;
else
length_factor = size_int (vect_factor);
+ /* In any case we should rememeber scalar_loop_iters
+ this helps to create flexible aliasing check
+ for small number of iterations. */
segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
+ segment_length2_a
+ = vect_vfa_segment_size (dr_info_a, scalar_loop_iters);
+ segment_length2_b
+ = vect_vfa_segment_size (dr_info_b, scalar_loop_iters);
}
access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
@@ -3805,9 +3815,9 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
}
dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
- access_size_a, align_a);
+ segment_length2_a, access_size_a, align_a);
dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
- access_size_b, align_b);
+ segment_length2_b, access_size_b, align_b);
/* Canonicalize the order to be the one that's needed for accurate
RAW, WAR and WAW flags, in cases where the data references are
well-ordered. The order doesn't really matter otherwise,
--
2.33.0
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。