Fetch the repository succeeded.
This action will force synchronization from src-openEuler/gcc, which will overwrite any changes that you have made since you forked the repository, and can not be recovered!!!
Synchronous operation will process in the background and will refresh the page when finishing processing. Please be patient.
From a3013c074cd2ab5f71eb98a587a627f38c68656c Mon Sep 17 00:00:00 2001
From: Diachkov Ilia <diachkov.ilia1@huawei-partners.com>
Date: Thu, 22 Feb 2024 17:07:24 +0800
Subject: [PATCH 12/18] Port maxmin patch to GCC 12
---
gcc/config/aarch64/aarch64-simd.md | 256 ++++++++++++++++++++++++++
gcc/config/aarch64/predicates.md | 19 ++
gcc/testsuite/gcc.dg/combine-maxmin.c | 46 +++++
3 files changed, 321 insertions(+)
create mode 100755 gcc/testsuite/gcc.dg/combine-maxmin.c
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 82f73805f..de92802f5 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1138,6 +1138,82 @@
[(set_attr "type" "neon_compare<q>,neon_shift_imm<q>")]
)
+;; Simplify the extension with following truncation for shift+neg operation.
+
+(define_insn_and_split "*aarch64_sshr_neg_v8hi"
+ [(set (match_operand:V8HI 0 "register_operand" "=w")
+ (vec_concat:V8HI
+ (truncate:V4HI
+ (ashiftrt:V4SI
+ (neg:V4SI
+ (sign_extend:V4SI
+ (vec_select:V4HI
+ (match_operand:V8HI 1 "register_operand")
+ (match_operand:V8HI 3 "vect_par_cnst_lo_half"))))
+ (match_operand:V4SI 2 "maxmin_arith_shift_operand")))
+ (truncate:V4HI
+ (ashiftrt:V4SI
+ (neg:V4SI
+ (sign_extend:V4SI
+ (vec_select:V4HI
+ (match_dup 1)
+ (match_operand:V8HI 4 "vect_par_cnst_hi_half"))))
+ (match_dup 2)))))]
+ "TARGET_SIMD"
+ "#"
+ "&& true"
+ [(set (match_operand:V8HI 0 "register_operand" "=w")
+ (ashiftrt:V8HI
+ (neg:V8HI
+ (match_operand:V8HI 1 "register_operand" "w"))
+ (match_operand:V8HI 2 "aarch64_simd_imm_minus_one")))]
+ {
+ /* Reduce the shift amount to smaller mode. */
+ int val = INTVAL (CONST_VECTOR_ENCODED_ELT (operands[2], 0))
+ - (GET_MODE_UNIT_BITSIZE (GET_MODE (operands[2])) / 2);
+ operands[2] = aarch64_simd_gen_const_vector_dup (V8HImode, val);
+ }
+ [(set_attr "type" "multiple")]
+)
+
+;; The helper definition that allows combiner to use the previous pattern.
+
+(define_insn_and_split "*aarch64_sshr_neg_tmpv8hi"
+ [(set (match_operand:V8HI 0 "register_operand" "=w")
+ (vec_concat:V8HI
+ (truncate:V4HI
+ (ashiftrt:V4SI
+ (neg:V4SI
+ (match_operand:V4SI 1 "register_operand" "w"))
+ (match_operand:V4SI 2 "maxmin_arith_shift_operand")))
+ (truncate:V4HI
+ (ashiftrt:V4SI
+ (neg:V4SI
+ (match_operand:V4SI 3 "register_operand" "w"))
+ (match_dup 2)))))]
+ "TARGET_SIMD"
+ "#"
+ "&& true"
+ [(set (match_operand:V4SI 1 "register_operand" "=w")
+ (ashiftrt:V4SI
+ (neg:V4SI
+ (match_dup 1))
+ (match_operand:V4SI 2 "maxmin_arith_shift_operand")))
+ (set (match_operand:V4SI 3 "register_operand" "=w")
+ (ashiftrt:V4SI
+ (neg:V4SI
+ (match_dup 3))
+ (match_dup 2)))
+ (set (match_operand:V8HI 0 "register_operand" "=w")
+ (vec_concat:V8HI
+ (truncate:V4HI
+ (match_dup 1))
+ (truncate:V4HI
+ (match_dup 3))))]
+ ""
+ [(set_attr "type" "multiple")]
+)
+
(define_insn "*aarch64_simd_sra<mode>"
[(set (match_operand:VDQ_I 0 "register_operand" "=w")
(plus:VDQ_I
@@ -1714,6 +1790,26 @@
}
)
+(define_insn "vec_pack_trunc_shifted_<mode>"
+ [(set (match_operand:<VNARROWQ2> 0 "register_operand" "=&w")
+ (vec_concat:<VNARROWQ2>
+ (truncate:<VNARROWQ>
+ (ashiftrt:VQN (match_operand:VQN 1 "register_operand" "w")
+ (match_operand:VQN 2 "half_size_operand" "w")))
+ (truncate:<VNARROWQ>
+ (ashiftrt:VQN (match_operand:VQN 3 "register_operand" "w")
+ (match_operand:VQN 4 "half_size_operand" "w")))))]
+ "TARGET_SIMD"
+ {
+ if (BYTES_BIG_ENDIAN)
+ return "uzp2\\t%0.<V2ntype>, %3.<V2ntype>, %1.<V2ntype>";
+ else
+ return "uzp2\\t%0.<V2ntype>, %1.<V2ntype>, %3.<V2ntype>";
+ }
+ [(set_attr "type" "neon_permute<q>")
+ (set_attr "length" "4")]
+)
+
(define_insn "aarch64_shrn<mode>_insn_le"
[(set (match_operand:<VNARROWQ2> 0 "register_operand" "=w")
(vec_concat:<VNARROWQ2>
@@ -6652,6 +6748,166 @@
[(set_attr "type" "neon_tst<q>")]
)
+;; Simplify the extension with following truncation for cmtst-like operation.
+
+(define_insn_and_split "*aarch64_cmtst_arith_v8hi"
+ [(set (match_operand:V8HI 0 "register_operand" "=w")
+ (vec_concat:V8HI
+ (plus:V4HI
+ (truncate:V4HI
+ (eq:V4SI
+ (sign_extend:V4SI
+ (vec_select:V4HI
+ (and:V8HI
+ (match_operand:V8HI 1 "register_operand")
+ (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin"))
+ (match_operand:V8HI 3 "vect_par_cnst_lo_half")))
+ (match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero")))
+ (match_operand:V4HI 5 "aarch64_simd_imm_minus_one"))
+ (plus:V4HI
+ (truncate:V4HI
+ (eq:V4SI
+ (sign_extend:V4SI
+ (vec_select:V4HI
+ (and:V8HI
+ (match_dup 1)
+ (match_dup 2))
+ (match_operand:V8HI 6 "vect_par_cnst_hi_half")))
+ (match_dup 4)))
+ (match_dup 5))))]
+ "TARGET_SIMD && !reload_completed"
+ "#"
+ "&& true"
+ [(set (match_operand:V8HI 6 "register_operand" "=w")
+ (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin"))
+ (set (match_operand:V8HI 0 "register_operand" "=w")
+ (plus:V8HI
+ (eq:V8HI
+ (and:V8HI
+ (match_operand:V8HI 1 "register_operand" "w")
+ (match_dup 6))
+ (match_operand:V8HI 4 "aarch64_simd_imm_zero"))
+ (match_operand:V8HI 5 "aarch64_simd_imm_minus_one")))]
+ {
+ if (can_create_pseudo_p ())
+ {
+ int val = INTVAL (CONST_VECTOR_ENCODED_ELT (operands[4], 0));
+ operands[4] = aarch64_simd_gen_const_vector_dup (V8HImode, val);
+ int val2 = INTVAL (CONST_VECTOR_ENCODED_ELT (operands[5], 0));
+ operands[5] = aarch64_simd_gen_const_vector_dup (V8HImode, val2);
+
+ operands[6] = gen_reg_rtx (V8HImode);
+ }
+ else
+ FAIL;
+ }
+ [(set_attr "type" "neon_tst_q")]
+)
+
+;; Three helper definitions that allow combiner to use the previous pattern.
+
+(define_insn_and_split "*aarch64_cmtst_arith_tmp_lo_v8hi"
+ [(set (match_operand:V4SI 0 "register_operand" "=w")
+ (neg:V4SI
+ (eq:V4SI
+ (sign_extend:V4SI
+ (vec_select:V4HI
+ (and:V8HI
+ (match_operand:V8HI 1 "register_operand")
+ (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin"))
+ (match_operand:V8HI 3 "vect_par_cnst_lo_half")))
+ (match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero"))))]
+ "TARGET_SIMD && !reload_completed"
+ "#"
+ "&& true"
+ [(set (match_operand:V8HI 5 "register_operand" "=w")
+ (and:V8HI
+ (match_operand:V8HI 1 "register_operand")
+ (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin")))
+ (set (match_operand:V4SI 0 "register_operand" "=w")
+ (sign_extend:V4SI
+ (vec_select:V4HI
+ (match_dup 5)
+ (match_operand:V8HI 3 "vect_par_cnst_lo_half"))))
+ (set (match_dup 0)
+ (neg:V4SI
+ (eq:V4SI
+ (match_dup 0)
+ (match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero"))))]
+ {
+ if (can_create_pseudo_p ())
+ operands[5] = gen_reg_rtx (V8HImode);
+ else
+ FAIL;
+ }
+ [(set_attr "type" "multiple")]
+)
+
+(define_insn_and_split "*aarch64_cmtst_arith_tmp_hi_v8hi"
+ [(set (match_operand:V4SI 0 "register_operand" "=w")
+ (neg:V4SI
+ (eq:V4SI
+ (sign_extend:V4SI
+ (vec_select:V4HI
+ (and:V8HI
+ (match_operand:V8HI 1 "register_operand")
+ (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin"))
+ (match_operand:V8HI 3 "vect_par_cnst_hi_half")))
+ (match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero"))))]
+ "TARGET_SIMD && !reload_completed"
+ "#"
+ "&& true"
+ [(set (match_operand:V8HI 5 "register_operand" "=w")
+ (and:V8HI
+ (match_operand:V8HI 1 "register_operand")
+ (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin")))
+ (set (match_operand:V4SI 0 "register_operand" "=w")
+ (sign_extend:V4SI
+ (vec_select:V4HI
+ (match_dup 5)
+ (match_operand:V8HI 3 "vect_par_cnst_hi_half"))))
+ (set (match_dup 0)
+ (neg:V4SI
+ (eq:V4SI
+ (match_dup 0)
+ (match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero"))))]
+ {
+ if (can_create_pseudo_p ())
+ operands[5] = gen_reg_rtx (V8HImode);
+ else
+ FAIL;
+ }
+ [(set_attr "type" "multiple")]
+)
+
+(define_insn_and_split "*aarch64_cmtst_arith_tmpv8hi"
+ [(set (match_operand:V8HI 0 "register_operand" "=w")
+ (vec_concat:V8HI
+ (truncate:V4HI
+ (not:V4SI
+ (match_operand:V4SI 1 "register_operand" "w")))
+ (truncate:V4HI
+ (not:V4SI
+ (match_operand:V4SI 2 "register_operand" "w")))))]
+ "TARGET_SIMD"
+ "#"
+ "&& true"
+ [(set (match_operand:V4SI 1 "register_operand" "=w")
+ (not:V4SI
+ (match_dup 1)))
+ (set (match_operand:V4SI 2 "register_operand" "=w")
+ (not:V4SI
+ (match_dup 2)))
+ (set (match_operand:V8HI 0 "register_operand" "=w")
+ (vec_concat:V8HI
+ (truncate:V4HI
+ (match_dup 1))
+ (truncate:V4HI
+ (match_dup 2))))]
+ ""
+ [(set_attr "type" "multiple")]
+)
+
(define_insn_and_split "aarch64_cmtstdi"
[(set (match_operand:DI 0 "register_operand" "=w,r")
(neg:DI
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index 07c14aacb..1b8496c07 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -118,6 +118,25 @@
(match_test "aarch64_simd_valid_immediate (op, NULL,
AARCH64_CHECK_ORR)"))))
+(define_predicate "aarch64_bic_imm_for_maxmin"
+ (match_code "const_vector")
+{
+ if (!aarch64_simd_valid_immediate (op, NULL, AARCH64_CHECK_BIC))
+ return false;
+ op = unwrap_const_vec_duplicate (op);
+ unsigned int size = GET_MODE_UNIT_BITSIZE (mode);
+ return CONST_INT_P (op)
+ && ((~UINTVAL (op)) < (((long unsigned int) 1 << size) - 1));
+})
+
+(define_predicate "maxmin_arith_shift_operand"
+ (match_code "const_vector")
+{
+ op = unwrap_const_vec_duplicate (op);
+ unsigned int size = GET_MODE_UNIT_BITSIZE (mode) - 1;
+ return CONST_INT_P (op) && (UINTVAL (op) == size);
+})
+
(define_predicate "aarch64_reg_or_bic_imm"
(ior (match_operand 0 "register_operand")
(and (match_code "const_vector")
diff --git a/gcc/testsuite/gcc.dg/combine-maxmin.c b/gcc/testsuite/gcc.dg/combine-maxmin.c
new file mode 100755
index 000000000..06bce7029
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/combine-maxmin.c
@@ -0,0 +1,46 @@
+/* { dg-do compile { target aarch64-*-* } } */
+/* { dg-options "-O3 -fdump-rtl-combine-all" } */
+
+/* The test checks usage of smax/smin insns for clip evaluation and
+ * uzp1/uzp2 insns for vector element narrowing. It's inspired by
+ * sources of x264 codec. */
+
+typedef unsigned char uint8_t;
+typedef long int intptr_t;
+typedef signed short int int16_t;
+
+static __attribute__((always_inline)) inline uint8_t clip (int x )
+{
+ return ( (x & ~((1 << 8)-1)) ? (-x)>>31 & ((1 << 8)-1) : x );
+}
+
+void hf (uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
+ intptr_t stride, int width, int height, int16_t *buf)
+{
+ const int pad = (8 > 9) ? (-10 * ((1 << 8)-1)) : 0;
+ for( int y = 0; y < height; y++ ) {
+ for( int x = -2; x < width+3; x++ ) {
+ int v = ((src)[x-2*stride] + (src)[x+3*stride] - 5*((src)[x-stride]
+ + (src)[x+2*stride]) + 20*((src)[x] + (src)[x+stride]));
+ dstv[x] = clip ( (v + 16) >> 5 );
+ buf[x+2] = v + pad;
+ }
+ for( int x = 0; x < width; x++ )
+ dstc[x] = clip ((((buf+2)[x-2*1] + (buf+2)[x+3*1] - 5*((buf+2)[x-1]
+ + (buf+2)[x+2*1]) + 20*((buf+2)[x] + (buf+2)[x+1]))
+ - 32*pad + 512) >> 10);
+ for( int x = 0; x < width; x++ )
+ dsth[x] = clip ((((src)[x-2*1] + (src)[x+3*1] - 5*((src)[x-1]
+ + (src)[x+2*1]) + 20*((src)[x] + (src)[x+1]))
+ + 16) >> 5);
+ dsth += stride;
+ dstv += stride;
+ dstc += stride;
+ src += stride;
+ }
+}
+
+/* { dg-final { scan-assembler-times {smax\t} 4 } } */
+/* { dg-final { scan-assembler-times {smin\t} 4 } } */
+/* { dg-final { scan-assembler-times {cmtst\t} 2 } } */
+/* { dg-final { scan-assembler-times {uzp1\t} 6 } } */
--
2.33.0
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。