From 6a6e07da8a12fd34965cdc06e6264c3e1cc00d1f Mon Sep 17 00:00:00 2001 From: baozhaoling Date: Fri, 29 Mar 2024 10:31:37 +0800 Subject: [PATCH] Add loop-slp-coop & slp-accumulation-combine for some unvectorizable cases. Add testsuite cases for loop-slp-coop-vectorize. Add shuffle fusion pass for vec-perm-expr combine. Add option -mloop-slp-coop: While enabled, try both use slp vectorize and loop vectorize while performing vectorize pass. Add option -mslp-optimize: While disabled, disable vect_optimize_slp & vect_gather_slp_loads function. Add option -mslp-accumulation-combine: The vect pass will perform rotate while performing vect pass. Enabled the option, and the addtion gimple will try to combine, reusing the register for register pressure Add option -mwidening_mul_ahead: Move the widening_mul_ahead pass, for optimization priority. Add option -mshuffle-fusion: To combine the permute and permute-like instructions. Add testsuite cases: gcc/testsuite/g++.dg/vect/simd-coop.cc Add testsuite cases: gcc/testsuite/g++.target/i386/simd-coop.C Add testsuite cases: gcc/testsuite/g++.target/i386/shuffle-fusion.C --- gcc/Makefile.in | 1 + gcc/common.opt | 20 + gcc/config/i386/i386-expand.cc | 62 ++- gcc/expr.cc | 25 +- gcc/gimple.h | 21 + gcc/optabs.cc | 4 +- gcc/passes.def | 8 +- gcc/shufflefusion.cc | 434 ++++++++++++++++++ gcc/testsuite/g++.dg/vect/simd-coop.cc | 33 ++ .../g++.target/i386/shuffle-fusion.C | 28 ++ gcc/testsuite/g++.target/i386/simd-coop.C | 33 ++ gcc/testsuite/gcc.dg/tree-ssa/pr69270.c | 12 +- gcc/testsuite/gcc.dg/tree-ssa/pr70232.c | 4 +- gcc/testsuite/gcc.dg/tree-ssa/pr71437.c | 4 +- gcc/testsuite/gcc.dg/tree-ssa/slsr-27.c | 10 +- gcc/testsuite/gcc.dg/tree-ssa/slsr-28.c | 10 +- gcc/testsuite/gcc.dg/tree-ssa/slsr-29.c | 10 +- .../gcc.dg/tree-ssa/ssa-dom-thread-7.c | 4 +- gcc/timevar.def | 1 + gcc/tree-pass.h | 1 + gcc/tree-ssa-loop.cc | 2 + gcc/tree-ssa-math-opts.cc | 14 +- gcc/tree-ssa-ter.cc | 2 +- gcc/tree-vect-data-refs.cc | 8 +- gcc/tree-vect-loop.cc | 15 +- gcc/tree-vect-slp.cc | 18 +- gcc/tree-vect-stmts.cc | 102 ++++ gcc/tree-vectorizer.h | 3 + 28 files changed, 831 insertions(+), 58 deletions(-) create mode 100644 gcc/shufflefusion.cc create mode 100644 gcc/testsuite/g++.dg/vect/simd-coop.cc create mode 100644 gcc/testsuite/g++.target/i386/shuffle-fusion.C create mode 100644 gcc/testsuite/g++.target/i386/simd-coop.C diff --git a/gcc/Makefile.in b/gcc/Makefile.in index 0aabc6ea3f2..c0696fef13f 100644 --- a/gcc/Makefile.in +++ b/gcc/Makefile.in @@ -1581,6 +1581,7 @@ OBJS = \ selftest-run-tests.o \ sese.o \ shrink-wrap.o \ + shufflefusion.o \ simplify-rtx.o \ sparseset.o \ spellcheck.o \ diff --git a/gcc/common.opt b/gcc/common.opt index 30f979870f6..645d971a14c 100644 --- a/gcc/common.opt +++ b/gcc/common.opt @@ -3616,4 +3616,24 @@ fipa-ra Common Var(flag_ipa_ra) Optimization Use caller save register across calls if possible. +floop-slp-coop +Target Var(flag_loop_slp_coop) Init(0) +Perform loop-slp-cooperate vectorization. + +fslp-optimize +Target Var(flag_vectorize_slp_optimize) Init(1) +Enable slp optimize in vect pass. + +fslp-accumulation-combine +Target Var(flag_accumulation_combine) Init(0) +Combine the mul-add accumulation stmt for reg pressure. + +fwidening-mul-ahead +Target Var(flag_widening_mul_ahead) Init(0) +Move widening-mul pass ahead of the store-merging pass. + +fshuffle-fusion +Target Var(flag_shuffle_fusion) Init(0) +Combine the permute and permute-like gimple. + ; This comment is to ensure we retain the blank line above. diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 77dda5dd44e..9985c1a635a 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -19011,13 +19011,17 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) if (vmode == V8SImode) for (i = 0; i < 8; ++i) - rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7); + rperm[i] = d->perm[i] != 255 ? + GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7) : + GEN_INT(255); else if (vmode == V16SImode) for (i = 0; i < 16; ++i) - rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15); + rperm[i] = d->perm[i] != 255 ? + GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15) : + GEN_INT(255); else { - eltsz = GET_MODE_UNIT_SIZE (d->vmode); + eltsz = GET_MODE_SIZE (d->vmode) / nelt; if (!d->one_operand_p) mask = 2 * nelt - 1; else if (vmode == V64QImode) @@ -19027,11 +19031,15 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) else mask = nelt - 1; + /* Reserve the index to be 255 for the shuffle instructions to insert a 0 + in op0 */ for (i = 0; i < nelt; ++i) { unsigned j, e = d->perm[i] & mask; for (j = 0; j < eltsz; ++j) - rperm[i * eltsz + j] = GEN_INT (e * eltsz + j); + if (d->perm[i] != 255) + rperm[i * eltsz + j] = GEN_INT (e * eltsz + j); + else rperm[i * eltsz + j] = GEN_INT (255); } } @@ -19229,6 +19237,7 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) for (i = 0; i < nelt; i++) { nd.perm[i] = d->perm[i] & mask; + if (d->perm[i] == 255) nd.perm[i] = 255; if (nd.perm[i] != i) identity_perm = false; if (nd.perm[i]) @@ -19311,8 +19320,12 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) every other permutation operand. */ for (i = 0; i < nelt; i += 2) { - nd.perm[i] = d->perm[i] & mask; - nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt; + if (d->perm[i] != 255) + nd.perm[i] = d->perm[i] & mask; + else nd.perm[i] = 255; + if (d->perm[i + 1] != 255) + nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt; + else nd.perm[i + 1] = 255; } if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt, d->testing_p)) @@ -19323,10 +19336,18 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) { for (i = 0; i < nelt; i += 4) { - nd.perm[i + 0] = d->perm[i + 0] & mask; - nd.perm[i + 1] = d->perm[i + 1] & mask; - nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt; - nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt; + if (d->perm[i + 0] != 255) + nd.perm[i + 0] = d->perm[i + 0] & mask; + else nd.perm[i + 0] = 255; + if (d->perm[i + 1] != 255) + nd.perm[i + 1] = d->perm[i + 1] & mask; + else nd.perm[i + 1] = 255; + if (d->perm[i + 2] != 255) + nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt; + else nd.perm[i + 2] = 255; + if (d->perm[i + 3] != 255) + nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt; + else nd.perm[i + 3] = 255; } if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt, @@ -20747,7 +20768,7 @@ expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d) } nelt = d->nelt; - eltsz = GET_MODE_UNIT_SIZE (d->vmode); + eltsz = GET_MODE_SIZE (d->vmode) / nelt; /* Generate two permutation masks. If the required element is within the given vector it is shuffled into the proper lane. If the required @@ -20762,8 +20783,9 @@ expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d) e -= nelt; for (j = 0; j < eltsz; ++j) - { - rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j); + { + rperm[which][i*eltsz + j] = (d->perm[i] == 255) ? + GEN_INT(255) : GEN_INT (e*eltsz + j); rperm[1-which][i*eltsz + j] = m128; } @@ -21926,7 +21948,8 @@ canonicalize_perm (struct expand_vec_perm_d *d) int i, which, nelt = d->nelt; for (i = which = 0; i < nelt; ++i) - which |= (d->perm[i] < nelt ? 1 : 2); + if (d->perm[i] != 255) + which |= (d->perm[i] < nelt ? 1 : 2); d->one_operand_p = true; switch (which) @@ -21948,7 +21971,8 @@ canonicalize_perm (struct expand_vec_perm_d *d) case 2: for (i = 0; i < nelt; ++i) - d->perm[i] &= nelt - 1; + if (d->perm[i] != 255) + d->perm[i] &= nelt - 1; d->op0 = d->op1; break; @@ -22098,10 +22122,11 @@ ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0, for (i = which = 0; i < nelt; ++i) { unsigned char e = sel[i]; - gcc_assert (e < 2 * nelt); + gcc_assert (e < 2 * nelt || e == 255); d.perm[i] = e; perm[i] = e; - which |= (e < nelt ? 1 : 2); + if (e != 255) + which |= (e < nelt ? 1 : 2); } if (d.testing_p) @@ -22109,7 +22134,8 @@ ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0, /* For all elements from second vector, fold the elements to first. */ if (which == 2) for (i = 0; i < nelt; ++i) - d.perm[i] -= nelt; + if (d.perm[i] != 255) + d.perm[i] -= nelt; /* Check whether the mask can be applied to the vector type. */ d.one_operand_p = (which != 3); diff --git a/gcc/expr.cc b/gcc/expr.cc index e7804d52656..ff126ebf509 100644 --- a/gcc/expr.cc +++ b/gcc/expr.cc @@ -10148,17 +10148,36 @@ expand_expr_real_2 (sepops ops, rtx target, machine_mode tmode, { expand_operands (treeop0, treeop1, target, &op0, &op1, EXPAND_NORMAL); vec_perm_builder sel; + machine_mode modeop = TYPE_MODE (TREE_TYPE (treeop0)); + if (TREE_CODE (treeop2) == VECTOR_CST && tree_to_vec_perm_builder (&sel, treeop2)) { machine_mode sel_mode = TYPE_MODE (TREE_TYPE (treeop2)); - temp = expand_vec_perm_const (mode, op0, op1, sel, - sel_mode, target); + if (modeop != mode && flag_shuffle_fusion) + { + temp = expand_vec_perm_const (modeop, op0, op1, sel, + sel_mode, target); + rtx tempnew = temp; + temp = gen_reg_rtx (mode); + convert_move (temp, tempnew, unsignedp); + } + else + temp = expand_vec_perm_const (mode, op0, op1, sel, + sel_mode, target); } else { op2 = expand_normal (treeop2); - temp = expand_vec_perm_var (mode, op0, op1, op2, target); + if (modeop != mode && flag_shuffle_fusion) + { + temp = expand_vec_perm_var (modeop, op0, op1, op2, target); + rtx tempnew = temp; + temp = gen_reg_rtx (mode); + convert_move (temp, tempnew, unsignedp); + } + else + temp = expand_vec_perm_var (mode, op0, op1, op2, target); } gcc_assert (temp); return temp; diff --git a/gcc/gimple.h b/gcc/gimple.h index 77a5a07e9b5..df21897baa9 100644 --- a/gcc/gimple.h +++ b/gcc/gimple.h @@ -2953,6 +2953,27 @@ gimple_clobber_p (const gimple *s, enum clobber_kind kind) static inline bool is_gimple_call (const gimple *gs) { + if (gimple_code (gs) == GIMPLE_CALL && flag_accumulation_combine) + { + const gcall *call=dyn_cast (gs); + if (call -> u.internal_fn == IFN_FMA) + return false; + } + return gimple_code (gs) == GIMPLE_CALL; +} + +/* For i386 structure IFN_FMA is not a call instruction, but it is also a + gimple call. Returen true if GS is a no-fma GIMPLE_CALL */ + +static inline bool +is_nofma_gimple_call (const gimple *gs) +{ + if (gimple_code (gs) == GIMPLE_CALL) + { + const gcall *call=dyn_cast (gs); + if (call -> u.internal_fn == IFN_FMA) + return false; + } return gimple_code (gs) == GIMPLE_CALL; } diff --git a/gcc/optabs.cc b/gcc/optabs.cc index 3d8fa3abdfe..4ad8fc44ad0 100644 --- a/gcc/optabs.cc +++ b/gcc/optabs.cc @@ -6194,7 +6194,9 @@ expand_vec_perm_const (machine_mode mode, rtx v0, rtx v1, cases in which the inputs are equal. Not all backends can cope with the single-input representation when testing for a double-input target instruction. */ - vec_perm_indices indices (sel, 2, GET_MODE_NUNITS (mode)); + poly_uint16 indice_nunits = flag_shuffle_fusion ? + 256 : GET_MODE_NUNITS(mode); + vec_perm_indices indices (sel, 2, indice_nunits); /* See if this can be handled with a vec_shr or vec_shl. We only do this if the second (for vec_shr) or first (for vec_shl) vector is all diff --git a/gcc/passes.def b/gcc/passes.def index 8dbb7983e3e..c52cfa3c5ab 100644 --- a/gcc/passes.def +++ b/gcc/passes.def @@ -297,6 +297,10 @@ along with GCC; see the file COPYING3. If not see POP_INSERT_PASSES () NEXT_PASS (pass_parallelize_loops, false /* oacc_kernels_p */); NEXT_PASS (pass_expand_omp_ssa); + NEXT_PASS (pass_tree_loop_done); + NEXT_PASS (pass_dominator, false); + NEXT_PASS (pass_copy_prop); + NEXT_PASS (pass_tree_loop_init); NEXT_PASS (pass_ch_vect); NEXT_PASS (pass_if_conversion); /* pass_vectorize must immediately follow pass_if_conversion. @@ -330,6 +334,7 @@ along with GCC; see the file COPYING3. If not see NEXT_PASS (pass_lower_vector_ssa); NEXT_PASS (pass_lower_switch); NEXT_PASS (pass_cse_reciprocals); + NEXT_PASS (pass_optimize_widening_mul, false); NEXT_PASS (pass_reassoc, false /* early_p */); NEXT_PASS (pass_strength_reduction); NEXT_PASS (pass_split_paths); @@ -353,7 +358,8 @@ along with GCC; see the file COPYING3. If not see NEXT_PASS (pass_sink_code, true /* unsplit edges */); NEXT_PASS (pass_phiopt, false /* early_p */); NEXT_PASS (pass_fold_builtins); - NEXT_PASS (pass_optimize_widening_mul); + NEXT_PASS (pass_shuffle_fusion); + NEXT_PASS (pass_optimize_widening_mul, true); NEXT_PASS (pass_store_merging); NEXT_PASS (pass_tail_calls); /* If DCE is not run before checking for uninitialized uses, diff --git a/gcc/shufflefusion.cc b/gcc/shufflefusion.cc new file mode 100644 index 00000000000..f9a475fce5a --- /dev/null +++ b/gcc/shufflefusion.cc @@ -0,0 +1,434 @@ +/* Routines for performing Temporary Expression Replacement (TER) in SSA trees. + Copyright (C) 2003-2022 Free Software Foundation, Inc. + Contributed by Andrew MacLeod + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "backend.h" +#include "tree.h" +#include "gimple.h" +#include "predict.h" +#include "tree-pass.h" +#include "ssa.h" +#include "cgraph.h" +#include "fold-const.h" +#include "stor-layout.h" +#include "gimple-iterator.h" +#include "gimple-walk.h" +#include "tree-ssa-loop-manip.h" +#include "tree-ssa-loop-niter.h" +#include "tree-cfg.h" +#include "cfgloop.h" +#include "tree-vectorizer.h" +#include "tree-ssa-propagate.h" +#include "dbgcnt.h" +#include "tree-scalar-evolution.h" +#include "stringpool.h" +#include "attribs.h" +#include "gimple-pretty-print.h" +#include "opt-problem.h" +#include "internal-fn.h" +#include "tree-ssa-sccvn.h" +#include "vec-perm-indices.h" + +// Operation1: +// If a VEC_PERM_EXPR has all uses as a permute-like expression. Change all +// the use into a new VEC_PERM_EXPR, and delete the old one. +// And VEC_PERM_EXPR can use 255 as a number in operand3 which means to put +// a 0 in the dest operand. As a result, we can turn VEC_UNPACK_LO/HI_EXPR +// into a VEC_PERM_EXPR. +// +// Case 1: +// vect__2 = VEC_PERM_EXPR ; +// vect__3 = [vec_unpack_lo_expr] vect__2; +// vect__4 = [vec_unpack_hi_expr] vect__2; +// ==> +// vect__3 = VEC_PERM_EXPR ; +// vect__4 = VEC_PERM_EXPR ; +// +// Case 2: +// vect__1 = VEC_PERM_EXPR ; +// vect__2 = VEC_PERM_EXPR ; +// vect__3 = VEC_PERM_EXPR ; +// ==> +// vect__2 = VEC_PERM_EXPR ; +// vect__3 = VEC_PERM_EXPR ; +// +// Operation2: +// While a VEC_PERM_EXPR has different src, and it's use only use one. It +// can be performed either. +// +// Case: +// vect__2 = VEC_PERM_EXPR ; +// vect__3 = [vec_unpack_float_lo_expr] vect__2; +// vect__4 = [vec_unpack_float_hi_expr] vect__2; +// ==> +// _0 = VEC_PERM_EXPR ; +// vect__3 = [vec_unpack_float_lo_expr] _0; +// _1 = VEC_PERM_EXPR ; +// vect__4 = [vec_unpack_float_lo_expr] _1; +void +permute_stmt_operation(gimple *stmt) +{ + imm_use_iterator iter; + gimple *use_stmt; + unsigned int perm[256],n; + + n=VECTOR_CST_NELTS (gimple_assign_rhs3(stmt)).to_constant (); + + for (unsigned int i=0; i=256) + { + /* vpshufb for ymm only works intra lanes, it is not + possible to shuffle bytes in between the lanes. */ + for (unsigned int i = 0; i < n; ++i) + if ((permnew[i]!=255) || ((permnew[i] ^ i) & (n / 2))) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "Expr for ymm should match only one insn.\n" + ); + return; + } + } + + tree char_vectype = TREE_TYPE(gimple_assign_rhs3(stmt)); + tree op0 = gimple_assign_rhs1(stmt); + tree op1 = gimple_assign_rhs2(stmt); + unsigned int t=0; + for (unsigned int i=0; i32) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "Shuffle fusion2 failed: expr has too many nelts.\n" + ); + return false; + } + + for (unsigned int i=0; ia = r->a + (*k) * p[u].a; + r->b = r->b + (*k) * p[u].b; + r->c = r->c + (*k) * p[u].c; + r->d = r->d + (*k) * p[u].d; + } +} diff --git a/gcc/testsuite/g++.target/i386/shuffle-fusion.C b/gcc/testsuite/g++.target/i386/shuffle-fusion.C new file mode 100644 index 00000000000..dd3a011bff1 --- /dev/null +++ b/gcc/testsuite/g++.target/i386/shuffle-fusion.C @@ -0,0 +1,28 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-march=znver1 -O2" } */ +/* { dg-additional-options "-fshuffle-fusion -funsafe-math-optimizations" } */ +/* { dg-final { scan-assembler-not "vpsrl"} } */ + +typedef struct _Double +{ + double + a, + b; +} Double; + +typedef struct _Unsigned +{ + unsigned short + a, + b; +} Unsigned; + +void S(unsigned n, Double *r, + const double *__restrict k, const Unsigned *__restrict p) +{ + for (unsigned u = 0; u < n; u++, k--) + { + r->a = r->a + p[u].a; + r->b = r->b + p[u].b; + } +} diff --git a/gcc/testsuite/g++.target/i386/simd-coop.C b/gcc/testsuite/g++.target/i386/simd-coop.C new file mode 100644 index 00000000000..60e831be748 --- /dev/null +++ b/gcc/testsuite/g++.target/i386/simd-coop.C @@ -0,0 +1,33 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-march=znver1 -O2 -fdump-tree-vect-details" } */ +/* { dg-additional-options "-floop-slp-coop -funsafe-math-optimizations" } */ +/* { dg-final { scan-tree-dump "Final SLP tree" "vect"} } */ + +typedef struct _Double +{ + double + a, + b, + c, + d; +} Double; + +typedef struct _Unsigned +{ + unsigned short + a, + b, + c, + d; +} Unsigned; + +void S(unsigned n, Double *r, + const double *__restrict k, const Unsigned *__restrict p) +{ + for (unsigned u = 0; u < n; u++, k--) + { + r->a = r->a + (*k) * p[u].a; + r->b = r->b + (*k) * p[u].b; + r->c = r->c + (*k) * p[u].c; + } +} diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr69270.c b/gcc/testsuite/gcc.dg/tree-ssa/pr69270.c index 0d66cc4383f..3aba15ee18b 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/pr69270.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr69270.c @@ -1,17 +1,17 @@ /* { dg-do compile } */ -/* { dg-options "-O2 -fsplit-paths -fdump-tree-dom3-details" } */ +/* { dg-options "-O2 -fsplit-paths -fdump-tree-dom4-details" } */ /* There should be two references to bufferstep that turn into constants. */ -/* { dg-final { scan-tree-dump-times "Replaced .bufferstep_\[0-9\]+. with constant .0." 1 "dom3"} } */ -/* { dg-final { scan-tree-dump-times "Replaced .bufferstep_\[0-9\]+. with constant .1." 1 "dom3"} } */ +/* { dg-final { scan-tree-dump-times "Replaced .bufferstep_\[0-9\]+. with constant .0." 1 "dom4"} } */ +/* { dg-final { scan-tree-dump-times "Replaced .bufferstep_\[0-9\]+. with constant .1." 1 "dom4"} } */ /* And some assignments ought to fold down to constants. */ -/* { dg-final { scan-tree-dump-times "Folded to: _\[0-9\]+ = 1;" 1 "dom3"} } */ -/* { dg-final { scan-tree-dump-times "Folded to: _\[0-9\]+ = 0;" 1 "dom3"} } */ +/* { dg-final { scan-tree-dump-times "Folded to: _\[0-9\]+ = 1;" 1 "dom4"} } */ +/* { dg-final { scan-tree-dump-times "Folded to: _\[0-9\]+ = 0;" 1 "dom4"} } */ /* The XOR operations should have been optimized to constants. */ -/* { dg-final { scan-tree-dump-not "bit_xor" "dom3"} } */ +/* { dg-final { scan-tree-dump-not "bit_xor" "dom4"} } */ extern int *stepsizeTable; diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr70232.c b/gcc/testsuite/gcc.dg/tree-ssa/pr70232.c index d636672fddc..43809215a1b 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/pr70232.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr70232.c @@ -1,10 +1,10 @@ /* { dg-do compile } */ -/* { dg-options "-O2 -w -fdump-tree-vrp1-details -fdump-tree-vrp2-details -fdump-tree-dom2-details -fdump-tree-dom3-details" } */ +/* { dg-options "-O2 -w -fdump-tree-vrp1-details -fdump-tree-vrp2-details -fdump-tree-dom2-details -fdump-tree-dom4-details" } */ /* All the threads found by the threader should have too many statements to be profitable. */ /* { dg-final { scan-tree-dump-not "Registering jump " "dom2"} } */ -/* { dg-final { scan-tree-dump-not "Registering jump " "dom3"} } */ +/* { dg-final { scan-tree-dump-not "Registering jump " "dom4"} } */ /* { dg-final { scan-tree-dump-not "Registering jump " "vrp1"} } */ /* { dg-final { scan-tree-dump-not "Registering jump " "vrp2"} } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr71437.c b/gcc/testsuite/gcc.dg/tree-ssa/pr71437.c index eab3a25928e..25bf07947d0 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/pr71437.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr71437.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-ffast-math -O3 -fdump-tree-dom3-details" } */ +/* { dg-options "-ffast-math -O3 -fdump-tree-dom4-details" } */ int I = 50, J = 50; int S, L; @@ -43,4 +43,4 @@ void foo (int K) /* We used to get 1 vrp-thread1 candidates here, but they now get deferred until after loop opts are done, because they were rotating loops. */ -/* { dg-final { scan-tree-dump-times "Threaded jump " 2 "dom3" } } */ +/* { dg-final { scan-tree-dump-times "Threaded jump " 2 "dom4" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/slsr-27.c b/gcc/testsuite/gcc.dg/tree-ssa/slsr-27.c index c8f8e612da2..979bfed98d8 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/slsr-27.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/slsr-27.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O2 -fdump-tree-dom3" } */ +/* { dg-options "-O2 -fdump-tree-dom4" } */ struct x { @@ -16,8 +16,8 @@ f (struct x *p, unsigned int n) foo (p->a[n], p->c[n], p->b[n]); } -/* { dg-final { scan-tree-dump-times "\\* 4;" 1 "dom3" { target { int32 } } } } */ -/* { dg-final { scan-tree-dump-times "\\* 2;" 1 "dom3" { target { int16 } } } } */ -/* { dg-final { scan-tree-dump-times "p_\\d\+\\(D\\) \\+ \[^\r\n\]*_\\d\+;" 1 "dom3" } } */ +/* { dg-final { scan-tree-dump-times "\\* 4;" 1 "dom4" { target { int32 } } } } */ +/* { dg-final { scan-tree-dump-times "\\* 2;" 1 "dom4" { target { int16 } } } } */ +/* { dg-final { scan-tree-dump-times "p_\\d\+\\(D\\) \\+ \[^\r\n\]*_\\d\+;" 1 "dom4" } } */ /* - { dg-final { scan-tree-dump-times "MEM *? *\\\[\\(struct x \\*\\)\[^\r\n\]*_\\d\+" 3 "dom3" } } */ + { dg-final { scan-tree-dump-times "MEM *? *\\\[\\(struct x \\*\\)\[^\r\n\]*_\\d\+" 3 "dom4" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/slsr-28.c b/gcc/testsuite/gcc.dg/tree-ssa/slsr-28.c index b18e9c1fe21..2bcafe24000 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/slsr-28.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/slsr-28.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O2 -fdump-tree-dom3" } */ +/* { dg-options "-O2 -fdump-tree-dom4" } */ struct x { @@ -20,7 +20,7 @@ f (struct x *p, unsigned int n) foo (p->b[n], p->a[n], p->c[n]); } -/* { dg-final { scan-tree-dump-times "\\* 4;" 1 "dom3" { target { int32 } } } } */ -/* { dg-final { scan-tree-dump-times "\\* 2;" 1 "dom3" { target { int16 } } } } */ -/* { dg-final { scan-tree-dump-times "p_\\d\+\\(D\\) \\+ \[^\r\n\]*_\\d\+" 1 "dom3" } } */ -/* { dg-final { scan-tree-dump-times "MEM *? *\\\[\\(struct x \\*\\)\[^\r\n\]*_\\d\+" 9 "dom3" } } */ +/* { dg-final { scan-tree-dump-times "\\* 4;" 1 "dom4" { target { int32 } } } } */ +/* { dg-final { scan-tree-dump-times "\\* 2;" 1 "dom4" { target { int16 } } } } */ +/* { dg-final { scan-tree-dump-times "p_\\d\+\\(D\\) \\+ \[^\r\n\]*_\\d\+" 1 "dom4" } } */ +/* { dg-final { scan-tree-dump-times "MEM *? *\\\[\\(struct x \\*\\)\[^\r\n\]*_\\d\+" 9 "dom4" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/slsr-29.c b/gcc/testsuite/gcc.dg/tree-ssa/slsr-29.c index 00e8d2b52b3..4db297a2c82 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/slsr-29.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/slsr-29.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O2 -fdump-tree-dom3" } */ +/* { dg-options "-O2 -fdump-tree-dom4" } */ struct x { @@ -22,7 +22,7 @@ f (struct x *p, unsigned int n) } } -/* { dg-final { scan-tree-dump-times "\\* 4;" 1 "dom3" { target { int32 } } } } */ -/* { dg-final { scan-tree-dump-times "\\* 2;" 1 "dom3" { target { int16 } } } } */ -/* { dg-final { scan-tree-dump-times "p_\\d\+\\(D\\) \\+ \[^\r\n\]*_\\d\+" 1 "dom3" } } */ -/* { dg-final { scan-tree-dump-times "MEM *? *\\\[\\(struct x \\*\\)\[^\r\n\]*_\\d\+" 9 "dom3" } } */ +/* { dg-final { scan-tree-dump-times "\\* 4;" 1 "dom4" { target { int32 } } } } */ +/* { dg-final { scan-tree-dump-times "\\* 2;" 1 "dom4" { target { int16 } } } } */ +/* { dg-final { scan-tree-dump-times "p_\\d\+\\(D\\) \\+ \[^\r\n\]*_\\d\+" 1 "dom4" } } */ +/* { dg-final { scan-tree-dump-times "MEM *? *\\\[\\(struct x \\*\\)\[^\r\n\]*_\\d\+" 9 "dom4" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-7.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-7.c index b64e71dae22..a9cf2578e3c 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-7.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-7.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O2 -fdump-tree-dom2-stats -fdump-tree-thread2-stats -fdump-tree-dom3-stats -fno-guess-branch-probability" } */ +/* { dg-options "-O2 -fdump-tree-dom2-stats -fdump-tree-thread2-stats -fdump-tree-dom4-stats -fno-guess-branch-probability" } */ /* { dg-final { scan-tree-dump-not "Jumps threaded" "dom2" } } */ @@ -10,7 +10,7 @@ /* aarch64 has the highest CASE_VALUES_THRESHOLD in GCC. It's high enough to change decisions in switch expansion which in turn can expose new jump threading opportunities. Skip the later tests on aarch64. */ -/* { dg-final { scan-tree-dump-not "Jumps threaded" "dom3" { target { ! aarch64*-*-* } } } } */ +/* { dg-final { scan-tree-dump-not "Jumps threaded" "dom4" { target { ! aarch64*-*-* } } } } */ /* { dg-final { scan-tree-dump "Jumps threaded: 7" "thread2" { target { ! aarch64*-*-* } } } } */ /* { dg-final { scan-tree-dump "Jumps threaded: 18" "thread2" { target { aarch64*-*-* } } } } */ diff --git a/gcc/timevar.def b/gcc/timevar.def index 794b8017d18..26a5b49d02e 100644 --- a/gcc/timevar.def +++ b/gcc/timevar.def @@ -309,6 +309,7 @@ DEFTIMEVAR (TV_INITIALIZE_RTL , "initialize rtl") DEFTIMEVAR (TV_GIMPLE_LADDRESS , "address lowering") DEFTIMEVAR (TV_TREE_LOOP_IFCVT , "tree loop if-conversion") DEFTIMEVAR (TV_WARN_ACCESS , "access analysis") +DEFTIMEVAR (TV_SHUFFLE_FUSION , "shuffle fusion") /* Everything else in rest_of_compilation not included above. */ DEFTIMEVAR (TV_EARLY_LOCAL , "early local passes") diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h index 55ee2fe7f9e..cf86f15c501 100644 --- a/gcc/tree-pass.h +++ b/gcc/tree-pass.h @@ -499,6 +499,7 @@ extern gimple_opt_pass *make_pass_modref (gcc::context *ctxt); extern gimple_opt_pass *make_pass_coroutine_lower_builtins (gcc::context *ctxt); extern gimple_opt_pass *make_pass_coroutine_early_expand_ifns (gcc::context *ctxt); extern gimple_opt_pass *make_pass_adjust_alignment (gcc::context *ctxt); +extern gimple_opt_pass *make_pass_shuffle_fusion (gcc::context *ctxt); /* IPA Passes */ extern simple_ipa_opt_pass *make_pass_ipa_lower_emutls (gcc::context *ctxt); diff --git a/gcc/tree-ssa-loop.cc b/gcc/tree-ssa-loop.cc index 73aa46627b4..844df411de4 100644 --- a/gcc/tree-ssa-loop.cc +++ b/gcc/tree-ssa-loop.cc @@ -339,6 +339,7 @@ public: pass_tree_loop_init (gcc::context *ctxt) : gimple_opt_pass (pass_data_tree_loop_init, ctxt) {} + opt_pass * clone () { return new pass_tree_loop_init (m_ctxt); } /* opt_pass methods: */ virtual unsigned int execute (function *); @@ -501,6 +502,7 @@ public: pass_tree_loop_done (gcc::context *ctxt) : gimple_opt_pass (pass_data_tree_loop_done, ctxt) {} + opt_pass * clone () { return new pass_tree_loop_done (m_ctxt); } /* opt_pass methods: */ virtual unsigned int execute (function *) { return tree_ssa_loop_done (); } diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc index 232e903b0d2..f89d3c3efcc 100644 --- a/gcc/tree-ssa-math-opts.cc +++ b/gcc/tree-ssa-math-opts.cc @@ -4891,13 +4891,24 @@ public: {} /* opt_pass methods: */ + opt_pass * clone () { return new pass_optimize_widening_mul (m_ctxt); } + void set_pass_param (unsigned int n, bool param) + { + gcc_assert (n == 0); + ahead = param; + } + virtual bool gate (function *) { - return flag_expensive_optimizations && optimize; + return flag_expensive_optimizations && optimize + && (ahead ^ flag_widening_mul_ahead); } virtual unsigned int execute (function *); + private: + /* Determines whether the pass moved ahead. */ + bool ahead; }; // class pass_optimize_widening_mul /* Walker class to perform the transformation in reverse dominance order. */ @@ -5073,3 +5084,4 @@ make_pass_optimize_widening_mul (gcc::context *ctxt) { return new pass_optimize_widening_mul (ctxt); } + diff --git a/gcc/tree-ssa-ter.cc b/gcc/tree-ssa-ter.cc index 4cdad0d2749..d34e435c63a 100644 --- a/gcc/tree-ssa-ter.cc +++ b/gcc/tree-ssa-ter.cc @@ -685,7 +685,7 @@ find_replaceable_in_bb (temp_expr_table *tab, basic_block bb) /* Increment counter if this is a non BUILT_IN call. We allow replacement over BUILT_IN calls since many will expand to inline insns instead of a true call. */ - if (is_gimple_call (stmt) + if (is_nofma_gimple_call (stmt) && !((fndecl = gimple_call_fndecl (stmt)) && fndecl_built_in_p (fndecl))) cur_call_cnt++; diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc index 4e615b80b3a..e4c8ffd5ff3 100644 --- a/gcc/tree-vect-data-refs.cc +++ b/gcc/tree-vect-data-refs.cc @@ -425,6 +425,8 @@ vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr, if (apply_safelen ()) return opt_result::success (); + vect_depandence_issue = true; + if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt, "versioning for alias required: " @@ -4210,7 +4212,10 @@ vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo, if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p, vectype, memory_type, offtype, scale, &ifn, &offset_vectype)) - ifn = IFN_LAST; + { + ifn = IFN_LAST; + vect_depandence_issue = true; + } decl = NULL_TREE; } else @@ -4225,6 +4230,7 @@ vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo, if (targetm.vectorize.builtin_scatter) decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale); } + if (!decl) vect_depandence_issue = true; ifn = IFN_LAST; /* The offset vector type will be read from DECL when needed. */ offset_vectype = NULL_TREE; diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index 3435f9378da..da3434ae43c 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -2374,11 +2374,14 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, /* Update the vectorization factor based on the SLP decision. */ vect_update_vf_for_slp (loop_vinfo); - /* Optimize the SLP graph with the vectorization factor fixed. */ - vect_optimize_slp (loop_vinfo); + if (flag_vectorize_slp_optimize) + { + /* Optimize the SLP graph with the vectorization factor fixed. */ + vect_optimize_slp (loop_vinfo); - /* Gather the loads reachable from the SLP graph entries. */ - vect_gather_slp_loads (loop_vinfo); + /* Gather the loads reachable from the SLP graph entries. */ + vect_gather_slp_loads (loop_vinfo); + } } bool saved_can_use_partial_vectors_p @@ -3016,6 +3019,7 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared) /* Set cached VF to -1 prior to analysis, which indicates a mode has failed. */ cached_vf_per_mode[last_mode_i] = -1; + vect_depandence_issue = false; opt_loop_vec_info loop_vinfo = vect_analyze_loop_1 (loop, shared, &loop_form_info, NULL, vector_modes, mode_i, @@ -3128,6 +3132,7 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared) "mode %s\n", GET_MODE_NAME (vector_modes[mode_i])); bool fatal; + vect_depandence_issue = true; opt_loop_vec_info loop_vinfo = vect_analyze_loop_1 (loop, shared, &loop_form_info, first_loop_vinfo, @@ -3193,6 +3198,8 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared) return first_loop_vinfo; } +bool vect_depandence_issue; + /* Return true if there is an in-order reduction function for CODE, storing it in *REDUC_FN if so. */ diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index af477c31aa3..24343ebe597 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -924,6 +924,7 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, bool first_stmt_phi_p = false, phi_p = false; bool maybe_soft_fail = false; tree soft_fail_nunits_vectype = NULL_TREE; + bool arraystmt = false; /* For every stmt in NODE find its def stmt/s. */ stmt_vec_info stmt_info; @@ -932,6 +933,7 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, gimple *stmt = stmt_info->stmt; swap[i] = 0; matches[i] = false; + arraystmt = false; if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt); @@ -1033,6 +1035,20 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, else { rhs_code = gimple_assign_rhs_code (stmt); + /* Loop-slp-cooperate vectorization. + For this case create a new stmt_info for the array, and perform as + slp vectorize. Set the size as 1, and loop vectorize willperform + the vectorized stmt as a new element. */ + if (rhs_code == MEM_REF && !vect_depandence_issue && + !zerop(stmt_info->dr_aux.dr->innermost.step)) + arraystmt = flag_loop_slp_coop; + if (arraystmt) + { + if(!stmt_info->first_element) + stmt_info->first_element = stmt_info; + if(!stmt_info->size) + stmt_info->size = 1; + } load_p = gimple_vuse (stmt); } @@ -1207,7 +1223,7 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, } /* Grouped store or load. */ - if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) + if (STMT_VINFO_GROUPED_ACCESS (stmt_info) || arraystmt) { if (REFERENCE_CLASS_P (lhs)) { diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 34920041116..fe28725e5b1 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -6448,6 +6448,108 @@ vectorizable_operation (vec_info *vinfo, vect_get_vec_defs (vinfo, stmt_info, slp_node, ncopies, op0, &vec_oprnds0, op1, &vec_oprnds1, op2, &vec_oprnds2); + +/* Accumulation Combine + + For loop vectorize, reuse the register after rotation. + + Before rotate: + PHI: vx1 = {v0, vy1} + vy1 = vx1 + vz[i] + + After rotate: + PHI: vx1 = {v0, vy1} + PHI: vx2 = {0, vy2} + vy1 = vx1 + vz[i] + vy2 = vx2 + vz[i+1] + + After accumulation combine: + PHI: vx1 = {v0, vy2} + vy1 = vx1 + vz[i] + vy2 = vy1 + vz[i+1] +*/ + + bool combined = false; + + if(slp_node && !op2 && op1 && op0 + && ((TREE_CODE(op0) == SSA_NAME + && SSA_NAME_DEF_STMT(op0)->code == GIMPLE_PHI) + || (TREE_CODE(op1) == SSA_NAME + && SSA_NAME_DEF_STMT(op1)->code == GIMPLE_PHI))) + { + gimple* phi; + if (TREE_CODE(op0) == SSA_NAME + && SSA_NAME_DEF_STMT(op0)->code == GIMPLE_PHI) + phi = SSA_NAME_DEF_STMT(op0); + else + phi = SSA_NAME_DEF_STMT(op1); + for (i = 0; i < gimple_phi_num_args (phi); i++) + { + tree arg = gimple_phi_arg_def (phi, i); + if (arg == scalar_dest) + combined = flag_accumulation_combine; + } + if(SLP_TREE_LANES(slp_node) % + vect_nunits_for_cost(SLP_TREE_VECTYPE(slp_node)) == 0) + combined = false; + if (vec_num % SLP_TREE_LANES(slp_node) != 0) + combined = false; + } + if (combined) + { + tree addvec[vec_num]; + tree zero_val = build_zero_cst (TREE_TYPE (vectype_out)); + FOR_EACH_VEC_ELT (vec_oprnds0, i, vop0) + { + gimple *new_stmt = NULL; + vop1 = vec_oprnds1[i]; + if (icode == GIMPLE_PHI) + new_stmt + = gimple_build_assign (vec_dest, code, + addvec[i-SLP_TREE_LANES(slp_node)], + vop1, NULL_TREE); + else + new_stmt + = gimple_build_assign (vec_dest, code, + addvec[i-SLP_TREE_LANES(slp_node)], + vop0, NULL_TREE); + addvec[i] = make_ssa_name (vec_dest, new_stmt); + gimple_assign_set_lhs (new_stmt, addvec[i]); + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); + if(vec_num - i <= SLP_TREE_LANES(slp_node)) + SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt); + } + + FOR_EACH_VEC_ELT (vec_oprnds0, i, vop0) + if (i >= SLP_TREE_LANES(slp_node)) + { + gimple *new_stmt = NULL; + if (SSA_NAME_DEF_STMT(op0)->code == GIMPLE_PHI) + new_stmt + = gimple_build_assign (vec_dest, PLUS_EXPR, + vop0, zero_val, NULL_TREE); + else + new_stmt + = gimple_build_assign (vec_dest, PLUS_EXPR, + vec_oprnds1[i], zero_val, NULL_TREE); + new_temp = make_ssa_name (vec_dest, new_stmt); + gimple_assign_set_lhs (new_stmt, new_temp); + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); + SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt); + } + + vec_oprnds0.release (); + vec_oprnds1.release (); + vec_oprnds2.release (); + + return true; + } + /* Arguments are ready. Create the new vector stmt. */ FOR_EACH_VEC_ELT (vec_oprnds0, i, vop0) { diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 642eb0aeb21..dd99f98071e 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -2558,4 +2558,7 @@ vect_is_integer_truncation (stmt_vec_info stmt_info) && TYPE_PRECISION (lhs_type) < TYPE_PRECISION (rhs_type)); } +/* Flag for ensure the cases of loop-slp-cooperate vectorization. */ +extern bool vect_depandence_issue; + #endif /* GCC_TREE_VECTORIZER_H */ -- Gitee