From 6a6e07da8a12fd34965cdc06e6264c3e1cc00d1f Mon Sep 17 00:00:00 2001
From: baozhaoling <baozhaoling@hygon.cn>
Date: Fri, 29 Mar 2024 10:31:37 +0800
Subject: [PATCH] Add loop-slp-coop & slp-accumulation-combine for some
 unvectorizable cases. Add testsuite cases for loop-slp-coop-vectorize. Add
 shuffle fusion pass for vec-perm-expr combine.

Add option -mloop-slp-coop: While enabled, try both use slp vectorize and loop vectorize while
performing vectorize pass.
Add option -mslp-optimize: While disabled, disable vect_optimize_slp & vect_gather_slp_loads
function.
Add option -mslp-accumulation-combine: The vect pass will perform rotate while performing vect
pass. Enabled the option, and the addtion gimple will try to combine, reusing the register
for register pressure
Add option -mwidening_mul_ahead: Move the widening_mul_ahead pass, for optimization priority.
Add option -mshuffle-fusion: To combine the permute and permute-like instructions.
Add testsuite cases: gcc/testsuite/g++.dg/vect/simd-coop.cc
Add testsuite cases: gcc/testsuite/g++.target/i386/simd-coop.C
Add testsuite cases: gcc/testsuite/g++.target/i386/shuffle-fusion.C
---
 gcc/Makefile.in                               |   1 +
 gcc/common.opt                                |  20 +
 gcc/config/i386/i386-expand.cc                |  62 ++-
 gcc/expr.cc                                   |  25 +-
 gcc/gimple.h                                  |  21 +
 gcc/optabs.cc                                 |   4 +-
 gcc/passes.def                                |   8 +-
 gcc/shufflefusion.cc                          | 434 ++++++++++++++++++
 gcc/testsuite/g++.dg/vect/simd-coop.cc        |  33 ++
 .../g++.target/i386/shuffle-fusion.C          |  28 ++
 gcc/testsuite/g++.target/i386/simd-coop.C     |  33 ++
 gcc/testsuite/gcc.dg/tree-ssa/pr69270.c       |  12 +-
 gcc/testsuite/gcc.dg/tree-ssa/pr70232.c       |   4 +-
 gcc/testsuite/gcc.dg/tree-ssa/pr71437.c       |   4 +-
 gcc/testsuite/gcc.dg/tree-ssa/slsr-27.c       |  10 +-
 gcc/testsuite/gcc.dg/tree-ssa/slsr-28.c       |  10 +-
 gcc/testsuite/gcc.dg/tree-ssa/slsr-29.c       |  10 +-
 .../gcc.dg/tree-ssa/ssa-dom-thread-7.c        |   4 +-
 gcc/timevar.def                               |   1 +
 gcc/tree-pass.h                               |   1 +
 gcc/tree-ssa-loop.cc                          |   2 +
 gcc/tree-ssa-math-opts.cc                     |  14 +-
 gcc/tree-ssa-ter.cc                           |   2 +-
 gcc/tree-vect-data-refs.cc                    |   8 +-
 gcc/tree-vect-loop.cc                         |  15 +-
 gcc/tree-vect-slp.cc                          |  18 +-
 gcc/tree-vect-stmts.cc                        | 102 ++++
 gcc/tree-vectorizer.h                         |   3 +
 28 files changed, 831 insertions(+), 58 deletions(-)
 create mode 100644 gcc/shufflefusion.cc
 create mode 100644 gcc/testsuite/g++.dg/vect/simd-coop.cc
 create mode 100644 gcc/testsuite/g++.target/i386/shuffle-fusion.C
 create mode 100644 gcc/testsuite/g++.target/i386/simd-coop.C

diff --git a/gcc/Makefile.in b/gcc/Makefile.in
index 0aabc6ea3f2..c0696fef13f 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -1581,6 +1581,7 @@ OBJS = \
 	selftest-run-tests.o \
 	sese.o \
 	shrink-wrap.o \
+	shufflefusion.o \
 	simplify-rtx.o \
 	sparseset.o \
 	spellcheck.o \
diff --git a/gcc/common.opt b/gcc/common.opt
index 30f979870f6..645d971a14c 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -3616,4 +3616,24 @@ fipa-ra
 Common Var(flag_ipa_ra) Optimization
 Use caller save register across calls if possible.
 
+floop-slp-coop
+Target Var(flag_loop_slp_coop) Init(0)
+Perform loop-slp-cooperate vectorization.
+
+fslp-optimize
+Target Var(flag_vectorize_slp_optimize) Init(1)
+Enable slp optimize in vect pass.
+
+fslp-accumulation-combine
+Target Var(flag_accumulation_combine) Init(0)
+Combine the mul-add accumulation stmt for reg pressure.
+
+fwidening-mul-ahead
+Target Var(flag_widening_mul_ahead) Init(0)
+Move widening-mul pass ahead of the store-merging pass.
+
+fshuffle-fusion
+Target Var(flag_shuffle_fusion) Init(0)
+Combine the permute and permute-like gimple.
+
 ; This comment is to ensure we retain the blank line above.
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 77dda5dd44e..9985c1a635a 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -19011,13 +19011,17 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
 
   if (vmode == V8SImode)
     for (i = 0; i < 8; ++i)
-      rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
+      rperm[i] = d->perm[i] != 255 ?
+		 GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7) :
+		 GEN_INT(255);
   else if (vmode == V16SImode)
     for (i = 0; i < 16; ++i)
-      rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
+      rperm[i] = d->perm[i] != 255 ?
+		 GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15) :
+		 GEN_INT(255);
   else
     {
-      eltsz = GET_MODE_UNIT_SIZE (d->vmode);
+      eltsz = GET_MODE_SIZE (d->vmode) / nelt;
       if (!d->one_operand_p)
 	mask = 2 * nelt - 1;
       else if (vmode == V64QImode)
@@ -19027,11 +19031,15 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
       else
 	mask = nelt - 1;
 
+      /* Reserve the index to be 255 for the shuffle instructions to insert a 0
+	 in op0 */
       for (i = 0; i < nelt; ++i)
 	{
 	  unsigned j, e = d->perm[i] & mask;
 	  for (j = 0; j < eltsz; ++j)
-	    rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
+	    if (d->perm[i] != 255)
+	      rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
+	    else rperm[i * eltsz + j] = GEN_INT (255);
 	}
     }
 
@@ -19229,6 +19237,7 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
       for (i = 0; i < nelt; i++)
 	{
 	  nd.perm[i] = d->perm[i] & mask;
+	  if (d->perm[i] == 255) nd.perm[i] = 255;
 	  if (nd.perm[i] != i)
 	    identity_perm = false;
 	  if (nd.perm[i])
@@ -19311,8 +19320,12 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
 	 every other permutation operand.  */
       for (i = 0; i < nelt; i += 2)
 	{
-	  nd.perm[i] = d->perm[i] & mask;
-	  nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
+	  if (d->perm[i] != 255)
+  	    nd.perm[i] = d->perm[i] & mask;
+	  else nd.perm[i] = 255;
+	  if (d->perm[i + 1] != 255)
+	    nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
+	  else nd.perm[i + 1] = 255;
 	}
       if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
 				  d->testing_p))
@@ -19323,10 +19336,18 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
 	{
 	  for (i = 0; i < nelt; i += 4)
 	    {
-	      nd.perm[i + 0] = d->perm[i + 0] & mask;
-	      nd.perm[i + 1] = d->perm[i + 1] & mask;
-	      nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
-	      nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
+	      if (d->perm[i + 0] != 255)
+	        nd.perm[i + 0] = d->perm[i + 0] & mask;
+	      else nd.perm[i + 0] = 255;
+	      if (d->perm[i + 1] != 255)
+	        nd.perm[i + 1] = d->perm[i + 1] & mask;
+	      else nd.perm[i + 1] = 255;
+	      if (d->perm[i + 2] != 255)
+	        nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
+	      else nd.perm[i + 2] = 255;
+	      if (d->perm[i + 3] != 255)
+	        nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
+	      else nd.perm[i + 3] = 255;
 	    }
 
 	  if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
@@ -20747,7 +20768,7 @@ expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
     }
 
   nelt = d->nelt;
-  eltsz = GET_MODE_UNIT_SIZE (d->vmode);
+  eltsz = GET_MODE_SIZE (d->vmode) / nelt;
 
   /* Generate two permutation masks.  If the required element is within
      the given vector it is shuffled into the proper lane.  If the required
@@ -20762,8 +20783,9 @@ expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
 	e -= nelt;
 
       for (j = 0; j < eltsz; ++j)
-	{
-	  rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
+        {
+	  rperm[which][i*eltsz + j] = (d->perm[i] == 255) ?
+		  GEN_INT(255) : GEN_INT (e*eltsz + j);
 	  rperm[1-which][i*eltsz + j] = m128;
 	}
 
@@ -21926,7 +21948,8 @@ canonicalize_perm (struct expand_vec_perm_d *d)
   int i, which, nelt = d->nelt;
 
   for (i = which = 0; i < nelt; ++i)
-    which |= (d->perm[i] < nelt ? 1 : 2);
+    if (d->perm[i] != 255)
+      which |= (d->perm[i] < nelt ? 1 : 2);
 
   d->one_operand_p = true;
   switch (which)
@@ -21948,7 +21971,8 @@ canonicalize_perm (struct expand_vec_perm_d *d)
 
     case 2:
       for (i = 0; i < nelt; ++i)
-        d->perm[i] &= nelt - 1;
+	if (d->perm[i] != 255)
+          d->perm[i] &= nelt - 1;
       d->op0 = d->op1;
       break;
 
@@ -22098,10 +22122,11 @@ ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
   for (i = which = 0; i < nelt; ++i)
     {
       unsigned char e = sel[i];
-      gcc_assert (e < 2 * nelt);
+      gcc_assert (e < 2 * nelt || e == 255);
       d.perm[i] = e;
       perm[i] = e;
-      which |= (e < nelt ? 1 : 2);
+      if (e != 255)
+        which |= (e < nelt ? 1 : 2);
     }
 
   if (d.testing_p)
@@ -22109,7 +22134,8 @@ ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
       /* For all elements from second vector, fold the elements to first.  */
       if (which == 2)
 	for (i = 0; i < nelt; ++i)
-	  d.perm[i] -= nelt;
+	  if (d.perm[i] != 255)
+	    d.perm[i] -= nelt;
 
       /* Check whether the mask can be applied to the vector type.  */
       d.one_operand_p = (which != 3);
diff --git a/gcc/expr.cc b/gcc/expr.cc
index e7804d52656..ff126ebf509 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -10148,17 +10148,36 @@ expand_expr_real_2 (sepops ops, rtx target, machine_mode tmode,
       {
 	expand_operands (treeop0, treeop1, target, &op0, &op1, EXPAND_NORMAL);
 	vec_perm_builder sel;
+	machine_mode modeop = TYPE_MODE (TREE_TYPE (treeop0));
+
 	if (TREE_CODE (treeop2) == VECTOR_CST
 	    && tree_to_vec_perm_builder (&sel, treeop2))
 	  {
 	    machine_mode sel_mode = TYPE_MODE (TREE_TYPE (treeop2));
-	    temp = expand_vec_perm_const (mode, op0, op1, sel,
-					  sel_mode, target);
+	    if (modeop != mode && flag_shuffle_fusion)
+	      {
+		temp = expand_vec_perm_const (modeop, op0, op1, sel,
+					      sel_mode, target);
+		rtx tempnew = temp;
+		temp = gen_reg_rtx (mode);
+		convert_move (temp, tempnew, unsignedp);
+	      }
+	    else
+	      temp = expand_vec_perm_const (mode, op0, op1, sel,
+                                            sel_mode, target);
 	  }
 	else
 	  {
 	    op2 = expand_normal (treeop2);
-	    temp = expand_vec_perm_var (mode, op0, op1, op2, target);
+	    if (modeop != mode && flag_shuffle_fusion)
+	      {
+		temp = expand_vec_perm_var (modeop, op0, op1, op2, target);
+		rtx tempnew = temp;
+		temp = gen_reg_rtx (mode);
+		convert_move (temp, tempnew, unsignedp);
+	      }
+	    else
+	      temp = expand_vec_perm_var (mode, op0, op1, op2, target);
 	  }
 	gcc_assert (temp);
 	return temp;
diff --git a/gcc/gimple.h b/gcc/gimple.h
index 77a5a07e9b5..df21897baa9 100644
--- a/gcc/gimple.h
+++ b/gcc/gimple.h
@@ -2953,6 +2953,27 @@ gimple_clobber_p (const gimple *s, enum clobber_kind kind)
 static inline bool
 is_gimple_call (const gimple *gs)
 {
+  if (gimple_code (gs) == GIMPLE_CALL && flag_accumulation_combine)
+    {
+      const gcall *call=dyn_cast <const gcall *> (gs);
+      if (call -> u.internal_fn == IFN_FMA)
+        return false;
+    }
+  return gimple_code (gs) == GIMPLE_CALL;
+}
+
+/* For i386 structure IFN_FMA is not a call instruction, but it is also a 
+   gimple call. Returen true if GS is a no-fma GIMPLE_CALL */
+
+static inline bool
+is_nofma_gimple_call (const gimple *gs)
+{
+  if (gimple_code (gs) == GIMPLE_CALL)
+    {
+      const gcall *call=dyn_cast <const gcall *> (gs);
+      if (call -> u.internal_fn == IFN_FMA)
+        return false;
+    }
   return gimple_code (gs) == GIMPLE_CALL;
 }
 
diff --git a/gcc/optabs.cc b/gcc/optabs.cc
index 3d8fa3abdfe..4ad8fc44ad0 100644
--- a/gcc/optabs.cc
+++ b/gcc/optabs.cc
@@ -6194,7 +6194,9 @@ expand_vec_perm_const (machine_mode mode, rtx v0, rtx v1,
      cases in which the inputs are equal.  Not all backends can cope with
      the single-input representation when testing for a double-input
      target instruction.  */
-  vec_perm_indices indices (sel, 2, GET_MODE_NUNITS (mode));
+  poly_uint16 indice_nunits = flag_shuffle_fusion ? 
+	                      256 : GET_MODE_NUNITS(mode);
+  vec_perm_indices indices (sel, 2, indice_nunits);
 
   /* See if this can be handled with a vec_shr or vec_shl.  We only do this
      if the second (for vec_shr) or first (for vec_shl) vector is all
diff --git a/gcc/passes.def b/gcc/passes.def
index 8dbb7983e3e..c52cfa3c5ab 100644
--- a/gcc/passes.def
+++ b/gcc/passes.def
@@ -297,6 +297,10 @@ along with GCC; see the file COPYING3.  If not see
 	  POP_INSERT_PASSES ()
 	  NEXT_PASS (pass_parallelize_loops, false /* oacc_kernels_p */);
 	  NEXT_PASS (pass_expand_omp_ssa);
+          NEXT_PASS (pass_tree_loop_done);
+          NEXT_PASS (pass_dominator, false);
+          NEXT_PASS (pass_copy_prop);
+          NEXT_PASS (pass_tree_loop_init);
 	  NEXT_PASS (pass_ch_vect);
 	  NEXT_PASS (pass_if_conversion);
 	  /* pass_vectorize must immediately follow pass_if_conversion.
@@ -330,6 +334,7 @@ along with GCC; see the file COPYING3.  If not see
       NEXT_PASS (pass_lower_vector_ssa);
       NEXT_PASS (pass_lower_switch);
       NEXT_PASS (pass_cse_reciprocals);
+      NEXT_PASS (pass_optimize_widening_mul, false);
       NEXT_PASS (pass_reassoc, false /* early_p */);
       NEXT_PASS (pass_strength_reduction);
       NEXT_PASS (pass_split_paths);
@@ -353,7 +358,8 @@ along with GCC; see the file COPYING3.  If not see
       NEXT_PASS (pass_sink_code, true /* unsplit edges */);
       NEXT_PASS (pass_phiopt, false /* early_p */);
       NEXT_PASS (pass_fold_builtins);
-      NEXT_PASS (pass_optimize_widening_mul);
+      NEXT_PASS (pass_shuffle_fusion);
+      NEXT_PASS (pass_optimize_widening_mul, true); 
       NEXT_PASS (pass_store_merging);
       NEXT_PASS (pass_tail_calls);
       /* If DCE is not run before checking for uninitialized uses,
diff --git a/gcc/shufflefusion.cc b/gcc/shufflefusion.cc
new file mode 100644
index 00000000000..f9a475fce5a
--- /dev/null
+++ b/gcc/shufflefusion.cc
@@ -0,0 +1,434 @@
+/* Routines for performing Temporary Expression Replacement (TER) in SSA trees.
+   Copyright (C) 2003-2022 Free Software Foundation, Inc.
+   Contributed by Andrew MacLeod  <amacleod@redhat.com>
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "tree.h"
+#include "gimple.h"
+#include "predict.h"
+#include "tree-pass.h"
+#include "ssa.h"
+#include "cgraph.h"
+#include "fold-const.h"
+#include "stor-layout.h"
+#include "gimple-iterator.h"
+#include "gimple-walk.h"
+#include "tree-ssa-loop-manip.h"
+#include "tree-ssa-loop-niter.h"
+#include "tree-cfg.h"
+#include "cfgloop.h"
+#include "tree-vectorizer.h"
+#include "tree-ssa-propagate.h"
+#include "dbgcnt.h"
+#include "tree-scalar-evolution.h"
+#include "stringpool.h"
+#include "attribs.h"
+#include "gimple-pretty-print.h"
+#include "opt-problem.h"
+#include "internal-fn.h"
+#include "tree-ssa-sccvn.h"
+#include "vec-perm-indices.h"
+
+// Operation1:
+// If a VEC_PERM_EXPR has all uses as a permute-like expression. Change all
+// the use into a new VEC_PERM_EXPR, and delete the old one.
+// And VEC_PERM_EXPR can use 255 as a number in operand3 which means to put
+// a 0 in the dest operand. As a result, we can turn VEC_UNPACK_LO/HI_EXPR
+// into a VEC_PERM_EXPR.
+//
+// Case 1:
+// vect__2 = VEC_PERM_EXPR <vect__0, vect__1, { 0, 1, 2, 4, 5, 6, 8, 9 }>;
+// vect__3 = [vec_unpack_lo_expr] vect__2;
+// vect__4 = [vec_unpack_hi_expr] vect__2;
+// ==>
+// vect__3 = VEC_PERM_EXPR <vect__0, vect__0, { 0, 255, 1, 255, 2, 255, 4, 255 }>;
+// vect__4 = VEC_PERM_EXPR <vect__0, vect__1, { 5, 255, 6, 255, 8, 255, 9, 255 }>;
+//
+// Case 2:
+// vect__1 = VEC_PERM_EXPR <vect__0, vect__0, { 0, 1 }>;
+// vect__2 = VEC_PERM_EXPR <vect__1, vect__1, { 0, 0 }>;
+// vect__3 = VEC_PERM_EXPR <vect__1, vect__1, { 1, 1 }>;
+// ==>
+// vect__2 = VEC_PERM_EXPR <vect__0, vect__0, { 0, 0 }>;
+// vect__3 = VEC_PERM_EXPR <vect__0, vect__0, { 1, 1 }>;
+//
+// Operation2:
+// While a VEC_PERM_EXPR has different src, and it's use only use one. It
+// can be performed either.
+//
+// Case:
+// vect__2 = VEC_PERM_EXPR <vect__0, vect__1, { 5, 6, 8, 9 }>;
+// vect__3 = [vec_unpack_float_lo_expr] vect__2;
+// vect__4 = [vec_unpack_float_hi_expr] vect__2;
+// ==>
+// _0 = VEC_PERM_EXPR <vect__0, vect__0, { 5, 6, 255, 255 }>;
+// vect__3 = [vec_unpack_float_lo_expr] _0;
+// _1 = VEC_PERM_EXPR <vect__1, vect__1, { 0, 1, 255, 255 }>;
+// vect__4 = [vec_unpack_float_lo_expr] _1;
+void
+permute_stmt_operation(gimple *stmt)
+{
+  imm_use_iterator iter;
+  gimple *use_stmt;
+  unsigned int perm[256],n;
+
+  n=VECTOR_CST_NELTS (gimple_assign_rhs3(stmt)).to_constant ();
+
+  for (unsigned int i=0; i<n; i++)
+    perm[i] = TREE_INT_CST_ELT(VECTOR_CST_ELT(gimple_assign_rhs3(stmt),i), 0);
+  tree rtype = copy_node (unsigned_type_node);
+  tree lhs = gimple_assign_lhs (stmt);
+  unsigned int permnew[256];
+  FOR_EACH_IMM_USE_STMT (use_stmt, iter, lhs)
+    {
+      for (unsigned int i=0; i<n; i++)
+	permnew[i] = 255;
+      switch (gimple_assign_rhs_code(use_stmt))
+        {
+	  case VEC_UNPACK_LO_EXPR:
+            for (unsigned int i=0; i<n/2; i++)
+	      permnew[i*2] = perm[i];
+	    break;
+
+          case VEC_UNPACK_HI_EXPR:
+            for (unsigned int i=0; i<n/2; i++)
+	      permnew[i*2] = perm[i+n/2];
+	    break;
+
+	  case VEC_UNPACK_FLOAT_LO_EXPR:
+            for (unsigned int i=0; i<n/2; i++)
+	      permnew[i] = perm[i];
+	    break;
+
+	  case VEC_UNPACK_FLOAT_HI_EXPR:
+            for (unsigned int i=0; i<n/2; i++)
+	      permnew[i] = perm[i+n/2];
+	    break;
+
+	  case VEC_PERM_EXPR:
+	    for (unsigned int i=0; i<n; i++ )
+	      permnew[i] = perm[TREE_INT_CST_ELT(
+			 VECTOR_CST_ELT(gimple_assign_rhs3(use_stmt),i), 0)];
+	    break;
+
+          default:
+	    gcc_unreachable ();
+	}
+      	if (tree_to_uhwi(TYPE_SIZE(TREE_TYPE(gimple_assign_lhs(stmt))))>=256)
+	  {
+	    /* vpshufb for ymm only works intra lanes, it is not
+	       possible to shuffle bytes in between the lanes.  */
+	    for (unsigned int i = 0; i < n; ++i)
+	      if ((permnew[i]!=255) || ((permnew[i] ^ i) & (n / 2)))
+		{
+                  if (dump_enabled_p ())
+                    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+		          "Expr for ymm should match only one insn.\n"
+		         );
+		  return;
+		}
+	  }
+
+      tree char_vectype = TREE_TYPE(gimple_assign_rhs3(stmt));
+      tree op0 = gimple_assign_rhs1(stmt);
+      tree op1 = gimple_assign_rhs2(stmt);
+      unsigned int t=0;
+      for (unsigned int i=0; i<n; i++)
+	if (permnew[i] != 255)
+	  t |= permnew[i]<n?1:2;
+      if (t == 1) op1 = op0;
+      if (t == 2)
+        {
+	  op0 = op1;
+	  for(unsigned int i=0;i<n;i++)
+            if (permnew[i] != 255)
+	      permnew[i] &= n-1;
+        }
+
+      vec_perm_builder elts (n, n, 1);
+      for (unsigned i = 0; i < n; i++)
+        elts.quick_push (permnew[i]);
+      vec_perm_indices indices (elts, 1, 256);
+      tree bswap_vconst = vec_perm_indices_to_tree (char_vectype, indices);
+      gimple_stmt_iterator gsi;
+      tree new_dest,new_temp;
+      gimple *new_stmt,*new_use;
+      switch (gimple_assign_rhs_code(use_stmt))
+	{
+	  case VEC_UNPACK_LO_EXPR:
+	  case VEC_UNPACK_HI_EXPR:
+	  case VEC_PERM_EXPR:
+            new_dest = gimple_assign_lhs(use_stmt);
+            new_stmt = gimple_build_assign (new_dest, VEC_PERM_EXPR,
+			   op0, op1, bswap_vconst);
+            if (dump_enabled_p ())
+              dump_printf_loc (MSG_NOTE, vect_location,
+		               "New stmt: %G",
+		               new_stmt);
+            gsi = gsi_for_stmt (use_stmt);
+            gsi_replace(&gsi, new_stmt, true);
+	    break;
+
+	  case VEC_UNPACK_FLOAT_LO_EXPR:
+	  case VEC_UNPACK_FLOAT_HI_EXPR:
+            new_dest = gimple_assign_lhs(use_stmt);
+	    new_temp = make_ssa_name (TREE_TYPE (lhs));
+            new_stmt = gimple_build_assign (new_temp, VEC_PERM_EXPR,
+			   op0, op1, bswap_vconst);
+            if (dump_enabled_p ())
+              dump_printf_loc (MSG_NOTE, vect_location,
+		               "New stmt: %G",
+		               new_stmt);
+            gsi = gsi_for_stmt (use_stmt);
+	    gsi_insert_before(&gsi, new_stmt, GSI_NEW_STMT);
+            new_use = gimple_build_assign (new_dest,
+			   VEC_UNPACK_FLOAT_LO_EXPR,
+			   new_temp);
+            if (dump_enabled_p ())
+              dump_printf_loc (MSG_NOTE, vect_location,
+		               "New stmt: %G",
+		               new_use);
+            gsi = gsi_for_stmt (use_stmt);
+            gsi_replace(&gsi, new_use, true);
+	    break;
+
+	  default:
+	   gcc_unreachable ();
+        }
+    }
+}
+
+// Analysis all the gimple and find the match case.
+bool
+permute_stmt_analysis(basic_block bb, gimple *stmt)
+{
+  imm_use_iterator iter;
+  gimple *use_stmt;
+
+  if (gimple_code(stmt) != GIMPLE_ASSIGN
+      || gimple_assign_rhs_code(stmt) != VEC_PERM_EXPR) return false;
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+		       "\nFind VEC_PERM_EXPR: %G\nUnsigned_p=%d\n",stmt,
+		     TYPE_UNSIGNED(TREE_TYPE(gimple_assign_lhs(stmt))));
+
+  tree lhs = gimple_assign_lhs (stmt);
+  FOR_EACH_IMM_USE_STMT (use_stmt, iter, lhs)
+    {
+      if (dump_enabled_p ())
+        dump_printf_loc (MSG_NOTE, vect_location,
+		          "Find user: %G",
+		         use_stmt);
+      if (gimple_bb(use_stmt) != bb) {
+            if (dump_enabled_p ())
+     		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                             "Shuffle fusion failed: use not in the same bb.\n"
+		         );
+	      return false;
+      }
+      if (gimple_code(use_stmt) == GIMPLE_ASSIGN
+	  && gimple_assign_rhs_code(use_stmt) == VEC_PERM_EXPR
+	  && gimple_assign_rhs1(use_stmt) == gimple_assign_rhs2(use_stmt)
+	  && TREE_TYPE(gimple_assign_lhs(stmt))
+	     == TREE_TYPE(gimple_assign_rhs1(stmt)))
+	continue;
+
+      if (gimple_code(use_stmt)==GIMPLE_ASSIGN
+	  && (gimple_assign_rhs_code(use_stmt) == VEC_UNPACK_LO_EXPR
+	   || gimple_assign_rhs_code(use_stmt) == VEC_UNPACK_HI_EXPR)
+	  && TYPE_SIGN(TREE_TYPE(gimple_assign_lhs(stmt))) == UNSIGNED)
+        continue;
+
+      if (dump_enabled_p ())
+        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                         "Shuffle fusion failed: use is not match.\n"
+   	         	);
+      return false;
+    }
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+		          "Analysis True.\n"
+		         );
+  return true;
+}
+
+bool
+permute_stmt_analysis2(basic_block bb, gimple *stmt)
+{
+  imm_use_iterator iter;
+  gimple *use_stmt;
+  unsigned int perm[256], n, tl=0, th=0;
+
+  if (gimple_code(stmt) != GIMPLE_ASSIGN
+      || gimple_assign_rhs_code(stmt) != VEC_PERM_EXPR) return false;
+  if (gimple_assign_rhs1(stmt)==gimple_assign_rhs2(stmt))
+    {
+      if (dump_enabled_p ())
+        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+		          "Shuffle fusion2 failed: expr has the same rhs.\n"
+		         );
+      return false;
+    }
+  tree lhs = gimple_assign_lhs (stmt);
+  n=VECTOR_CST_NELTS (gimple_assign_rhs3(stmt)).to_constant ();
+  if (n>32) 
+    {
+      if (dump_enabled_p ())
+        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+		          "Shuffle fusion2 failed: expr has too many nelts.\n"
+		         );
+      return false;
+    }
+
+  for (unsigned int i=0; i<n; i++)
+    perm[i] = TREE_INT_CST_ELT(VECTOR_CST_ELT(gimple_assign_rhs3(stmt),i), 0);
+  for (unsigned int i=0; i<n/2; i++)
+    if (perm[i]!=255)
+      tl |= perm[i]<n?1:2;
+  for (unsigned int i=n/2; i<n; i++)
+    if (perm[i]!=255)
+      th |= perm[i]<n?1:2;
+
+  FOR_EACH_IMM_USE_STMT (use_stmt, iter, lhs)
+    {
+      if (gimple_bb(use_stmt) != bb)
+	      return false;
+      if (gimple_code(use_stmt) != GIMPLE_ASSIGN
+	  || ((gimple_assign_rhs_code(use_stmt) != VEC_UNPACK_FLOAT_LO_EXPR
+	       || tl == 3)
+	  && (gimple_assign_rhs_code(use_stmt) != VEC_UNPACK_FLOAT_HI_EXPR
+	       || th == 3)))
+	{
+          if (dump_enabled_p ())
+            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+		          "Shuffle fusion2 failed: user need both rhs.\n"
+		         );
+	  return false;
+	}
+
+    }
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+		          "Act 2 Analysis True.\n"
+		         );
+  return true;
+}
+
+void
+permute_bb_analysis(function *fun)
+{
+  basic_block bb;
+
+  FOR_EACH_BB_FN (bb, fun)
+    {
+      for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
+           gsi_next (&gsi))
+        {
+  	  gimple *stmt = gsi_stmt (gsi);
+	  if (permute_stmt_analysis(bb, stmt) ||
+	      permute_stmt_analysis2(bb, stmt))
+	    permute_stmt_operation(stmt);
+        }
+    }
+}
+
+/*  Entry point to shuffle fusion phase.  */
+
+namespace {
+
+const pass_data pass_data_shuffle_fusion =
+{
+  GIMPLE_PASS, /* type */
+  "shuffle-fusion", /* name */
+  OPTGROUP_VEC, /* optinfo_flags */
+  TV_SHUFFLE_FUSION, /* tv_id */
+  ( PROP_ssa | PROP_cfg ), /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  TODO_update_ssa, /* todo_flags_finish */
+};
+
+class pass_shuffle_fusion : public gimple_opt_pass
+{
+public:
+  pass_shuffle_fusion (gcc::context *ctxt)
+    : gimple_opt_pass (pass_data_shuffle_fusion, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  virtual bool gate (function *) { return flag_shuffle_fusion; }
+  virtual unsigned int execute (function *);
+
+}; // class pass_slp_vectorize
+
+
+unsigned int
+pass_shuffle_fusion::execute (function *fun)
+{
+  auto_purge_vect_location sentinel;
+  basic_block bb;
+
+  bool in_loop_pipeline = scev_initialized_p ();
+  if (!in_loop_pipeline)
+    {
+      loop_optimizer_init (LOOPS_NORMAL);
+      scev_initialize ();
+    }
+
+  /* Mark all stmts as not belonging to the current region and unvisited.  */
+  FOR_EACH_BB_FN (bb, fun)
+    {
+      for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
+	   gsi_next (&gsi))
+	{
+	  gphi *stmt = gsi.phi ();
+	  gimple_set_uid (stmt, -1);
+	  gimple_set_visited (stmt, false);
+	}
+      for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
+	   gsi_next (&gsi))
+	{
+	  gimple *stmt = gsi_stmt (gsi);
+	  gimple_set_uid (stmt, -1);
+	  gimple_set_visited (stmt, false);
+	}
+    }
+
+  permute_bb_analysis (fun);
+
+  if (!in_loop_pipeline)
+    {
+      scev_finalize ();
+      loop_optimizer_finalize ();
+    }
+
+  return 0;
+}
+
+} // anon namespace
+
+gimple_opt_pass *
+make_pass_shuffle_fusion (gcc::context *ctxt)
+{
+  return new pass_shuffle_fusion (ctxt);
+}
diff --git a/gcc/testsuite/g++.dg/vect/simd-coop.cc b/gcc/testsuite/g++.dg/vect/simd-coop.cc
new file mode 100644
index 00000000000..7674379b178
--- /dev/null
+++ b/gcc/testsuite/g++.dg/vect/simd-coop.cc
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-floop-slp-coop -funsafe-math-optimizations" } */
+/* { dg-final { scan-tree-dump "Final SLP tree" "vect"} } */
+
+typedef struct _Double
+{
+  double
+    a,
+    b,
+    c,
+    d;
+} Double;
+
+typedef struct _Unsigned
+{
+  unsigned short
+    a,
+    b,
+    c,
+    d;
+} Unsigned;
+
+void S(unsigned n, Double *r,
+       const double *__restrict k, const Unsigned *__restrict p)
+{
+  for (unsigned u = 0; u < n; u++, k--)
+    {
+      r->a = r->a + (*k) * p[u].a;
+      r->b = r->b + (*k) * p[u].b;
+      r->c = r->c + (*k) * p[u].c;
+      r->d = r->d + (*k) * p[u].d;
+    }
+}
diff --git a/gcc/testsuite/g++.target/i386/shuffle-fusion.C b/gcc/testsuite/g++.target/i386/shuffle-fusion.C
new file mode 100644
index 00000000000..dd3a011bff1
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/shuffle-fusion.C
@@ -0,0 +1,28 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=znver1 -O2" } */
+/* { dg-additional-options "-fshuffle-fusion -funsafe-math-optimizations" } */
+/* { dg-final { scan-assembler-not "vpsrl"} } */
+
+typedef struct _Double
+{
+  double
+    a,
+    b;
+} Double;
+
+typedef struct _Unsigned
+{
+  unsigned short
+    a,
+    b;
+} Unsigned;
+
+void S(unsigned n, Double *r,
+       const double *__restrict k, const Unsigned *__restrict p)
+{
+  for (unsigned u = 0; u < n; u++, k--)
+    {
+      r->a = r->a + p[u].a;
+      r->b = r->b + p[u].b;
+    }
+}
diff --git a/gcc/testsuite/g++.target/i386/simd-coop.C b/gcc/testsuite/g++.target/i386/simd-coop.C
new file mode 100644
index 00000000000..60e831be748
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/simd-coop.C
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=znver1 -O2 -fdump-tree-vect-details" } */
+/* { dg-additional-options "-floop-slp-coop -funsafe-math-optimizations" } */
+/* { dg-final { scan-tree-dump "Final SLP tree" "vect"} } */
+
+typedef struct _Double
+{
+  double
+    a,
+    b,
+    c,
+    d;
+} Double;
+
+typedef struct _Unsigned
+{
+  unsigned short
+    a,
+    b,
+    c,
+    d;
+} Unsigned;
+
+void S(unsigned n, Double *r,
+       const double *__restrict k, const Unsigned *__restrict p)
+{
+  for (unsigned u = 0; u < n; u++, k--)
+    {
+      r->a = r->a + (*k) * p[u].a;
+      r->b = r->b + (*k) * p[u].b;
+      r->c = r->c + (*k) * p[u].c;
+    }
+}
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr69270.c b/gcc/testsuite/gcc.dg/tree-ssa/pr69270.c
index 0d66cc4383f..3aba15ee18b 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/pr69270.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr69270.c
@@ -1,17 +1,17 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -fsplit-paths -fdump-tree-dom3-details" } */
+/* { dg-options "-O2 -fsplit-paths -fdump-tree-dom4-details" } */
 
 /* There should be two references to bufferstep that turn into
    constants.  */
-/* { dg-final { scan-tree-dump-times "Replaced .bufferstep_\[0-9\]+. with constant .0." 1 "dom3"} } */
-/* { dg-final { scan-tree-dump-times "Replaced .bufferstep_\[0-9\]+. with constant .1." 1 "dom3"} } */
+/* { dg-final { scan-tree-dump-times "Replaced .bufferstep_\[0-9\]+. with constant .0." 1 "dom4"} } */
+/* { dg-final { scan-tree-dump-times "Replaced .bufferstep_\[0-9\]+. with constant .1." 1 "dom4"} } */
 
 /* And some assignments ought to fold down to constants.  */
-/* { dg-final { scan-tree-dump-times "Folded to: _\[0-9\]+ = 1;" 1 "dom3"} } */
-/* { dg-final { scan-tree-dump-times "Folded to: _\[0-9\]+ = 0;" 1 "dom3"} } */
+/* { dg-final { scan-tree-dump-times "Folded to: _\[0-9\]+ = 1;" 1 "dom4"} } */
+/* { dg-final { scan-tree-dump-times "Folded to: _\[0-9\]+ = 0;" 1 "dom4"} } */
 
 /* The XOR operations should have been optimized to constants.  */
-/* { dg-final { scan-tree-dump-not "bit_xor" "dom3"} } */
+/* { dg-final { scan-tree-dump-not "bit_xor" "dom4"} } */
 
 
 extern int *stepsizeTable;
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr70232.c b/gcc/testsuite/gcc.dg/tree-ssa/pr70232.c
index d636672fddc..43809215a1b 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/pr70232.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr70232.c
@@ -1,10 +1,10 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -w -fdump-tree-vrp1-details -fdump-tree-vrp2-details -fdump-tree-dom2-details -fdump-tree-dom3-details" } */
+/* { dg-options "-O2 -w -fdump-tree-vrp1-details -fdump-tree-vrp2-details -fdump-tree-dom2-details -fdump-tree-dom4-details" } */
 
 /* All the threads found by the threader should have too many
    statements to be profitable.  */
 /* { dg-final { scan-tree-dump-not "Registering jump " "dom2"} } */
-/* { dg-final { scan-tree-dump-not "Registering jump " "dom3"} } */
+/* { dg-final { scan-tree-dump-not "Registering jump " "dom4"} } */
 /* { dg-final { scan-tree-dump-not "Registering jump " "vrp1"} } */
 /* { dg-final { scan-tree-dump-not "Registering jump " "vrp2"} } */
 
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr71437.c b/gcc/testsuite/gcc.dg/tree-ssa/pr71437.c
index eab3a25928e..25bf07947d0 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/pr71437.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr71437.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-ffast-math -O3 -fdump-tree-dom3-details" } */
+/* { dg-options "-ffast-math -O3 -fdump-tree-dom4-details" } */
 
 int I = 50, J = 50;
 int S, L;
@@ -43,4 +43,4 @@ void foo (int K)
 /* We used to get 1 vrp-thread1 candidates here, but they now get
    deferred until after loop opts are done, because they were rotating
    loops.  */
-/* { dg-final { scan-tree-dump-times "Threaded jump " 2 "dom3" } } */
+/* { dg-final { scan-tree-dump-times "Threaded jump " 2 "dom4" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/slsr-27.c b/gcc/testsuite/gcc.dg/tree-ssa/slsr-27.c
index c8f8e612da2..979bfed98d8 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/slsr-27.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/slsr-27.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -fdump-tree-dom3" } */
+/* { dg-options "-O2 -fdump-tree-dom4" } */
 
 struct x
 {
@@ -16,8 +16,8 @@ f (struct x *p, unsigned int n)
   foo (p->a[n], p->c[n], p->b[n]);
 }
 
-/* { dg-final { scan-tree-dump-times "\\* 4;" 1 "dom3" { target { int32 } } } } */
-/* { dg-final { scan-tree-dump-times "\\* 2;" 1 "dom3" { target { int16 } } } } */
-/* { dg-final { scan-tree-dump-times "p_\\d\+\\(D\\) \\+ \[^\r\n\]*_\\d\+;" 1 "dom3" } } */
+/* { dg-final { scan-tree-dump-times "\\* 4;" 1 "dom4" { target { int32 } } } } */
+/* { dg-final { scan-tree-dump-times "\\* 2;" 1 "dom4" { target { int16 } } } } */
+/* { dg-final { scan-tree-dump-times "p_\\d\+\\(D\\) \\+ \[^\r\n\]*_\\d\+;" 1 "dom4" } } */
 /*
-  { dg-final { scan-tree-dump-times "MEM *<int>? *\\\[\\(struct x \\*\\)\[^\r\n\]*_\\d\+" 3 "dom3" } } */
+  { dg-final { scan-tree-dump-times "MEM *<int>? *\\\[\\(struct x \\*\\)\[^\r\n\]*_\\d\+" 3 "dom4" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/slsr-28.c b/gcc/testsuite/gcc.dg/tree-ssa/slsr-28.c
index b18e9c1fe21..2bcafe24000 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/slsr-28.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/slsr-28.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -fdump-tree-dom3" } */
+/* { dg-options "-O2 -fdump-tree-dom4" } */
 
 struct x
 {
@@ -20,7 +20,7 @@ f (struct x *p, unsigned int n)
     foo (p->b[n], p->a[n], p->c[n]);
 }
 
-/* { dg-final { scan-tree-dump-times "\\* 4;" 1 "dom3" { target { int32 } } } } */
-/* { dg-final { scan-tree-dump-times "\\* 2;" 1 "dom3" { target { int16 } } } } */
-/* { dg-final { scan-tree-dump-times "p_\\d\+\\(D\\) \\+ \[^\r\n\]*_\\d\+" 1 "dom3" } } */
-/* { dg-final { scan-tree-dump-times "MEM *<int>? *\\\[\\(struct x \\*\\)\[^\r\n\]*_\\d\+" 9 "dom3" } } */
+/* { dg-final { scan-tree-dump-times "\\* 4;" 1 "dom4" { target { int32 } } } } */
+/* { dg-final { scan-tree-dump-times "\\* 2;" 1 "dom4" { target { int16 } } } } */
+/* { dg-final { scan-tree-dump-times "p_\\d\+\\(D\\) \\+ \[^\r\n\]*_\\d\+" 1 "dom4" } } */
+/* { dg-final { scan-tree-dump-times "MEM *<int>? *\\\[\\(struct x \\*\\)\[^\r\n\]*_\\d\+" 9 "dom4" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/slsr-29.c b/gcc/testsuite/gcc.dg/tree-ssa/slsr-29.c
index 00e8d2b52b3..4db297a2c82 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/slsr-29.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/slsr-29.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -fdump-tree-dom3" } */
+/* { dg-options "-O2 -fdump-tree-dom4" } */
 
 struct x
 {
@@ -22,7 +22,7 @@ f (struct x *p, unsigned int n)
     }
 }
 
-/* { dg-final { scan-tree-dump-times "\\* 4;" 1 "dom3" { target { int32 } } } } */
-/* { dg-final { scan-tree-dump-times "\\* 2;" 1 "dom3" { target { int16 } } } } */
-/* { dg-final { scan-tree-dump-times "p_\\d\+\\(D\\) \\+ \[^\r\n\]*_\\d\+" 1 "dom3" } } */
-/* { dg-final { scan-tree-dump-times "MEM *<int>? *\\\[\\(struct x \\*\\)\[^\r\n\]*_\\d\+" 9 "dom3" } } */
+/* { dg-final { scan-tree-dump-times "\\* 4;" 1 "dom4" { target { int32 } } } } */
+/* { dg-final { scan-tree-dump-times "\\* 2;" 1 "dom4" { target { int16 } } } } */
+/* { dg-final { scan-tree-dump-times "p_\\d\+\\(D\\) \\+ \[^\r\n\]*_\\d\+" 1 "dom4" } } */
+/* { dg-final { scan-tree-dump-times "MEM *<int>? *\\\[\\(struct x \\*\\)\[^\r\n\]*_\\d\+" 9 "dom4" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-7.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-7.c
index b64e71dae22..a9cf2578e3c 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-7.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dom-thread-7.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -fdump-tree-dom2-stats -fdump-tree-thread2-stats -fdump-tree-dom3-stats -fno-guess-branch-probability" } */
+/* { dg-options "-O2 -fdump-tree-dom2-stats -fdump-tree-thread2-stats -fdump-tree-dom4-stats -fno-guess-branch-probability" } */
 
 /* { dg-final { scan-tree-dump-not "Jumps threaded"  "dom2" } } */
 
@@ -10,7 +10,7 @@
 /* aarch64 has the highest CASE_VALUES_THRESHOLD in GCC.  It's high enough
    to change decisions in switch expansion which in turn can expose new
    jump threading opportunities.  Skip the later tests on aarch64.  */
-/* { dg-final { scan-tree-dump-not "Jumps threaded"  "dom3" { target { ! aarch64*-*-* } } } } */
+/* { dg-final { scan-tree-dump-not "Jumps threaded"  "dom4" { target { ! aarch64*-*-* } } } } */
 /* { dg-final { scan-tree-dump "Jumps threaded: 7"  "thread2" { target { ! aarch64*-*-* } } } } */
 /* { dg-final { scan-tree-dump "Jumps threaded: 18"  "thread2" { target { aarch64*-*-* } } } } */
 
diff --git a/gcc/timevar.def b/gcc/timevar.def
index 794b8017d18..26a5b49d02e 100644
--- a/gcc/timevar.def
+++ b/gcc/timevar.def
@@ -309,6 +309,7 @@ DEFTIMEVAR (TV_INITIALIZE_RTL        , "initialize rtl")
 DEFTIMEVAR (TV_GIMPLE_LADDRESS       , "address lowering")
 DEFTIMEVAR (TV_TREE_LOOP_IFCVT       , "tree loop if-conversion")
 DEFTIMEVAR (TV_WARN_ACCESS           , "access analysis")
+DEFTIMEVAR (TV_SHUFFLE_FUSION        , "shuffle fusion")
 
 /* Everything else in rest_of_compilation not included above.  */
 DEFTIMEVAR (TV_EARLY_LOCAL	     , "early local passes")
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
index 55ee2fe7f9e..cf86f15c501 100644
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@@ -499,6 +499,7 @@ extern gimple_opt_pass *make_pass_modref (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_coroutine_lower_builtins (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_coroutine_early_expand_ifns (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_adjust_alignment (gcc::context *ctxt);
+extern gimple_opt_pass *make_pass_shuffle_fusion (gcc::context *ctxt);
 
 /* IPA Passes */
 extern simple_ipa_opt_pass *make_pass_ipa_lower_emutls (gcc::context *ctxt);
diff --git a/gcc/tree-ssa-loop.cc b/gcc/tree-ssa-loop.cc
index 73aa46627b4..844df411de4 100644
--- a/gcc/tree-ssa-loop.cc
+++ b/gcc/tree-ssa-loop.cc
@@ -339,6 +339,7 @@ public:
   pass_tree_loop_init (gcc::context *ctxt)
     : gimple_opt_pass (pass_data_tree_loop_init, ctxt)
   {}
+  opt_pass * clone () { return new pass_tree_loop_init (m_ctxt); }
 
   /* opt_pass methods: */
   virtual unsigned int execute (function *);
@@ -501,6 +502,7 @@ public:
   pass_tree_loop_done (gcc::context *ctxt)
     : gimple_opt_pass (pass_data_tree_loop_done, ctxt)
   {}
+  opt_pass * clone () { return new pass_tree_loop_done (m_ctxt); }
 
   /* opt_pass methods: */
   virtual unsigned int execute (function *) { return tree_ssa_loop_done (); }
diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc
index 232e903b0d2..f89d3c3efcc 100644
--- a/gcc/tree-ssa-math-opts.cc
+++ b/gcc/tree-ssa-math-opts.cc
@@ -4891,13 +4891,24 @@ public:
   {}
 
   /* opt_pass methods: */
+  opt_pass * clone () { return new pass_optimize_widening_mul (m_ctxt); }
+  void set_pass_param (unsigned int n, bool param)
+    {
+      gcc_assert (n == 0);
+      ahead = param;
+    }
+
   virtual bool gate (function *)
     {
-      return flag_expensive_optimizations && optimize;
+      return flag_expensive_optimizations && optimize
+	      && (ahead ^ flag_widening_mul_ahead);
     }
 
   virtual unsigned int execute (function *);
 
+  private:
+  /* Determines whether the pass moved ahead.  */
+  bool ahead;
 }; // class pass_optimize_widening_mul
 
 /* Walker class to perform the transformation in reverse dominance order. */
@@ -5073,3 +5084,4 @@ make_pass_optimize_widening_mul (gcc::context *ctxt)
 {
   return new pass_optimize_widening_mul (ctxt);
 }
+
diff --git a/gcc/tree-ssa-ter.cc b/gcc/tree-ssa-ter.cc
index 4cdad0d2749..d34e435c63a 100644
--- a/gcc/tree-ssa-ter.cc
+++ b/gcc/tree-ssa-ter.cc
@@ -685,7 +685,7 @@ find_replaceable_in_bb (temp_expr_table *tab, basic_block bb)
       /* Increment counter if this is a non BUILT_IN call. We allow
 	 replacement over BUILT_IN calls since many will expand to inline
 	 insns instead of a true call.  */
-      if (is_gimple_call (stmt)
+      if (is_nofma_gimple_call (stmt)
 	  && !((fndecl = gimple_call_fndecl (stmt))
 	       && fndecl_built_in_p (fndecl)))
 	cur_call_cnt++;
diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
index 4e615b80b3a..e4c8ffd5ff3 100644
--- a/gcc/tree-vect-data-refs.cc
+++ b/gcc/tree-vect-data-refs.cc
@@ -425,6 +425,8 @@ vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
       if (apply_safelen ())
 	return opt_result::success ();
 
+      vect_depandence_issue = true;
+
       if (dump_enabled_p ())
 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 			 "versioning for alias required: "
@@ -4210,7 +4212,10 @@ vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
       if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
 				     vectype, memory_type, offtype, scale,
 				     &ifn, &offset_vectype))
-	ifn = IFN_LAST;
+        {     
+          ifn = IFN_LAST;
+          vect_depandence_issue = true;
+	}
       decl = NULL_TREE;
     }
   else
@@ -4225,6 +4230,7 @@ vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
 	  if (targetm.vectorize.builtin_scatter)
 	    decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
 	}
+      if (!decl) vect_depandence_issue = true;
       ifn = IFN_LAST;
       /* The offset vector type will be read from DECL when needed.  */
       offset_vectype = NULL_TREE;
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 3435f9378da..da3434ae43c 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -2374,11 +2374,14 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
       /* Update the vectorization factor based on the SLP decision.  */
       vect_update_vf_for_slp (loop_vinfo);
 
-      /* Optimize the SLP graph with the vectorization factor fixed.  */
-      vect_optimize_slp (loop_vinfo);
+      if (flag_vectorize_slp_optimize)
+	{
+	  /* Optimize the SLP graph with the vectorization factor fixed.  */
+	  vect_optimize_slp (loop_vinfo);
 
-      /* Gather the loads reachable from the SLP graph entries.  */
-      vect_gather_slp_loads (loop_vinfo);
+	  /* Gather the loads reachable from the SLP graph entries.  */
+	  vect_gather_slp_loads (loop_vinfo);
+	}
     }
 
   bool saved_can_use_partial_vectors_p
@@ -3016,6 +3019,7 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
       /* Set cached VF to -1 prior to analysis, which indicates a mode has
 	 failed.  */
       cached_vf_per_mode[last_mode_i] = -1;
+      vect_depandence_issue = false;
       opt_loop_vec_info loop_vinfo
 	= vect_analyze_loop_1 (loop, shared, &loop_form_info,
 			       NULL, vector_modes, mode_i,
@@ -3128,6 +3132,7 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
 			 "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
 
       bool fatal;
+      vect_depandence_issue = true;
       opt_loop_vec_info loop_vinfo
 	= vect_analyze_loop_1 (loop, shared, &loop_form_info,
 			       first_loop_vinfo,
@@ -3193,6 +3198,8 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
   return first_loop_vinfo;
 }
 
+bool vect_depandence_issue;
+
 /* Return true if there is an in-order reduction function for CODE, storing
    it in *REDUC_FN if so.  */
 
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index af477c31aa3..24343ebe597 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -924,6 +924,7 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
   bool first_stmt_phi_p = false, phi_p = false;
   bool maybe_soft_fail = false;
   tree soft_fail_nunits_vectype = NULL_TREE;
+  bool arraystmt = false;
 
   /* For every stmt in NODE find its def stmt/s.  */
   stmt_vec_info stmt_info;
@@ -932,6 +933,7 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
       gimple *stmt = stmt_info->stmt;
       swap[i] = 0;
       matches[i] = false;
+      arraystmt = false;
 
       if (dump_enabled_p ())
 	dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
@@ -1033,6 +1035,20 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
       else
 	{
 	  rhs_code = gimple_assign_rhs_code (stmt);
+	  /* Loop-slp-cooperate vectorization.
+	     For this case create a new stmt_info for the array, and perform as
+	     slp vectorize. Set the size as 1, and loop vectorize willperform 
+	     the vectorized stmt as a new element. */
+         if (rhs_code == MEM_REF && !vect_depandence_issue &&
+             !zerop(stmt_info->dr_aux.dr->innermost.step))
+            arraystmt = flag_loop_slp_coop;
+	  if (arraystmt)
+            {
+              if(!stmt_info->first_element)
+                stmt_info->first_element = stmt_info;
+              if(!stmt_info->size)
+                stmt_info->size = 1;
+            }
 	  load_p = gimple_vuse (stmt);
 	}
 
@@ -1207,7 +1223,7 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
 	}
 
       /* Grouped store or load.  */
-      if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
+      if (STMT_VINFO_GROUPED_ACCESS (stmt_info) || arraystmt)
 	{
 	  if (REFERENCE_CLASS_P (lhs))
 	    {
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 34920041116..fe28725e5b1 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -6448,6 +6448,108 @@ vectorizable_operation (vec_info *vinfo,
 
   vect_get_vec_defs (vinfo, stmt_info, slp_node, ncopies,
 		     op0, &vec_oprnds0, op1, &vec_oprnds1, op2, &vec_oprnds2);
+
+/* Accumulation Combine
+
+   For loop vectorize, reuse the register after rotation.
+
+   Before rotate:
+   PHI: vx1 = {v0, vy1}
+        vy1 = vx1 + vz[i]
+
+   After rotate:
+   PHI: vx1 = {v0, vy1}
+   PHI: vx2 = {0, vy2}
+        vy1 = vx1 + vz[i]
+        vy2 = vx2 + vz[i+1]	
+
+   After accumulation combine:
+   PHI: vx1 = {v0, vy2}
+        vy1 = vx1 + vz[i]
+        vy2 = vy1 + vz[i+1]
+*/
+
+  bool combined = false;
+
+  if(slp_node && !op2 && op1 && op0 
+	      && ((TREE_CODE(op0) == SSA_NAME 
+		   && SSA_NAME_DEF_STMT(op0)->code == GIMPLE_PHI)
+	      ||  (TREE_CODE(op1) == SSA_NAME
+		   && SSA_NAME_DEF_STMT(op1)->code == GIMPLE_PHI)))
+    {
+      gimple* phi;
+      if (TREE_CODE(op0) == SSA_NAME
+	  && SSA_NAME_DEF_STMT(op0)->code == GIMPLE_PHI)
+        phi = SSA_NAME_DEF_STMT(op0);
+      else
+        phi = SSA_NAME_DEF_STMT(op1);
+      for (i = 0; i < gimple_phi_num_args (phi); i++)
+        {
+          tree arg = gimple_phi_arg_def (phi, i);
+	  if (arg == scalar_dest)
+	    combined = flag_accumulation_combine;
+	}
+      if(SLP_TREE_LANES(slp_node) % 
+ 	 vect_nunits_for_cost(SLP_TREE_VECTYPE(slp_node)) == 0)
+        combined = false;
+      if (vec_num % SLP_TREE_LANES(slp_node) != 0) 
+	combined = false;
+    }
+  if (combined)
+    {
+      tree addvec[vec_num];
+      tree zero_val = build_zero_cst (TREE_TYPE (vectype_out));
+      FOR_EACH_VEC_ELT (vec_oprnds0, i, vop0)
+        {
+          gimple *new_stmt = NULL;
+	  vop1 = vec_oprnds1[i];
+	  if (i<SLP_TREE_LANES(slp_node))
+	    new_stmt
+	      = gimple_build_assign (vec_dest, code,
+	        		     vop0, vop1, NULL_TREE);
+	  else
+	    if (SSA_NAME_DEF_STMT(op0)->code == GIMPLE_PHI)
+	      new_stmt
+		= gimple_build_assign (vec_dest, code,
+		  		       addvec[i-SLP_TREE_LANES(slp_node)], 
+				       vop1, NULL_TREE);
+	    else
+	      new_stmt
+		= gimple_build_assign (vec_dest, code,
+				       addvec[i-SLP_TREE_LANES(slp_node)],
+		 		       vop0, NULL_TREE);
+	  addvec[i] = make_ssa_name (vec_dest, new_stmt);
+	  gimple_assign_set_lhs (new_stmt, addvec[i]);
+	  vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+	  if(vec_num - i <= SLP_TREE_LANES(slp_node))
+	    SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
+	}
+
+      FOR_EACH_VEC_ELT (vec_oprnds0, i, vop0)
+        if (i >= SLP_TREE_LANES(slp_node))
+	  {
+            gimple *new_stmt = NULL;
+	    if (SSA_NAME_DEF_STMT(op0)->code == GIMPLE_PHI)
+	      new_stmt
+		= gimple_build_assign (vec_dest, PLUS_EXPR,
+				       vop0, zero_val, NULL_TREE);
+	    else
+	      new_stmt
+		= gimple_build_assign (vec_dest, PLUS_EXPR,
+	      			       vec_oprnds1[i], zero_val, NULL_TREE);
+	    new_temp = make_ssa_name (vec_dest, new_stmt);
+	    gimple_assign_set_lhs (new_stmt, new_temp);
+	    vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+	    SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
+	  }
+
+      vec_oprnds0.release ();
+      vec_oprnds1.release ();
+      vec_oprnds2.release ();
+
+      return true;
+    }
+
   /* Arguments are ready.  Create the new vector stmt.  */
   FOR_EACH_VEC_ELT (vec_oprnds0, i, vop0)
     {
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 642eb0aeb21..dd99f98071e 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2558,4 +2558,7 @@ vect_is_integer_truncation (stmt_vec_info stmt_info)
 	  && TYPE_PRECISION (lhs_type) < TYPE_PRECISION (rhs_type));
 }
 
+/* Flag for ensure the cases of loop-slp-cooperate vectorization. */
+extern bool vect_depandence_issue;
+
 #endif  /* GCC_TREE_VECTORIZER_H  */
-- 
Gitee