1 Star 0 Fork 132

wangding16/src-gcc

forked from src-openEuler/gcc 
加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
0045-Transposed-SLP-Enable-Transposed-SLP.patch 93.72 KB
一键复制 编辑 原始数据 按行查看 历史
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009
From 639b5248cbab1806618545fc30215ed9d1a019e7 Mon Sep 17 00:00:00 2001
From: luohailing <luo_hailing@qq.com>
Date: Fri, 17 Jun 2022 22:38:55 +0800
Subject: [PATCH 11/12] [Transposed SLP] Enable Transposed SLP Enable
Transposed SLP when memory is uncontinual with
-ftree-slp-transpose-vectorize.
---
gcc/common.opt | 4 +
gcc/testsuite/gcc.dg/vect/transpose-1.c | 53 ++
gcc/testsuite/gcc.dg/vect/transpose-2.c | 50 ++
gcc/testsuite/gcc.dg/vect/transpose-3.c | 54 ++
gcc/testsuite/gcc.dg/vect/transpose-4.c | 53 ++
gcc/testsuite/gcc.dg/vect/transpose-5.c | 73 ++
gcc/testsuite/gcc.dg/vect/transpose-6.c | 67 ++
gcc/testsuite/gcc.dg/vect/transpose-7.c | 53 ++
gcc/testsuite/gcc.dg/vect/transpose-8.c | 53 ++
gcc/testsuite/gcc.dg/vect/vect.exp | 7 +
gcc/tree-vect-data-refs.c | 236 +++++
gcc/tree-vect-slp.c | 1090 ++++++++++++++++++++++-
gcc/tree-vect-stmts.c | 763 +++++++++++++++-
gcc/tree-vectorizer.h | 89 ++
14 files changed, 2641 insertions(+), 4 deletions(-)
create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-1.c
create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-2.c
create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-3.c
create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-4.c
create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-5.c
create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-6.c
create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-7.c
create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-8.c
diff --git a/gcc/common.opt b/gcc/common.opt
index 24834cf60..d38401b71 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -3049,6 +3049,10 @@ ftree-vect-analyze-slp-group
Common Report Var(flag_tree_slp_group) Init(0)
Disable SLP vectorization for reduction chain on tree.
+ftree-slp-transpose-vectorize
+Common Report Var(flag_tree_slp_transpose_vectorize) Optimization Init(0)
+Enable basic block vectorization (SLP) for transposed stores and loads on trees.
+
fvect-cost-model=
Common Joined RejectNegative Enum(vect_cost_model) Var(flag_vect_cost_model) Init(VECT_COST_MODEL_DEFAULT) Optimization
-fvect-cost-model=[unlimited|dynamic|cheap] Specifies the cost model for vectorization.
diff --git a/gcc/testsuite/gcc.dg/vect/transpose-1.c b/gcc/testsuite/gcc.dg/vect/transpose-1.c
new file mode 100644
index 000000000..8237a8b9e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/transpose-1.c
@@ -0,0 +1,53 @@
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-require-effective-target vect_int } */
+#include <stdio.h>
+#include <stdlib.h>
+#include "tree-vect.h"
+
+#define N 4
+#define M 256
+
+int foo (unsigned char *pix1, int i_pix1, unsigned char *pix2, int i_pix2)
+{
+ int i = 0;
+ int sum = 0;
+ unsigned c0[N], c1[N], c2[N], c3[N], c4[N], c5[N], c6[N], c7[N];
+ for (i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2)
+ {
+ c0[i] = pix1[0] - pix2[0];
+ c1[i] = pix1[1] - pix2[1];
+ c2[i] = pix1[2] - pix2[2];
+ c3[i] = pix1[3] - pix2[3];
+ c4[i] = pix1[4] - pix2[4];
+ c5[i] = pix1[5] - pix2[5];
+ c6[i] = pix1[6] - pix2[6];
+ c7[i] = pix1[7] - pix2[7];
+ }
+ for (int i = 0; i < N; i++)
+ {
+ sum += c0[i] + c1[i] + c2[i] + c3[i] + c4[i] + c5[i] + c6[i] + c7[i];
+ }
+ return sum;
+}
+
+int main (int argc, const char* argv[])
+{
+ unsigned char input1[M];
+ unsigned char input2[M];
+ int i1 = 16;
+ int i2 = 8;
+ check_vect ();
+ for (int i = 0; i < M; i++)
+ {
+ input1[i] = i * 2;
+ input2[i] = i;
+ }
+ int sum = foo (input1, i1, input2, i2);
+ if (sum != 1264)
+ {
+ abort ();
+ }
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/transpose-2.c b/gcc/testsuite/gcc.dg/vect/transpose-2.c
new file mode 100644
index 000000000..b01a0410e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/transpose-2.c
@@ -0,0 +1,50 @@
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-additional-options "-fno-tree-loop-vectorize" } */
+/* { dg-require-effective-target vect_int } */
+#include <stdio.h>
+#include <stdlib.h>
+#include "tree-vect.h"
+
+#define N 8
+#define M 256
+
+int foo (unsigned char *pix1, int i_pix1, unsigned char *pix2, int i_pix2)
+{
+ int i = 0;
+ int sum = 0;
+ unsigned short c0[N], c1[N], c2[N], c3[N], c4[N], c5[N], c6[N], c7[N];
+ for (i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2)
+ {
+ c0[i] = pix1[0] - pix2[0];
+ c1[i] = pix1[1] - pix2[1];
+ c2[i] = pix1[2] - pix2[2];
+ c3[i] = pix1[3] - pix2[3];
+ }
+ for (int i = 0; i < N; i++)
+ {
+ sum += c0[i] + c1[i] + c2[i] + c3[i];
+ }
+ return sum;
+}
+
+int main (int argc, const char* argv[])
+{
+ unsigned char input1[M];
+ unsigned char input2[M];
+ int i1 = 5;
+ int i2 = 4;
+ check_vect ();
+ for (int i = 0; i < M; i++)
+ {
+ input1[i] = i * 4;
+ input2[i] = i * 2;
+ }
+ int sum = foo (input1, i1, input2, i2);
+ if (sum != 1440)
+ {
+ abort ();
+ }
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/transpose-3.c b/gcc/testsuite/gcc.dg/vect/transpose-3.c
new file mode 100644
index 000000000..529581c59
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/transpose-3.c
@@ -0,0 +1,54 @@
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-additional-options "-fno-tree-loop-vectorize" } */
+/* { dg-require-effective-target vect_int } */
+#include <stdio.h>
+#include <stdlib.h>
+#include "tree-vect.h"
+
+#define N 4
+#define M 256
+
+int foo (unsigned short *pix1, int i_pix1, unsigned short *pix2, int i_pix2)
+{
+ int i = 0;
+ int sum = 0;
+ unsigned c0[N], c1[N], c2[N], c3[N], c4[N], c5[N], c6[N], c7[N];
+ for (i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2)
+ {
+ c0[i] = pix1[0] - pix2[0];
+ c1[i] = pix1[1] - pix2[1];
+ c2[i] = pix1[2] - pix2[2];
+ c3[i] = pix1[3] - pix2[3];
+ c4[i] = pix1[4] - pix2[4];
+ c5[i] = pix1[5] - pix2[5];
+ c6[i] = pix1[6] - pix2[6];
+ c7[i] = pix1[7] - pix2[7];
+ }
+ for (int i = 0; i < N; i++)
+ {
+ sum += c0[i] + c1[i] + c2[i] + c3[i] + c4[i] + c5[i] + c6[i] + c7[i];
+ }
+ return sum;
+}
+
+int main (int argc, const char* argv[])
+{
+ unsigned short input1[M];
+ unsigned short input2[M];
+ int i1 = 8;
+ int i2 = 4;
+ check_vect ();
+ for (int i = 0; i < M; i++)
+ {
+ input1[i] = i * 4;
+ input2[i] = i;
+ }
+ int sum = foo (input1, i1, input2, i2);
+ if (sum != 1680)
+ {
+ abort ();
+ }
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/transpose-4.c b/gcc/testsuite/gcc.dg/vect/transpose-4.c
new file mode 100644
index 000000000..0b4adea9b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/transpose-4.c
@@ -0,0 +1,53 @@
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-require-effective-target vect_int } */
+#include <stdio.h>
+#include <stdlib.h>
+#include "tree-vect.h"
+
+#define N 4
+#define M 256
+
+int foo (unsigned *pix1, int i_pix1, unsigned *pix2, int i_pix2)
+{
+ int i = 0;
+ int sum = 0;
+ unsigned c0[N], c1[N], c2[N], c3[N], c4[N], c5[N], c6[N], c7[N];
+ for (i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2)
+ {
+ c0[i] = pix1[0] - pix2[0];
+ c1[i] = pix1[1] - pix2[1];
+ c2[i] = pix1[2] - pix2[2];
+ c3[i] = pix1[3] - pix2[3];
+ c4[i] = pix1[4] - pix2[4];
+ c5[i] = pix1[5] - pix2[5];
+ c6[i] = pix1[6] - pix2[6];
+ c7[i] = pix1[7] - pix2[7];
+ }
+ for (int i = 0; i < N; i++)
+ {
+ sum += c0[i] + c1[i] + c2[i] + c3[i] + c4[i] + c5[i] + c6[i] + c7[i];
+ }
+ return sum;
+}
+
+int main (int argc, const char* argv[])
+{
+ unsigned input1[M];
+ unsigned input2[M];
+ int i1 = 12;
+ int i2 = 6;
+ check_vect ();
+ for (int i = 0; i < M; i++)
+ {
+ input1[i] = i * 7;
+ input2[i] = i * 3;
+ }
+ int sum = foo (input1, i1, input2, i2);
+ if (sum != 3616)
+ {
+ abort ();
+ }
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/transpose-5.c b/gcc/testsuite/gcc.dg/vect/transpose-5.c
new file mode 100644
index 000000000..81a248840
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/transpose-5.c
@@ -0,0 +1,73 @@
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-require-effective-target vect_int } */
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include "tree-vect.h"
+
+#define N 4
+#define M 256
+#define eps 1e-8
+
+double foo (unsigned char *pix1, int i_pix1, unsigned char *pix2, int i_pix2)
+{
+ unsigned a0[N];
+ unsigned a1[N];
+ unsigned a2[N];
+ unsigned a3[N];
+
+ int b0[N];
+ int b1[N];
+ int b2[N];
+ int b3[N];
+
+ for (int i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2)
+ {
+ a0[i] = (pix1[0] - pix2[0]) + ((pix1[4] + pix2[4]) << 16);
+ a1[i] = (pix1[1] - pix2[1]) + ((pix1[5] + pix2[5]) << 16);
+ a2[i] = (pix1[2] - pix2[2]) + ((pix1[6] + pix2[6]) << 16);
+ a3[i] = (pix1[3] - pix2[3]) + ((pix1[7] + pix2[7]) << 16);
+ }
+
+ for (int i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2)
+ {
+ b0[i] = (pix1[0] - pix2[0]) + (pix1[4] + pix2[4]);
+ b1[i] = (pix1[1] - pix2[1]) + (pix1[5] + pix2[5]);
+ b2[i] = (pix1[2] - pix2[2]) + (pix1[6] + pix2[6]);
+ b3[i] = (pix1[3] - pix2[3]) + (pix1[7] + pix2[7]);
+ }
+
+ double sum = 0;
+ for (int i = 0; i < N; i++)
+ {
+ sum += a0[i] + a1[i] + a2[i] + a3[i] + b0[i] + b1[i] + b2[i] + b3[i];
+ }
+ return sum;
+}
+
+int main (int argc, const char* argv[])
+{
+ unsigned char input1[M];
+ unsigned char input2[M];
+ int i1 = 8;
+ int i2 = 3;
+ unsigned char m = 2;
+ unsigned short n = 12;
+ float t = 3.0;
+ double k = 4.2;
+ check_vect ();
+ for (int i = 0; i < M; i++)
+ {
+ input1[i] = i * 6;
+ input2[i] = i * 3;
+ }
+ double sum = foo (input1, i1, input2, i2);
+ if (fabs (sum - 78648144) > eps)
+ {
+ abort ();
+ }
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */
+/* { dg-final { scan-tree-dump-times "vectorizable_store for slp transpose" 2 "slp1" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/transpose-6.c b/gcc/testsuite/gcc.dg/vect/transpose-6.c
new file mode 100644
index 000000000..3e134ac02
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/transpose-6.c
@@ -0,0 +1,67 @@
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target vect_float } */
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include "tree-vect.h"
+
+#define N 4
+#define M 256
+#define eps 1e-8
+
+float foo (unsigned char *pix1, int i_pix1, unsigned char *pix2, int i_pix2)
+{
+ unsigned a0[N];
+ unsigned a1[N];
+ unsigned a2[N];
+ unsigned a3[N];
+
+ float c0[N];
+ float c1[N];
+ float c2[N];
+ float c3[N];
+
+ for (int i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2)
+ {
+ a0[i] = (pix1[0] - pix2[0]) + ((pix1[4] - pix2[4]) << 16);
+ a1[i] = (pix1[1] - pix2[1]) + ((pix1[5] - pix2[5]) << 16);
+ a2[i] = (pix1[2] - pix2[2]) + ((pix1[6] - pix2[6]) << 16);
+ a3[i] = (pix1[3] - pix2[3]) + ((pix1[7] - pix2[7]) << 16);
+
+ c0[i] = (pix1[0] * pix2[0]) + (pix1[4] * pix2[4]);
+ c1[i] = (pix1[1] * pix2[1]) + (pix1[5] * pix2[5]);
+ c2[i] = (pix1[2] * pix2[2]) + (pix1[6] * pix2[6]);
+ c3[i] = (pix1[3] * pix2[3]) + (pix1[7] * pix2[7]);
+ }
+
+ float sum = 0;
+ for (int i = 0; i < N; i++)
+ {
+ sum += a0[i] + a1[i] + a2[i] + a3[i] + c0[i] + c1[i] + c2[i] + c3[i];
+ }
+ return sum;
+}
+
+int main (int argc, const char* argv[])
+{
+ unsigned char input1[M];
+ unsigned char input2[M];
+ int i1 = 18;
+ int i2 = 6;
+ check_vect ();
+ for (int i = 0; i < M; i++)
+ {
+ input1[i] = i * 4;
+ input2[i] = i * 2;
+ }
+ float sum = foo (input1, i1, input2, i2);
+ if (fabs (sum - 106041168) > eps)
+ {
+ abort ();
+ }
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */
+/* { dg-final { scan-tree-dump-times "vectorizable_store for slp transpose" 2 "slp1" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/transpose-7.c b/gcc/testsuite/gcc.dg/vect/transpose-7.c
new file mode 100644
index 000000000..2074d9aa8
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/transpose-7.c
@@ -0,0 +1,53 @@
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-additional-options "-fno-tree-loop-vectorize" } */
+/* { dg-require-effective-target vect_int } */
+#include <stdio.h>
+#include <stdlib.h>
+#include "tree-vect.h"
+
+#define N 16
+#define M 256
+
+int foo (unsigned char *pix1, int i_pix1, unsigned char *pix2, int i_pix2)
+{
+ int i = 0;
+ int sum = 0;
+ unsigned char c0[N], c1[N];
+ for (int i = 0; i < N/2; i++, pix1 += i_pix1, pix2 += i_pix2)
+ {
+ c0[i] = pix1[0] - pix2[0];
+ c1[i] = pix1[1] - pix2[1];
+ }
+ for (int i = N/2; i < N; i++, pix1 += i_pix1, pix2 += i_pix2)
+ {
+ c0[i] = pix1[0] - pix2[0];
+ c1[i] = pix1[1] - pix2[1];
+ }
+ for (int i = 0; i < N; i++)
+ {
+ sum += c0[i] + c1[i];
+ }
+ return sum;
+}
+
+int main (int argc, const char* argv[])
+{
+ unsigned char input1[M];
+ unsigned char input2[M];
+ int i1 = 6;
+ int i2 = 4;
+ check_vect ();
+ for (int i = 0; i < M; i++)
+ {
+ input1[i] = i * 5;
+ input2[i] = i * 2;
+ }
+ int sum = foo (input1, i1, input2, i2);
+ if (sum != 3280)
+ {
+ abort ();
+ }
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/transpose-8.c b/gcc/testsuite/gcc.dg/vect/transpose-8.c
new file mode 100644
index 000000000..a154f012a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/transpose-8.c
@@ -0,0 +1,53 @@
+/* { dg-do compile { target { aarch64*-*-linux* } } } */
+/* { dg-additional-options "-fno-tree-loop-vectorize" } */
+/* { dg-require-effective-target vect_int } */
+#include <stdio.h>
+#include <stdlib.h>
+#include "tree-vect.h"
+
+#define N 32
+#define M 256
+
+int foo (unsigned char *pix1, int i_pix1, unsigned char *pix2, int i_pix2)
+{
+ int i = 0;
+ int sum = 0;
+ unsigned char c0[N], c1[N];
+ for (int i = 0; i < N/2; i++, pix1 += i_pix1, pix2 += i_pix2)
+ {
+ c0[i] = pix1[0] - pix2[0];
+ c1[i] = pix1[1] - pix2[1];
+ }
+ for (int i = N/2; i < N; i++, pix1 += i_pix1, pix2 += i_pix2)
+ {
+ c0[i] = pix1[0] - pix2[0];
+ c1[i] = pix1[1] - pix2[1];
+ }
+ for (int i = 0; i < N; i++)
+ {
+ sum += c0[i] + c1[i];
+ }
+ return sum;
+}
+
+int main (int argc, const char* argv[])
+{
+ unsigned char input1[M];
+ unsigned char input2[M];
+ int i1 = 6;
+ int i2 = 4;
+ check_vect ();
+ for (int i = 0; i < M; i++)
+ {
+ input1[i] = i * 5;
+ input2[i] = i * 2;
+ }
+ int sum = foo (input1, i1, input2, i2);
+ if (sum != 7584)
+ {
+ abort ();
+ }
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect.exp b/gcc/testsuite/gcc.dg/vect/vect.exp
index efe17ac6f..d92e1ba5b 100644
--- a/gcc/testsuite/gcc.dg/vect/vect.exp
+++ b/gcc/testsuite/gcc.dg/vect/vect.exp
@@ -114,6 +114,13 @@ et-dg-runtest dg-runtest [lsort \
[glob -nocomplain $srcdir/$subdir/no-vfa-*.\[cS\]]] \
"" $DEFAULT_VECTCFLAGS
+# -ftree-slp-transpose-vectorize SLP tests
+set VECT_SLP_CFLAGS $SAVED_VECT_SLP_CFLAGS
+lappend VECT_SLP_CFLAGS "-ftree-slp-transpose-vectorize"
+et-dg-runtest dg-runtest [lsort \
+ [glob -nocomplain $srcdir/$subdir/transpose-*.\[cS\]]] \
+ "" "-ftree-slp-transpose-vectorize -fdump-tree-slp-details -O3"
+
# -ffast-math tests
set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS
lappend DEFAULT_VECTCFLAGS "-ffast-math"
diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
index fcc0726bd..d78b06455 100644
--- a/gcc/tree-vect-data-refs.c
+++ b/gcc/tree-vect-data-refs.c
@@ -2647,6 +2647,9 @@ vect_analyze_group_access_1 (dr_vec_info *dr_info)
DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
DR_GROUP_SIZE (stmt_info) = groupsize;
+
+ DR_GROUP_SLP_TRANSPOSE (stmt_info) = false;
+
if (dump_enabled_p ())
{
dump_printf_loc (MSG_NOTE, vect_location,
@@ -2676,6 +2679,20 @@ vect_analyze_group_access_1 (dr_vec_info *dr_info)
DR_GROUP_GAP (stmt_info));
}
+ /* SLP: create an SLP data structure for every interleaving group of
+ loads for further analysis in vect_analyse_slp. */
+ if (DR_IS_READ (dr) && !slp_impossible)
+ {
+ if (loop_vinfo)
+ {
+ LOOP_VINFO_GROUPED_LOADS (loop_vinfo).safe_push (stmt_info);
+ }
+ if (bb_vinfo)
+ {
+ BB_VINFO_GROUPED_LOADS (bb_vinfo).safe_push (stmt_info);
+ }
+ }
+
/* SLP: create an SLP data structure for every interleaving group of
stores for further analysis in vect_analyse_slp. */
if (DR_IS_WRITE (dr) && !slp_impossible)
@@ -5413,6 +5430,225 @@ vect_permute_store_chain (vec<tree> dr_chain,
}
}
+/* Encoding the PERM_MASK_FIRST. */
+
+static void
+vect_indices_encoding_first (tree vectype, unsigned int array_num,
+ tree &perm_mask_high_first,
+ tree &perm_mask_low_first)
+{
+ unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
+ vec_perm_builder sel (nelt, nelt, 1);
+ sel.quick_grow (nelt);
+ unsigned int group_num = nelt / array_num;
+ unsigned int index = 0;
+ unsigned int array = 0;
+ unsigned int group = 0;
+
+ /* The encoding has 1 pattern in the fisrt stage. */
+ for (array = 0; array < array_num / 2; array++)
+ {
+ for (group = 0; group < group_num * 2; group++)
+ {
+ sel[index++] = array + array_num * group;
+ }
+ }
+ vec_perm_indices indices (sel, 2, nelt);
+ perm_mask_high_first = vect_gen_perm_mask_checked (vectype, indices);
+
+ index = 0;
+ for (array = array_num / 2; array < array_num; array++)
+ {
+ for (group = 0; group < group_num * 2; group++)
+ {
+ sel[index++] = array + array_num * group;
+ }
+ }
+ indices.new_vector (sel, 2, nelt);
+ perm_mask_low_first = vect_gen_perm_mask_checked (vectype, indices);
+}
+
+/* Encoding the PERM_MASK. */
+
+static void
+vect_indices_encoding (tree vectype, unsigned int array_num,
+ tree &perm_mask_high, tree &perm_mask_low)
+{
+ unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
+ vec_perm_builder sel (nelt, nelt, 1);
+ sel.quick_grow (nelt);
+ unsigned int group_num = nelt / array_num;
+ unsigned int index = 0;
+ unsigned int array = 0;
+ unsigned int group = 0;
+
+ /* The encoding has 2 patterns in the folllowing stages. */
+ for (array = 0; array < array_num / 2; array++)
+ {
+ for (group = 0; group < group_num; group++)
+ {
+ sel[index++] = group + group_num * array;
+ }
+ for (group = 0; group < group_num; group++)
+ {
+ sel[index++] = nelt + group + group_num * array;
+ }
+ }
+ vec_perm_indices indices (sel, 2, nelt);
+ perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
+
+ index = 0;
+ for (array = array_num / 2; array < array_num; array++)
+ {
+ for (group = 0; group < group_num; group++)
+ {
+ sel[index++] = group + group_num * array;
+ }
+ for (group = 0; group < group_num; group++)
+ {
+ sel[index++] = nelt + group + group_num * array;
+ }
+ }
+ indices.new_vector (sel, 2, nelt);
+ perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
+}
+
+/* Function vect_transpose_store_chain.
+
+ Given a chain of interleaved stores in DR_CHAIN of LENGTH and ARRAY_NUM that
+ must be a power of 2. Generate interleave_high/low stmts to reorder
+ the data correctly for the stores. Return the final references for stores
+ in RESULT_CHAIN. This function is similar to vect_permute_store_chain (),
+ we interleave the contents of the vectors in their order.
+
+ E.g., LENGTH is 4, the scalar type is short (i.e., VF is 8) and ARRAY_NUM
+ is 4. That is, the input is 4 vectors each containing 8 elements.
+ And 2 (VF / ARRAY_NUM) of 8 elements come from the same array. we interleave
+ the contents of the four vectors in their order. We assign a number to each
+ element, the input sequence is:
+
+ 1st vec: 0 1 2 3 4 5 6 7
+ 2nd vec: 8 9 10 11 12 13 14 15
+ 3rd vec: 16 17 18 19 20 21 22 23
+ 4th vec: 24 25 26 27 28 29 30 31
+
+ The output sequence should be:
+
+ 1st vec: 0 4 8 12 16 20 24 28
+ 2nd vec: 1 5 9 13 17 21 25 29
+ 3rd vec: 2 6 10 14 18 22 26 30
+ 4th vec: 3 7 11 15 19 23 27 31
+
+ In our example,
+ We get 2 (VF / ARRAY_NUM) elements together in every vector.
+
+ I1: 0 4 1 5 2 6 3 7
+ I2: 8 12 9 13 10 14 11 15
+ I3: 16 20 17 21 18 22 19 23
+ I4: 24 28 25 29 26 30 27 31
+
+ Then, we use interleave_high/low instructions to create such output.
+ Every 2 (VF / ARRAY_NUM) elements are regarded as a whole. The permutation
+ is done in log LENGTH stages.
+
+ I1: interleave_high (1st vec, 3rd vec)
+ I2: interleave_low (1st vec, 3rd vec)
+ I3: interleave_high (2nd vec, 4th vec)
+ I4: interleave_low (2nd vec, 4th vec)
+
+ The first stage of the sequence should be:
+
+ I1: 0 4 16 20 1 5 17 21
+ I2: 2 6 18 22 3 7 19 23
+ I3: 8 12 24 28 9 13 25 29
+ I4: 10 14 26 30 11 15 27 31
+
+ The following stage sequence should be, i.e. the final result is:
+
+ I1: 0 4 8 12 16 20 24 28
+ I2: 1 5 9 13 17 21 25 29
+ I3: 2 6 10 14 18 22 26 30
+ I4: 3 7 11 15 19 23 27 31. */
+
+void
+vect_transpose_store_chain (vec<tree> dr_chain, unsigned int length,
+ unsigned int array_num, stmt_vec_info stmt_info,
+ gimple_stmt_iterator *gsi, vec<tree> *result_chain)
+{
+ gimple *perm_stmt = NULL;
+ tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+ tree perm_mask_low_first = NULL;
+ tree perm_mask_high_first = NULL;
+ tree perm_mask_low = NULL;
+ tree perm_mask_high = NULL;
+ unsigned int log_length = exact_log2 (length);
+
+ /* Only power of 2 is supported. */
+ gcc_assert (pow2p_hwi (length));
+
+ /* The encoding has 2 types, one for the grouped pattern in the fisrt stage,
+ another for the interleaved patterns in the following stages. */
+ gcc_assert (array_num != 0);
+
+ /* Create grouped stmt (in the first stage):
+ group = nelt / array_num;
+ high_first = VEC_PERM_EXPR <vect1, vect2,
+ {0, array_num, 2*array_num, ..., (2*group-1)*array_num,
+ 1, 1+array_num, 1+2*array_num, ..., 1+(2*group-1)*array_num,
+ ...,
+ array_num/2-1, (array_num/2-1)+array_num, ...,
+ (array_num/2-1)+(2*group-1)*array_num}>
+ low_first = VEC_PERM_EXPR <vect1, vect2,
+ {array_num/2, array_num/2+array_num, array_num/2+2*array_num,
+ ..., array_num/2+(2*group-1)*array_num,
+ array_num/2+1, array_num/2+1+array_num,
+ ..., array_num/2+1+(2*group-1)*array_num,
+ ...,
+ array_num-1, array_num-1+array_num,
+ ..., array_num-1+(2*group-1)*array_num}> */
+ vect_indices_encoding_first (vectype, array_num, perm_mask_high_first,
+ perm_mask_low_first);
+
+ /* Create interleaving stmt (in the following stages):
+ high = VEC_PERM_EXPR <vect1, vect2, {0, 1, ..., group-1,
+ nelt, nelt+1, ..., nelt+group-1,
+ group, group+1, ..., 2*group-1,
+ nelt+group, nelt+group+1, ..., nelt+2*group-1,
+ ...}>
+ low = VEC_PERM_EXPR <vect1, vect2,
+ {nelt/2, nelt/2+1, ..., nelt/2+group-1,
+ nelt*3/2, nelt*3/2+1, ..., nelt*3/2+group-1,
+ nelt/2+group, nelt/2+group+1, ..., nelt/2+2*group-1,
+ nelt*3/2+group, nelt*3/2+group+1, ..., nelt*3/2+2*group-1,
+ ...}> */
+ vect_indices_encoding (vectype, array_num, perm_mask_high, perm_mask_low);
+
+ for (unsigned int perm_time = 0; perm_time < log_length; perm_time++)
+ {
+ for (unsigned int index = 0; index < length / 2; index++)
+ {
+ tree vect1 = dr_chain[index];
+ tree vect2 = dr_chain[index + length / 2];
+
+ tree high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
+ perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1, vect2,
+ perm_time == 0 ? perm_mask_high_first
+ : perm_mask_high);
+ vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
+ (*result_chain)[2 * index] = high;
+
+ tree low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
+ perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1, vect2,
+ perm_time == 0 ? perm_mask_low_first
+ : perm_mask_low);
+ vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
+ (*result_chain)[2 * index+1] = low;
+ }
+ memcpy (dr_chain.address (), result_chain->address (),
+ length * sizeof (tree));
+ }
+}
+
/* Function vect_setup_realignment
This function is called when vectorizing an unaligned load using
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index 476b32370..d30463b96 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -2414,11 +2414,13 @@ vect_analyze_slp_instance (vec_info *vinfo,
/* For basic block SLP, try to break the group up into multiples of the
vector size. */
+ bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
unsigned HOST_WIDE_INT const_nunits;
if (is_a <bb_vec_info> (vinfo)
&& STMT_VINFO_GROUPED_ACCESS (stmt_info)
&& DR_GROUP_FIRST_ELEMENT (stmt_info)
- && nunits.is_constant (&const_nunits))
+ && nunits.is_constant (&const_nunits)
+ && !bb_vinfo->transposed)
{
/* We consider breaking the group only on VF boundaries from the existing
start. */
@@ -2455,6 +2457,898 @@ vect_analyze_slp_instance (vec_info *vinfo,
return false;
}
+static inline bool
+is_const_assign (stmt_vec_info store_elem)
+{
+ if (store_elem == NULL)
+ {
+ gcc_unreachable ();
+ }
+ gimple *stmt = store_elem->stmt;
+ gimple_rhs_class rhs_class = gimple_assign_rhs_class (stmt);
+ return rhs_class == GIMPLE_SINGLE_RHS
+ && TREE_CONSTANT (gimple_assign_rhs1 (store_elem->stmt));
+}
+
+/* Push inits to INNERMOST_INITS and check const assign. */
+
+static bool
+record_innermost (vec<tree> &innermost_inits,
+ vec<tree> &innermost_offsets,
+ stmt_vec_info stmt_vinfo)
+{
+ if (!stmt_vinfo)
+ {
+ return false;
+ }
+ stmt_vec_info next_info = stmt_vinfo;
+ while (next_info)
+ {
+ /* No need to vectorize constant assign in a transposed version. */
+ if (is_const_assign (next_info))
+ {
+ if (dump_enabled_p ())
+ {
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "no need to vectorize, store is const assign: %G",
+ next_info->stmt);
+ }
+ return false;
+ }
+ innermost_inits.safe_push (STMT_VINFO_DR_INIT (next_info));
+ innermost_offsets.safe_push (STMT_VINFO_DR_OFFSET (next_info));
+ next_info = DR_GROUP_NEXT_ELEMENT (next_info);
+ }
+ return true;
+}
+
+/* Compare inits to INNERMOST_INITS, return FALSE if inits do not match
+ the first grouped_store. And check const assign meanwhile. */
+
+static bool
+compare_innermost (const vec<tree> &innermost_inits,
+ const vec<tree> &innermost_offsets,
+ stmt_vec_info stmt_vinfo)
+{
+ if (!stmt_vinfo || innermost_inits.length () != stmt_vinfo->size)
+ {
+ return false;
+ }
+ stmt_vec_info next_info = stmt_vinfo;
+ unsigned int i = 0;
+ while (next_info)
+ {
+ if (is_const_assign (next_info))
+ {
+ if (dump_enabled_p ())
+ {
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "no need to vectorize, store is const "
+ "assign: %G", next_info->stmt);
+ }
+ return false;
+ }
+ if (innermost_inits[i] != STMT_VINFO_DR_INIT (next_info)
+ || innermost_offsets[i] != STMT_VINFO_DR_OFFSET (next_info))
+ {
+ return false;
+ }
+ next_info = DR_GROUP_NEXT_ELEMENT (next_info);
+ i++;
+ }
+ return true;
+}
+
+/* Check if grouped stores are of same type.
+ input: t1/t2 = TREE_TYPE (gimple_assign_lhs (first_element->stmt))
+ output: 0 if same, 1 or -1 else. */
+
+static int
+tree_type_cmp (const tree t1, const tree t2)
+{
+ gcc_checking_assert (t1 != NULL && t2 != NULL);
+ if (t1 != t2)
+ {
+ if (TREE_CODE (t1) != TREE_CODE (t2))
+ {
+ return TREE_CODE (t1) > TREE_CODE (t2) ? 1 : -1;
+ }
+ if (TYPE_UNSIGNED (t1) != TYPE_UNSIGNED (t2))
+ {
+ return TYPE_UNSIGNED (t1) > TYPE_UNSIGNED (t2) ? 1 : -1;
+ }
+ if (TYPE_PRECISION (t1) != TYPE_PRECISION (t2))
+ {
+ return TYPE_PRECISION (t1) > TYPE_PRECISION (t2) ? 1 : -1;
+ }
+ }
+ return 0;
+}
+
+/* Check it if 2 grouped stores are of same type that
+ we can analyze them in a transpose group. */
+static int
+check_same_store_type (stmt_vec_info grp1, stmt_vec_info grp2)
+{
+ if (grp1 == grp2)
+ {
+ return 0;
+ }
+ if (grp1->size != grp2->size)
+ {
+ return grp1->size > grp2->size ? 1 : -1;
+ }
+ tree lhs1 = gimple_assign_lhs (grp1->stmt);
+ tree lhs2 = gimple_assign_lhs (grp2->stmt);
+ if (TREE_CODE (lhs1) != TREE_CODE (lhs2))
+ {
+ return TREE_CODE (lhs1) > TREE_CODE (lhs2) ? 1 : -1;
+ }
+ tree grp_type1 = TREE_TYPE (gimple_assign_lhs (grp1->stmt));
+ tree grp_type2 = TREE_TYPE (gimple_assign_lhs (grp2->stmt));
+ int cmp = tree_type_cmp (grp_type1, grp_type2);
+ return cmp;
+}
+
+/* Sort grouped stores according to group_size and store_type.
+ output: 0 if same, 1 if grp1 > grp2, -1 otherwise. */
+
+static int
+grouped_store_cmp (const void *grp1_, const void *grp2_)
+{
+ stmt_vec_info grp1 = *(stmt_vec_info *)const_cast<void *>(grp1_);
+ stmt_vec_info grp2 = *(stmt_vec_info *)const_cast<void *>(grp2_);
+ return check_same_store_type (grp1, grp2);
+}
+
+/* Transposing is based on permutation in registers. Permutation requires
+ vector length being power of 2 and satisfying the vector mode. */
+
+static inline bool
+check_filling_reg (stmt_vec_info current_element)
+{
+ if (current_element->size == 0)
+ {
+ return false;
+ }
+ /* If the gimple STMT was already vectorized in vect pass, it's unable to
+ conduct transpose analysis, skip it. */
+ bool lhs_vectorized
+ = TREE_CODE (TREE_TYPE (gimple_get_lhs (current_element->stmt)))
+ == VECTOR_TYPE;
+ bool rhs_vectorized
+ = TREE_CODE (TREE_TYPE (gimple_assign_rhs1 (current_element->stmt)))
+ == VECTOR_TYPE;
+ if (lhs_vectorized || rhs_vectorized)
+ {
+ return false;
+ }
+ unsigned int store_precision
+ = TYPE_PRECISION (TREE_TYPE (gimple_get_lhs (current_element->stmt)));
+ auto_vector_modes vector_modes;
+ targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
+ unsigned min_mode_size = -1u;
+ for (unsigned i = 0; i < vector_modes.length (); i++)
+ {
+ unsigned mode_bit_size = (GET_MODE_BITSIZE (vector_modes[i])).coeffs[0];
+ min_mode_size = mode_bit_size < min_mode_size
+ ? mode_bit_size : min_mode_size;
+ }
+ return store_precision != 0
+ && pow2p_hwi (current_element->size)
+ && (current_element->size * store_precision % min_mode_size == 0);
+}
+
+/* Check if previous groups are suitable to transpose, if not, set their
+ group number to -1, reduce grp_num and clear current_groups.
+ Otherwise, just clear current_groups. */
+
+static void
+check_and_clear_groups (vec<stmt_vec_info> current_groups,
+ unsigned int &grp_num)
+{
+ stmt_vec_info first_element;
+ if (current_groups.length () == 1
+ || (current_groups.length () != 0
+ && !pow2p_hwi (current_groups.length ())))
+ {
+ while (current_groups.length () != 0)
+ {
+ first_element = current_groups.pop ();
+ first_element->group_number = -1;
+ }
+ grp_num--;
+ }
+ else
+ {
+ while (current_groups.length ())
+ {
+ current_groups.pop ();
+ }
+ }
+}
+
+
+/* Make sure that transpose slp vectorization is conducted only if grouped
+ stores are one dimension array ref. */
+
+static bool
+is_store_one_dim_array (gimple *stmt)
+{
+ tree op = gimple_get_lhs (stmt);
+ if (TREE_CODE (op) != ARRAY_REF)
+ return false;
+ return TREE_OPERAND_LENGTH (op) > 0
+ && TREE_OPERAND_LENGTH (TREE_OPERAND (op, 0)) == 0;
+}
+
+/* Set grouped_stores with similar MEM_REF to the same group and mark their
+ grp_num. Groups with same grp_num consist the minimum unit to analyze
+ transpose. Return num of such units. */
+
+static unsigned
+vect_prepare_transpose (bb_vec_info bb_vinfo)
+{
+ stmt_vec_info current_element = NULL;
+ stmt_vec_info first_element = NULL;
+ unsigned int i = 0;
+ unsigned int grp_num = 0;
+ /* Use arrays to record MEM_REF data in different GROUPED_STORES. */
+ auto_vec<tree> innermost_inits;
+ auto_vec<tree> innermost_offsets;
+
+ /* A set of stmt_vec_info with same store type. Analyze them if their size
+ is suitable to transpose. */
+ auto_vec<stmt_vec_info> current_groups;
+
+ FOR_EACH_VEC_ELT (bb_vinfo->grouped_stores, i, current_element)
+ {
+ /* Compare current grouped_store to the first one if first_element exists,
+ push current_element to current_groups if they are similar on innermost
+ behavior of MEM_REF. */
+ if (first_element != NULL
+ && !check_same_store_type (first_element, current_element)
+ && compare_innermost (innermost_inits, innermost_offsets,
+ current_element))
+ {
+ current_groups.safe_push (current_element);
+ current_element->group_number = grp_num;
+ /* If current_element is the last element in grouped_stores, continue
+ will exit the loop and leave the last group unanalyzed. */
+ if (i == bb_vinfo->grouped_stores.length () - 1)
+ {
+ check_and_clear_groups (current_groups, grp_num);
+ }
+ continue;
+ }
+ check_and_clear_groups (current_groups, grp_num);
+ innermost_inits.release ();
+ innermost_offsets.release ();
+ /* Beginning of a new group to analyze whether they are able to consist
+ a unit to conduct transpose analysis. */
+ first_element = NULL;
+ if (is_store_one_dim_array (current_element->stmt)
+ && check_filling_reg (current_element)
+ && record_innermost (innermost_inits, innermost_offsets,
+ current_element))
+ {
+ first_element = current_element;
+ current_groups.safe_push (current_element);
+ current_element->group_number = ++grp_num;
+ if (i == bb_vinfo->grouped_stores.length () - 1)
+ {
+ check_and_clear_groups (current_groups, grp_num);
+ }
+ continue;
+ }
+ current_element->group_number = -1;
+ }
+ return grp_num;
+}
+
+/* Return a flag to transpose grouped stores before building slp tree.
+ Add bool may_transpose in class vec_info. */
+
+static bool
+vect_may_transpose (bb_vec_info bb_vinfo)
+{
+ if (targetm.vectorize.vec_perm_const == NULL)
+ {
+ return false;
+ }
+ if (bb_vinfo->grouped_stores.length () < 2)
+ {
+ return false;
+ }
+ DUMP_VECT_SCOPE ("analyze if grouped stores may transpose to slp");
+ /* Sort grouped_stores according to size and type for function
+ vect_prepare_transpose (). */
+ bb_vinfo->grouped_stores.qsort (grouped_store_cmp);
+
+ int groups = vect_prepare_transpose (bb_vinfo);
+ BB_VINFO_TRANS_GROUPS (bb_vinfo) = groups;
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "%d groups to analyze transposed slp.\n", groups);
+ return groups != 0;
+}
+
+/* Get the base address of STMT_INFO. */
+
+static tree
+get_op_base_address (stmt_vec_info stmt_info)
+{
+ struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
+ tree op = DR_BASE_ADDRESS (dr);
+ while (TREE_OPERAND_LENGTH (op) > 0)
+ {
+ op = TREE_OPERAND (op, 0);
+ }
+ return op;
+}
+
+/* Compare the UID of the two stmt_info STMTINFO_A and STMTINFO_B.
+ Sorting them in ascending order. */
+
+static int
+dr_group_cmp (const void *stmtinfo_a_, const void *stmtinfo_b_)
+{
+ stmt_vec_info stmtinfo_a
+ = *(stmt_vec_info *) const_cast<void *> (stmtinfo_a_);
+ stmt_vec_info stmtinfo_b
+ = *(stmt_vec_info *) const_cast<void *> (stmtinfo_b_);
+
+ /* Stabilize sort. */
+ if (stmtinfo_a == stmtinfo_b)
+ {
+ return 0;
+ }
+ return gimple_uid (stmtinfo_a->stmt) < gimple_uid (stmtinfo_b->stmt) ? -1 : 1;
+}
+
+/* Find the first elements of the grouped loads which are required to merge. */
+
+static void
+vect_slp_grouped_load_find (bb_vec_info bb_vinfo, vec<bool> &visited,
+ vec<stmt_vec_info> &res)
+{
+ unsigned int i = 0;
+ stmt_vec_info merge_first_element = NULL;
+ stmt_vec_info first_element = NULL;
+ tree opa = NULL;
+ unsigned int grp_size_a = 0;
+ FOR_EACH_VEC_ELT (bb_vinfo->grouped_loads, i, first_element)
+ {
+ if (visited[i])
+ {
+ continue;
+ }
+ if (!STMT_VINFO_GROUPED_ACCESS (first_element)
+ || !pow2p_hwi (DR_GROUP_SIZE (first_element)))
+ {
+ /* Non-conforming grouped load should be grouped separately. */
+ if (merge_first_element == NULL)
+ {
+ visited[i] = true;
+ res.safe_push (first_element);
+ return;
+ }
+ }
+ if (merge_first_element == NULL)
+ {
+ merge_first_element = first_element;
+ opa = get_op_base_address (first_element);
+ grp_size_a = DR_GROUP_SIZE (first_element);
+ res.safe_push (first_element);
+ visited[i] = true;
+ continue;
+ }
+
+ /* If the two first elements are of the same base address and group size,
+ these two grouped loads need to be merged. */
+ tree opb = get_op_base_address (first_element);
+ unsigned int grp_size_b = DR_GROUP_SIZE (first_element);
+ if (opa == opb && grp_size_a == grp_size_b)
+ {
+ res.safe_push (first_element);
+ visited[i] = true;
+ }
+ }
+}
+
+/* Merge the grouped loads that are found from
+ vect_slp_grouped_load_find (). */
+
+static stmt_vec_info
+vect_slp_grouped_load_merge (vec<stmt_vec_info> res)
+{
+ stmt_vec_info stmt_info = res[0];
+ if (res.length () == 1)
+ {
+ return stmt_info;
+ }
+ unsigned int i = 0;
+ unsigned int size = DR_GROUP_SIZE (res[0]);
+ unsigned int new_group_size = size * res.length ();
+ stmt_vec_info first_element = NULL;
+ stmt_vec_info merge_first_element = NULL;
+ stmt_vec_info last_element = NULL;
+ FOR_EACH_VEC_ELT (res, i, first_element)
+ {
+ if (merge_first_element == NULL)
+ {
+ merge_first_element = first_element;
+ last_element = merge_first_element;
+ size = DR_GROUP_SIZE (merge_first_element);
+ }
+
+ if (last_element != first_element
+ && !DR_GROUP_NEXT_ELEMENT (last_element))
+ {
+ DR_GROUP_NEXT_ELEMENT (last_element) = first_element;
+ /* Store the gap from the previous member of the group. If there is
+ no gap in the access, DR_GROUP_GAP is always 1. */
+ DR_GROUP_GAP_TRANS (first_element) = DR_GROUP_GAP (first_element);
+ DR_GROUP_GAP (first_element) = 1;
+ }
+ for (stmt_info = first_element; stmt_info;
+ stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
+ {
+ DR_GROUP_FIRST_ELEMENT (stmt_info) = merge_first_element;
+ DR_GROUP_SIZE_TRANS (stmt_info) = DR_GROUP_SIZE (stmt_info);
+ DR_GROUP_SIZE (stmt_info) = new_group_size;
+ last_element = stmt_info;
+ }
+ }
+ DR_GROUP_SIZE (merge_first_element) = new_group_size;
+ DR_GROUP_SLP_TRANSPOSE (merge_first_element) = true;
+ DR_GROUP_NEXT_ELEMENT (last_element) = NULL;
+ return merge_first_element;
+}
+
+/* Merge the grouped loads which have the same base address and group size.
+ For example, for grouped loads (opa_1, opa_2, opb_1, opb_2):
+ opa_1: a0->a1->a2->a3
+ opa_2: a8->a9->a10->a11
+ opb_1: b0->b1
+ opb_2: b16->b17
+ we can probably get two merged grouped loads:
+ opa: a0->a1->a2->a3->a8->a9->a10->a11
+ opb: b0->b1->b16->b17. */
+
+static bool
+vect_merge_slp_grouped_loads (bb_vec_info bb_vinfo)
+{
+ if (bb_vinfo->grouped_loads.length () <= 0)
+ {
+ if (dump_enabled_p ())
+ {
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "The number of grouped loads is 0.\n");
+ }
+ return false;
+ }
+ bb_vinfo->grouped_loads.qsort (dr_group_cmp);
+ auto_vec<bool> visited (bb_vinfo->grouped_loads.length ());
+ auto_vec<stmt_vec_info> grouped_loads_merge;
+ for (unsigned int i = 0; i < bb_vinfo->grouped_loads.length (); i++)
+ {
+ visited.safe_push (false);
+ }
+ while (1)
+ {
+ /* Find grouped loads which are required to merge. */
+ auto_vec<stmt_vec_info> res;
+ vect_slp_grouped_load_find (bb_vinfo, visited, res);
+ if (res.is_empty ())
+ {
+ break;
+ }
+ /* Merge the required grouped loads into one group. */
+ grouped_loads_merge.safe_push (vect_slp_grouped_load_merge (res));
+ }
+ if (grouped_loads_merge.length () == bb_vinfo->grouped_loads.length ())
+ {
+ if (dump_enabled_p ())
+ {
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "No grouped loads need to be merged.\n");
+ }
+ return false;
+ }
+ if (dump_enabled_p ())
+ {
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "Merging grouped loads successfully.\n");
+ }
+ BB_VINFO_GROUPED_LOADS (bb_vinfo).release ();
+ for (unsigned int i = 0; i < grouped_loads_merge.length (); i++)
+ {
+ BB_VINFO_GROUPED_LOADS (bb_vinfo).safe_push (grouped_loads_merge[i]);
+ }
+ return true;
+}
+
+/* Find the first elements of the grouped stores
+ which are required to transpose and merge. */
+
+static void
+vect_slp_grouped_store_find (bb_vec_info bb_vinfo, vec<bool> &visited,
+ vec<stmt_vec_info> &res)
+{
+ stmt_vec_info first_element = NULL;
+ stmt_vec_info merge_first_element = NULL;
+ unsigned int k = 0;
+ FOR_EACH_VEC_ELT (bb_vinfo->grouped_stores, k, first_element)
+ {
+ if (visited[k])
+ {
+ continue;
+ }
+ /* Non-conforming grouped store should be grouped separately. */
+ if (!STMT_VINFO_GROUPED_ACCESS (first_element)
+ || first_element->group_number == -1)
+ {
+ if (merge_first_element == NULL)
+ {
+ visited[k] = true;
+ res.safe_push (first_element);
+ return;
+ }
+ }
+ if (first_element->group_number != -1
+ && merge_first_element == NULL)
+ {
+ merge_first_element = first_element;
+ }
+ if (merge_first_element->group_number == first_element->group_number)
+ {
+ visited[k] = true;
+ res.safe_push (first_element);
+ }
+ }
+}
+
+/* Transpose and merge the grouped stores that are found from
+ vect_slp_grouped_store_find (). */
+
+static stmt_vec_info
+vect_slp_grouped_store_transform (vec<stmt_vec_info> res)
+{
+ stmt_vec_info stmt_info = res[0];
+ if (res.length () == 1)
+ {
+ return stmt_info;
+ }
+ stmt_vec_info rearrange_first_element = stmt_info;
+ stmt_vec_info last_element = rearrange_first_element;
+
+ unsigned int size = DR_GROUP_SIZE (rearrange_first_element);
+ unsigned int new_group_size = size * res.length ();
+ for (unsigned int i = 1; i < res.length (); i++)
+ {
+ /* Store the gap from the previous member of the group. If there is no
+ gap in the access, DR_GROUP_GAP is always 1. */
+ DR_GROUP_GAP_TRANS (res[i]) = DR_GROUP_GAP (res[i]);
+ DR_GROUP_GAP (res[i]) = 1;
+ }
+ while (!res.is_empty ())
+ {
+ stmt_info = res[0];
+ res.ordered_remove (0);
+ if (DR_GROUP_NEXT_ELEMENT (stmt_info))
+ {
+ res.safe_push (DR_GROUP_NEXT_ELEMENT (stmt_info));
+ }
+ DR_GROUP_FIRST_ELEMENT (stmt_info) = rearrange_first_element;
+ DR_GROUP_NEXT_ELEMENT (last_element) = stmt_info;
+ DR_GROUP_SIZE_TRANS (stmt_info) = DR_GROUP_SIZE (stmt_info);
+ DR_GROUP_SIZE (stmt_info) = new_group_size;
+ last_element = stmt_info;
+ }
+
+ DR_GROUP_SIZE (rearrange_first_element) = new_group_size;
+ DR_GROUP_SLP_TRANSPOSE (rearrange_first_element) = true;
+ DR_GROUP_NEXT_ELEMENT (last_element) = NULL;
+ return rearrange_first_element;
+}
+
+/* Save the STMT_INFO in the grouped stores to BB_VINFO_SCALAR_STORES for
+ transposing back grouped stores. */
+
+static void
+get_scalar_stores (bb_vec_info bb_vinfo)
+{
+ unsigned int k = 0;
+ stmt_vec_info first_element = NULL;
+ FOR_EACH_VEC_ELT (bb_vinfo->grouped_stores, k, first_element)
+ {
+ /* Filter the grouped store which is unnecessary for transposing. */
+ if (!STMT_VINFO_GROUPED_ACCESS (first_element)
+ || first_element->group_number == -1)
+ {
+ continue;
+ }
+ vec<stmt_vec_info> tmp_scalar_store;
+ tmp_scalar_store.create (DR_GROUP_SIZE (first_element));
+ for (stmt_vec_info stmt_info = first_element; stmt_info;
+ stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
+ {
+ tmp_scalar_store.safe_push (stmt_info);
+ }
+ BB_VINFO_SCALAR_STORES (bb_vinfo).safe_push (tmp_scalar_store);
+ }
+}
+
+/* Transpose and merge the grouped stores which have the same group number.
+ For example, for grouped stores (opa_0, opa_1, opa_2, opa_3):
+ opa_0: a00->a01->a02->a03
+ opa_1: a10->a11->a12->a13
+ opa_2: a20->a21->a22->a23
+ opa_2: a30->a31->a32->a33
+ we can probably get the merged grouped store:
+ opa: a00->a10->a20->a30
+ ->a01->a11->a21->a31
+ ->a02->a12->a22->a32
+ ->a03->a13->a23->a33. */
+
+static bool
+vect_transform_slp_grouped_stores (bb_vec_info bb_vinfo)
+{
+ if (bb_vinfo->grouped_stores.length () <= 0)
+ {
+ if (dump_enabled_p ())
+ {
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "The number of grouped stores is 0.\n");
+ }
+ return false;
+ }
+
+ bb_vinfo->grouped_stores.qsort (dr_group_cmp);
+ auto_vec<stmt_vec_info> grouped_stores_merge;
+ auto_vec<bool> visited (bb_vinfo->grouped_stores.length ());
+ unsigned int i = 0;
+ for (i = 0; i < bb_vinfo->grouped_stores.length (); i++)
+ {
+ visited.safe_push (false);
+ }
+
+ /* Get scalar stores for the following transposition recovery. */
+ get_scalar_stores (bb_vinfo);
+
+ while (1)
+ {
+ /* Find grouped stores which are required to transpose and merge. */
+ auto_vec<stmt_vec_info> res;
+ vect_slp_grouped_store_find (bb_vinfo, visited, res);
+ if (res.is_empty ())
+ {
+ break;
+ }
+ /* Transpose and merge the required grouped stores into one group. */
+ grouped_stores_merge.safe_push (vect_slp_grouped_store_transform (res));
+ }
+
+ BB_VINFO_GROUPED_STORES (bb_vinfo).release ();
+ for (i = 0; i < grouped_stores_merge.length (); i++)
+ {
+ BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (grouped_stores_merge[i]);
+ }
+
+ if (dump_enabled_p ())
+ {
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "Transposing grouped stores successfully.\n");
+ }
+ return true;
+}
+
+/* A helpful function of vect_transform_back_slp_grouped_stores (). */
+
+static auto_vec<stmt_vec_info>
+vect_transform_back_slp_grouped_store (bb_vec_info bb_vinfo,
+ stmt_vec_info first_stmt_info)
+{
+ auto_vec<stmt_vec_info> grouped_stores_split;
+ for (unsigned int i = 0; i < bb_vinfo->scalar_stores.length (); i++)
+ {
+ vec<stmt_vec_info> scalar_tmp = bb_vinfo->scalar_stores[i];
+ if (scalar_tmp.length () > 1
+ && scalar_tmp[0]->group_number != first_stmt_info->group_number)
+ {
+ continue;
+ }
+ stmt_vec_info cur_stmt_info = NULL;
+ stmt_vec_info cur_first_stmt_info = NULL;
+ stmt_vec_info last_stmt_info = NULL;
+ unsigned int k = 0;
+ FOR_EACH_VEC_ELT (scalar_tmp, k, cur_stmt_info)
+ {
+ if (k == 0)
+ {
+ cur_first_stmt_info = cur_stmt_info;
+ last_stmt_info = cur_stmt_info;
+ }
+ DR_GROUP_FIRST_ELEMENT (cur_stmt_info) = cur_first_stmt_info;
+ DR_GROUP_NEXT_ELEMENT (last_stmt_info) = cur_stmt_info;
+ last_stmt_info = cur_stmt_info;
+ }
+ DR_GROUP_SIZE (cur_first_stmt_info) = k;
+ DR_GROUP_NEXT_ELEMENT (last_stmt_info) = NULL;
+ if (first_stmt_info != cur_first_stmt_info)
+ {
+ DR_GROUP_GAP (cur_first_stmt_info)
+ = DR_GROUP_GAP_TRANS (cur_first_stmt_info);
+ DR_GROUP_SLP_TRANSPOSE (cur_first_stmt_info) = false;
+ DR_GROUP_NUMBER (cur_first_stmt_info) = -1;
+ }
+ grouped_stores_split.safe_push (cur_first_stmt_info);
+ }
+ return grouped_stores_split;
+}
+
+/* Transform the grouped store back. */
+
+void
+vect_transform_back_slp_grouped_stores (bb_vec_info bb_vinfo,
+ stmt_vec_info first_stmt_info)
+{
+ if (first_stmt_info->group_number == -1)
+ {
+ return;
+ }
+ /* Transform back. */
+ auto_vec<stmt_vec_info> grouped_stores_split
+ = vect_transform_back_slp_grouped_store (bb_vinfo, first_stmt_info);
+
+ /* Add the remaining grouped stores to grouped_stores_split. */
+ stmt_vec_info first_element = NULL;
+ unsigned int i = 0;
+ FOR_EACH_VEC_ELT (bb_vinfo->grouped_stores, i, first_element)
+ {
+ if (first_element->group_number != first_stmt_info->group_number)
+ {
+ grouped_stores_split.safe_push (first_element);
+ }
+ }
+ DR_GROUP_SLP_TRANSPOSE (first_stmt_info) = false;
+ DR_GROUP_NUMBER (first_stmt_info) = -1;
+ BB_VINFO_GROUPED_STORES (bb_vinfo).release ();
+ for (i = 0; i < grouped_stores_split.length (); i++)
+ {
+ BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (grouped_stores_split[i]);
+ }
+}
+
+/* Function check_for_slp_vectype
+
+ Restriction for grouped stores by checking their vectype.
+ If the vectype of the grouped store is changed, it need transform back.
+ If all grouped stores need to be transformed back, return FALSE. */
+
+static bool
+check_for_slp_vectype (bb_vec_info bb_vinfo)
+{
+ stmt_vec_info first_element = NULL;
+ unsigned int i = 0;
+ int count = 0;
+ auto_vec<stmt_vec_info> grouped_stores_check;
+ FOR_EACH_VEC_ELT (bb_vinfo->grouped_stores, i, first_element)
+ {
+ grouped_stores_check.safe_push (first_element);
+ }
+ FOR_EACH_VEC_ELT (grouped_stores_check, i, first_element)
+ {
+ if (STMT_VINFO_GROUPED_ACCESS (first_element)
+ && first_element->group_number != -1)
+ {
+ unsigned int group_size_b
+ = DR_GROUP_SIZE_TRANS (first_element);
+ tree vectype = STMT_VINFO_VECTYPE (first_element);
+ poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
+ if (nunits.to_constant () > group_size_b)
+ {
+ count++;
+ /* If the vectype is changed, this grouped store need
+ to be transformed back. */
+ vect_transform_back_slp_grouped_stores (bb_vinfo, first_element);
+ if (dump_enabled_p ())
+ {
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "No supported: only supported for"
+ " group_size geq than nunits.\n");
+ }
+ }
+ }
+ }
+ if (count == BB_VINFO_TRANS_GROUPS (bb_vinfo))
+ {
+ return false;
+ }
+ return true;
+}
+
+/* Function check_for_dr_alignment
+
+ Check the alignment of the slp instance loads.
+ Return FALSE if a load cannot be vectorized. */
+
+static bool
+check_for_dr_alignment (slp_instance instance)
+{
+ slp_tree node = NULL;
+ unsigned int i = 0;
+ FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
+ {
+ stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
+ dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
+ enum dr_alignment_support supportable_dr_alignment
+ = vect_supportable_dr_alignment (first_dr_info, false);
+ if (supportable_dr_alignment == dr_explicit_realign_optimized
+ || supportable_dr_alignment == dr_explicit_realign)
+ {
+ return false;
+ }
+ }
+ return true;
+}
+
+/* Initialize slp_transpose flag before transposing. */
+
+static void
+init_stmt_info_slp_transpose (bb_vec_info bb_vinfo)
+{
+ stmt_vec_info first_element = NULL;
+ unsigned int k = 0;
+ FOR_EACH_VEC_ELT (bb_vinfo->grouped_stores, k, first_element)
+ {
+ if (STMT_VINFO_GROUPED_ACCESS (first_element))
+ {
+ DR_GROUP_SLP_TRANSPOSE (first_element) = false;
+ }
+ }
+ FOR_EACH_VEC_ELT (bb_vinfo->grouped_loads, k, first_element)
+ {
+ if (STMT_VINFO_GROUPED_ACCESS (first_element))
+ {
+ DR_GROUP_SLP_TRANSPOSE (first_element) = false;
+ }
+ }
+}
+
+/* Analyze and transpose the stmts before building the SLP tree. */
+
+static bool
+vect_analyze_transpose (bb_vec_info bb_vinfo)
+{
+ DUMP_VECT_SCOPE ("vect_analyze_transpose");
+
+ if (!vect_may_transpose (bb_vinfo))
+ {
+ return false;
+ }
+
+ /* For basic block SLP, try to merge the grouped stores and loads
+ into one group. */
+ init_stmt_info_slp_transpose (bb_vinfo);
+ if (vect_transform_slp_grouped_stores (bb_vinfo)
+ && vect_merge_slp_grouped_loads (bb_vinfo))
+ {
+ if (dump_enabled_p ())
+ {
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "Analysis succeeded with SLP transposed.\n");
+ }
+ return true;
+ }
+ if (dump_enabled_p ())
+ {
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "Analysis failed with SLP transposed.\n");
+ }
+ return false;
+}
/* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
trees of packed scalar stmts if SLP is possible. */
@@ -3124,7 +4018,11 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
- if (dump_enabled_p ())
+ BB_VINFO_VEC_INSIDE_COST (bb_vinfo) = vec_inside_cost;
+ BB_VINFO_VEC_OUTSIDE_COST (bb_vinfo) = vec_outside_cost;
+ BB_VINFO_SCALAR_COST (bb_vinfo) = scalar_cost;
+
+ if (!unlimited_cost_model (NULL) && dump_enabled_p ())
{
dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
@@ -3239,6 +4137,22 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal)
vect_pattern_recog (bb_vinfo);
+ /* Transpose grouped stores and loads for better vectorizable version. */
+ if (bb_vinfo->transposed)
+ {
+ if (!vect_analyze_transpose (bb_vinfo))
+ {
+ if (dump_enabled_p ())
+ {
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "not vectorized: unhandled slp transposed in "
+ "basic block.\n");
+ }
+ return false;
+ }
+ }
+ bb_vinfo->before_slp = true;
+
/* Check the SLP opportunities in the basic block, analyze and build SLP
trees. */
if (!vect_analyze_slp (bb_vinfo, n_stmts))
@@ -3254,6 +4168,20 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal)
return false;
}
+ /* Check if the vectype is suitable for SLP transposed. */
+ if (bb_vinfo->transposed && !check_for_slp_vectype (bb_vinfo))
+ {
+ if (dump_enabled_p ())
+ {
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "Failed to SLP transposed in the basic block.\n");
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "not vectorized: vectype is not suitable for "
+ "SLP transposed in basic block.\n");
+ }
+ return false;
+ }
+
vect_record_base_alignments (bb_vinfo);
/* Analyze and verify the alignment of data references and the
@@ -3286,6 +4214,27 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal)
if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
return false;
+ /* Check if the alignment is suitable for SLP transposed. */
+ if (bb_vinfo->transposed)
+ {
+ for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); i++)
+ {
+ if (!check_for_dr_alignment (instance))
+ {
+ if (dump_enabled_p ())
+ {
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "Failed to SLP transposed in the basic "
+ "block.\n");
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "not vectorized: alignment is not suitable "
+ "for SLP transposed in basic block.\n");
+ }
+ return false;
+ }
+ }
+ }
+
if (!vect_slp_analyze_operations (bb_vinfo))
{
if (dump_enabled_p ())
@@ -3311,6 +4260,83 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal)
return true;
}
+static bool
+may_new_transpose_bbvinfo (bb_vec_info bb_vinfo_ori, bool res_ori)
+{
+ /* If the flag is false or the slp analysis is broken before
+ vect_analyze_slp, we don't try to analyze the transposed SLP version. */
+ if (!flag_tree_slp_transpose_vectorize
+ || !BB_VINFO_BEFORE_SLP (bb_vinfo_ori))
+ {
+ return false;
+ }
+
+ /* If the original bb_vinfo can't be vectorized, try to new a bb_vinfo
+ of the transposed version. */
+ if (!res_ori)
+ {
+ return true;
+ }
+
+ /* Caculate the cost of the original bb_vinfo. */
+ if (unlimited_cost_model (NULL))
+ {
+ vect_bb_vectorization_profitable_p (bb_vinfo_ori);
+ }
+ /* If the vec cost and scalar cost are not much difference (here we set the
+ threshold to 4), we try to new a bb_vinfo of the transposed version. */
+ if (BB_VINFO_SCALAR_COST (bb_vinfo_ori)
+ < 4 * (BB_VINFO_VEC_INSIDE_COST (bb_vinfo_ori)
+ + BB_VINFO_VEC_OUTSIDE_COST (bb_vinfo_ori)))
+ {
+ return true;
+ }
+ return false;
+}
+
+static bool
+may_choose_transpose_bbvinfo (bb_vec_info bb_vinfo_trans, bool res_trans,
+ bb_vec_info bb_vinfo_ori, bool res_ori)
+{
+ /* The original bb_vinfo is chosen if the transposed bb_vinfo
+ can't be vectorized. */
+ if (!res_trans)
+ {
+ return false;
+ }
+ /* Caculate the cost of the transposed bb_vinfo. */
+ if (unlimited_cost_model (NULL))
+ {
+ vect_bb_vectorization_profitable_p (bb_vinfo_trans);
+ }
+ int diff_bb_cost = -1;
+ int diff_bb_cost_trans = -1;
+ if (res_ori)
+ {
+ diff_bb_cost = BB_VINFO_SCALAR_COST (bb_vinfo_ori)
+ - BB_VINFO_VEC_INSIDE_COST (bb_vinfo_ori)
+ - BB_VINFO_VEC_OUTSIDE_COST (bb_vinfo_ori);
+ }
+ if (res_trans)
+ {
+ diff_bb_cost_trans = BB_VINFO_SCALAR_COST (bb_vinfo_trans)
+ - BB_VINFO_VEC_INSIDE_COST (bb_vinfo_trans)
+ - BB_VINFO_VEC_OUTSIDE_COST (bb_vinfo_trans);
+ }
+ /* The original bb_vinfo is chosen when one of the following conditions
+ is satisfied as follows:
+ 1) The cost of original version is better transposed version.
+ 2) The vec cost is similar to scalar cost in the transposed version. */
+ if ((res_ori && res_trans && diff_bb_cost >= diff_bb_cost_trans)
+ || (res_trans && BB_VINFO_SCALAR_COST (bb_vinfo_trans)
+ <= (BB_VINFO_VEC_INSIDE_COST (bb_vinfo_trans)
+ + BB_VINFO_VEC_OUTSIDE_COST (bb_vinfo_trans))))
+ {
+ return false;
+ }
+ return true;
+}
+
/* Subroutine of vect_slp_bb. Try to vectorize the statements between
REGION_BEGIN (inclusive) and REGION_END (exclusive), returning true
on success. The region has N_STMTS statements and has the datarefs
@@ -3323,6 +4349,7 @@ vect_slp_bb_region (gimple_stmt_iterator region_begin,
unsigned int n_stmts)
{
bb_vec_info bb_vinfo;
+ bb_vec_info bb_vinfo_trans = NULL;
auto_vector_modes vector_modes;
/* Autodetect first vector size we try. */
@@ -3337,6 +4364,10 @@ vect_slp_bb_region (gimple_stmt_iterator region_begin,
{
bool vectorized = false;
bool fatal = false;
+ bool res_bb_vinfo_ori = false;
+ bool res_bb_vinfo_trans = false;
+
+ /* New a bb_vinfo of the original version. */
bb_vinfo = new _bb_vec_info (region_begin, region_end, &shared);
bool first_time_p = shared.datarefs.is_empty ();
@@ -3346,8 +4377,57 @@ vect_slp_bb_region (gimple_stmt_iterator region_begin,
else
bb_vinfo->shared->check_datarefs ();
bb_vinfo->vector_mode = next_vector_mode;
+ bb_vinfo->transposed = false;
+ bb_vinfo->before_slp = false;
+
+ res_bb_vinfo_ori = vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal);
+ /* Analyze and new a transposed bb_vinfo. */
+ if (may_new_transpose_bbvinfo (bb_vinfo, res_bb_vinfo_ori))
+ {
+ bool fatal_trans = false;
+ bb_vinfo_trans
+ = new _bb_vec_info (region_begin, region_end, &shared);
+ bool first_time_p = shared.datarefs.is_empty ();
+ BB_VINFO_DATAREFS (bb_vinfo_trans) = datarefs;
+ if (first_time_p)
+ {
+ bb_vinfo_trans->shared->save_datarefs ();
+ }
+ else
+ {
+ bb_vinfo_trans->shared->check_datarefs ();
+ }
+ bb_vinfo_trans->vector_mode = next_vector_mode;
+ bb_vinfo_trans->transposed = true;
+ bb_vinfo_trans->before_slp = false;
+
+ res_bb_vinfo_trans
+ = vect_slp_analyze_bb_1 (bb_vinfo_trans, n_stmts, fatal_trans);
+ if (may_choose_transpose_bbvinfo (bb_vinfo_trans,
+ res_bb_vinfo_trans,
+ bb_vinfo, res_bb_vinfo_ori))
+ {
+ bb_vinfo = bb_vinfo_trans;
+ fatal = fatal_trans;
+ if (dump_enabled_p ())
+ {
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "Basic block part vectorized "
+ "using transposed version.\n");
+ }
+ }
+ else
+ {
+ if (dump_enabled_p ())
+ {
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "Basic block part vectorized "
+ "using original version.\n");
+ }
+ }
+ }
- if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal)
+ if ((res_bb_vinfo_ori || res_bb_vinfo_trans)
&& dbg_cnt (vect_slp))
{
if (dump_enabled_p ())
@@ -3400,6 +4480,10 @@ vect_slp_bb_region (gimple_stmt_iterator region_begin,
}
delete bb_vinfo;
+ if (bb_vinfo_trans)
+ {
+ bb_vinfo_trans = NULL;
+ }
if (mode_i < vector_modes.length ()
&& VECTOR_MODE_P (autodetected_vector_mode)
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 6418edb52..b872cfc8d 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -7329,6 +7329,153 @@ vectorizable_scan_store (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
return true;
}
+/* Function vect_permute_store_chains
+
+ Call function vect_permute_store_chain ().
+ Given a chain of interleaved stores in DR_CHAIN, generate
+ interleave_high/low stmts to reorder the data correctly.
+ Return the final references for stores in RESULT_CHAIN. */
+
+static void
+vect_permute_store_chains (vec<tree> dr_chain, unsigned int num_each,
+ stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
+ vec<tree> *result_chain, unsigned int group)
+{
+ unsigned int k = 0;
+ unsigned int t = 0;
+
+ /* Divide vectors into GROUP parts. And permute every NUM_EACH vectors
+ together. */
+ for (k = 0; k < group; k++)
+ {
+ auto_vec<tree> dr_chain_transposed (num_each);
+ auto_vec<tree> result_chain_transposed (num_each);
+ for (t = k; t < dr_chain.length (); t = t + group)
+ {
+ dr_chain_transposed.quick_push (dr_chain[t]);
+ }
+ vect_permute_store_chain (dr_chain_transposed, num_each, stmt_info,
+ gsi, &result_chain_transposed);
+ for (t = 0; t < num_each; t++)
+ {
+ result_chain->quick_push (result_chain_transposed[t]);
+ }
+ }
+}
+
+/* Function transpose_oprnd_store
+
+ Calculate the transposed results from VEC_OPRNDS (VEC_STMT)
+ for vectorizable_store. */
+
+static void
+transpose_oprnd_store (vec<tree>vec_oprnds, vec<tree> *result_chain,
+ unsigned int vec_num, unsigned int const_nunits,
+ unsigned int array_num, stmt_vec_info first_stmt_info,
+ gimple_stmt_iterator *gsi)
+{
+ unsigned int group_for_transform = 0;
+ unsigned int num_each = 0;
+
+ /* Transpose back for vec_oprnds. */
+ /* vec = {vec1, vec2, ...} */
+ if (array_num < const_nunits
+ && const_nunits % array_num == 0)
+ {
+ vect_transpose_store_chain (vec_oprnds,
+ vec_num, array_num,
+ first_stmt_info,
+ gsi, result_chain);
+ }
+ /* vec1 = {vec_part1}, vec2 = {vec_part2}, ... */
+ else if (array_num >= const_nunits
+ && array_num % const_nunits == 0)
+ {
+ group_for_transform = array_num / const_nunits;
+ num_each = vec_oprnds.length () / group_for_transform;
+ vect_permute_store_chains (vec_oprnds,
+ num_each, first_stmt_info,
+ gsi, result_chain,
+ group_for_transform);
+ }
+ else
+ {
+ gcc_unreachable ();
+ }
+}
+
+static dr_vec_info *
+get_dr_info (stmt_vec_info stmt_info)
+{
+ dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
+ if (dr_info->misalignment == DR_MISALIGNMENT_UNINITIALIZED)
+ {
+ SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
+ }
+ return dr_info;
+}
+
+static unsigned
+dr_align_vect_store (dr_vec_info *cur_first_dr_info,
+ unsigned HOST_WIDE_INT &align)
+{
+ unsigned misalign = 0;
+ align = known_alignment (DR_TARGET_ALIGNMENT (cur_first_dr_info));
+ if (aligned_access_p (cur_first_dr_info))
+ {
+ return misalign;
+ }
+ else if (DR_MISALIGNMENT (cur_first_dr_info) == -1)
+ {
+ align = dr_alignment (vect_dr_behavior (cur_first_dr_info));
+ }
+ else
+ {
+ misalign = DR_MISALIGNMENT (cur_first_dr_info);
+ }
+ return misalign;
+}
+
+static stmt_vec_info
+add_new_stmt_vect_store (tree vectype, tree dataref_ptr, tree dataref_offset,
+ tree ref_type, dr_vec_info *cur_first_dr_info,
+ tree vec_oprnd, gimple_stmt_iterator *gsi,
+ stmt_vec_info stmt_info)
+{
+ /* Data align. */
+ unsigned HOST_WIDE_INT align;
+ unsigned misalign = dr_align_vect_store (cur_first_dr_info, align);
+
+ if (dataref_offset == NULL_TREE && TREE_CODE (dataref_ptr) == SSA_NAME)
+ {
+ set_ptr_info_alignment (get_ptr_info (dataref_ptr), align, misalign);
+ }
+
+ /* Get data_ref. */
+ tree offset = dataref_offset ? dataref_offset : build_int_cst (ref_type, 0);
+ tree data_ref = fold_build2 (MEM_REF, vectype, dataref_ptr, offset);
+ if (aligned_access_p (cur_first_dr_info))
+ {
+ ;
+ }
+ else if (DR_MISALIGNMENT (cur_first_dr_info) == -1)
+ {
+ TREE_TYPE (data_ref) = build_aligned_type (TREE_TYPE (data_ref),
+ align * BITS_PER_UNIT);
+ }
+ else
+ {
+ tree elem_type = TREE_TYPE (vectype);
+ TREE_TYPE (data_ref) = build_aligned_type (TREE_TYPE (data_ref),
+ TYPE_ALIGN (elem_type));
+ }
+ /* Add new stmt. */
+ vect_copy_ref_info (data_ref, DR_REF (cur_first_dr_info->dr));
+ gassign *new_stmt = gimple_build_assign (data_ref, vec_oprnd);
+ stmt_vec_info new_stmt_info
+ = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
+ return new_stmt_info;
+}
/* Function vectorizable_store.
@@ -8208,6 +8355,16 @@ vectorizable_store (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
else if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
vect_get_gather_scatter_ops (loop, stmt_info, &gs_info,
&dataref_ptr, &vec_offset);
+ /* If the stmt_info need to be transposed recovery, dataref_ptr
+ will be caculated later. */
+ else if (memory_access_type == VMAT_CONTIGUOUS
+ && is_a <bb_vec_info> (vinfo)
+ && STMT_VINFO_GROUPED_ACCESS (stmt_info)
+ && DR_GROUP_SLP_TRANSPOSE (
+ DR_GROUP_FIRST_ELEMENT (stmt_info)))
+ {
+ dataref_ptr = NULL_TREE;
+ }
else
dataref_ptr
= vect_create_data_ref_ptr (first_stmt_info, aggr_type,
@@ -8299,6 +8456,75 @@ vectorizable_store (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
}
else
{
+ /* group_size: the size of group after transposing and merging.
+ group_size_b: the size of group before transposing and merging,
+ and only group_size_b >= const_nunits is supported.
+ array_num: the number of arrays.
+ const_nunits: TYPE_VECTOR_SUBPARTS (vectype).
+ ncontinues: group_size_b / const_nunits, it means the number of
+ times an array is stored in memory. */
+ if (slp && is_a <bb_vec_info> (vinfo)
+ && STMT_VINFO_GROUPED_ACCESS (stmt_info)
+ && DR_GROUP_SLP_TRANSPOSE (DR_GROUP_FIRST_ELEMENT (stmt_info)))
+ {
+ if (dump_enabled_p ())
+ {
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "vectorizable_store for slp transpose.\n");
+ }
+ /* Transpose back for grouped stores. */
+ vect_transform_back_slp_grouped_stores (bb_vinfo,
+ first_stmt_info);
+
+ result_chain.create (vec_oprnds.length ());
+ unsigned int const_nunits = nunits.to_constant ();
+ unsigned int group_size_b = DR_GROUP_SIZE_TRANS (first_stmt_info);
+ unsigned int array_num = group_size / group_size_b;
+ transpose_oprnd_store (vec_oprnds, &result_chain, vec_num,
+ const_nunits, array_num,
+ first_stmt_info, gsi);
+
+ /* For every store group, not for every vec, because transposing
+ and merging have changed the data reference access. */
+ gcc_assert (group_size_b >= const_nunits);
+ unsigned int ncontinues = group_size_b / const_nunits;
+
+ unsigned int k = 0;
+ for (i = 0; i < array_num; i++)
+ {
+ stmt_vec_info first_stmt_b;
+ BB_VINFO_GROUPED_STORES (vinfo).iterate (i, &first_stmt_b);
+ bool simd_lane_access_p
+ = STMT_VINFO_SIMD_LANE_ACCESS_P (first_stmt_b) != 0;
+ tree ref_type = get_group_alias_ptr_type (first_stmt_b);
+ dataref_ptr = vect_create_data_ref_ptr (
+ first_stmt_b, aggr_type,
+ simd_lane_access_p ? loop : NULL,
+ offset, &dummy, gsi, &ptr_incr,
+ simd_lane_access_p, NULL_TREE, bump);
+ dr_vec_info *cur_first_dr_info = get_dr_info (first_stmt_b);
+ for (unsigned int t = 0; t < ncontinues; t++)
+ {
+ vec_oprnd = result_chain[k];
+ k++;
+ if (t > 0)
+ {
+ /* Bump the vector pointer. */
+ dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr,
+ gsi, first_stmt_b,
+ bump);
+ }
+ new_stmt_info = add_new_stmt_vect_store (
+ vectype, dataref_ptr, dataref_offset,
+ ref_type, cur_first_dr_info, vec_oprnd,
+ gsi, first_stmt_b);
+ }
+ }
+ oprnds.release ();
+ result_chain.release ();
+ vec_oprnds.release ();
+ return true;
+ }
new_stmt_info = NULL;
if (grouped_store)
{
@@ -8557,6 +8783,447 @@ hoist_defs_of_uses (stmt_vec_info stmt_info, class loop *loop)
return true;
}
+static tree
+calculate_new_type (tree vectype, unsigned int const_nunits,
+ unsigned int group_size_b, unsigned int &nloads,
+ unsigned int &ncontinues, tree &lvectype)
+{
+ tree ltype = TREE_TYPE (vectype);
+ /* nloads is the number of ARRAYs in a vector.
+ vectemp = {a[], b[], ...} */
+ if (group_size_b < const_nunits)
+ {
+ tree ptype;
+ tree vtype
+ = vector_vector_composition_type (vectype,
+ const_nunits / group_size_b,
+ &ptype);
+ if (vtype != NULL_TREE)
+ {
+ nloads = const_nunits / group_size_b;
+ lvectype = vtype;
+ ltype = ptype;
+ ncontinues = 1;
+ }
+ }
+ /* ncontinues is the number of vectors from an ARRAY.
+ vectemp1 = {a[0], a[1], ...}
+ ...
+ vectempm = {a[k], a[k+1], ...} */
+ else
+ {
+ nloads = 1;
+ ltype = vectype;
+ ncontinues = group_size_b / const_nunits;
+ }
+ ltype = build_aligned_type (ltype, TYPE_ALIGN (TREE_TYPE (vectype)));
+ return ltype;
+}
+
+static void
+generate_old_load_permutations (slp_tree slp_node, unsigned int group_size,
+ vec<unsigned> &old_load_permutation)
+{
+ /* Generate the old load permutations from the slp_node. */
+ unsigned i = 0;
+ unsigned k = 0;
+
+ /* If SLP_NODE has load_permutation, we copy it to old_load_permutation.
+ Otherwise, we generate a permutation sequentially. */
+ if (SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
+ {
+ FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (slp_node), i, k)
+ {
+ old_load_permutation.safe_push (k);
+ }
+ }
+ else
+ {
+ for (unsigned i = 0; i < group_size; i++)
+ {
+ old_load_permutation.safe_push (i);
+ }
+ }
+}
+
+static void
+generate_new_load_permutation_mapping (unsigned slp_node_length,
+ vec<unsigned> &group_idx,
+ const vec<unsigned> &load_permutation,
+ unsigned int group_size_b,
+ unsigned &new_group_size,
+ vec<unsigned> &group_from)
+{
+ /* group_num_vec: only stores the group_loads IDs which are caculated from
+ load_permutation. */
+ auto_vec<unsigned> group_num_vec;
+
+ /* Caculate which group_loads are the stmts in SLP_NODE from. */
+ unsigned i = 0;
+ unsigned k = 0;
+ FOR_EACH_VEC_ELT (load_permutation, i, k)
+ {
+ unsigned int t0 = k / group_size_b;
+ if (!group_num_vec.contains (t0))
+ {
+ group_num_vec.safe_push (t0);
+ }
+ group_from.safe_push (t0);
+ }
+ group_num_vec.qsort (cmp_for_group_num);
+ /* n_groups: the number of group_loads. */
+ unsigned int n_groups = group_num_vec.length ();
+ new_group_size = n_groups * group_size_b;
+ for (i = 0; i < n_groups; i++)
+ {
+ group_idx.safe_push (group_num_vec[i] * group_size_b);
+ }
+ /* A new mapping from group_ind_vec to group_from.
+ For example:
+ Origin: group_from = {1,1,3,3,5,5,7,7};
+ After mapping: group_from = {0,0,1,1,2,2,2,2}; */
+ auto_vec<unsigned> group_ind_vec (n_groups);
+ for (k = 0; k < n_groups; k++)
+ {
+ group_ind_vec.safe_push (k);
+ }
+ for (i = 0; i < slp_node_length; i++)
+ {
+ for (k = 0; k < n_groups; k++)
+ {
+ if (group_from[i] == group_num_vec[k])
+ {
+ group_from[i] = group_ind_vec[k];
+ break;
+ }
+ }
+ }
+}
+
+static void
+generate_new_load_permutation (vec<unsigned> &new_load_permutation,
+ const vec<unsigned> &old_load_permutation,
+ slp_tree slp_node, bool &this_load_permuted,
+ const vec<unsigned> &group_from,
+ unsigned int group_size_b)
+{
+ unsigned slp_node_length = SLP_TREE_SCALAR_STMTS (slp_node).length ();
+ /* Generate the new load permutation from the new mapping. */
+ new_load_permutation.create (slp_node_length);
+ unsigned i = 0;
+ unsigned k = 0;
+ FOR_EACH_VEC_ELT (old_load_permutation, i, k)
+ {
+ /* t1 is the new permutation of k in the old permutation.
+ t1 = base_address + offset:
+ base_address = group_from[i] * group_size_b;
+ offset = k % group_size_b. */
+ unsigned int t1
+ = group_from[i] * group_size_b + k % group_size_b;
+ new_load_permutation.safe_push (t1);
+ if (t1 != k)
+ {
+ this_load_permuted = true;
+ }
+ }
+}
+
+static bool
+is_slp_perm (bool slp_perm, bool this_load_permuted, poly_uint64 nunits,
+ unsigned int group_size, stmt_vec_info first_stmt_info)
+{
+ /* Calculate the unrolling factor based on the smallest type. */
+ poly_uint64 unrolling_factor
+ = exact_div (common_multiple (nunits, group_size), group_size);
+ /* The load requires permutation when unrolling exposes
+ a gap either because the group is larger than the SLP
+ group-size or because there is a gap between the groups. */
+ if (!slp_perm && !this_load_permuted
+ && (known_eq (unrolling_factor, 1U)
+ || (group_size == DR_GROUP_SIZE (first_stmt_info)
+ && DR_GROUP_GAP (first_stmt_info) == 0)))
+ {
+ return false;
+ }
+ else
+ {
+ return true;
+ }
+}
+
+static void
+generate_load_permutation (slp_tree slp_node, unsigned &new_group_size,
+ unsigned int group_size, unsigned int group_size_b,
+ bool &this_load_permuted, vec<unsigned> &group_idx,
+ vec<unsigned> &new_load_permutation)
+{
+ /* Generate the old load permutations from SLP_NODE. */
+ vec<unsigned> old_load_permutation;
+ old_load_permutation.create (group_size);
+ generate_old_load_permutations (slp_node, group_size, old_load_permutation);
+
+ /* Caculate which group_loads are the stmts in SLP_NODE from. */
+ unsigned slp_node_length = SLP_TREE_SCALAR_STMTS (slp_node).length ();
+ /* group_from: stores the group_loads ID for every stmt in SLP_NODE. */
+ vec<unsigned> group_from;
+ group_from.create (slp_node_length);
+ generate_new_load_permutation_mapping (slp_node_length, group_idx,
+ old_load_permutation,
+ group_size_b, new_group_size,
+ group_from);
+
+ /* Generate the new load permutation from the new mapping and caculate
+ this_load_permuted flag. If this_load_permuted is true, we need execute
+ slp permutation by using new load permutation. */
+ generate_new_load_permutation (new_load_permutation, old_load_permutation,
+ slp_node, this_load_permuted, group_from,
+ group_size_b);
+ old_load_permutation.release ();
+ group_from.release ();
+}
+
+static unsigned int
+dr_align_vect_load (dr_vec_info *cur_first_dr_info,
+ unsigned HOST_WIDE_INT &align,
+ enum dr_alignment_support alignment_support_scheme)
+{
+ unsigned int misalign = 0;
+
+ align = known_alignment (DR_TARGET_ALIGNMENT (cur_first_dr_info));
+ if (alignment_support_scheme == dr_aligned)
+ {
+ gcc_assert (aligned_access_p (cur_first_dr_info));
+ }
+ else if (DR_MISALIGNMENT (cur_first_dr_info) == -1)
+ {
+ align = dr_alignment (vect_dr_behavior (cur_first_dr_info));
+ }
+ else
+ {
+ misalign = DR_MISALIGNMENT (cur_first_dr_info);
+ }
+ return misalign;
+}
+
+static stmt_vec_info
+add_new_stmt_vect_load (tree vectype, tree dataref_ptr, tree dataref_offset,
+ tree ref_type, tree ltype, gassign *(&new_stmt),
+ dr_vec_info *cur_first_dr_info,
+ gimple_stmt_iterator *gsi, stmt_vec_info stmt_info)
+{
+ /* Data align. */
+ enum dr_alignment_support alignment_support_scheme
+ = vect_supportable_dr_alignment (cur_first_dr_info, false);
+ unsigned HOST_WIDE_INT align;
+ unsigned int misalign = dr_align_vect_load (cur_first_dr_info, align,
+ alignment_support_scheme);
+ if (dataref_offset == NULL_TREE && TREE_CODE (dataref_ptr) == SSA_NAME)
+ {
+ set_ptr_info_alignment (get_ptr_info (dataref_ptr), align, misalign);
+ }
+
+ /* Get data_ref. */
+ tree offset = dataref_offset ? dataref_offset : build_int_cst (ref_type, 0);
+ tree data_ref = fold_build2 (MEM_REF, ltype, dataref_ptr, offset);
+ if (alignment_support_scheme == dr_aligned)
+ {
+ ;
+ }
+ else if (DR_MISALIGNMENT (cur_first_dr_info) == -1)
+ {
+ TREE_TYPE (data_ref)
+ = build_aligned_type (TREE_TYPE (data_ref), align * BITS_PER_UNIT);
+ }
+ else
+ {
+ tree elem_type = TREE_TYPE (vectype);
+ TREE_TYPE (data_ref)
+ = build_aligned_type (TREE_TYPE (data_ref), TYPE_ALIGN (elem_type));
+ }
+
+ /* Add new stmt. */
+ vect_copy_ref_info (data_ref, DR_REF (cur_first_dr_info->dr));
+ new_stmt = gimple_build_assign (make_ssa_name (ltype), data_ref);
+ stmt_vec_info new_stmt_info
+ = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
+ return new_stmt_info;
+}
+
+static void
+push_new_stmt_to_dr_chain (bool slp_perm, stmt_vec_info new_stmt_info,
+ vec<tree> &dr_chain, slp_tree slp_node)
+{
+ if (slp_perm)
+ {
+ dr_chain.quick_push (gimple_assign_lhs (new_stmt_info->stmt));
+ }
+ else
+ {
+ SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
+ }
+}
+
+static stmt_vec_info
+get_first_stmt_info_before_transpose (stmt_vec_info first_stmt_info,
+ unsigned int group_el,
+ unsigned int group_size)
+{
+ stmt_vec_info last_stmt_info = first_stmt_info;
+ unsigned int count = 0;
+ gcc_assert (group_el < group_size);
+ while (count < group_el)
+ {
+ last_stmt_info = DR_GROUP_NEXT_ELEMENT (last_stmt_info);
+ count++;
+ }
+ return last_stmt_info;
+}
+
+static stmt_vec_info
+add_new_stmt_for_nloads_greater_than_one (tree lvectype, tree vectype,
+ vec<constructor_elt, va_gc> *v,
+ stmt_vec_info stmt_info,
+ gimple_stmt_iterator *gsi)
+{
+ tree vec_inv = build_constructor (lvectype, v);
+ tree new_temp = vect_init_vector (stmt_info, vec_inv, lvectype, gsi);
+ vec_info *vinfo = stmt_info->vinfo;
+ stmt_vec_info new_stmt_info = vinfo->lookup_def (new_temp);
+ if (lvectype != vectype)
+ {
+ gassign *new_stmt = gimple_build_assign (make_ssa_name (vectype),
+ VIEW_CONVERT_EXPR,
+ build1 (VIEW_CONVERT_EXPR,
+ vectype, new_temp));
+ new_stmt_info = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
+ }
+ return new_stmt_info;
+}
+
+/* Function new_vect_stmt_for_nloads.
+
+ New a VEC_STMT when nloads Arrays are merged into a vector.
+
+ ncopies is the number of vectors that need to be loaded from memmory.
+ nloads is the number of ARRAYs in a vector.
+ vectemp = {a[], b[], ...} */
+
+static void
+new_vect_stmt_for_nloads (unsigned int ncopies, unsigned int nloads,
+ vec<unsigned> group_idx, stmt_vec_info stmt_info,
+ offset_info *offset_info, vectype_info *vectype_info,
+ vect_memory_access_type memory_access_type,
+ bool slp_perm, vec<tree>& dr_chain, slp_tree slp_node,
+ gimple_stmt_iterator *gsi)
+{
+ vec<constructor_elt, va_gc> *v = NULL;
+ stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
+ unsigned int group_size = DR_GROUP_SIZE (first_stmt_info);
+ stmt_vec_info first_stmt_info_b = NULL;
+ stmt_vec_info new_stmt_info = NULL;
+ tree dataref_ptr = NULL_TREE;
+ tree dummy;
+ gimple *ptr_incr = NULL;
+ unsigned int n = 0;
+ for (unsigned int i = 0; i < ncopies; i++)
+ {
+ vec_alloc (v, nloads);
+ for (unsigned int t = 0; t < nloads; t++)
+ {
+ first_stmt_info_b = get_first_stmt_info_before_transpose (
+ first_stmt_info, group_idx[n++], group_size);
+ dr_vec_info* cur_first_dr_info = get_dr_info (first_stmt_info_b);
+ tree bump = vect_get_data_ptr_increment (cur_first_dr_info,
+ vectype_info->ltype,
+ memory_access_type);
+ bool simd_lane_access_p
+ = STMT_VINFO_SIMD_LANE_ACCESS_P (first_stmt_info_b) != 0;
+
+ /* Create dataref_ptr which is point to init_address. */
+ dataref_ptr = vect_create_data_ref_ptr (
+ first_stmt_info_b, vectype_info->ltype, NULL,
+ offset_info->offset, &dummy, gsi, &ptr_incr,
+ simd_lane_access_p, offset_info->byte_offset, bump);
+
+ gassign *new_stmt = NULL;
+ new_stmt_info = add_new_stmt_vect_load (
+ vectype_info->vectype, dataref_ptr,
+ offset_info->dataref_offset, vectype_info->ref_type,
+ vectype_info->ltype, new_stmt, cur_first_dr_info,
+ gsi, first_stmt_info_b);
+
+ CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, gimple_assign_lhs (new_stmt));
+ }
+ new_stmt_info = add_new_stmt_for_nloads_greater_than_one (
+ vectype_info->lvectype, vectype_info->vectype,
+ v, first_stmt_info_b, gsi);
+ push_new_stmt_to_dr_chain (slp_perm, new_stmt_info,
+ dr_chain, slp_node);
+ }
+}
+
+/* Function new_vect_stmt_for_ncontinues.
+
+ New a VEC_STMTs when an Array is divided into several vectors.
+
+ n_groups is the number of ARRAYs.
+ ncontinues is the number of vectors from an ARRAY.
+ vectemp1 = {a[0], a[1], ...}
+ ...
+ vectempm = {a[k], a[k+1], ...} */
+
+static void
+new_vect_stmt_for_ncontinues (unsigned int ncontinues, vec<unsigned> group_idx,
+ stmt_vec_info stmt_info, offset_info* offset_info,
+ vectype_info* vectype_info,
+ vect_memory_access_type memory_access_type,
+ bool slp_perm, vec<tree>& dr_chain,
+ slp_tree slp_node,
+ gimple_stmt_iterator *gsi)
+{
+ stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
+ unsigned int group_size = DR_GROUP_SIZE (first_stmt_info);
+ stmt_vec_info new_stmt_info = NULL;
+ tree dataref_ptr = NULL_TREE;
+ tree dummy;
+ gimple *ptr_incr = NULL;
+ unsigned int n_groups = group_idx.length ();
+ for (unsigned int i = 0; i < n_groups; i++)
+ {
+ stmt_vec_info first_stmt_info_b = get_first_stmt_info_before_transpose (
+ first_stmt_info, group_idx[i], group_size);
+ dr_vec_info* cur_first_dr_info = get_dr_info (first_stmt_info_b);
+ tree bump = vect_get_data_ptr_increment (cur_first_dr_info,
+ vectype_info->ltype, memory_access_type);
+ bool simd_lane_access_p
+ = STMT_VINFO_SIMD_LANE_ACCESS_P (first_stmt_info_b) != 0;
+ for (unsigned int k = 0; k < ncontinues; k++)
+ {
+ /* Create dataref_ptr which is point to init_address. */
+ if (k == 0)
+ {
+ dataref_ptr = vect_create_data_ref_ptr (
+ first_stmt_info_b, vectype_info->ltype, NULL,
+ offset_info->offset, &dummy, gsi, &ptr_incr,
+ simd_lane_access_p, offset_info->byte_offset, bump);
+ }
+ else
+ {
+ dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr,
+ gsi, first_stmt_info_b, bump);
+ }
+ gassign *new_stmt = NULL;
+ new_stmt_info = add_new_stmt_vect_load (
+ vectype_info->vectype, dataref_ptr,
+ offset_info->dataref_offset, vectype_info->ref_type,
+ vectype_info->ltype, new_stmt, cur_first_dr_info,
+ gsi, first_stmt_info_b);
+ push_new_stmt_to_dr_chain (slp_perm, new_stmt_info,
+ dr_chain, slp_node);
+ }
+ }
+}
+
/* vectorizable_load.
Check if STMT_INFO reads a non scalar data-ref (array/pointer/structure)
@@ -9364,6 +10031,9 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
tree vec_mask = NULL_TREE;
prev_stmt_info = NULL;
poly_uint64 group_elt = 0;
+ unsigned new_group_size = 0;
+ vec<unsigned> new_load_permutation;
+
for (j = 0; j < ncopies; j++)
{
stmt_vec_info new_stmt_info = NULL;
@@ -9385,6 +10055,15 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
dataref_ptr = unshare_expr (DR_BASE_ADDRESS (first_dr_info->dr));
dataref_offset = build_int_cst (ref_type, 0);
}
+ /* If the stmt_info need to be transposed recovery, dataref_ptr
+ will be caculated later. */
+ else if (slp && is_a <bb_vec_info> (vinfo)
+ && STMT_VINFO_GROUPED_ACCESS (stmt_info)
+ && DR_GROUP_SLP_TRANSPOSE (
+ DR_GROUP_FIRST_ELEMENT (stmt_info)))
+ {
+ dataref_ptr = NULL_TREE;
+ }
else if (diff_first_stmt_info)
{
dataref_ptr
@@ -9501,6 +10180,63 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
/* Record that VEC_ARRAY is now dead. */
vect_clobber_variable (stmt_info, gsi, vec_array);
}
+ else if (slp && is_a <bb_vec_info> (vinfo)
+ && STMT_VINFO_GROUPED_ACCESS (stmt_info)
+ && DR_GROUP_SLP_TRANSPOSE (DR_GROUP_FIRST_ELEMENT (stmt_info)))
+ {
+ if (dump_enabled_p ())
+ {
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "vectorizable_load for slp transpose.\n");
+ }
+ /* group_size: the size of group after merging.
+ group_size_b: the size of group before merging.
+ const_nunits: TYPE_VECTOR_SUBPARTS (vectype), it is the number of
+ elements in a vector.
+ nloads: const_nunits / group_size_b or 1, it means the number
+ of ARRAYs in a vector.
+ ncontinues: group_size_b / const_nunits or 1, it means the number
+ of vectors from an ARRAY. */
+ unsigned int group_size_b = DR_GROUP_SIZE_TRANS (first_stmt_info);
+ unsigned int const_nunits = nunits.to_constant ();
+ unsigned int nloads = const_nunits;
+ unsigned int ncontinues = group_size_b;
+ tree lvectype = vectype;
+ tree ltype = calculate_new_type (vectype, const_nunits,
+ group_size_b, nloads,
+ ncontinues, lvectype);
+ bool this_load_permuted = false;
+ auto_vec<unsigned> group_idx;
+ generate_load_permutation (slp_node, new_group_size, group_size,
+ group_size_b, this_load_permuted,
+ group_idx, new_load_permutation);
+ slp_perm = is_slp_perm (slp_perm, this_load_permuted, nunits,
+ group_size, first_stmt_info);
+
+ /* ncopies: the number of vectors that need to be loaded from
+ memmory. */
+ unsigned int ncopies = new_group_size / const_nunits;
+ offset_info offset_info = {offset, byte_offset, dataref_offset};
+ vectype_info vectype_info = {vectype, ltype, lvectype, ref_type};
+ if (slp_perm)
+ {
+ dr_chain.create (ncopies);
+ }
+ if (nloads > 1 && ncontinues == 1)
+ {
+ new_vect_stmt_for_nloads (ncopies, nloads, group_idx, stmt_info,
+ &offset_info, &vectype_info,
+ memory_access_type, slp_perm, dr_chain,
+ slp_node, gsi);
+ }
+ else
+ {
+ new_vect_stmt_for_ncontinues (ncontinues, group_idx, stmt_info,
+ &offset_info, &vectype_info,
+ memory_access_type, slp_perm,
+ dr_chain, slp_node, gsi);
+ }
+ }
else
{
for (i = 0; i < vec_num; i++)
@@ -9840,7 +10576,32 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
if (slp && !slp_perm)
continue;
- if (slp_perm)
+ /* Using the new load permutation to generate vector permute statements
+ from a list of loads in DR_CHAIN. */
+ if (slp && slp_perm && is_a <bb_vec_info> (vinfo)
+ && STMT_VINFO_GROUPED_ACCESS (stmt_info)
+ && DR_GROUP_SLP_TRANSPOSE (DR_GROUP_FIRST_ELEMENT (stmt_info)))
+ {
+ unsigned n_perms;
+ stmt_vec_info stmt_info_ = SLP_TREE_SCALAR_STMTS (slp_node)[0];
+ unsigned int old_size = DR_GROUP_SIZE (stmt_info);
+ DR_GROUP_SIZE (stmt_info_) = new_group_size;
+ vec<unsigned> old_load_permutation
+ = SLP_TREE_LOAD_PERMUTATION (slp_node);
+ SLP_TREE_LOAD_PERMUTATION (slp_node) = new_load_permutation;
+ bool perm_load_success = vect_transform_slp_perm_load (
+ slp_node, dr_chain, gsi, vf,
+ slp_node_instance, false, &n_perms);
+ DR_GROUP_SIZE (stmt_info_) = old_size;
+ SLP_TREE_LOAD_PERMUTATION (slp_node) = old_load_permutation;
+ new_load_permutation.release ();
+ if (!perm_load_success)
+ {
+ dr_chain.release ();
+ return false;
+ }
+ }
+ else if (slp_perm)
{
unsigned n_perms;
if (!vect_transform_slp_perm_load (slp_node, dr_chain, gsi, vf,
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index f7becb34a..1c4a6c421 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -297,6 +297,21 @@ public:
vec<ddr_p> ddrs;
};
+/* Information about offset in vectorizable_load. */
+struct offset_info {
+ tree offset;
+ tree byte_offset;
+ tree dataref_offset;
+};
+
+/* Information about vectype in vectorizable_load. */
+struct vectype_info {
+ tree vectype;
+ tree ltype;
+ tree lvectype;
+ tree ref_type;
+};
+
/* Vectorizer state common between loop and basic-block vectorization. */
class vec_info {
public:
@@ -335,6 +350,14 @@ public:
stmt in the chain. */
auto_vec<stmt_vec_info> grouped_stores;
+ /* All interleaving chains of loads, represented by the first
+ stmt in the chain. */
+ auto_vec<stmt_vec_info> grouped_loads;
+
+ /* All interleaving chains of stores (before transposed), represented by all
+ stmt in the chain. */
+ auto_vec<vec<stmt_vec_info> > scalar_stores;
+
/* Cost data used by the target cost model. */
void *target_cost_data;
@@ -702,6 +725,8 @@ public:
#define LOOP_VINFO_CHECK_NONZERO(L) (L)->check_nonzero
#define LOOP_VINFO_LOWER_BOUNDS(L) (L)->lower_bounds
#define LOOP_VINFO_GROUPED_STORES(L) (L)->grouped_stores
+#define LOOP_VINFO_GROUPED_LOADS(L) (L)->grouped_loads
+#define LOOP_VINFO_SCALAR_STORES(L) (L)->scalar_stores
#define LOOP_VINFO_SLP_INSTANCES(L) (L)->slp_instances
#define LOOP_VINFO_SLP_UNROLLING_FACTOR(L) (L)->slp_unrolling_factor
#define LOOP_VINFO_REDUCTIONS(L) (L)->reductions
@@ -764,6 +789,25 @@ public:
basic_block bb;
gimple_stmt_iterator region_begin;
gimple_stmt_iterator region_end;
+
+ /* True, if bb_vinfo can goto vect_analyze_slp. */
+ bool before_slp;
+
+ /* True, if bb_vinfo is a transposed version. */
+ bool transposed;
+
+ /* The number of transposed groups. */
+ int transposed_group;
+
+ /* The cost of the scalar iterations. */
+ int scalar_cost;
+
+ /* The cost of the vector prologue and epilogue, including peeled
+ iterations and set-up code. */
+ int vec_outside_cost;
+
+ /* The cost of the vector loop body. */
+ int vec_inside_cost;
} *bb_vec_info;
#define BB_VINFO_BB(B) (B)->bb
@@ -772,6 +816,14 @@ public:
#define BB_VINFO_DATAREFS(B) (B)->shared->datarefs
#define BB_VINFO_DDRS(B) (B)->shared->ddrs
#define BB_VINFO_TARGET_COST_DATA(B) (B)->target_cost_data
+#define BB_VINFO_GROUPED_LOADS(B) (B)->grouped_loads
+#define BB_VINFO_SCALAR_STORES(B) (B)->scalar_stores
+#define BB_VINFO_VEC_OUTSIDE_COST(B) (B)->vec_outside_cost
+#define BB_VINFO_VEC_INSIDE_COST(B) (B)->vec_inside_cost
+#define BB_VINFO_SCALAR_COST(B) (B)->scalar_cost
+#define BB_VINFO_SLP_TRANSPOSED(B) (B)->transposed
+#define BB_VINFO_BEFORE_SLP(B) (B)->before_slp
+#define BB_VINFO_TRANS_GROUPS(B) (B)->transposed_group
static inline bb_vec_info
vec_info_for_bb (basic_block bb)
@@ -1012,6 +1064,17 @@ public:
stmt_vec_info next_element;
/* The size of the group. */
unsigned int size;
+
+ /* The size of the group before transposed. */
+ unsigned int size_before_transpose;
+
+ /* If true, the stmt_info is slp transposed. */
+ bool slp_transpose;
+
+ /* Mark the group store number for rebuild interleaving chain
+ during transpose phase. Value -1 represents unable to transpose. */
+ int group_number;
+
/* For stores, number of stores from this group seen. We vectorize the last
one. */
unsigned int store_count;
@@ -1019,6 +1082,9 @@ public:
is 1. */
unsigned int gap;
+ /* The gap before transposed. */
+ unsigned int gap_before_transpose;
+
/* The minimum negative dependence distance this stmt participates in
or zero if none. */
unsigned int min_neg_dist;
@@ -1217,6 +1283,12 @@ STMT_VINFO_BB_VINFO (stmt_vec_info stmt_vinfo)
#define STMT_VINFO_REDUC_VECTYPE_IN(S) (S)->reduc_vectype_in
#define STMT_VINFO_SLP_VECT_ONLY(S) (S)->slp_vect_only_p
+#define DR_GROUP_SLP_TRANSPOSE(S) \
+ (gcc_checking_assert ((S)->dr_aux.dr), (S)->slp_transpose)
+#define DR_GROUP_SIZE_TRANS(S) \
+ (gcc_checking_assert ((S)->dr_aux.dr), (S)->size_before_transpose)
+#define DR_GROUP_NUMBER(S) \
+ (gcc_checking_assert ((S)->dr_aux.dr), (S)->group_number)
#define DR_GROUP_FIRST_ELEMENT(S) \
(gcc_checking_assert ((S)->dr_aux.dr), (S)->first_element)
#define DR_GROUP_NEXT_ELEMENT(S) \
@@ -1227,6 +1299,8 @@ STMT_VINFO_BB_VINFO (stmt_vec_info stmt_vinfo)
(gcc_checking_assert ((S)->dr_aux.dr), (S)->store_count)
#define DR_GROUP_GAP(S) \
(gcc_checking_assert ((S)->dr_aux.dr), (S)->gap)
+#define DR_GROUP_GAP_TRANS(S) \
+ (gcc_checking_assert ((S)->dr_aux.dr), (S)->gap_before_transpose)
#define REDUC_GROUP_FIRST_ELEMENT(S) \
(gcc_checking_assert (!(S)->dr_aux.dr), (S)->first_element)
@@ -1624,6 +1698,17 @@ vect_get_scalar_dr_size (dr_vec_info *dr_info)
return tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_info->dr))));
}
+/* Compare two unsigned int A and B.
+ Sorting them in ascending order. */
+
+static inline int
+cmp_for_group_num (const void *a_, const void *b_)
+{
+ unsigned int a = *(unsigned int *)const_cast<void *>(a_);
+ unsigned int b = *(unsigned int *)const_cast<void *>(b_);
+ return a < b ? -1 : 1;
+}
+
/* Return true if LOOP_VINFO requires a runtime check for whether the
vector loop is profitable. */
@@ -1787,6 +1872,9 @@ extern bool vect_grouped_load_supported (tree, bool, unsigned HOST_WIDE_INT);
extern bool vect_load_lanes_supported (tree, unsigned HOST_WIDE_INT, bool);
extern void vect_permute_store_chain (vec<tree> ,unsigned int, stmt_vec_info,
gimple_stmt_iterator *, vec<tree> *);
+extern void vect_transpose_store_chain (vec<tree>, unsigned int, unsigned int,
+ stmt_vec_info, gimple_stmt_iterator *,
+ vec<tree> *);
extern tree vect_setup_realignment (stmt_vec_info, gimple_stmt_iterator *,
tree *, enum dr_alignment_support, tree,
class loop **);
@@ -1849,6 +1937,7 @@ extern void vect_free_slp_instance (slp_instance, bool);
extern bool vect_transform_slp_perm_load (slp_tree, vec<tree> ,
gimple_stmt_iterator *, poly_uint64,
slp_instance, bool, unsigned *);
+extern void vect_transform_back_slp_grouped_stores (bb_vec_info, stmt_vec_info);
extern bool vect_slp_analyze_operations (vec_info *);
extern void vect_schedule_slp (vec_info *);
extern opt_result vect_analyze_slp (vec_info *, unsigned);
--
2.27.0.windows.1
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/wangding16/src-gcc.git
git@gitee.com:wangding16/src-gcc.git
wangding16
src-gcc
src-gcc
master

搜索帮助