• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX1
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-SLOW
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-FAST
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VL-SLOW
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VL-FAST
7
8define <8 x float> @shuffle_v8f32_00000000(<8 x float> %a, <8 x float> %b) {
9; AVX1-LABEL: shuffle_v8f32_00000000:
10; AVX1:       # %bb.0:
11; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
12; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
13; AVX1-NEXT:    retq
14;
15; AVX2OR512VL-LABEL: shuffle_v8f32_00000000:
16; AVX2OR512VL:       # %bb.0:
17; AVX2OR512VL-NEXT:    vbroadcastss %xmm0, %ymm0
18; AVX2OR512VL-NEXT:    retq
19  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
20  ret <8 x float> %shuffle
21}
22
23define <8 x float> @shuffle_v8f32_00000010(<8 x float> %a, <8 x float> %b) {
24; AVX1-LABEL: shuffle_v8f32_00000010:
25; AVX1:       # %bb.0:
26; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
27; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0]
28; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
29; AVX1-NEXT:    retq
30;
31; AVX2OR512VL-LABEL: shuffle_v8f32_00000010:
32; AVX2OR512VL:       # %bb.0:
33; AVX2OR512VL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0]
34; AVX2OR512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1]
35; AVX2OR512VL-NEXT:    retq
36  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
37  ret <8 x float> %shuffle
38}
39
40define <8 x float> @shuffle_v8f32_00000200(<8 x float> %a, <8 x float> %b) {
41; AVX1-LABEL: shuffle_v8f32_00000200:
42; AVX1:       # %bb.0:
43; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
44; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,0]
45; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
46; AVX1-NEXT:    retq
47;
48; AVX2OR512VL-LABEL: shuffle_v8f32_00000200:
49; AVX2OR512VL:       # %bb.0:
50; AVX2OR512VL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,2]
51; AVX2OR512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0]
52; AVX2OR512VL-NEXT:    retq
53  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
54  ret <8 x float> %shuffle
55}
56
57define <8 x float> @shuffle_v8f32_00003000(<8 x float> %a, <8 x float> %b) {
58; AVX1-LABEL: shuffle_v8f32_00003000:
59; AVX1:       # %bb.0:
60; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
61; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,0,0,0]
62; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
63; AVX1-NEXT:    retq
64;
65; AVX2OR512VL-LABEL: shuffle_v8f32_00003000:
66; AVX2OR512VL:       # %bb.0:
67; AVX2OR512VL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,3,0]
68; AVX2OR512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0]
69; AVX2OR512VL-NEXT:    retq
70  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
71  ret <8 x float> %shuffle
72}
73
74define <8 x float> @shuffle_v8f32_00040000(<8 x float> %a, <8 x float> %b) {
75; AVX1-LABEL: shuffle_v8f32_00040000:
76; AVX1:       # %bb.0:
77; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,3]
78; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
79; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
80; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
81; AVX1-NEXT:    retq
82;
83; AVX2OR512VL-LABEL: shuffle_v8f32_00040000:
84; AVX2OR512VL:       # %bb.0:
85; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0]
86; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
87; AVX2OR512VL-NEXT:    retq
88  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
89  ret <8 x float> %shuffle
90}
91
92define <8 x float> @shuffle_v8f32_00500000(<8 x float> %a, <8 x float> %b) {
93; AVX1-LABEL: shuffle_v8f32_00500000:
94; AVX1:       # %bb.0:
95; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
96; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
97; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,4,4]
98; AVX1-NEXT:    retq
99;
100; AVX2OR512VL-LABEL: shuffle_v8f32_00500000:
101; AVX2OR512VL:       # %bb.0:
102; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,5,0,0,0,0,0]
103; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
104; AVX2OR512VL-NEXT:    retq
105  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
106  ret <8 x float> %shuffle
107}
108
109define <8 x float> @shuffle_v8f32_06000000(<8 x float> %a, <8 x float> %b) {
110; AVX1-LABEL: shuffle_v8f32_06000000:
111; AVX1:       # %bb.0:
112; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
113; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
114; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,2,0,0,4,4,4,4]
115; AVX1-NEXT:    retq
116;
117; AVX2OR512VL-LABEL: shuffle_v8f32_06000000:
118; AVX2OR512VL:       # %bb.0:
119; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = [0,6,0,0,0,0,0,0]
120; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
121; AVX2OR512VL-NEXT:    retq
122  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
123  ret <8 x float> %shuffle
124}
125
126define <8 x float> @shuffle_v8f32_70000000(<8 x float> %a, <8 x float> %b) {
127; AVX1-LABEL: shuffle_v8f32_70000000:
128; AVX1:       # %bb.0:
129; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
130; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
131; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,0,0,0,4,4,4,4]
132; AVX1-NEXT:    retq
133;
134; AVX2OR512VL-LABEL: shuffle_v8f32_70000000:
135; AVX2OR512VL:       # %bb.0:
136; AVX2OR512VL-NEXT:    movl $7, %eax
137; AVX2OR512VL-NEXT:    vmovd %eax, %xmm1
138; AVX2OR512VL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
139; AVX2OR512VL-NEXT:    retq
140  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
141  ret <8 x float> %shuffle
142}
143
144define <8 x float> @shuffle_v8f32_01014545(<8 x float> %a, <8 x float> %b) {
145; ALL-LABEL: shuffle_v8f32_01014545:
146; ALL:       # %bb.0:
147; ALL-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
148; ALL-NEXT:    retq
149  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
150  ret <8 x float> %shuffle
151}
152
153define <8 x float> @shuffle_v8f32_00112233(<8 x float> %a, <8 x float> %b) {
154; AVX1-LABEL: shuffle_v8f32_00112233:
155; AVX1:       # %bb.0:
156; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,0,1,1]
157; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
158; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
159; AVX1-NEXT:    retq
160;
161; AVX2OR512VL-LABEL: shuffle_v8f32_00112233:
162; AVX2OR512VL:       # %bb.0:
163; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3]
164; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
165; AVX2OR512VL-NEXT:    retq
166  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
167  ret <8 x float> %shuffle
168}
169
170define <8 x float> @shuffle_v8f32_00001111(<8 x float> %a, <8 x float> %b) {
171; AVX1-LABEL: shuffle_v8f32_00001111:
172; AVX1:       # %bb.0:
173; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
174; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
175; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
176; AVX1-NEXT:    retq
177;
178; AVX2OR512VL-LABEL: shuffle_v8f32_00001111:
179; AVX2OR512VL:       # %bb.0:
180; AVX2OR512VL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
181; AVX2OR512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1]
182; AVX2OR512VL-NEXT:    retq
183  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
184  ret <8 x float> %shuffle
185}
186
187define <8 x float> @shuffle_v8f32_81a3c5e7(<8 x float> %a, <8 x float> %b) {
188; ALL-LABEL: shuffle_v8f32_81a3c5e7:
189; ALL:       # %bb.0:
190; ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
191; ALL-NEXT:    retq
192  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
193  ret <8 x float> %shuffle
194}
195
196define <8 x float> @shuffle_v8f32_08080808(<8 x float> %a, <8 x float> %b) {
197; AVX1-LABEL: shuffle_v8f32_08080808:
198; AVX1:       # %bb.0:
199; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
200; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
201; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
202; AVX1-NEXT:    retq
203;
204; AVX2OR512VL-LABEL: shuffle_v8f32_08080808:
205; AVX2OR512VL:       # %bb.0:
206; AVX2OR512VL-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
207; AVX2OR512VL-NEXT:    vbroadcastsd %xmm0, %ymm0
208; AVX2OR512VL-NEXT:    retq
209  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8>
210  ret <8 x float> %shuffle
211}
212
213define <8 x float> @shuffle_v8f32_08084c4c(<8 x float> %a, <8 x float> %b) {
214; ALL-LABEL: shuffle_v8f32_08084c4c:
215; ALL:       # %bb.0:
216; ALL-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4]
217; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
218; ALL-NEXT:    retq
219  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 4, i32 12, i32 4, i32 12>
220  ret <8 x float> %shuffle
221}
222
223define <8 x float> @shuffle_v8f32_8823cc67(<8 x float> %a, <8 x float> %b) {
224; ALL-LABEL: shuffle_v8f32_8823cc67:
225; ALL:       # %bb.0:
226; ALL-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,0],ymm0[2,3],ymm1[4,4],ymm0[6,7]
227; ALL-NEXT:    retq
228  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 12, i32 12, i32 6, i32 7>
229  ret <8 x float> %shuffle
230}
231
232define <8 x float> @shuffle_v8f32_9832dc76(<8 x float> %a, <8 x float> %b) {
233; ALL-LABEL: shuffle_v8f32_9832dc76:
234; ALL:       # %bb.0:
235; ALL-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[3,2],ymm1[5,4],ymm0[7,6]
236; ALL-NEXT:    retq
237  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 9, i32 8, i32 3, i32 2, i32 13, i32 12, i32 7, i32 6>
238  ret <8 x float> %shuffle
239}
240
241define <8 x float> @shuffle_v8f32_9810dc54(<8 x float> %a, <8 x float> %b) {
242; ALL-LABEL: shuffle_v8f32_9810dc54:
243; ALL:       # %bb.0:
244; ALL-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4]
245; ALL-NEXT:    retq
246  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 9, i32 8, i32 1, i32 0, i32 13, i32 12, i32 5, i32 4>
247  ret <8 x float> %shuffle
248}
249
250define <8 x float> @shuffle_v8f32_08194c5d(<8 x float> %a, <8 x float> %b) {
251; ALL-LABEL: shuffle_v8f32_08194c5d:
252; ALL:       # %bb.0:
253; ALL-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
254; ALL-NEXT:    retq
255  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
256  ret <8 x float> %shuffle
257}
258
259define <8 x float> @shuffle_v8f32_2a3b6e7f(<8 x float> %a, <8 x float> %b) {
260; ALL-LABEL: shuffle_v8f32_2a3b6e7f:
261; ALL:       # %bb.0:
262; ALL-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
263; ALL-NEXT:    retq
264  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
265  ret <8 x float> %shuffle
266}
267
268define <8 x float> @shuffle_v8f32_08192a3b(<8 x float> %a, <8 x float> %b) {
269; AVX1OR2-LABEL: shuffle_v8f32_08192a3b:
270; AVX1OR2:       # %bb.0:
271; AVX1OR2-NEXT:    vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
272; AVX1OR2-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
273; AVX1OR2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
274; AVX1OR2-NEXT:    retq
275;
276; AVX512VL-LABEL: shuffle_v8f32_08192a3b:
277; AVX512VL:       # %bb.0:
278; AVX512VL-NEXT:    vmovaps {{.*#+}} ymm2 = [0,8,1,9,2,10,3,11]
279; AVX512VL-NEXT:    vpermt2ps %ymm1, %ymm2, %ymm0
280; AVX512VL-NEXT:    retq
281  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
282  ret <8 x float> %shuffle
283}
284
285define <8 x float> @shuffle_v8f32_08991abb(<8 x float> %a, <8 x float> %b) {
286; AVX1-LABEL: shuffle_v8f32_08991abb:
287; AVX1:       # %bb.0:
288; AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[0,0],xmm1[0,0]
289; AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[1,1]
290; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
291; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,2,3,3]
292; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
293; AVX1-NEXT:    retq
294;
295; AVX2-LABEL: shuffle_v8f32_08991abb:
296; AVX2:       # %bb.0:
297; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = <u,0,1,1,u,2,3,3>
298; AVX2-NEXT:    vpermps %ymm1, %ymm2, %ymm1
299; AVX2-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
300; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
301; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
302; AVX2-NEXT:    retq
303;
304; AVX512VL-LABEL: shuffle_v8f32_08991abb:
305; AVX512VL:       # %bb.0:
306; AVX512VL-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3]
307; AVX512VL-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,1,1,10,2,3,3]
308; AVX512VL-NEXT:    vpermi2ps %ymm2, %ymm1, %ymm0
309; AVX512VL-NEXT:    retq
310  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 9, i32 9, i32 1, i32 10, i32 11, i32 11>
311  ret <8 x float> %shuffle
312}
313
314define <8 x float> @shuffle_v8f32_091b2d3f(<8 x float> %a, <8 x float> %b) {
315; AVX1-LABEL: shuffle_v8f32_091b2d3f:
316; AVX1:       # %bb.0:
317; AVX1-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3]
318; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3]
319; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
320; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
321; AVX1-NEXT:    retq
322;
323; AVX2-LABEL: shuffle_v8f32_091b2d3f:
324; AVX2:       # %bb.0:
325; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = <0,u,1,u,2,u,3,u>
326; AVX2-NEXT:    vpermps %ymm0, %ymm2, %ymm0
327; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
328; AVX2-NEXT:    retq
329;
330; AVX512VL-LABEL: shuffle_v8f32_091b2d3f:
331; AVX512VL:       # %bb.0:
332; AVX512VL-NEXT:    vmovaps {{.*#+}} ymm2 = [0,9,1,11,2,13,3,15]
333; AVX512VL-NEXT:    vpermt2ps %ymm1, %ymm2, %ymm0
334; AVX512VL-NEXT:    retq
335  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15>
336  ret <8 x float> %shuffle
337}
338
339define <8 x float> @shuffle_v8f32_09ab1def(<8 x float> %a, <8 x float> %b) {
340; AVX1-LABEL: shuffle_v8f32_09ab1def:
341; AVX1:       # %bb.0:
342; AVX1-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
343; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
344; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
345; AVX1-NEXT:    retq
346;
347; AVX2-LABEL: shuffle_v8f32_09ab1def:
348; AVX2:       # %bb.0:
349; AVX2-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
350; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
351; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
352; AVX2-NEXT:    retq
353;
354; AVX512VL-SLOW-LABEL: shuffle_v8f32_09ab1def:
355; AVX512VL-SLOW:       # %bb.0:
356; AVX512VL-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
357; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
358; AVX512VL-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
359; AVX512VL-SLOW-NEXT:    retq
360;
361; AVX512VL-FAST-LABEL: shuffle_v8f32_09ab1def:
362; AVX512VL-FAST:       # %bb.0:
363; AVX512VL-FAST-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3]
364; AVX512VL-FAST-NEXT:    vmovaps {{.*#+}} ymm0 = [8,1,2,3,10,5,6,7]
365; AVX512VL-FAST-NEXT:    vpermi2ps %ymm2, %ymm1, %ymm0
366; AVX512VL-FAST-NEXT:    retq
367  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>
368  ret <8 x float> %shuffle
369}
370
371define <8 x float> @shuffle_v8f32_00014445(<8 x float> %a, <8 x float> %b) {
372; ALL-LABEL: shuffle_v8f32_00014445:
373; ALL:       # %bb.0:
374; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5]
375; ALL-NEXT:    retq
376  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 4, i32 4, i32 4, i32 5>
377  ret <8 x float> %shuffle
378}
379
380define <8 x float> @shuffle_v8f32_00204464(<8 x float> %a, <8 x float> %b) {
381; ALL-LABEL: shuffle_v8f32_00204464:
382; ALL:       # %bb.0:
383; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,4,6,4]
384; ALL-NEXT:    retq
385  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 4, i32 6, i32 4>
386  ret <8 x float> %shuffle
387}
388
389define <8 x float> @shuffle_v8f32_03004744(<8 x float> %a, <8 x float> %b) {
390; ALL-LABEL: shuffle_v8f32_03004744:
391; ALL:       # %bb.0:
392; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,3,0,0,4,7,4,4]
393; ALL-NEXT:    retq
394  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 7, i32 4, i32 4>
395  ret <8 x float> %shuffle
396}
397
398define <8 x float> @shuffle_v8f32_10005444(<8 x float> %a, <8 x float> %b) {
399; ALL-LABEL: shuffle_v8f32_10005444:
400; ALL:       # %bb.0:
401; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,0,0,5,4,4,4]
402; ALL-NEXT:    retq
403  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4>
404  ret <8 x float> %shuffle
405}
406
407define <8 x float> @shuffle_v8f32_22006644(<8 x float> %a, <8 x float> %b) {
408; ALL-LABEL: shuffle_v8f32_22006644:
409; ALL:       # %bb.0:
410; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[2,2,0,0,6,6,4,4]
411; ALL-NEXT:    retq
412  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 6, i32 4, i32 4>
413  ret <8 x float> %shuffle
414}
415
416define <8 x float> @shuffle_v8f32_33307774(<8 x float> %a, <8 x float> %b) {
417; ALL-LABEL: shuffle_v8f32_33307774:
418; ALL:       # %bb.0:
419; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,3,3,0,7,7,7,4]
420; ALL-NEXT:    retq
421  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 7, i32 7, i32 4>
422  ret <8 x float> %shuffle
423}
424
425define <8 x float> @shuffle_v8f32_32107654(<8 x float> %a, <8 x float> %b) {
426; ALL-LABEL: shuffle_v8f32_32107654:
427; ALL:       # %bb.0:
428; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
429; ALL-NEXT:    retq
430  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
431  ret <8 x float> %shuffle
432}
433
434define <8 x float> @shuffle_v8f32_00234467(<8 x float> %a, <8 x float> %b) {
435; ALL-LABEL: shuffle_v8f32_00234467:
436; ALL:       # %bb.0:
437; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7]
438; ALL-NEXT:    retq
439  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 4, i32 4, i32 6, i32 7>
440  ret <8 x float> %shuffle
441}
442
443define <8 x float> @shuffle_v8f32_00224466(<8 x float> %a, <8 x float> %b) {
444; ALL-LABEL: shuffle_v8f32_00224466:
445; ALL:       # %bb.0:
446; ALL-NEXT:    vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
447; ALL-NEXT:    retq
448  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
449  ret <8 x float> %shuffle
450}
451
452define <8 x float> @shuffle_v8f32_10325476(<8 x float> %a, <8 x float> %b) {
453; ALL-LABEL: shuffle_v8f32_10325476:
454; ALL:       # %bb.0:
455; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
456; ALL-NEXT:    retq
457  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
458  ret <8 x float> %shuffle
459}
460
461define <8 x float> @shuffle_v8f32_11335577(<8 x float> %a, <8 x float> %b) {
462; ALL-LABEL: shuffle_v8f32_11335577:
463; ALL:       # %bb.0:
464; ALL-NEXT:    vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
465; ALL-NEXT:    retq
466  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
467  ret <8 x float> %shuffle
468}
469
470define <8 x float> @shuffle_v8f32_10235467(<8 x float> %a, <8 x float> %b) {
471; ALL-LABEL: shuffle_v8f32_10235467:
472; ALL:       # %bb.0:
473; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,2,3,5,4,6,7]
474; ALL-NEXT:    retq
475  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
476  ret <8 x float> %shuffle
477}
478
479define <8 x float> @shuffle_v8f32_10225466(<8 x float> %a, <8 x float> %b) {
480; ALL-LABEL: shuffle_v8f32_10225466:
481; ALL:       # %bb.0:
482; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,2,2,5,4,6,6]
483; ALL-NEXT:    retq
484  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 2, i32 5, i32 4, i32 6, i32 6>
485  ret <8 x float> %shuffle
486}
487
488define <8 x float> @shuffle_v8f32_00015444(<8 x float> %a, <8 x float> %b) {
489; ALL-LABEL: shuffle_v8f32_00015444:
490; ALL:       # %bb.0:
491; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,1,5,4,4,4]
492; ALL-NEXT:    retq
493  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 5, i32 4, i32 4, i32 4>
494  ret <8 x float> %shuffle
495}
496
497define <8 x float> @shuffle_v8f32_00204644(<8 x float> %a, <8 x float> %b) {
498; ALL-LABEL: shuffle_v8f32_00204644:
499; ALL:       # %bb.0:
500; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,6,4,4]
501; ALL-NEXT:    retq
502  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 6, i32 4, i32 4>
503  ret <8 x float> %shuffle
504}
505
506define <8 x float> @shuffle_v8f32_03004474(<8 x float> %a, <8 x float> %b) {
507; ALL-LABEL: shuffle_v8f32_03004474:
508; ALL:       # %bb.0:
509; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,3,0,0,4,4,7,4]
510; ALL-NEXT:    retq
511  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 4, i32 7, i32 4>
512  ret <8 x float> %shuffle
513}
514
515define <8 x float> @shuffle_v8f32_10004444(<8 x float> %a, <8 x float> %b) {
516; ALL-LABEL: shuffle_v8f32_10004444:
517; ALL:       # %bb.0:
518; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,0,0,4,4,4,4]
519; ALL-NEXT:    retq
520  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
521  ret <8 x float> %shuffle
522}
523
524define <8 x float> @shuffle_v8f32_22006446(<8 x float> %a, <8 x float> %b) {
525; ALL-LABEL: shuffle_v8f32_22006446:
526; ALL:       # %bb.0:
527; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[2,2,0,0,6,4,4,6]
528; ALL-NEXT:    retq
529  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 4, i32 4, i32 6>
530  ret <8 x float> %shuffle
531}
532
533define <8 x float> @shuffle_v8f32_33307474(<8 x float> %a, <8 x float> %b) {
534; ALL-LABEL: shuffle_v8f32_33307474:
535; ALL:       # %bb.0:
536; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,3,3,0,7,4,7,4]
537; ALL-NEXT:    retq
538  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 4, i32 7, i32 4>
539  ret <8 x float> %shuffle
540}
541
542define <8 x float> @shuffle_v8f32_32104567(<8 x float> %a, <8 x float> %b) {
543; ALL-LABEL: shuffle_v8f32_32104567:
544; ALL:       # %bb.0:
545; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7]
546; ALL-NEXT:    retq
547  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
548  ret <8 x float> %shuffle
549}
550
551define <8 x float> @shuffle_v8f32_00236744(<8 x float> %a, <8 x float> %b) {
552; ALL-LABEL: shuffle_v8f32_00236744:
553; ALL:       # %bb.0:
554; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,6,7,4,4]
555; ALL-NEXT:    retq
556  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 6, i32 7, i32 4, i32 4>
557  ret <8 x float> %shuffle
558}
559
560define <8 x float> @shuffle_v8f32_00226644(<8 x float> %a, <8 x float> %b) {
561; ALL-LABEL: shuffle_v8f32_00226644:
562; ALL:       # %bb.0:
563; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,2,6,6,4,4]
564; ALL-NEXT:    retq
565  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 6, i32 6, i32 4, i32 4>
566  ret <8 x float> %shuffle
567}
568
569define <8 x float> @shuffle_v8f32_10324567(<8 x float> %a, <8 x float> %b) {
570; ALL-LABEL: shuffle_v8f32_10324567:
571; ALL:       # %bb.0:
572; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,6,7]
573; ALL-NEXT:    retq
574  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
575  ret <8 x float> %shuffle
576}
577
578define <8 x float> @shuffle_v8f32_11334567(<8 x float> %a, <8 x float> %b) {
579; ALL-LABEL: shuffle_v8f32_11334567:
580; ALL:       # %bb.0:
581; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,4,5,6,7]
582; ALL-NEXT:    retq
583  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7>
584  ret <8 x float> %shuffle
585}
586
587define <8 x float> @shuffle_v8f32_01235467(<8 x float> %a, <8 x float> %b) {
588; ALL-LABEL: shuffle_v8f32_01235467:
589; ALL:       # %bb.0:
590; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,6,7]
591; ALL-NEXT:    retq
592  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
593  ret <8 x float> %shuffle
594}
595
596define <8 x float> @shuffle_v8f32_01235466(<8 x float> %a, <8 x float> %b) {
597; ALL-LABEL: shuffle_v8f32_01235466:
598; ALL:       # %bb.0:
599; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,6,6]
600; ALL-NEXT:    retq
601  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 6>
602  ret <8 x float> %shuffle
603}
604
605define <8 x float> @shuffle_v8f32_002u6u44(<8 x float> %a, <8 x float> %b) {
606; ALL-LABEL: shuffle_v8f32_002u6u44:
607; ALL:       # %bb.0:
608; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,u,6,u,4,4]
609; ALL-NEXT:    retq
610  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 4, i32 4>
611  ret <8 x float> %shuffle
612}
613
614define <8 x float> @shuffle_v8f32_00uu66uu(<8 x float> %a, <8 x float> %b) {
615; ALL-LABEL: shuffle_v8f32_00uu66uu:
616; ALL:       # %bb.0:
617; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,u,u,6,6,u,u]
618; ALL-NEXT:    retq
619  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 6, i32 6, i32 undef, i32 undef>
620  ret <8 x float> %shuffle
621}
622
623define <8 x float> @shuffle_v8f32_103245uu(<8 x float> %a, <8 x float> %b) {
624; ALL-LABEL: shuffle_v8f32_103245uu:
625; ALL:       # %bb.0:
626; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,u,u]
627; ALL-NEXT:    retq
628  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 undef, i32 undef>
629  ret <8 x float> %shuffle
630}
631
632define <8 x float> @shuffle_v8f32_1133uu67(<8 x float> %a, <8 x float> %b) {
633; ALL-LABEL: shuffle_v8f32_1133uu67:
634; ALL:       # %bb.0:
635; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,u,u,6,7]
636; ALL-NEXT:    retq
637  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 undef, i32 6, i32 7>
638  ret <8 x float> %shuffle
639}
640
641define <8 x float> @shuffle_v8f32_0uu354uu(<8 x float> %a, <8 x float> %b) {
642; ALL-LABEL: shuffle_v8f32_0uu354uu:
643; ALL:       # %bb.0:
644; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,u,u,3,5,4,u,u]
645; ALL-NEXT:    retq
646  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 5, i32 4, i32 undef, i32 undef>
647  ret <8 x float> %shuffle
648}
649
650define <8 x float> @shuffle_v8f32_uuu3uu66(<8 x float> %a, <8 x float> %b) {
651; ALL-LABEL: shuffle_v8f32_uuu3uu66:
652; ALL:       # %bb.0:
653; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[u,u,u,3,u,u,6,6]
654; ALL-NEXT:    retq
655  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 6>
656  ret <8 x float> %shuffle
657}
658
659define <8 x float> @shuffle_v8f32_c348cda0(<8 x float> %a, <8 x float> %b) {
660; AVX1-LABEL: shuffle_v8f32_c348cda0:
661; AVX1:       # %bb.0:
662; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
663; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm2[0,0],ymm0[4,7],ymm2[4,4]
664; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
665; AVX1-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
666; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7]
667; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7]
668; AVX1-NEXT:    retq
669;
670; AVX2-SLOW-LABEL: shuffle_v8f32_c348cda0:
671; AVX2-SLOW:       # %bb.0:
672; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm2 = <4,u,u,0,4,5,2,u>
673; AVX2-SLOW-NEXT:    vpermps %ymm1, %ymm2, %ymm1
674; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,3,2,0,4,7,6,4]
675; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1]
676; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7]
677; AVX2-SLOW-NEXT:    retq
678;
679; AVX2-FAST-LABEL: shuffle_v8f32_c348cda0:
680; AVX2-FAST:       # %bb.0:
681; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm2 = [0,3,4,7,4,7,2,0]
682; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm2, %ymm0
683; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm2 = <4,u,u,0,4,5,2,u>
684; AVX2-FAST-NEXT:    vpermps %ymm1, %ymm2, %ymm1
685; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7]
686; AVX2-FAST-NEXT:    retq
687;
688; AVX512VL-LABEL: shuffle_v8f32_c348cda0:
689; AVX512VL:       # %bb.0:
690; AVX512VL-NEXT:    vmovaps {{.*#+}} ymm2 = [4,11,12,0,4,5,2,8]
691; AVX512VL-NEXT:    vpermi2ps %ymm0, %ymm1, %ymm2
692; AVX512VL-NEXT:    vmovaps %ymm2, %ymm0
693; AVX512VL-NEXT:    retq
694  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 12, i32 3, i32 4, i32 8, i32 12, i32 13, i32 10, i32 0>
695  ret <8 x float> %shuffle
696}
697
698define <8 x float> @shuffle_v8f32_f511235a(<8 x float> %a, <8 x float> %b) {
699; AVX1-LABEL: shuffle_v8f32_f511235a:
700; AVX1:       # %bb.0:
701; AVX1-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[3,1,2,2,7,5,6,6]
702; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1]
703; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
704; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,1,1,1,4,5,5,5]
705; AVX1-NEXT:    vshufpd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[3],ymm0[3]
706; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7]
707; AVX1-NEXT:    retq
708;
709; AVX2-SLOW-LABEL: shuffle_v8f32_f511235a:
710; AVX2-SLOW:       # %bb.0:
711; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[3,2,2,3,7,6,6,7]
712; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,0]
713; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,1,2,3,5,5,6,7]
714; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,0,1,2]
715; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7]
716; AVX2-SLOW-NEXT:    retq
717;
718; AVX2-FAST-LABEL: shuffle_v8f32_f511235a:
719; AVX2-FAST:       # %bb.0:
720; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm2 = [7,6,2,3,7,6,3,2]
721; AVX2-FAST-NEXT:    vpermps %ymm1, %ymm2, %ymm1
722; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm2 = [5,5,1,1,2,3,5,5]
723; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm2, %ymm0
724; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7]
725; AVX2-FAST-NEXT:    retq
726;
727; AVX512VL-LABEL: shuffle_v8f32_f511235a:
728; AVX512VL:       # %bb.0:
729; AVX512VL-NEXT:    vmovaps {{.*#+}} ymm2 = [15,5,1,1,2,3,5,10]
730; AVX512VL-NEXT:    vpermt2ps %ymm1, %ymm2, %ymm0
731; AVX512VL-NEXT:    retq
732  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 15, i32 5, i32 1, i32 1, i32 2, i32 3, i32 5, i32 10>
733  ret <8 x float> %shuffle
734}
735
736define <8 x float> @shuffle_v8f32_32103210(<8 x float> %a, <8 x float> %b) {
737; AVX1-LABEL: shuffle_v8f32_32103210:
738; AVX1:       # %bb.0:
739; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
740; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
741; AVX1-NEXT:    retq
742;
743; AVX2OR512VL-LABEL: shuffle_v8f32_32103210:
744; AVX2OR512VL:       # %bb.0:
745; AVX2OR512VL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
746; AVX2OR512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
747; AVX2OR512VL-NEXT:    retq
748  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>
749  ret <8 x float> %shuffle
750}
751
752define <8 x float> @shuffle_v8f32_76547654(<8 x float> %a, <8 x float> %b) {
753; AVX1-LABEL: shuffle_v8f32_76547654:
754; AVX1:       # %bb.0:
755; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
756; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
757; AVX1-NEXT:    retq
758;
759; AVX2-SLOW-LABEL: shuffle_v8f32_76547654:
760; AVX2-SLOW:       # %bb.0:
761; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
762; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
763; AVX2-SLOW-NEXT:    retq
764;
765; AVX2-FAST-LABEL: shuffle_v8f32_76547654:
766; AVX2-FAST:       # %bb.0:
767; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
768; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm1, %ymm0
769; AVX2-FAST-NEXT:    retq
770;
771; AVX512VL-SLOW-LABEL: shuffle_v8f32_76547654:
772; AVX512VL-SLOW:       # %bb.0:
773; AVX512VL-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
774; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
775; AVX512VL-SLOW-NEXT:    retq
776;
777; AVX512VL-FAST-LABEL: shuffle_v8f32_76547654:
778; AVX512VL-FAST:       # %bb.0:
779; AVX512VL-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
780; AVX512VL-FAST-NEXT:    vpermps %ymm0, %ymm1, %ymm0
781; AVX512VL-FAST-NEXT:    retq
782  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4>
783  ret <8 x float> %shuffle
784}
785
786define <8 x float> @shuffle_v8f32_76543210(<8 x float> %a, <8 x float> %b) {
787; AVX1-LABEL: shuffle_v8f32_76543210:
788; AVX1:       # %bb.0:
789; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
790; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
791; AVX1-NEXT:    retq
792;
793; AVX2-SLOW-LABEL: shuffle_v8f32_76543210:
794; AVX2-SLOW:       # %bb.0:
795; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
796; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
797; AVX2-SLOW-NEXT:    retq
798;
799; AVX2-FAST-LABEL: shuffle_v8f32_76543210:
800; AVX2-FAST:       # %bb.0:
801; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0]
802; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm1, %ymm0
803; AVX2-FAST-NEXT:    retq
804;
805; AVX512VL-SLOW-LABEL: shuffle_v8f32_76543210:
806; AVX512VL-SLOW:       # %bb.0:
807; AVX512VL-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
808; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
809; AVX512VL-SLOW-NEXT:    retq
810;
811; AVX512VL-FAST-LABEL: shuffle_v8f32_76543210:
812; AVX512VL-FAST:       # %bb.0:
813; AVX512VL-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0]
814; AVX512VL-FAST-NEXT:    vpermps %ymm0, %ymm1, %ymm0
815; AVX512VL-FAST-NEXT:    retq
816  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
817  ret <8 x float> %shuffle
818}
819
820define <8 x float> @shuffle_v8f32_3210ba98(<8 x float> %a, <8 x float> %b) {
821; ALL-LABEL: shuffle_v8f32_3210ba98:
822; ALL:       # %bb.0:
823; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
824; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
825; ALL-NEXT:    retq
826  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 11, i32 10, i32 9, i32 8>
827  ret <8 x float> %shuffle
828}
829
830define <8 x float> @shuffle_v8f32_3210fedc(<8 x float> %a, <8 x float> %b) {
831; ALL-LABEL: shuffle_v8f32_3210fedc:
832; ALL:       # %bb.0:
833; ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
834; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
835; ALL-NEXT:    retq
836  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12>
837  ret <8 x float> %shuffle
838}
839
840define <8 x float> @shuffle_v8f32_7654fedc(<8 x float> %a, <8 x float> %b) {
841; AVX1OR2-LABEL: shuffle_v8f32_7654fedc:
842; AVX1OR2:       # %bb.0:
843; AVX1OR2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
844; AVX1OR2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
845; AVX1OR2-NEXT:    retq
846;
847; AVX512VL-SLOW-LABEL: shuffle_v8f32_7654fedc:
848; AVX512VL-SLOW:       # %bb.0:
849; AVX512VL-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
850; AVX512VL-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
851; AVX512VL-SLOW-NEXT:    retq
852;
853; AVX512VL-FAST-LABEL: shuffle_v8f32_7654fedc:
854; AVX512VL-FAST:       # %bb.0:
855; AVX512VL-FAST-NEXT:    vmovaps {{.*#+}} ymm2 = [7,6,5,4,15,14,13,12]
856; AVX512VL-FAST-NEXT:    vpermt2ps %ymm1, %ymm2, %ymm0
857; AVX512VL-FAST-NEXT:    retq
858  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 15, i32 14, i32 13, i32 12>
859  ret <8 x float> %shuffle
860}
861
862define <8 x float> @shuffle_v8f32_fedc7654(<8 x float> %a, <8 x float> %b) {
863; AVX1OR2-LABEL: shuffle_v8f32_fedc7654:
864; AVX1OR2:       # %bb.0:
865; AVX1OR2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
866; AVX1OR2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
867; AVX1OR2-NEXT:    retq
868;
869; AVX512VL-SLOW-LABEL: shuffle_v8f32_fedc7654:
870; AVX512VL-SLOW:       # %bb.0:
871; AVX512VL-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
872; AVX512VL-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
873; AVX512VL-SLOW-NEXT:    retq
874;
875; AVX512VL-FAST-LABEL: shuffle_v8f32_fedc7654:
876; AVX512VL-FAST:       # %bb.0:
877; AVX512VL-FAST-NEXT:    vmovaps {{.*#+}} ymm2 = [7,6,5,4,15,14,13,12]
878; AVX512VL-FAST-NEXT:    vpermi2ps %ymm0, %ymm1, %ymm2
879; AVX512VL-FAST-NEXT:    vmovaps %ymm2, %ymm0
880; AVX512VL-FAST-NEXT:    retq
881  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 15, i32 14, i32 13, i32 12, i32 7, i32 6, i32 5, i32 4>
882  ret <8 x float> %shuffle
883}
884
885define <8 x float> @PR21138(<8 x float> %truc, <8 x float> %tchose) {
886; AVX1-LABEL: PR21138:
887; AVX1:       # %bb.0:
888; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
889; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3]
890; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
891; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
892; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
893; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
894; AVX1-NEXT:    retq
895;
896; AVX2-LABEL: PR21138:
897; AVX2:       # %bb.0:
898; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
899; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
900; AVX2-NEXT:    retq
901;
902; AVX512VL-SLOW-LABEL: PR21138:
903; AVX512VL-SLOW:       # %bb.0:
904; AVX512VL-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
905; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
906; AVX512VL-SLOW-NEXT:    retq
907;
908; AVX512VL-FAST-LABEL: PR21138:
909; AVX512VL-FAST:       # %bb.0:
910; AVX512VL-FAST-NEXT:    vmovaps {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15]
911; AVX512VL-FAST-NEXT:    vpermt2ps %ymm1, %ymm2, %ymm0
912; AVX512VL-FAST-NEXT:    retq
913  %shuffle = shufflevector <8 x float> %truc, <8 x float> %tchose, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
914  ret <8 x float> %shuffle
915}
916
917define <8 x float> @shuffle_v8f32_ba987654(<8 x float> %a, <8 x float> %b) {
918; ALL-LABEL: shuffle_v8f32_ba987654:
919; ALL:       # %bb.0:
920; ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
921; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
922; ALL-NEXT:    retq
923  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4>
924  ret <8 x float> %shuffle
925}
926
927define <8 x float> @shuffle_v8f32_ba983210(<8 x float> %a, <8 x float> %b) {
928; ALL-LABEL: shuffle_v8f32_ba983210:
929; ALL:       # %bb.0:
930; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
931; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
932; ALL-NEXT:    retq
933  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 11, i32 10, i32 9, i32 8, i32 3, i32 2, i32 1, i32 0>
934  ret <8 x float> %shuffle
935}
936
937define <8 x float> @shuffle_v8f32_80u1c4u5(<8 x float> %a, <8 x float> %b) {
938; ALL-LABEL: shuffle_v8f32_80u1c4u5:
939; ALL:       # %bb.0:
940; ALL-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
941; ALL-NEXT:    retq
942  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 8, i32 0, i32 undef, i32 1, i32 12, i32 4, i32 undef, i32 5>
943  ret <8 x float> %shuffle
944}
945
946define <8 x float> @shuffle_v8f32_a2u3e6f7(<8 x float> %a, <8 x float> %b) {
947; ALL-LABEL: shuffle_v8f32_a2u3e6f7:
948; ALL:       # %bb.0:
949; ALL-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
950; ALL-NEXT:    retq
951  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 10, i32 2, i32 undef, i32 3, i32 14, i32 6, i32 15, i32 7>
952  ret <8 x float> %shuffle
953}
954
955define <8 x float> @shuffle_v8f32_084c195d(<8 x float> %a, <8 x float> %b) {
956; AVX1-LABEL: shuffle_v8f32_084c195d:
957; AVX1:       # %bb.0:
958; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
959; AVX1-NEXT:    vpermilps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
960; AVX1-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5]
961; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5],ymm1[6,7]
962; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
963; AVX1-NEXT:    vpermilps {{.*#+}} ymm2 = ymm2[1,1,0,3,5,5,4,7]
964; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7]
965; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5],ymm0[6,7]
966; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
967; AVX1-NEXT:    retq
968;
969; AVX2-LABEL: shuffle_v8f32_084c195d:
970; AVX2:       # %bb.0:
971; AVX2-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
972; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
973; AVX2-NEXT:    retq
974;
975; AVX512VL-SLOW-LABEL: shuffle_v8f32_084c195d:
976; AVX512VL-SLOW:       # %bb.0:
977; AVX512VL-SLOW-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
978; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
979; AVX512VL-SLOW-NEXT:    retq
980;
981; AVX512VL-FAST-LABEL: shuffle_v8f32_084c195d:
982; AVX512VL-FAST:       # %bb.0:
983; AVX512VL-FAST-NEXT:    vmovaps {{.*#+}} ymm2 = [0,8,4,12,1,9,5,13]
984; AVX512VL-FAST-NEXT:    vpermt2ps %ymm1, %ymm2, %ymm0
985; AVX512VL-FAST-NEXT:    retq
986  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 4, i32 12, i32 1, i32 9, i32 5, i32 13>
987  ret <8 x float> %shuffle
988}
989
990define <8 x float> @shuffle_v8f32_01452367d(<8 x float> %a) {
991; AVX1-LABEL: shuffle_v8f32_01452367d:
992; AVX1:       # %bb.0:
993; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
994; AVX1-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,2]
995; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3]
996; AVX1-NEXT:    retq
997;
998; AVX2OR512VL-LABEL: shuffle_v8f32_01452367d:
999; AVX2OR512VL:       # %bb.0:
1000; AVX2OR512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
1001; AVX2OR512VL-NEXT:    retq
1002  %shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 2, i32 3, i32 6, i32 7>
1003  ret <8 x float> %shuffle
1004}
1005
1006define <8 x float> @shuffle_v8f32_uuuu1111(<8 x float> %a, <8 x float> %b) {
1007; ALL-LABEL: shuffle_v8f32_uuuu1111:
1008; ALL:       # %bb.0:
1009; ALL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
1010; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1011; ALL-NEXT:    retq
1012  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 1, i32 1, i32 1>
1013  ret <8 x float> %shuffle
1014}
1015
1016define <8 x float> @shuffle_v8f32_44444444(<8 x float> %a, <8 x float> %b) {
1017; AVX1-LABEL: shuffle_v8f32_44444444:
1018; AVX1:       # %bb.0:
1019; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
1020; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
1021; AVX1-NEXT:    retq
1022;
1023; AVX2OR512VL-LABEL: shuffle_v8f32_44444444:
1024; AVX2OR512VL:       # %bb.0:
1025; AVX2OR512VL-NEXT:    vextractf128 $1, %ymm0, %xmm0
1026; AVX2OR512VL-NEXT:    vbroadcastss %xmm0, %ymm0
1027; AVX2OR512VL-NEXT:    retq
1028  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
1029  ret <8 x float> %shuffle
1030}
1031
1032define <8 x float> @shuffle_v8f32_1188uuuu(<8 x float> %a, <8 x float> %b) {
1033; ALL-LABEL: shuffle_v8f32_1188uuuu:
1034; ALL:       # %bb.0:
1035; ALL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,0]
1036; ALL-NEXT:    retq
1037  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 1, i32 8, i32 8, i32 undef, i32 undef, i32 undef, i32 undef>
1038  ret <8 x float> %shuffle
1039}
1040
1041define <8 x float> @shuffle_v8f32_uuuu3210(<8 x float> %a, <8 x float> %b) {
1042; ALL-LABEL: shuffle_v8f32_uuuu3210:
1043; ALL:       # %bb.0:
1044; ALL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
1045; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1046; ALL-NEXT:    retq
1047  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 3, i32 2, i32 1, i32 0>
1048  ret <8 x float> %shuffle
1049}
1050
1051define <8 x float> @shuffle_v8f32_uuuu1188(<8 x float> %a, <8 x float> %b) {
1052; ALL-LABEL: shuffle_v8f32_uuuu1188:
1053; ALL:       # %bb.0:
1054; ALL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,0]
1055; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1056; ALL-NEXT:    retq
1057  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 1, i32 8, i32 8>
1058  ret <8 x float> %shuffle
1059}
1060
1061define <8 x float> @shuffle_v8f32_1111uuuu(<8 x float> %a, <8 x float> %b) {
1062; ALL-LABEL: shuffle_v8f32_1111uuuu:
1063; ALL:       # %bb.0:
1064; ALL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
1065; ALL-NEXT:    retq
1066  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef>
1067  ret <8 x float> %shuffle
1068}
1069
1070define <8 x float> @shuffle_v8f32_5555uuuu(<8 x float> %a, <8 x float> %b) {
1071; ALL-LABEL: shuffle_v8f32_5555uuuu:
1072; ALL:       # %bb.0:
1073; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm0
1074; ALL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
1075; ALL-NEXT:    retq
1076  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 undef, i32 undef, i32 undef, i32 undef>
1077  ret <8 x float> %shuffle
1078}
1079
1080define <8 x i32> @shuffle_v8i32_00000000(<8 x i32> %a, <8 x i32> %b) {
1081; AVX1-LABEL: shuffle_v8i32_00000000:
1082; AVX1:       # %bb.0:
1083; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
1084; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1085; AVX1-NEXT:    retq
1086;
1087; AVX2OR512VL-LABEL: shuffle_v8i32_00000000:
1088; AVX2OR512VL:       # %bb.0:
1089; AVX2OR512VL-NEXT:    vbroadcastss %xmm0, %ymm0
1090; AVX2OR512VL-NEXT:    retq
1091  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
1092  ret <8 x i32> %shuffle
1093}
1094
1095define <8 x i32> @shuffle_v8i32_00000010(<8 x i32> %a, <8 x i32> %b) {
1096; AVX1-LABEL: shuffle_v8i32_00000010:
1097; AVX1:       # %bb.0:
1098; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
1099; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0]
1100; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1101; AVX1-NEXT:    retq
1102;
1103; AVX2OR512VL-LABEL: shuffle_v8i32_00000010:
1104; AVX2OR512VL:       # %bb.0:
1105; AVX2OR512VL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0]
1106; AVX2OR512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1]
1107; AVX2OR512VL-NEXT:    retq
1108  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
1109  ret <8 x i32> %shuffle
1110}
1111
1112define <8 x i32> @shuffle_v8i32_00000200(<8 x i32> %a, <8 x i32> %b) {
1113; AVX1-LABEL: shuffle_v8i32_00000200:
1114; AVX1:       # %bb.0:
1115; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
1116; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,0]
1117; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1118; AVX1-NEXT:    retq
1119;
1120; AVX2OR512VL-LABEL: shuffle_v8i32_00000200:
1121; AVX2OR512VL:       # %bb.0:
1122; AVX2OR512VL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,2]
1123; AVX2OR512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0]
1124; AVX2OR512VL-NEXT:    retq
1125  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
1126  ret <8 x i32> %shuffle
1127}
1128
1129define <8 x i32> @shuffle_v8i32_00003000(<8 x i32> %a, <8 x i32> %b) {
1130; AVX1-LABEL: shuffle_v8i32_00003000:
1131; AVX1:       # %bb.0:
1132; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
1133; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,0,0,0]
1134; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1135; AVX1-NEXT:    retq
1136;
1137; AVX2OR512VL-LABEL: shuffle_v8i32_00003000:
1138; AVX2OR512VL:       # %bb.0:
1139; AVX2OR512VL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,3,0]
1140; AVX2OR512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0]
1141; AVX2OR512VL-NEXT:    retq
1142  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
1143  ret <8 x i32> %shuffle
1144}
1145
1146define <8 x i32> @shuffle_v8i32_00040000(<8 x i32> %a, <8 x i32> %b) {
1147; AVX1-LABEL: shuffle_v8i32_00040000:
1148; AVX1:       # %bb.0:
1149; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,3]
1150; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
1151; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
1152; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
1153; AVX1-NEXT:    retq
1154;
1155; AVX2OR512VL-LABEL: shuffle_v8i32_00040000:
1156; AVX2OR512VL:       # %bb.0:
1157; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0]
1158; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
1159; AVX2OR512VL-NEXT:    retq
1160  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
1161  ret <8 x i32> %shuffle
1162}
1163
1164define <8 x i32> @shuffle_v8i32_00500000(<8 x i32> %a, <8 x i32> %b) {
1165; AVX1-LABEL: shuffle_v8i32_00500000:
1166; AVX1:       # %bb.0:
1167; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
1168; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
1169; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,4,4]
1170; AVX1-NEXT:    retq
1171;
1172; AVX2OR512VL-LABEL: shuffle_v8i32_00500000:
1173; AVX2OR512VL:       # %bb.0:
1174; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,5,0,0,0,0,0]
1175; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
1176; AVX2OR512VL-NEXT:    retq
1177  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
1178  ret <8 x i32> %shuffle
1179}
1180
1181define <8 x i32> @shuffle_v8i32_06000000(<8 x i32> %a, <8 x i32> %b) {
1182; AVX1-LABEL: shuffle_v8i32_06000000:
1183; AVX1:       # %bb.0:
1184; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
1185; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
1186; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,2,0,0,4,4,4,4]
1187; AVX1-NEXT:    retq
1188;
1189; AVX2OR512VL-LABEL: shuffle_v8i32_06000000:
1190; AVX2OR512VL:       # %bb.0:
1191; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = [0,6,0,0,0,0,0,0]
1192; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
1193; AVX2OR512VL-NEXT:    retq
1194  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
1195  ret <8 x i32> %shuffle
1196}
1197
1198define <8 x i32> @shuffle_v8i32_70000000(<8 x i32> %a, <8 x i32> %b) {
1199; AVX1-LABEL: shuffle_v8i32_70000000:
1200; AVX1:       # %bb.0:
1201; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
1202; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
1203; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,0,0,0,4,4,4,4]
1204; AVX1-NEXT:    retq
1205;
1206; AVX2OR512VL-LABEL: shuffle_v8i32_70000000:
1207; AVX2OR512VL:       # %bb.0:
1208; AVX2OR512VL-NEXT:    movl $7, %eax
1209; AVX2OR512VL-NEXT:    vmovd %eax, %xmm1
1210; AVX2OR512VL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
1211; AVX2OR512VL-NEXT:    retq
1212  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
1213  ret <8 x i32> %shuffle
1214}
1215
1216define <8 x i32> @shuffle_v8i32_01014545(<8 x i32> %a, <8 x i32> %b) {
1217; AVX1-LABEL: shuffle_v8i32_01014545:
1218; AVX1:       # %bb.0:
1219; AVX1-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
1220; AVX1-NEXT:    retq
1221;
1222; AVX2OR512VL-LABEL: shuffle_v8i32_01014545:
1223; AVX2OR512VL:       # %bb.0:
1224; AVX2OR512VL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
1225; AVX2OR512VL-NEXT:    retq
1226  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
1227  ret <8 x i32> %shuffle
1228}
1229
1230define <8 x i32> @shuffle_v8i32_00112233(<8 x i32> %a, <8 x i32> %b) {
1231; AVX1-LABEL: shuffle_v8i32_00112233:
1232; AVX1:       # %bb.0:
1233; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,0,1,1]
1234; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
1235; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1236; AVX1-NEXT:    retq
1237;
1238; AVX2OR512VL-LABEL: shuffle_v8i32_00112233:
1239; AVX2OR512VL:       # %bb.0:
1240; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3]
1241; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
1242; AVX2OR512VL-NEXT:    retq
1243  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
1244  ret <8 x i32> %shuffle
1245}
1246
1247define <8 x i32> @shuffle_v8i32_00001111(<8 x i32> %a, <8 x i32> %b) {
1248; AVX1-LABEL: shuffle_v8i32_00001111:
1249; AVX1:       # %bb.0:
1250; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
1251; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
1252; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1253; AVX1-NEXT:    retq
1254;
1255; AVX2OR512VL-LABEL: shuffle_v8i32_00001111:
1256; AVX2OR512VL:       # %bb.0:
1257; AVX2OR512VL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
1258; AVX2OR512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1]
1259; AVX2OR512VL-NEXT:    retq
1260  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
1261  ret <8 x i32> %shuffle
1262}
1263
1264define <8 x i32> @shuffle_v8i32_81a3c5e7(<8 x i32> %a, <8 x i32> %b) {
1265; ALL-LABEL: shuffle_v8i32_81a3c5e7:
1266; ALL:       # %bb.0:
1267; ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
1268; ALL-NEXT:    retq
1269  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
1270  ret <8 x i32> %shuffle
1271}
1272
1273define <8 x i32> @shuffle_v8i32_08080808(<8 x i32> %a, <8 x i32> %b) {
1274; AVX1-LABEL: shuffle_v8i32_08080808:
1275; AVX1:       # %bb.0:
1276; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
1277; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
1278; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1279; AVX1-NEXT:    retq
1280;
1281; AVX2OR512VL-LABEL: shuffle_v8i32_08080808:
1282; AVX2OR512VL:       # %bb.0:
1283; AVX2OR512VL-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1284; AVX2OR512VL-NEXT:    vbroadcastsd %xmm0, %ymm0
1285; AVX2OR512VL-NEXT:    retq
1286  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8>
1287  ret <8 x i32> %shuffle
1288}
1289
1290define <8 x i32> @shuffle_v8i32_08084c4c(<8 x i32> %a, <8 x i32> %b) {
1291; AVX1-LABEL: shuffle_v8i32_08084c4c:
1292; AVX1:       # %bb.0:
1293; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4]
1294; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
1295; AVX1-NEXT:    retq
1296;
1297; AVX2OR512VL-LABEL: shuffle_v8i32_08084c4c:
1298; AVX2OR512VL:       # %bb.0:
1299; AVX2OR512VL-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[0,0,2,0,4,4,6,4]
1300; AVX2OR512VL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
1301; AVX2OR512VL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
1302; AVX2OR512VL-NEXT:    retq
1303  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 4, i32 12, i32 4, i32 12>
1304  ret <8 x i32> %shuffle
1305}
1306
1307define <8 x i32> @shuffle_v8i32_8823cc67(<8 x i32> %a, <8 x i32> %b) {
1308; ALL-LABEL: shuffle_v8i32_8823cc67:
1309; ALL:       # %bb.0:
1310; ALL-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,0],ymm0[2,3],ymm1[4,4],ymm0[6,7]
1311; ALL-NEXT:    retq
1312  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 12, i32 12, i32 6, i32 7>
1313  ret <8 x i32> %shuffle
1314}
1315
1316define <8 x i32> @shuffle_v8i32_9832dc76(<8 x i32> %a, <8 x i32> %b) {
1317; ALL-LABEL: shuffle_v8i32_9832dc76:
1318; ALL:       # %bb.0:
1319; ALL-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[3,2],ymm1[5,4],ymm0[7,6]
1320; ALL-NEXT:    retq
1321  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 9, i32 8, i32 3, i32 2, i32 13, i32 12, i32 7, i32 6>
1322  ret <8 x i32> %shuffle
1323}
1324
1325define <8 x i32> @shuffle_v8i32_9810dc54(<8 x i32> %a, <8 x i32> %b) {
1326; ALL-LABEL: shuffle_v8i32_9810dc54:
1327; ALL:       # %bb.0:
1328; ALL-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4]
1329; ALL-NEXT:    retq
1330  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 9, i32 8, i32 1, i32 0, i32 13, i32 12, i32 5, i32 4>
1331  ret <8 x i32> %shuffle
1332}
1333
1334define <8 x i32> @shuffle_v8i32_08194c5d(<8 x i32> %a, <8 x i32> %b) {
1335; ALL-LABEL: shuffle_v8i32_08194c5d:
1336; ALL:       # %bb.0:
1337; ALL-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
1338; ALL-NEXT:    retq
1339  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
1340  ret <8 x i32> %shuffle
1341}
1342
1343define <8 x i32> @shuffle_v8i32_2a3b6e7f(<8 x i32> %a, <8 x i32> %b) {
1344; ALL-LABEL: shuffle_v8i32_2a3b6e7f:
1345; ALL:       # %bb.0:
1346; ALL-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
1347; ALL-NEXT:    retq
1348  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
1349  ret <8 x i32> %shuffle
1350}
1351
1352define <8 x i32> @shuffle_v8i32_08192a3b(<8 x i32> %a, <8 x i32> %b) {
1353; AVX1OR2-LABEL: shuffle_v8i32_08192a3b:
1354; AVX1OR2:       # %bb.0:
1355; AVX1OR2-NEXT:    vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1356; AVX1OR2-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1357; AVX1OR2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1358; AVX1OR2-NEXT:    retq
1359;
1360; AVX512VL-LABEL: shuffle_v8i32_08192a3b:
1361; AVX512VL:       # %bb.0:
1362; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1363; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,8,2,9,4,10,6,11]
1364; AVX512VL-NEXT:    vpermi2d %ymm1, %ymm2, %ymm0
1365; AVX512VL-NEXT:    retq
1366  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
1367  ret <8 x i32> %shuffle
1368}
1369
1370define <8 x i32> @shuffle_v8i32_08991abb(<8 x i32> %a, <8 x i32> %b) {
1371; AVX1-LABEL: shuffle_v8i32_08991abb:
1372; AVX1:       # %bb.0:
1373; AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[0,0],xmm1[0,0]
1374; AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[1,1]
1375; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1376; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,2,3,3]
1377; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
1378; AVX1-NEXT:    retq
1379;
1380; AVX2-LABEL: shuffle_v8i32_08991abb:
1381; AVX2:       # %bb.0:
1382; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,0,1,1,u,2,3,3>
1383; AVX2-NEXT:    vpermd %ymm1, %ymm2, %ymm1
1384; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1385; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
1386; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
1387; AVX2-NEXT:    retq
1388;
1389; AVX512VL-LABEL: shuffle_v8i32_08991abb:
1390; AVX512VL:       # %bb.0:
1391; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
1392; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm0 = [8,0,1,1,10,2,3,3]
1393; AVX512VL-NEXT:    vpermi2d %ymm2, %ymm1, %ymm0
1394; AVX512VL-NEXT:    retq
1395  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 9, i32 9, i32 1, i32 10, i32 11, i32 11>
1396  ret <8 x i32> %shuffle
1397}
1398
1399define <8 x i32> @shuffle_v8i32_091b2d3f(<8 x i32> %a, <8 x i32> %b) {
1400; AVX1-LABEL: shuffle_v8i32_091b2d3f:
1401; AVX1:       # %bb.0:
1402; AVX1-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3]
1403; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3]
1404; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
1405; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
1406; AVX1-NEXT:    retq
1407;
1408; AVX2OR512VL-LABEL: shuffle_v8i32_091b2d3f:
1409; AVX2OR512VL:       # %bb.0:
1410; AVX2OR512VL-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1411; AVX2OR512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
1412; AVX2OR512VL-NEXT:    retq
1413  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15>
1414  ret <8 x i32> %shuffle
1415}
1416
1417define <8 x i32> @shuffle_v8i32_09ab1def(<8 x i32> %a, <8 x i32> %b) {
1418; AVX1-LABEL: shuffle_v8i32_09ab1def:
1419; AVX1:       # %bb.0:
1420; AVX1-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[1,1,3,3]
1421; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1422; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
1423; AVX1-NEXT:    retq
1424;
1425; AVX2-LABEL: shuffle_v8i32_09ab1def:
1426; AVX2:       # %bb.0:
1427; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1428; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
1429; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
1430; AVX2-NEXT:    retq
1431;
1432; AVX512VL-SLOW-LABEL: shuffle_v8i32_09ab1def:
1433; AVX512VL-SLOW:       # %bb.0:
1434; AVX512VL-SLOW-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1435; AVX512VL-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
1436; AVX512VL-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
1437; AVX512VL-SLOW-NEXT:    retq
1438;
1439; AVX512VL-FAST-LABEL: shuffle_v8i32_09ab1def:
1440; AVX512VL-FAST:       # %bb.0:
1441; AVX512VL-FAST-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
1442; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [8,1,2,3,10,5,6,7]
1443; AVX512VL-FAST-NEXT:    vpermi2d %ymm2, %ymm1, %ymm0
1444; AVX512VL-FAST-NEXT:    retq
1445  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>
1446  ret <8 x i32> %shuffle
1447}
1448
1449define <8 x i32> @shuffle_v8i32_00014445(<8 x i32> %a, <8 x i32> %b) {
1450; ALL-LABEL: shuffle_v8i32_00014445:
1451; ALL:       # %bb.0:
1452; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5]
1453; ALL-NEXT:    retq
1454  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 4, i32 4, i32 4, i32 5>
1455  ret <8 x i32> %shuffle
1456}
1457
1458define <8 x i32> @shuffle_v8i32_00204464(<8 x i32> %a, <8 x i32> %b) {
1459; ALL-LABEL: shuffle_v8i32_00204464:
1460; ALL:       # %bb.0:
1461; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,4,6,4]
1462; ALL-NEXT:    retq
1463  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 4, i32 6, i32 4>
1464  ret <8 x i32> %shuffle
1465}
1466
1467define <8 x i32> @shuffle_v8i32_03004744(<8 x i32> %a, <8 x i32> %b) {
1468; ALL-LABEL: shuffle_v8i32_03004744:
1469; ALL:       # %bb.0:
1470; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,3,0,0,4,7,4,4]
1471; ALL-NEXT:    retq
1472  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 7, i32 4, i32 4>
1473  ret <8 x i32> %shuffle
1474}
1475
1476define <8 x i32> @shuffle_v8i32_10005444(<8 x i32> %a, <8 x i32> %b) {
1477; ALL-LABEL: shuffle_v8i32_10005444:
1478; ALL:       # %bb.0:
1479; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,0,0,5,4,4,4]
1480; ALL-NEXT:    retq
1481  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4>
1482  ret <8 x i32> %shuffle
1483}
1484
1485define <8 x i32> @shuffle_v8i32_22006644(<8 x i32> %a, <8 x i32> %b) {
1486; ALL-LABEL: shuffle_v8i32_22006644:
1487; ALL:       # %bb.0:
1488; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[2,2,0,0,6,6,4,4]
1489; ALL-NEXT:    retq
1490  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 6, i32 4, i32 4>
1491  ret <8 x i32> %shuffle
1492}
1493
1494define <8 x i32> @shuffle_v8i32_33307774(<8 x i32> %a, <8 x i32> %b) {
1495; ALL-LABEL: shuffle_v8i32_33307774:
1496; ALL:       # %bb.0:
1497; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,3,3,0,7,7,7,4]
1498; ALL-NEXT:    retq
1499  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 7, i32 7, i32 4>
1500  ret <8 x i32> %shuffle
1501}
1502
1503define <8 x i32> @shuffle_v8i32_32107654(<8 x i32> %a, <8 x i32> %b) {
1504; ALL-LABEL: shuffle_v8i32_32107654:
1505; ALL:       # %bb.0:
1506; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
1507; ALL-NEXT:    retq
1508  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
1509  ret <8 x i32> %shuffle
1510}
1511
1512define <8 x i32> @shuffle_v8i32_00234467(<8 x i32> %a, <8 x i32> %b) {
1513; ALL-LABEL: shuffle_v8i32_00234467:
1514; ALL:       # %bb.0:
1515; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7]
1516; ALL-NEXT:    retq
1517  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 4, i32 4, i32 6, i32 7>
1518  ret <8 x i32> %shuffle
1519}
1520
1521define <8 x i32> @shuffle_v8i32_00224466(<8 x i32> %a, <8 x i32> %b) {
1522; AVX1-LABEL: shuffle_v8i32_00224466:
1523; AVX1:       # %bb.0:
1524; AVX1-NEXT:    vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
1525; AVX1-NEXT:    retq
1526;
1527; AVX2OR512VL-LABEL: shuffle_v8i32_00224466:
1528; AVX2OR512VL:       # %bb.0:
1529; AVX2OR512VL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
1530; AVX2OR512VL-NEXT:    retq
1531  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
1532  ret <8 x i32> %shuffle
1533}
1534
1535define <8 x i32> @shuffle_v8i32_10325476(<8 x i32> %a, <8 x i32> %b) {
1536; ALL-LABEL: shuffle_v8i32_10325476:
1537; ALL:       # %bb.0:
1538; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
1539; ALL-NEXT:    retq
1540  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
1541  ret <8 x i32> %shuffle
1542}
1543
1544define <8 x i32> @shuffle_v8i32_11335577(<8 x i32> %a, <8 x i32> %b) {
1545; AVX1-LABEL: shuffle_v8i32_11335577:
1546; AVX1:       # %bb.0:
1547; AVX1-NEXT:    vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
1548; AVX1-NEXT:    retq
1549;
1550; AVX2OR512VL-LABEL: shuffle_v8i32_11335577:
1551; AVX2OR512VL:       # %bb.0:
1552; AVX2OR512VL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
1553; AVX2OR512VL-NEXT:    retq
1554  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
1555  ret <8 x i32> %shuffle
1556}
1557
1558define <8 x i32> @shuffle_v8i32_10235467(<8 x i32> %a, <8 x i32> %b) {
1559; ALL-LABEL: shuffle_v8i32_10235467:
1560; ALL:       # %bb.0:
1561; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,2,3,5,4,6,7]
1562; ALL-NEXT:    retq
1563  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
1564  ret <8 x i32> %shuffle
1565}
1566
1567define <8 x i32> @shuffle_v8i32_10225466(<8 x i32> %a, <8 x i32> %b) {
1568; ALL-LABEL: shuffle_v8i32_10225466:
1569; ALL:       # %bb.0:
1570; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,2,2,5,4,6,6]
1571; ALL-NEXT:    retq
1572  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 2, i32 5, i32 4, i32 6, i32 6>
1573  ret <8 x i32> %shuffle
1574}
1575
1576define <8 x i32> @shuffle_v8i32_00015444(<8 x i32> %a, <8 x i32> %b) {
1577; AVX1-LABEL: shuffle_v8i32_00015444:
1578; AVX1:       # %bb.0:
1579; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,1,5,4,4,4]
1580; AVX1-NEXT:    retq
1581;
1582; AVX2OR512VL-LABEL: shuffle_v8i32_00015444:
1583; AVX2OR512VL:       # %bb.0:
1584; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,0,1,5,4,4,4]
1585; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
1586; AVX2OR512VL-NEXT:    retq
1587  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 5, i32 4, i32 4, i32 4>
1588  ret <8 x i32> %shuffle
1589}
1590
1591define <8 x i32> @shuffle_v8i32_00204644(<8 x i32> %a, <8 x i32> %b) {
1592; AVX1-LABEL: shuffle_v8i32_00204644:
1593; AVX1:       # %bb.0:
1594; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,6,4,4]
1595; AVX1-NEXT:    retq
1596;
1597; AVX2OR512VL-LABEL: shuffle_v8i32_00204644:
1598; AVX2OR512VL:       # %bb.0:
1599; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,2,0,4,6,4,4]
1600; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
1601; AVX2OR512VL-NEXT:    retq
1602  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 6, i32 4, i32 4>
1603  ret <8 x i32> %shuffle
1604}
1605
1606define <8 x i32> @shuffle_v8i32_03004474(<8 x i32> %a, <8 x i32> %b) {
1607; AVX1-LABEL: shuffle_v8i32_03004474:
1608; AVX1:       # %bb.0:
1609; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,3,0,0,4,4,7,4]
1610; AVX1-NEXT:    retq
1611;
1612; AVX2OR512VL-LABEL: shuffle_v8i32_03004474:
1613; AVX2OR512VL:       # %bb.0:
1614; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = [0,3,0,0,4,4,7,4]
1615; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
1616; AVX2OR512VL-NEXT:    retq
1617  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 4, i32 7, i32 4>
1618  ret <8 x i32> %shuffle
1619}
1620
1621define <8 x i32> @shuffle_v8i32_10004444(<8 x i32> %a, <8 x i32> %b) {
1622; AVX1-LABEL: shuffle_v8i32_10004444:
1623; AVX1:       # %bb.0:
1624; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,0,0,4,4,4,4]
1625; AVX1-NEXT:    retq
1626;
1627; AVX2OR512VL-LABEL: shuffle_v8i32_10004444:
1628; AVX2OR512VL:       # %bb.0:
1629; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = [1,0,0,0,4,4,4,4]
1630; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
1631; AVX2OR512VL-NEXT:    retq
1632  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
1633  ret <8 x i32> %shuffle
1634}
1635
1636define <8 x i32> @shuffle_v8i32_22006446(<8 x i32> %a, <8 x i32> %b) {
1637; AVX1-LABEL: shuffle_v8i32_22006446:
1638; AVX1:       # %bb.0:
1639; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[2,2,0,0,6,4,4,6]
1640; AVX1-NEXT:    retq
1641;
1642; AVX2OR512VL-LABEL: shuffle_v8i32_22006446:
1643; AVX2OR512VL:       # %bb.0:
1644; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = [2,2,0,0,6,4,4,6]
1645; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
1646; AVX2OR512VL-NEXT:    retq
1647  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 4, i32 4, i32 6>
1648  ret <8 x i32> %shuffle
1649}
1650
1651define <8 x i32> @shuffle_v8i32_33307474(<8 x i32> %a, <8 x i32> %b) {
1652; AVX1-LABEL: shuffle_v8i32_33307474:
1653; AVX1:       # %bb.0:
1654; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,3,3,0,7,4,7,4]
1655; AVX1-NEXT:    retq
1656;
1657; AVX2OR512VL-LABEL: shuffle_v8i32_33307474:
1658; AVX2OR512VL:       # %bb.0:
1659; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = [3,3,3,0,7,4,7,4]
1660; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
1661; AVX2OR512VL-NEXT:    retq
1662  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 4, i32 7, i32 4>
1663  ret <8 x i32> %shuffle
1664}
1665
1666define <8 x i32> @shuffle_v8i32_32104567(<8 x i32> %a, <8 x i32> %b) {
1667; AVX1-LABEL: shuffle_v8i32_32104567:
1668; AVX1:       # %bb.0:
1669; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7]
1670; AVX1-NEXT:    retq
1671;
1672; AVX2OR512VL-LABEL: shuffle_v8i32_32104567:
1673; AVX2OR512VL:       # %bb.0:
1674; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = [3,2,1,0,4,5,6,7]
1675; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
1676; AVX2OR512VL-NEXT:    retq
1677  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
1678  ret <8 x i32> %shuffle
1679}
1680
1681define <8 x i32> @shuffle_v8i32_00236744(<8 x i32> %a, <8 x i32> %b) {
1682; AVX1-LABEL: shuffle_v8i32_00236744:
1683; AVX1:       # %bb.0:
1684; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,6,7,4,4]
1685; AVX1-NEXT:    retq
1686;
1687; AVX2OR512VL-LABEL: shuffle_v8i32_00236744:
1688; AVX2OR512VL:       # %bb.0:
1689; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,2,3,6,7,4,4]
1690; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
1691; AVX2OR512VL-NEXT:    retq
1692  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 6, i32 7, i32 4, i32 4>
1693  ret <8 x i32> %shuffle
1694}
1695
1696define <8 x i32> @shuffle_v8i32_00226644(<8 x i32> %a, <8 x i32> %b) {
1697; AVX1-LABEL: shuffle_v8i32_00226644:
1698; AVX1:       # %bb.0:
1699; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,2,6,6,4,4]
1700; AVX1-NEXT:    retq
1701;
1702; AVX2OR512VL-LABEL: shuffle_v8i32_00226644:
1703; AVX2OR512VL:       # %bb.0:
1704; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,2,2,6,6,4,4]
1705; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
1706; AVX2OR512VL-NEXT:    retq
1707  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 6, i32 6, i32 4, i32 4>
1708  ret <8 x i32> %shuffle
1709}
1710
1711define <8 x i32> @shuffle_v8i32_10324567(<8 x i32> %a, <8 x i32> %b) {
1712; AVX1-LABEL: shuffle_v8i32_10324567:
1713; AVX1:       # %bb.0:
1714; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,6,7]
1715; AVX1-NEXT:    retq
1716;
1717; AVX2OR512VL-LABEL: shuffle_v8i32_10324567:
1718; AVX2OR512VL:       # %bb.0:
1719; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = [1,0,3,2,4,5,6,7]
1720; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
1721; AVX2OR512VL-NEXT:    retq
1722  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
1723  ret <8 x i32> %shuffle
1724}
1725
1726define <8 x i32> @shuffle_v8i32_11334567(<8 x i32> %a, <8 x i32> %b) {
1727; AVX1-LABEL: shuffle_v8i32_11334567:
1728; AVX1:       # %bb.0:
1729; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,4,5,6,7]
1730; AVX1-NEXT:    retq
1731;
1732; AVX2OR512VL-LABEL: shuffle_v8i32_11334567:
1733; AVX2OR512VL:       # %bb.0:
1734; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = [1,1,3,3,4,5,6,7]
1735; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
1736; AVX2OR512VL-NEXT:    retq
1737  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7>
1738  ret <8 x i32> %shuffle
1739}
1740
1741define <8 x i32> @shuffle_v8i32_01235467(<8 x i32> %a, <8 x i32> %b) {
1742; AVX1-LABEL: shuffle_v8i32_01235467:
1743; AVX1:       # %bb.0:
1744; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,6,7]
1745; AVX1-NEXT:    retq
1746;
1747; AVX2OR512VL-LABEL: shuffle_v8i32_01235467:
1748; AVX2OR512VL:       # %bb.0:
1749; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = [0,1,2,3,5,4,6,7]
1750; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
1751; AVX2OR512VL-NEXT:    retq
1752  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
1753  ret <8 x i32> %shuffle
1754}
1755
1756define <8 x i32> @shuffle_v8i32_01235466(<8 x i32> %a, <8 x i32> %b) {
1757; AVX1-LABEL: shuffle_v8i32_01235466:
1758; AVX1:       # %bb.0:
1759; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,6,6]
1760; AVX1-NEXT:    retq
1761;
1762; AVX2OR512VL-LABEL: shuffle_v8i32_01235466:
1763; AVX2OR512VL:       # %bb.0:
1764; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = [0,1,2,3,5,4,6,6]
1765; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
1766; AVX2OR512VL-NEXT:    retq
1767  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 6>
1768  ret <8 x i32> %shuffle
1769}
1770
1771define <8 x i32> @shuffle_v8i32_002u6u44(<8 x i32> %a, <8 x i32> %b) {
1772; AVX1-LABEL: shuffle_v8i32_002u6u44:
1773; AVX1:       # %bb.0:
1774; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,u,6,u,4,4]
1775; AVX1-NEXT:    retq
1776;
1777; AVX2OR512VL-LABEL: shuffle_v8i32_002u6u44:
1778; AVX2OR512VL:       # %bb.0:
1779; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = <0,0,2,u,6,u,4,4>
1780; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
1781; AVX2OR512VL-NEXT:    retq
1782  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 4, i32 4>
1783  ret <8 x i32> %shuffle
1784}
1785
1786define <8 x i32> @shuffle_v8i32_00uu66uu(<8 x i32> %a, <8 x i32> %b) {
1787; AVX1-LABEL: shuffle_v8i32_00uu66uu:
1788; AVX1:       # %bb.0:
1789; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,u,u,6,6,u,u]
1790; AVX1-NEXT:    retq
1791;
1792; AVX2OR512VL-LABEL: shuffle_v8i32_00uu66uu:
1793; AVX2OR512VL:       # %bb.0:
1794; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = <0,0,u,u,6,6,u,u>
1795; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
1796; AVX2OR512VL-NEXT:    retq
1797  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 6, i32 6, i32 undef, i32 undef>
1798  ret <8 x i32> %shuffle
1799}
1800
1801define <8 x i32> @shuffle_v8i32_103245uu(<8 x i32> %a, <8 x i32> %b) {
1802; AVX1-LABEL: shuffle_v8i32_103245uu:
1803; AVX1:       # %bb.0:
1804; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,u,u]
1805; AVX1-NEXT:    retq
1806;
1807; AVX2OR512VL-LABEL: shuffle_v8i32_103245uu:
1808; AVX2OR512VL:       # %bb.0:
1809; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = <1,0,3,2,4,5,u,u>
1810; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
1811; AVX2OR512VL-NEXT:    retq
1812  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 undef, i32 undef>
1813  ret <8 x i32> %shuffle
1814}
1815
1816define <8 x i32> @shuffle_v8i32_1133uu67(<8 x i32> %a, <8 x i32> %b) {
1817; AVX1-LABEL: shuffle_v8i32_1133uu67:
1818; AVX1:       # %bb.0:
1819; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,u,u,6,7]
1820; AVX1-NEXT:    retq
1821;
1822; AVX2OR512VL-LABEL: shuffle_v8i32_1133uu67:
1823; AVX2OR512VL:       # %bb.0:
1824; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = <1,1,3,3,u,u,6,7>
1825; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
1826; AVX2OR512VL-NEXT:    retq
1827  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 undef, i32 6, i32 7>
1828  ret <8 x i32> %shuffle
1829}
1830
1831define <8 x i32> @shuffle_v8i32_0uu354uu(<8 x i32> %a, <8 x i32> %b) {
1832; AVX1-LABEL: shuffle_v8i32_0uu354uu:
1833; AVX1:       # %bb.0:
1834; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,u,u,3,5,4,u,u]
1835; AVX1-NEXT:    retq
1836;
1837; AVX2OR512VL-LABEL: shuffle_v8i32_0uu354uu:
1838; AVX2OR512VL:       # %bb.0:
1839; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = <0,u,u,3,5,4,u,u>
1840; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
1841; AVX2OR512VL-NEXT:    retq
1842  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 5, i32 4, i32 undef, i32 undef>
1843  ret <8 x i32> %shuffle
1844}
1845
1846define <8 x i32> @shuffle_v8i32_uuu3uu66(<8 x i32> %a, <8 x i32> %b) {
1847; AVX1-LABEL: shuffle_v8i32_uuu3uu66:
1848; AVX1:       # %bb.0:
1849; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[u,u,u,3,u,u,6,6]
1850; AVX1-NEXT:    retq
1851;
1852; AVX2OR512VL-LABEL: shuffle_v8i32_uuu3uu66:
1853; AVX2OR512VL:       # %bb.0:
1854; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = <u,u,u,3,u,u,6,6>
1855; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
1856; AVX2OR512VL-NEXT:    retq
1857  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 6>
1858  ret <8 x i32> %shuffle
1859}
1860
1861define <8 x i32> @shuffle_v8i32_6caa87e5(<8 x i32> %a, <8 x i32> %b) {
1862; AVX1-LABEL: shuffle_v8i32_6caa87e5:
1863; AVX1:       # %bb.0:
1864; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
1865; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
1866; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
1867; AVX1-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[2,2],ymm2[4,4],ymm1[6,6]
1868; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7]
1869; AVX1-NEXT:    retq
1870;
1871; AVX2-SLOW-LABEL: shuffle_v8i32_6caa87e5:
1872; AVX2-SLOW:       # %bb.0:
1873; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,1,3,2]
1874; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[0,0,2,2,4,4,6,6]
1875; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,1,0,3]
1876; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7]
1877; AVX2-SLOW-NEXT:    retq
1878;
1879; AVX2-FAST-LABEL: shuffle_v8i32_6caa87e5:
1880; AVX2-FAST:       # %bb.0:
1881; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm2 = [4,4,2,2,0,0,6,6]
1882; AVX2-FAST-NEXT:    vpermps %ymm1, %ymm2, %ymm1
1883; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,1,3,2]
1884; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7]
1885; AVX2-FAST-NEXT:    retq
1886;
1887; AVX512VL-LABEL: shuffle_v8i32_6caa87e5:
1888; AVX512VL:       # %bb.0:
1889; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [14,4,2,2,0,15,6,13]
1890; AVX512VL-NEXT:    vpermi2d %ymm0, %ymm1, %ymm2
1891; AVX512VL-NEXT:    vmovdqa %ymm2, %ymm0
1892; AVX512VL-NEXT:    retq
1893  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 6, i32 12, i32 10, i32 10, i32 8, i32 7, i32 14, i32 5>
1894  ret <8 x i32> %shuffle
1895}
1896
1897define <8 x i32> @shuffle_v8i32_32103210(<8 x i32> %a, <8 x i32> %b) {
1898; AVX1-LABEL: shuffle_v8i32_32103210:
1899; AVX1:       # %bb.0:
1900; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
1901; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1902; AVX1-NEXT:    retq
1903;
1904; AVX2OR512VL-LABEL: shuffle_v8i32_32103210:
1905; AVX2OR512VL:       # %bb.0:
1906; AVX2OR512VL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
1907; AVX2OR512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
1908; AVX2OR512VL-NEXT:    retq
1909  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>
1910  ret <8 x i32> %shuffle
1911}
1912
1913define <8 x i32> @shuffle_v8i32_76547654(<8 x i32> %a, <8 x i32> %b) {
1914; AVX1-LABEL: shuffle_v8i32_76547654:
1915; AVX1:       # %bb.0:
1916; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
1917; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
1918; AVX1-NEXT:    retq
1919;
1920; AVX2-SLOW-LABEL: shuffle_v8i32_76547654:
1921; AVX2-SLOW:       # %bb.0:
1922; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
1923; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
1924; AVX2-SLOW-NEXT:    retq
1925;
1926; AVX2-FAST-LABEL: shuffle_v8i32_76547654:
1927; AVX2-FAST:       # %bb.0:
1928; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
1929; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm1, %ymm0
1930; AVX2-FAST-NEXT:    retq
1931;
1932; AVX512VL-SLOW-LABEL: shuffle_v8i32_76547654:
1933; AVX512VL-SLOW:       # %bb.0:
1934; AVX512VL-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
1935; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
1936; AVX512VL-SLOW-NEXT:    retq
1937;
1938; AVX512VL-FAST-LABEL: shuffle_v8i32_76547654:
1939; AVX512VL-FAST:       # %bb.0:
1940; AVX512VL-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
1941; AVX512VL-FAST-NEXT:    vpermps %ymm0, %ymm1, %ymm0
1942; AVX512VL-FAST-NEXT:    retq
1943  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4>
1944  ret <8 x i32> %shuffle
1945}
1946
1947define <8 x i32> @shuffle_v8i32_76543210(<8 x i32> %a, <8 x i32> %b) {
1948; AVX1-LABEL: shuffle_v8i32_76543210:
1949; AVX1:       # %bb.0:
1950; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
1951; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
1952; AVX1-NEXT:    retq
1953;
1954; AVX2-SLOW-LABEL: shuffle_v8i32_76543210:
1955; AVX2-SLOW:       # %bb.0:
1956; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
1957; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
1958; AVX2-SLOW-NEXT:    retq
1959;
1960; AVX2-FAST-LABEL: shuffle_v8i32_76543210:
1961; AVX2-FAST:       # %bb.0:
1962; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0]
1963; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm1, %ymm0
1964; AVX2-FAST-NEXT:    retq
1965;
1966; AVX512VL-SLOW-LABEL: shuffle_v8i32_76543210:
1967; AVX512VL-SLOW:       # %bb.0:
1968; AVX512VL-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
1969; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
1970; AVX512VL-SLOW-NEXT:    retq
1971;
1972; AVX512VL-FAST-LABEL: shuffle_v8i32_76543210:
1973; AVX512VL-FAST:       # %bb.0:
1974; AVX512VL-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0]
1975; AVX512VL-FAST-NEXT:    vpermps %ymm0, %ymm1, %ymm0
1976; AVX512VL-FAST-NEXT:    retq
1977  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
1978  ret <8 x i32> %shuffle
1979}
1980
1981define <8 x i32> @shuffle_v8i32_3210ba98(<8 x i32> %a, <8 x i32> %b) {
1982; ALL-LABEL: shuffle_v8i32_3210ba98:
1983; ALL:       # %bb.0:
1984; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1985; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
1986; ALL-NEXT:    retq
1987  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 11, i32 10, i32 9, i32 8>
1988  ret <8 x i32> %shuffle
1989}
1990
1991define <8 x i32> @shuffle_v8i32_3210fedc(<8 x i32> %a, <8 x i32> %b) {
1992; ALL-LABEL: shuffle_v8i32_3210fedc:
1993; ALL:       # %bb.0:
1994; ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
1995; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
1996; ALL-NEXT:    retq
1997  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12>
1998  ret <8 x i32> %shuffle
1999}
2000
2001define <8 x i32> @shuffle_v8i32_7654fedc(<8 x i32> %a, <8 x i32> %b) {
2002; AVX1OR2-LABEL: shuffle_v8i32_7654fedc:
2003; AVX1OR2:       # %bb.0:
2004; AVX1OR2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
2005; AVX1OR2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
2006; AVX1OR2-NEXT:    retq
2007;
2008; AVX512VL-SLOW-LABEL: shuffle_v8i32_7654fedc:
2009; AVX512VL-SLOW:       # %bb.0:
2010; AVX512VL-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
2011; AVX512VL-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
2012; AVX512VL-SLOW-NEXT:    retq
2013;
2014; AVX512VL-FAST-LABEL: shuffle_v8i32_7654fedc:
2015; AVX512VL-FAST:       # %bb.0:
2016; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,6,5,4,15,14,13,12]
2017; AVX512VL-FAST-NEXT:    vpermt2d %ymm1, %ymm2, %ymm0
2018; AVX512VL-FAST-NEXT:    retq
2019  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 15, i32 14, i32 13, i32 12>
2020  ret <8 x i32> %shuffle
2021}
2022
2023define <8 x i32> @shuffle_v8i32_fedc7654(<8 x i32> %a, <8 x i32> %b) {
2024; AVX1OR2-LABEL: shuffle_v8i32_fedc7654:
2025; AVX1OR2:       # %bb.0:
2026; AVX1OR2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
2027; AVX1OR2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
2028; AVX1OR2-NEXT:    retq
2029;
2030; AVX512VL-SLOW-LABEL: shuffle_v8i32_fedc7654:
2031; AVX512VL-SLOW:       # %bb.0:
2032; AVX512VL-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
2033; AVX512VL-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
2034; AVX512VL-SLOW-NEXT:    retq
2035;
2036; AVX512VL-FAST-LABEL: shuffle_v8i32_fedc7654:
2037; AVX512VL-FAST:       # %bb.0:
2038; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,6,5,4,15,14,13,12]
2039; AVX512VL-FAST-NEXT:    vpermi2d %ymm0, %ymm1, %ymm2
2040; AVX512VL-FAST-NEXT:    vmovdqa %ymm2, %ymm0
2041; AVX512VL-FAST-NEXT:    retq
2042  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 15, i32 14, i32 13, i32 12, i32 7, i32 6, i32 5, i32 4>
2043  ret <8 x i32> %shuffle
2044}
2045
2046define <8 x i32> @shuffle_v8i32_ba987654(<8 x i32> %a, <8 x i32> %b) {
2047; ALL-LABEL: shuffle_v8i32_ba987654:
2048; ALL:       # %bb.0:
2049; ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2050; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
2051; ALL-NEXT:    retq
2052  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4>
2053  ret <8 x i32> %shuffle
2054}
2055
2056define <8 x i32> @shuffle_v8i32_ba983210(<8 x i32> %a, <8 x i32> %b) {
2057; ALL-LABEL: shuffle_v8i32_ba983210:
2058; ALL:       # %bb.0:
2059; ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2060; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
2061; ALL-NEXT:    retq
2062  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4>
2063  ret <8 x i32> %shuffle
2064}
2065
2066define <8 x i32> @shuffle_v8i32_zuu8zuuc(<8 x i32> %a) {
2067; AVX1-LABEL: shuffle_v8i32_zuu8zuuc:
2068; AVX1:       # %bb.0:
2069; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
2070; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
2071; AVX1-NEXT:    retq
2072;
2073; AVX2OR512VL-LABEL: shuffle_v8i32_zuu8zuuc:
2074; AVX2OR512VL:       # %bb.0:
2075; AVX2OR512VL-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19]
2076; AVX2OR512VL-NEXT:    retq
2077  %shuffle = shufflevector <8 x i32> zeroinitializer, <8 x i32> %a, <8 x i32> <i32 0, i32 undef, i32 undef, i32 8, i32 0, i32 undef, i32 undef, i32 12>
2078  ret <8 x i32> %shuffle
2079}
2080
2081define <8 x i32> @shuffle_v8i32_9ubzdefz(<8 x i32> %a) {
2082; AVX1-LABEL: shuffle_v8i32_9ubzdefz:
2083; AVX1:       # %bb.0:
2084; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
2085; AVX1-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4]
2086; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,2],ymm1[2,0],ymm0[5,6],ymm1[6,4]
2087; AVX1-NEXT:    retq
2088;
2089; AVX2OR512VL-LABEL: shuffle_v8i32_9ubzdefz:
2090; AVX2OR512VL:       # %bb.0:
2091; AVX2OR512VL-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,ymm0[20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero
2092; AVX2OR512VL-NEXT:    retq
2093  %shuffle = shufflevector <8 x i32> zeroinitializer, <8 x i32> %a, <8 x i32> <i32 9, i32 undef, i32 11, i32 0, i32 13, i32 14, i32 15, i32 0>
2094  ret <8 x i32> %shuffle
2095}
2096
2097define <8 x i32> @shuffle_v8i32_80u1b4uu(<8 x i32> %a, <8 x i32> %b) {
2098; ALL-LABEL: shuffle_v8i32_80u1b4uu:
2099; ALL:       # %bb.0:
2100; ALL-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
2101; ALL-NEXT:    retq
2102  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 8, i32 0, i32 undef, i32 1, i32 12, i32 4, i32 undef, i32 undef>
2103  ret <8 x i32> %shuffle
2104}
2105
2106define <8 x i32> @shuffle_v8i32_uuuu1111(<8 x i32> %a, <8 x i32> %b) {
2107; ALL-LABEL: shuffle_v8i32_uuuu1111:
2108; ALL:       # %bb.0:
2109; ALL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
2110; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
2111; ALL-NEXT:    retq
2112  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 1, i32 1, i32 1>
2113  ret <8 x i32> %shuffle
2114}
2115
2116define <8 x i32> @shuffle_v8i32_2222uuuu(<8 x i32> %a, <8 x i32> %b) {
2117; ALL-LABEL: shuffle_v8i32_2222uuuu:
2118; ALL:       # %bb.0:
2119; ALL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,2,2,2]
2120; ALL-NEXT:    retq
2121  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 undef, i32 undef, i32 undef, i32 undef>
2122  ret <8 x i32> %shuffle
2123}
2124
2125define <8 x i32> @shuffle_v8i32_2A3Buuuu(<8 x i32> %a, <8 x i32> %b) {
2126; ALL-LABEL: shuffle_v8i32_2A3Buuuu:
2127; ALL:       # %bb.0:
2128; ALL-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2129; ALL-NEXT:    retq
2130  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 undef, i32 undef, i32 undef, i32 undef>
2131  ret <8 x i32> %shuffle
2132}
2133
2134define <8 x i32> @shuffle_v8i32_44444444(<8 x i32> %a, <8 x i32> %b) {
2135; AVX1-LABEL: shuffle_v8i32_44444444:
2136; AVX1:       # %bb.0:
2137; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
2138; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
2139; AVX1-NEXT:    retq
2140;
2141; AVX2OR512VL-LABEL: shuffle_v8i32_44444444:
2142; AVX2OR512VL:       # %bb.0:
2143; AVX2OR512VL-NEXT:    vextractf128 $1, %ymm0, %xmm0
2144; AVX2OR512VL-NEXT:    vbroadcastss %xmm0, %ymm0
2145; AVX2OR512VL-NEXT:    retq
2146  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
2147  ret <8 x i32> %shuffle
2148}
2149
2150define <8 x i32> @shuffle_v8i32_44444444_bc(<8 x float> %a, <8 x float> %b) {
2151; AVX1-LABEL: shuffle_v8i32_44444444_bc:
2152; AVX1:       # %bb.0:
2153; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
2154; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
2155; AVX1-NEXT:    retq
2156;
2157; AVX2OR512VL-LABEL: shuffle_v8i32_44444444_bc:
2158; AVX2OR512VL:       # %bb.0:
2159; AVX2OR512VL-NEXT:    vextractf128 $1, %ymm0, %xmm0
2160; AVX2OR512VL-NEXT:    vbroadcastss %xmm0, %ymm0
2161; AVX2OR512VL-NEXT:    retq
2162  %tmp0 = bitcast <8 x float> %a to <8 x i32>
2163  %tmp1 = bitcast <8 x float> %b to <8 x i32>
2164  %shuffle = shufflevector <8 x i32> %tmp0, <8 x i32> %tmp1, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
2165  ret <8 x i32> %shuffle
2166}
2167
2168define <8 x i32> @shuffle_v8i32_5555uuuu(<8 x i32> %a, <8 x i32> %b) {
2169; ALL-LABEL: shuffle_v8i32_5555uuuu:
2170; ALL:       # %bb.0:
2171; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm0
2172; ALL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
2173; ALL-NEXT:    retq
2174  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 undef, i32 undef, i32 undef, i32 undef>
2175  ret <8 x i32> %shuffle
2176}
2177
2178; PR32453
2179define <8 x i32> @shuffle_v8i32_uuuuuu7u(<8 x i32> %a, <8 x i32> %b) nounwind {
2180; AVX1-LABEL: shuffle_v8i32_uuuuuu7u:
2181; AVX1:       # %bb.0:
2182; AVX1-NEXT:    vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
2183; AVX1-NEXT:    retq
2184;
2185; AVX2OR512VL-LABEL: shuffle_v8i32_uuuuuu7u:
2186; AVX2OR512VL:       # %bb.0:
2187; AVX2OR512VL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,1,3,3,4,5,7,7]
2188; AVX2OR512VL-NEXT:    retq
2189  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 7, i32 undef>
2190  ret <8 x i32> %shuffle
2191}
2192
2193define <8 x float> @splat_mem_v8f32_2(float* %p) {
2194; ALL-LABEL: splat_mem_v8f32_2:
2195; ALL:       # %bb.0:
2196; ALL-NEXT:    vbroadcastss (%rdi), %ymm0
2197; ALL-NEXT:    retq
2198  %1 = load float, float* %p
2199  %2 = insertelement <4 x float> undef, float %1, i32 0
2200  %3 = shufflevector <4 x float> %2, <4 x float> undef, <8 x i32> zeroinitializer
2201  ret <8 x float> %3
2202}
2203
2204define <8 x float> @splat_v8f32(<4 x float> %r) {
2205; AVX1-LABEL: splat_v8f32:
2206; AVX1:       # %bb.0:
2207; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
2208; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
2209; AVX1-NEXT:    retq
2210;
2211; AVX2OR512VL-LABEL: splat_v8f32:
2212; AVX2OR512VL:       # %bb.0:
2213; AVX2OR512VL-NEXT:    vbroadcastss %xmm0, %ymm0
2214; AVX2OR512VL-NEXT:    retq
2215  %1 = shufflevector <4 x float> %r, <4 x float> undef, <8 x i32> zeroinitializer
2216  ret <8 x float> %1
2217}
2218
2219;
2220; Shuffle to logical bit shifts
2221;
2222
2223define <8 x i32> @shuffle_v8i32_z0U2zUz6(<8 x i32> %a) {
2224; AVX1-LABEL: shuffle_v8i32_z0U2zUz6:
2225; AVX1:       # %bb.0:
2226; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
2227; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
2228; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5]
2229; AVX1-NEXT:    retq
2230;
2231; AVX2OR512VL-LABEL: shuffle_v8i32_z0U2zUz6:
2232; AVX2OR512VL:       # %bb.0:
2233; AVX2OR512VL-NEXT:    vpsllq $32, %ymm0, %ymm0
2234; AVX2OR512VL-NEXT:    retq
2235  %shuffle = shufflevector <8 x i32> %a, <8 x i32> zeroinitializer, <8 x i32> <i32 8, i32 0, i32 undef, i32 2, i32 8, i32 undef, i32 8, i32 6>
2236  ret <8 x i32> %shuffle
2237}
2238
2239define <8 x i32> @shuffle_v8i32_1U3z5zUU(<8 x i32> %a) {
2240; AVX1-LABEL: shuffle_v8i32_1U3z5zUU:
2241; AVX1:       # %bb.0:
2242; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
2243; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
2244; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
2245; AVX1-NEXT:    retq
2246;
2247; AVX2OR512VL-LABEL: shuffle_v8i32_1U3z5zUU:
2248; AVX2OR512VL:       # %bb.0:
2249; AVX2OR512VL-NEXT:    vpsrlq $32, %ymm0, %ymm0
2250; AVX2OR512VL-NEXT:    retq
2251  %shuffle = shufflevector <8 x i32> %a, <8 x i32> zeroinitializer, <8 x i32> <i32 1, i32 undef, i32 3, i32 8, i32 5, i32 8, i32 undef, i32 undef>
2252  ret <8 x i32> %shuffle
2253}
2254
2255define <8 x i32> @shuffle_v8i32_B012F456(<8 x i32> %a, <8 x i32> %b) {
2256; AVX1-LABEL: shuffle_v8i32_B012F456:
2257; AVX1:       # %bb.0:
2258; AVX1-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4]
2259; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[1,2],ymm1[4,6],ymm0[5,6]
2260; AVX1-NEXT:    retq
2261;
2262; AVX2OR512VL-LABEL: shuffle_v8i32_B012F456:
2263; AVX2OR512VL:       # %bb.0:
2264; AVX2OR512VL-NEXT:    vpalignr {{.*#+}} ymm0 = ymm1[12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11],ymm1[28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27]
2265; AVX2OR512VL-NEXT:    retq
2266  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 11, i32 0, i32 1, i32 2, i32 15, i32 4, i32 5, i32 6>
2267  ret <8 x i32> %shuffle
2268}
2269
2270define <8 x i32> @shuffle_v8i32_1238567C(<8 x i32> %a, <8 x i32> %b) {
2271; AVX1-LABEL: shuffle_v8i32_1238567C:
2272; AVX1:       # %bb.0:
2273; AVX1-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm0[3,0],ymm1[4,4],ymm0[7,4]
2274; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,2],ymm1[2,0],ymm0[5,6],ymm1[6,4]
2275; AVX1-NEXT:    retq
2276;
2277; AVX2OR512VL-LABEL: shuffle_v8i32_1238567C:
2278; AVX2OR512VL:       # %bb.0:
2279; AVX2OR512VL-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3],ymm0[20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19]
2280; AVX2OR512VL-NEXT:    retq
2281  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 8, i32 5, i32 6, i32 7, i32 12>
2282  ret <8 x i32> %shuffle
2283}
2284
2285define <8 x i32> @shuffle_v8i32_9AB0DEF4(<8 x i32> %a, <8 x i32> %b) {
2286; AVX1-LABEL: shuffle_v8i32_9AB0DEF4:
2287; AVX1:       # %bb.0:
2288; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[3,0],ymm0[4,4],ymm1[7,4]
2289; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[1,2],ymm0[2,0],ymm1[5,6],ymm0[6,4]
2290; AVX1-NEXT:    retq
2291;
2292; AVX2OR512VL-LABEL: shuffle_v8i32_9AB0DEF4:
2293; AVX2OR512VL:       # %bb.0:
2294; AVX2OR512VL-NEXT:    vpalignr {{.*#+}} ymm0 = ymm1[4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3],ymm1[20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19]
2295; AVX2OR512VL-NEXT:    retq
2296  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 9, i32 10, i32 11, i32 0, i32 13, i32 14, i32 15, i32 4>
2297  ret <8 x i32> %shuffle
2298}
2299
2300define <8 x i32> @shuffle_v8i32_389A7CDE(<8 x i32> %a, <8 x i32> %b) {
2301; AVX1-LABEL: shuffle_v8i32_389A7CDE:
2302; AVX1:       # %bb.0:
2303; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm1[0,0],ymm0[7,4],ymm1[4,4]
2304; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[1,2],ymm0[4,6],ymm1[5,6]
2305; AVX1-NEXT:    retq
2306;
2307; AVX2OR512VL-LABEL: shuffle_v8i32_389A7CDE:
2308; AVX2OR512VL:       # %bb.0:
2309; AVX2OR512VL-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm0[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27]
2310; AVX2OR512VL-NEXT:    retq
2311  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 8, i32 9, i32 10, i32 7, i32 12, i32 13, i32 14>
2312  ret <8 x i32> %shuffle
2313}
2314
2315define <8 x i32> @shuffle_v8i32_30127456(<8 x i32> %a, <8 x i32> %b) {
2316; ALL-LABEL: shuffle_v8i32_30127456:
2317; ALL:       # %bb.0:
2318; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,0,1,2,7,4,5,6]
2319; ALL-NEXT:    retq
2320  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 0, i32 1, i32 2, i32 7, i32 4, i32 5, i32 6>
2321  ret <8 x i32> %shuffle
2322}
2323
2324define <8 x i32> @shuffle_v8i32_12305674(<8 x i32> %a, <8 x i32> %b) {
2325; ALL-LABEL: shuffle_v8i32_12305674:
2326; ALL:       # %bb.0:
2327; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,2,3,0,5,6,7,4]
2328; ALL-NEXT:    retq
2329  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 0, i32 5, i32 6, i32 7, i32 4>
2330  ret <8 x i32> %shuffle
2331}
2332
2333define <8x float> @concat_v2f32_1(<2 x float>* %tmp64, <2 x float>* %tmp65) {
2334; ALL-LABEL: concat_v2f32_1:
2335; ALL:       # %bb.0: # %entry
2336; ALL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
2337; ALL-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
2338; ALL-NEXT:    retq
2339entry:
2340  %tmp74 = load <2 x float>, <2 x float>* %tmp65, align 8
2341  %tmp72 = load <2 x float>, <2 x float>* %tmp64, align 8
2342  %tmp73 = shufflevector <2 x float> %tmp72, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2343  %tmp75 = shufflevector <2 x float> %tmp74, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2344  %tmp76 = shufflevector <8 x float> %tmp73, <8 x float> %tmp75, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef>
2345  ret <8 x float> %tmp76
2346}
2347
2348define <8x float> @concat_v2f32_2(<2 x float>* %tmp64, <2 x float>* %tmp65) {
2349; ALL-LABEL: concat_v2f32_2:
2350; ALL:       # %bb.0: # %entry
2351; ALL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
2352; ALL-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
2353; ALL-NEXT:    retq
2354entry:
2355  %tmp74 = load <2 x float>, <2 x float>* %tmp65, align 8
2356  %tmp72 = load <2 x float>, <2 x float>* %tmp64, align 8
2357  %tmp76 = shufflevector <2 x float> %tmp72, <2 x float> %tmp74, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
2358  ret <8 x float> %tmp76
2359}
2360
2361define <8x float> @concat_v2f32_3(<2 x float>* %tmp64, <2 x float>* %tmp65) {
2362; ALL-LABEL: concat_v2f32_3:
2363; ALL:       # %bb.0: # %entry
2364; ALL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
2365; ALL-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
2366; ALL-NEXT:    retq
2367entry:
2368  %tmp74 = load <2 x float>, <2 x float>* %tmp65, align 8
2369  %tmp72 = load <2 x float>, <2 x float>* %tmp64, align 8
2370  %tmp76 = shufflevector <2 x float> %tmp72, <2 x float> %tmp74, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2371  %res = shufflevector <4 x float> %tmp76, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
2372  ret <8 x float> %res
2373}
2374
2375define <8 x i32> @insert_mem_and_zero_v8i32(i32* %ptr) {
2376; ALL-LABEL: insert_mem_and_zero_v8i32:
2377; ALL:       # %bb.0:
2378; ALL-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2379; ALL-NEXT:    retq
2380  %a = load i32, i32* %ptr
2381  %v = insertelement <8 x i32> undef, i32 %a, i32 0
2382  %shuffle = shufflevector <8 x i32> %v, <8 x i32> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2383  ret <8 x i32> %shuffle
2384}
2385
2386define <8 x i32> @concat_v8i32_0123CDEF(<8 x i32> %a, <8 x i32> %b) {
2387; ALL-LABEL: concat_v8i32_0123CDEF:
2388; ALL:       # %bb.0:
2389; ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2390; ALL-NEXT:    retq
2391  %alo = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2392  %bhi = shufflevector <8 x i32> %b, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2393  %shuf = shufflevector <4 x i32> %alo, <4 x i32> %bhi, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2394  ret <8 x i32> %shuf
2395}
2396
2397define <8 x i32> @concat_v8i32_4567CDEF_bc(<8 x i32> %a0, <8 x i32> %a1) {
2398; AVX1OR2-LABEL: concat_v8i32_4567CDEF_bc:
2399; AVX1OR2:       # %bb.0:
2400; AVX1OR2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
2401; AVX1OR2-NEXT:    retq
2402;
2403; AVX512VL-LABEL: concat_v8i32_4567CDEF_bc:
2404; AVX512VL:       # %bb.0:
2405; AVX512VL-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
2406; AVX512VL-NEXT:    retq
2407  %a0hi = shufflevector <8 x i32> %a0, <8 x i32> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2408  %a1hi = shufflevector <8 x i32> %a0, <8 x i32> %a1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
2409  %bc0hi = bitcast <4 x i32> %a0hi to <2 x i64>
2410  %bc1hi = bitcast <4 x i32> %a1hi to <2 x i64>
2411  %shuffle64 = shufflevector <2 x i64> %bc0hi, <2 x i64> %bc1hi, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2412  %shuffle32 = bitcast <4 x i64> %shuffle64 to <8 x i32>
2413  ret <8 x i32> %shuffle32
2414}
2415
2416define <8 x float> @concat_v8f32_4567CDEF_bc(<8 x float> %f0, <8 x float> %f1) {
2417; ALL-LABEL: concat_v8f32_4567CDEF_bc:
2418; ALL:       # %bb.0:
2419; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
2420; ALL-NEXT:    retq
2421  %a0 = bitcast <8 x float> %f0 to <4 x i64>
2422  %a1 = bitcast <8 x float> %f1 to <8 x i32>
2423  %a0hi = shufflevector <4 x i64> %a0, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
2424  %a1hi = shufflevector <8 x i32> %a1, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2425  %bc0hi = bitcast <2 x i64> %a0hi to <2 x i64>
2426  %bc1hi = bitcast <4 x i32> %a1hi to <2 x i64>
2427  %shuffle64 = shufflevector <2 x i64> %bc0hi, <2 x i64> %bc1hi, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2428  %shuffle32 = bitcast <4 x i64> %shuffle64 to <8 x float>
2429  ret <8 x float> %shuffle32
2430}
2431
2432define <8 x i32> @insert_dup_mem_v8i32(i32* %ptr) {
2433; ALL-LABEL: insert_dup_mem_v8i32:
2434; ALL:       # %bb.0:
2435; ALL-NEXT:    vbroadcastss (%rdi), %ymm0
2436; ALL-NEXT:    retq
2437  %tmp = load i32, i32* %ptr, align 4
2438  %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
2439  %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <8 x i32> zeroinitializer
2440  ret <8 x i32> %tmp2
2441}
2442
2443define <8 x i32> @shuffle_v8i32_12345678(<8 x i32> %a, <8 x i32> %b) {
2444; AVX1-LABEL: shuffle_v8i32_12345678:
2445; AVX1:       # %bb.0:
2446; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7]
2447; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
2448; AVX1-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm0[3,0],ymm1[4,4],ymm0[7,4]
2449; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,2],ymm1[2,0],ymm0[5,6],ymm1[6,4]
2450; AVX1-NEXT:    retq
2451;
2452; AVX2-LABEL: shuffle_v8i32_12345678:
2453; AVX2:       # %bb.0:
2454; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7]
2455; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [1,2,3,4,5,6,7,0]
2456; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
2457; AVX2-NEXT:    retq
2458;
2459; AVX512VL-LABEL: shuffle_v8i32_12345678:
2460; AVX512VL:       # %bb.0:
2461; AVX512VL-NEXT:    valignd {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7],ymm1[0]
2462; AVX512VL-NEXT:    retq
2463  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
2464  ret <8 x i32> %shuffle
2465}
2466
2467define <8 x i32> @shuffle_v8i32_12345670(<8 x i32> %a) {
2468; AVX1-LABEL: shuffle_v8i32_12345670:
2469; AVX1:       # %bb.0:
2470; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
2471; AVX1-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm0[3,0],ymm1[4,4],ymm0[7,4]
2472; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,2],ymm1[2,0],ymm0[5,6],ymm1[6,4]
2473; AVX1-NEXT:    retq
2474;
2475; AVX2-LABEL: shuffle_v8i32_12345670:
2476; AVX2:       # %bb.0:
2477; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [1,2,3,4,5,6,7,0]
2478; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
2479; AVX2-NEXT:    retq
2480;
2481; AVX512VL-LABEL: shuffle_v8i32_12345670:
2482; AVX512VL:       # %bb.0:
2483; AVX512VL-NEXT:    valignd {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,0]
2484; AVX512VL-NEXT:    retq
2485  %shuffle = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0>
2486  ret <8 x i32> %shuffle
2487}
2488
2489define <8 x float> @add_v8f32_02468ACE_13579BDF(<8 x float> %a, <8 x float> %b) {
2490; AVX1-LABEL: add_v8f32_02468ACE_13579BDF:
2491; AVX1:       # %bb.0: # %entry
2492; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2493; AVX1-NEXT:    vshufps {{.*#+}} xmm3 = xmm1[0,2],xmm2[0,2]
2494; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
2495; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
2496; AVX1-NEXT:    vshufps {{.*#+}} xmm5 = xmm0[0,2],xmm4[0,2]
2497; AVX1-NEXT:    vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
2498; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3]
2499; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
2500; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3]
2501; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2502; AVX1-NEXT:    vaddps %ymm0, %ymm3, %ymm0
2503; AVX1-NEXT:    retq
2504;
2505; AVX2-LABEL: add_v8f32_02468ACE_13579BDF:
2506; AVX2:       # %bb.0: # %entry
2507; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
2508; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
2509; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
2510; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
2511; AVX2-NEXT:    vaddps %ymm0, %ymm2, %ymm0
2512; AVX2-NEXT:    retq
2513;
2514; AVX512VL-SLOW-LABEL: add_v8f32_02468ACE_13579BDF:
2515; AVX512VL-SLOW:       # %bb.0: # %entry
2516; AVX512VL-SLOW-NEXT:    vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
2517; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
2518; AVX512VL-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
2519; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
2520; AVX512VL-SLOW-NEXT:    vaddps %ymm0, %ymm2, %ymm0
2521; AVX512VL-SLOW-NEXT:    retq
2522;
2523; AVX512VL-FAST-LABEL: add_v8f32_02468ACE_13579BDF:
2524; AVX512VL-FAST:       # %bb.0: # %entry
2525; AVX512VL-FAST-NEXT:    vmovaps {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14]
2526; AVX512VL-FAST-NEXT:    vpermi2ps %ymm1, %ymm0, %ymm2
2527; AVX512VL-FAST-NEXT:    vmovaps {{.*#+}} ymm3 = [1,3,5,7,9,11,13,15]
2528; AVX512VL-FAST-NEXT:    vpermi2ps %ymm1, %ymm0, %ymm3
2529; AVX512VL-FAST-NEXT:    vaddps %ymm3, %ymm2, %ymm0
2530; AVX512VL-FAST-NEXT:    retq
2531entry:
2532  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
2533  %shuffle1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
2534  %add = fadd <8 x float> %shuffle, %shuffle1
2535  ret <8 x float> %add
2536}
2537
2538define <8 x float> @add_v8f32_8ACE0246_9BDF1357(<8 x float> %a, <8 x float> %b) {
2539; AVX1-LABEL: add_v8f32_8ACE0246_9BDF1357:
2540; AVX1:       # %bb.0: # %entry
2541; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2542; AVX1-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[0,2],xmm2[0,2]
2543; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
2544; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
2545; AVX1-NEXT:    vshufps {{.*#+}} xmm5 = xmm1[0,2],xmm4[0,2]
2546; AVX1-NEXT:    vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
2547; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
2548; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
2549; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm4[1,3]
2550; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2551; AVX1-NEXT:    vaddps %ymm0, %ymm3, %ymm0
2552; AVX1-NEXT:    retq
2553;
2554; AVX2-LABEL: add_v8f32_8ACE0246_9BDF1357:
2555; AVX2:       # %bb.0: # %entry
2556; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
2557; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
2558; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[1,3],ymm0[1,3],ymm1[5,7],ymm0[5,7]
2559; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
2560; AVX2-NEXT:    vaddps %ymm0, %ymm2, %ymm0
2561; AVX2-NEXT:    retq
2562;
2563; AVX512VL-SLOW-LABEL: add_v8f32_8ACE0246_9BDF1357:
2564; AVX512VL-SLOW:       # %bb.0: # %entry
2565; AVX512VL-SLOW-NEXT:    vshufps {{.*#+}} ymm2 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
2566; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
2567; AVX512VL-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[1,3],ymm0[1,3],ymm1[5,7],ymm0[5,7]
2568; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
2569; AVX512VL-SLOW-NEXT:    vaddps %ymm0, %ymm2, %ymm0
2570; AVX512VL-SLOW-NEXT:    retq
2571;
2572; AVX512VL-FAST-LABEL: add_v8f32_8ACE0246_9BDF1357:
2573; AVX512VL-FAST:       # %bb.0: # %entry
2574; AVX512VL-FAST-NEXT:    vmovaps {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14]
2575; AVX512VL-FAST-NEXT:    vpermi2ps %ymm0, %ymm1, %ymm2
2576; AVX512VL-FAST-NEXT:    vmovaps {{.*#+}} ymm3 = [1,3,5,7,9,11,13,15]
2577; AVX512VL-FAST-NEXT:    vpermi2ps %ymm0, %ymm1, %ymm3
2578; AVX512VL-FAST-NEXT:    vaddps %ymm3, %ymm2, %ymm0
2579; AVX512VL-FAST-NEXT:    retq
2580entry:
2581  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 0, i32 2, i32 4, i32 6>
2582  %shuffle1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 1, i32 3, i32 5, i32 7>
2583  %add = fadd <8 x float> %shuffle, %shuffle1
2584  ret <8 x float> %add
2585}
2586
2587define <8 x i32> @add_v8i32_02468ACE_13579BDF(<8 x i32> %a, <8 x i32> %b) {
2588; AVX1-LABEL: add_v8i32_02468ACE_13579BDF:
2589; AVX1:       # %bb.0: # %entry
2590; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2591; AVX1-NEXT:    vshufps {{.*#+}} xmm3 = xmm1[0,2],xmm2[0,2]
2592; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
2593; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
2594; AVX1-NEXT:    vshufps {{.*#+}} xmm5 = xmm0[0,2],xmm4[0,2]
2595; AVX1-NEXT:    vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
2596; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3]
2597; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
2598; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3]
2599; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2600; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2601; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
2602; AVX1-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
2603; AVX1-NEXT:    vpaddd %xmm0, %xmm3, %xmm0
2604; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2605; AVX1-NEXT:    retq
2606;
2607; AVX2-LABEL: add_v8i32_02468ACE_13579BDF:
2608; AVX2:       # %bb.0: # %entry
2609; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
2610; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
2611; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
2612; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
2613; AVX2-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
2614; AVX2-NEXT:    retq
2615;
2616; AVX512VL-SLOW-LABEL: add_v8i32_02468ACE_13579BDF:
2617; AVX512VL-SLOW:       # %bb.0: # %entry
2618; AVX512VL-SLOW-NEXT:    vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
2619; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
2620; AVX512VL-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
2621; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
2622; AVX512VL-SLOW-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
2623; AVX512VL-SLOW-NEXT:    retq
2624;
2625; AVX512VL-FAST-LABEL: add_v8i32_02468ACE_13579BDF:
2626; AVX512VL-FAST:       # %bb.0: # %entry
2627; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14]
2628; AVX512VL-FAST-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2
2629; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,3,5,7,9,11,13,15]
2630; AVX512VL-FAST-NEXT:    vpermi2d %ymm1, %ymm0, %ymm3
2631; AVX512VL-FAST-NEXT:    vpaddd %ymm3, %ymm2, %ymm0
2632; AVX512VL-FAST-NEXT:    retq
2633entry:
2634  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
2635  %shuffle1 = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
2636  %add = add <8 x i32> %shuffle, %shuffle1
2637  ret <8 x i32> %add
2638}
2639
2640define <8 x i32> @add_v8i32_8ACE0246_9BDF1357(<8 x i32> %a, <8 x i32> %b) {
2641; AVX1-LABEL: add_v8i32_8ACE0246_9BDF1357:
2642; AVX1:       # %bb.0: # %entry
2643; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2644; AVX1-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[0,2],xmm2[0,2]
2645; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
2646; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
2647; AVX1-NEXT:    vshufps {{.*#+}} xmm5 = xmm1[0,2],xmm4[0,2]
2648; AVX1-NEXT:    vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
2649; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
2650; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
2651; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm4[1,3]
2652; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2653; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2654; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
2655; AVX1-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
2656; AVX1-NEXT:    vpaddd %xmm0, %xmm3, %xmm0
2657; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2658; AVX1-NEXT:    retq
2659;
2660; AVX2-LABEL: add_v8i32_8ACE0246_9BDF1357:
2661; AVX2:       # %bb.0: # %entry
2662; AVX2-NEXT:    vshufps {{.*#+}} ymm2 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
2663; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
2664; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[1,3],ymm0[1,3],ymm1[5,7],ymm0[5,7]
2665; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
2666; AVX2-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
2667; AVX2-NEXT:    retq
2668;
2669; AVX512VL-SLOW-LABEL: add_v8i32_8ACE0246_9BDF1357:
2670; AVX512VL-SLOW:       # %bb.0: # %entry
2671; AVX512VL-SLOW-NEXT:    vshufps {{.*#+}} ymm2 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
2672; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
2673; AVX512VL-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[1,3],ymm0[1,3],ymm1[5,7],ymm0[5,7]
2674; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
2675; AVX512VL-SLOW-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
2676; AVX512VL-SLOW-NEXT:    retq
2677;
2678; AVX512VL-FAST-LABEL: add_v8i32_8ACE0246_9BDF1357:
2679; AVX512VL-FAST:       # %bb.0: # %entry
2680; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14]
2681; AVX512VL-FAST-NEXT:    vpermi2d %ymm0, %ymm1, %ymm2
2682; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,3,5,7,9,11,13,15]
2683; AVX512VL-FAST-NEXT:    vpermi2d %ymm0, %ymm1, %ymm3
2684; AVX512VL-FAST-NEXT:    vpaddd %ymm3, %ymm2, %ymm0
2685; AVX512VL-FAST-NEXT:    retq
2686entry:
2687  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 0, i32 2, i32 4, i32 6>
2688  %shuffle1 = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 1, i32 3, i32 5, i32 7>
2689  %add = add <8 x i32> %shuffle, %shuffle1
2690  ret <8 x i32> %add
2691}
2692