• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX1
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX2 --check-prefix=AVX2-SLOW
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX2 --check-prefix=AVX2-FAST
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512VL-SLOW
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512VL-FAST
7
8define <4 x double> @shuffle_v4f64_0000(<4 x double> %a, <4 x double> %b) {
9; AVX1-LABEL: shuffle_v4f64_0000:
10; AVX1:       # %bb.0:
11; AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
12; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
13; AVX1-NEXT:    retq
14;
15; AVX2-LABEL: shuffle_v4f64_0000:
16; AVX2:       # %bb.0:
17; AVX2-NEXT:    vbroadcastsd %xmm0, %ymm0
18; AVX2-NEXT:    retq
19;
20; AVX512VL-LABEL: shuffle_v4f64_0000:
21; AVX512VL:       # %bb.0:
22; AVX512VL-NEXT:    vbroadcastsd %xmm0, %ymm0
23; AVX512VL-NEXT:    retq
24  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
25  ret <4 x double> %shuffle
26}
27
28define <4 x double> @shuffle_v4f64_0001(<4 x double> %a, <4 x double> %b) {
29; AVX1-LABEL: shuffle_v4f64_0001:
30; AVX1:       # %bb.0:
31; AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
32; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
33; AVX1-NEXT:    retq
34;
35; AVX2-LABEL: shuffle_v4f64_0001:
36; AVX2:       # %bb.0:
37; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1]
38; AVX2-NEXT:    retq
39;
40; AVX512VL-LABEL: shuffle_v4f64_0001:
41; AVX512VL:       # %bb.0:
42; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1]
43; AVX512VL-NEXT:    retq
44  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
45  ret <4 x double> %shuffle
46}
47
48define <4 x double> @shuffle_v4f64_0020(<4 x double> %a, <4 x double> %b) {
49; AVX1-LABEL: shuffle_v4f64_0020:
50; AVX1:       # %bb.0:
51; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
52; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
53; AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
54; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
55; AVX1-NEXT:    retq
56;
57; AVX2-LABEL: shuffle_v4f64_0020:
58; AVX2:       # %bb.0:
59; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,0]
60; AVX2-NEXT:    retq
61;
62; AVX512VL-LABEL: shuffle_v4f64_0020:
63; AVX512VL:       # %bb.0:
64; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,0]
65; AVX512VL-NEXT:    retq
66  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
67  ret <4 x double> %shuffle
68}
69
70define <4 x double> @shuffle_v4f64_0300(<4 x double> %a, <4 x double> %b) {
71; AVX1-LABEL: shuffle_v4f64_0300:
72; AVX1:       # %bb.0:
73; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
74; AVX1-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm1[0,1,2,2]
75; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
76; AVX1-NEXT:    retq
77;
78; AVX2-LABEL: shuffle_v4f64_0300:
79; AVX2:       # %bb.0:
80; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,0]
81; AVX2-NEXT:    retq
82;
83; AVX512VL-LABEL: shuffle_v4f64_0300:
84; AVX512VL:       # %bb.0:
85; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,0]
86; AVX512VL-NEXT:    retq
87  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
88  ret <4 x double> %shuffle
89}
90
91define <4 x double> @shuffle_v4f64_1000(<4 x double> %a, <4 x double> %b) {
92; AVX1-LABEL: shuffle_v4f64_1000:
93; AVX1:       # %bb.0:
94; AVX1-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
95; AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
96; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
97; AVX1-NEXT:    retq
98;
99; AVX2-LABEL: shuffle_v4f64_1000:
100; AVX2:       # %bb.0:
101; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0]
102; AVX2-NEXT:    retq
103;
104; AVX512VL-LABEL: shuffle_v4f64_1000:
105; AVX512VL:       # %bb.0:
106; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0]
107; AVX512VL-NEXT:    retq
108  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
109  ret <4 x double> %shuffle
110}
111
112define <4 x double> @shuffle_v4f64_2200(<4 x double> %a, <4 x double> %b) {
113; AVX1-LABEL: shuffle_v4f64_2200:
114; AVX1:       # %bb.0:
115; AVX1-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
116; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
117; AVX1-NEXT:    retq
118;
119; AVX2-LABEL: shuffle_v4f64_2200:
120; AVX2:       # %bb.0:
121; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0]
122; AVX2-NEXT:    retq
123;
124; AVX512VL-LABEL: shuffle_v4f64_2200:
125; AVX512VL:       # %bb.0:
126; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0]
127; AVX512VL-NEXT:    retq
128  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
129  ret <4 x double> %shuffle
130}
131
132define <4 x double> @shuffle_v4f64_2222(<4 x double> %a, <4 x double> %b) {
133; AVX1-LABEL: shuffle_v4f64_2222:
134; AVX1:       # %bb.0:
135; AVX1-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
136; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
137; AVX1-NEXT:    retq
138;
139; AVX2-LABEL: shuffle_v4f64_2222:
140; AVX2:       # %bb.0:
141; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
142; AVX2-NEXT:    retq
143;
144; AVX512VL-LABEL: shuffle_v4f64_2222:
145; AVX512VL:       # %bb.0:
146; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
147; AVX512VL-NEXT:    retq
148  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
149  ret <4 x double> %shuffle
150}
151
152define <4 x double> @shuffle_v4f64_2222_bc(<4 x i64> %a, <4 x i64> %b) {
153; AVX1-LABEL: shuffle_v4f64_2222_bc:
154; AVX1:       # %bb.0:
155; AVX1-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
156; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
157; AVX1-NEXT:    retq
158;
159; AVX2-LABEL: shuffle_v4f64_2222_bc:
160; AVX2:       # %bb.0:
161; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
162; AVX2-NEXT:    retq
163;
164; AVX512VL-LABEL: shuffle_v4f64_2222_bc:
165; AVX512VL:       # %bb.0:
166; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
167; AVX512VL-NEXT:    retq
168  %tmp0 = bitcast <4 x i64> %a to <4 x double>
169  %tmp1 = bitcast <4 x i64> %b to <4 x double>
170  %shuffle = shufflevector <4 x double> %tmp0, <4 x double> %tmp1, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
171  ret <4 x double> %shuffle
172}
173
174define <4 x double> @shuffle_v4f64_3330(<4 x double> %a, <4 x double> %b) {
175; AVX1-LABEL: shuffle_v4f64_3330:
176; AVX1:       # %bb.0:
177; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
178; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3]
179; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,2]
180; AVX1-NEXT:    retq
181;
182; AVX2-LABEL: shuffle_v4f64_3330:
183; AVX2:       # %bb.0:
184; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,0]
185; AVX2-NEXT:    retq
186;
187; AVX512VL-LABEL: shuffle_v4f64_3330:
188; AVX512VL:       # %bb.0:
189; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,0]
190; AVX512VL-NEXT:    retq
191  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0>
192  ret <4 x double> %shuffle
193}
194
195define <4 x double> @shuffle_v4f64_3210(<4 x double> %a, <4 x double> %b) {
196; AVX1-LABEL: shuffle_v4f64_3210:
197; AVX1:       # %bb.0:
198; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
199; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
200; AVX1-NEXT:    retq
201;
202; AVX2-LABEL: shuffle_v4f64_3210:
203; AVX2:       # %bb.0:
204; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0]
205; AVX2-NEXT:    retq
206;
207; AVX512VL-LABEL: shuffle_v4f64_3210:
208; AVX512VL:       # %bb.0:
209; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0]
210; AVX512VL-NEXT:    retq
211  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
212  ret <4 x double> %shuffle
213}
214
215define <4 x double> @shuffle_v4f64_0023(<4 x double> %a, <4 x double> %b) {
216; ALL-LABEL: shuffle_v4f64_0023:
217; ALL:       # %bb.0:
218; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,3]
219; ALL-NEXT:    retq
220
221  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
222  ret <4 x double> %shuffle
223}
224
225define <4 x double> @shuffle_v4f64_0022(<4 x double> %a, <4 x double> %b) {
226; ALL-LABEL: shuffle_v4f64_0022:
227; ALL:       # %bb.0:
228; ALL-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
229; ALL-NEXT:    retq
230  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
231  ret <4 x double> %shuffle
232}
233
234define <4 x double> @shuffle_v4f64mem_0022(<4 x double>* %ptr, <4 x double> %b) {
235; ALL-LABEL: shuffle_v4f64mem_0022:
236; ALL:       # %bb.0:
237; ALL-NEXT:    vmovddup {{.*#+}} ymm0 = mem[0,0,2,2]
238; ALL-NEXT:    retq
239  %a = load  <4 x double>,  <4 x double>* %ptr
240  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
241  ret <4 x double> %shuffle
242}
243
244define <4 x double> @shuffle_v4f64_1032(<4 x double> %a, <4 x double> %b) {
245; ALL-LABEL: shuffle_v4f64_1032:
246; ALL:       # %bb.0:
247; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
248; ALL-NEXT:    retq
249  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
250  ret <4 x double> %shuffle
251}
252
253define <4 x double> @shuffle_v4f64_1133(<4 x double> %a, <4 x double> %b) {
254; ALL-LABEL: shuffle_v4f64_1133:
255; ALL:       # %bb.0:
256; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,3]
257; ALL-NEXT:    retq
258  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
259  ret <4 x double> %shuffle
260}
261
262define <4 x double> @shuffle_v4f64_1023(<4 x double> %a, <4 x double> %b) {
263; ALL-LABEL: shuffle_v4f64_1023:
264; ALL:       # %bb.0:
265; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3]
266; ALL-NEXT:    retq
267  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
268  ret <4 x double> %shuffle
269}
270
271define <4 x double> @shuffle_v4f64_1022(<4 x double> %a, <4 x double> %b) {
272; ALL-LABEL: shuffle_v4f64_1022:
273; ALL:       # %bb.0:
274; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2]
275; ALL-NEXT:    retq
276  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 2, i32 2>
277  ret <4 x double> %shuffle
278}
279
280define <4 x double> @shuffle_v4f64_0213(<4 x double> %a, <4 x double> %b) {
281; AVX1-LABEL: shuffle_v4f64_0213:
282; AVX1:       # %bb.0:
283; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
284; AVX1-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,2]
285; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3]
286; AVX1-NEXT:    retq
287;
288; AVX2-LABEL: shuffle_v4f64_0213:
289; AVX2:       # %bb.0:
290; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
291; AVX2-NEXT:    retq
292;
293; AVX512VL-LABEL: shuffle_v4f64_0213:
294; AVX512VL:       # %bb.0:
295; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
296; AVX512VL-NEXT:    retq
297  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
298  ret <4 x double> %shuffle
299}
300
301define <4 x double> @shuffle_v4f64_0423(<4 x double> %a, <4 x double> %b) {
302; ALL-LABEL: shuffle_v4f64_0423:
303; ALL:       # %bb.0:
304; ALL-NEXT:    vmovddup {{.*#+}} xmm1 = xmm1[0,0]
305; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3]
306; ALL-NEXT:    retq
307  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
308  ret <4 x double> %shuffle
309}
310
311define <4 x double> @shuffle_v4f64_0462(<4 x double> %a, <4 x double> %b) {
312; ALL-LABEL: shuffle_v4f64_0462:
313; ALL:       # %bb.0:
314; ALL-NEXT:    vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2]
315; ALL-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
316; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3]
317; ALL-NEXT:    retq
318  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 6, i32 2>
319  ret <4 x double> %shuffle
320}
321
322define <4 x double> @shuffle_v4f64_0426(<4 x double> %a, <4 x double> %b) {
323; ALL-LABEL: shuffle_v4f64_0426:
324; ALL:       # %bb.0:
325; ALL-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
326; ALL-NEXT:    retq
327  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
328  ret <4 x double> %shuffle
329}
330
331define <4 x double> @shuffle_v4f64_1537(<4 x double> %a, <4 x double> %b) {
332; ALL-LABEL: shuffle_v4f64_1537:
333; ALL:       # %bb.0:
334; ALL-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
335; ALL-NEXT:    retq
336  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
337  ret <4 x double> %shuffle
338}
339
340define <4 x double> @shuffle_v4f64_4062(<4 x double> %a, <4 x double> %b) {
341; ALL-LABEL: shuffle_v4f64_4062:
342; ALL:       # %bb.0:
343; ALL-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
344; ALL-NEXT:    retq
345  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 0, i32 6, i32 2>
346  ret <4 x double> %shuffle
347}
348
349define <4 x double> @shuffle_v4f64_5173(<4 x double> %a, <4 x double> %b) {
350; ALL-LABEL: shuffle_v4f64_5173:
351; ALL:       # %bb.0:
352; ALL-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
353; ALL-NEXT:    retq
354  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 5, i32 1, i32 7, i32 3>
355  ret <4 x double> %shuffle
356}
357
358define <4 x double> @shuffle_v4f64_5163(<4 x double> %a, <4 x double> %b) {
359; ALL-LABEL: shuffle_v4f64_5163:
360; ALL:       # %bb.0:
361; ALL-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[2],ymm0[3]
362; ALL-NEXT:    retq
363  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 5, i32 1, i32 6, i32 3>
364  ret <4 x double> %shuffle
365}
366
367define <4 x double> @shuffle_v4f64_0527(<4 x double> %a, <4 x double> %b) {
368; ALL-LABEL: shuffle_v4f64_0527:
369; ALL:       # %bb.0:
370; ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
371; ALL-NEXT:    retq
372  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
373  ret <4 x double> %shuffle
374}
375
376define <4 x double> @shuffle_v4f64_4163(<4 x double> %a, <4 x double> %b) {
377; ALL-LABEL: shuffle_v4f64_4163:
378; ALL:       # %bb.0:
379; ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
380; ALL-NEXT:    retq
381  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
382  ret <4 x double> %shuffle
383}
384
385define <4 x double> @shuffle_v4f64_0145(<4 x double> %a, <4 x double> %b) {
386; ALL-LABEL: shuffle_v4f64_0145:
387; ALL:       # %bb.0:
388; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
389; ALL-NEXT:    retq
390  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
391  ret <4 x double> %shuffle
392}
393
394define <4 x double> @shuffle_v4f64_4501(<4 x double> %a, <4 x double> %b) {
395; ALL-LABEL: shuffle_v4f64_4501:
396; ALL:       # %bb.0:
397; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
398; ALL-NEXT:    retq
399  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
400  ret <4 x double> %shuffle
401}
402
403define <4 x double> @shuffle_v4f64_0167(<4 x double> %a, <4 x double> %b) {
404; ALL-LABEL: shuffle_v4f64_0167:
405; ALL:       # %bb.0:
406; ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
407; ALL-NEXT:    retq
408  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
409  ret <4 x double> %shuffle
410}
411
412define <4 x double> @shuffle_v4f64_1054(<4 x double> %a, <4 x double> %b) {
413; ALL-LABEL: shuffle_v4f64_1054:
414; ALL:       # %bb.0:
415; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
416; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
417; ALL-NEXT:    retq
418  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 5, i32 4>
419  ret <4 x double> %shuffle
420}
421
422define <4 x double> @shuffle_v4f64_3254(<4 x double> %a, <4 x double> %b) {
423; AVX1OR2-LABEL: shuffle_v4f64_3254:
424; AVX1OR2:       # %bb.0:
425; AVX1OR2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
426; AVX1OR2-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
427; AVX1OR2-NEXT:    retq
428;
429; AVX512VL-SLOW-LABEL: shuffle_v4f64_3254:
430; AVX512VL-SLOW:       # %bb.0:
431; AVX512VL-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
432; AVX512VL-SLOW-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
433; AVX512VL-SLOW-NEXT:    retq
434;
435; AVX512VL-FAST-LABEL: shuffle_v4f64_3254:
436; AVX512VL-FAST:       # %bb.0:
437; AVX512VL-FAST-NEXT:    vmovapd {{.*#+}} ymm2 = [3,2,5,4]
438; AVX512VL-FAST-NEXT:    vpermt2pd %ymm1, %ymm2, %ymm0
439; AVX512VL-FAST-NEXT:    retq
440  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 3, i32 2, i32 5, i32 4>
441  ret <4 x double> %shuffle
442}
443
444define <4 x double> @shuffle_v4f64_3276(<4 x double> %a, <4 x double> %b) {
445; AVX1OR2-LABEL: shuffle_v4f64_3276:
446; AVX1OR2:       # %bb.0:
447; AVX1OR2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
448; AVX1OR2-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
449; AVX1OR2-NEXT:    retq
450;
451; AVX512VL-SLOW-LABEL: shuffle_v4f64_3276:
452; AVX512VL-SLOW:       # %bb.0:
453; AVX512VL-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
454; AVX512VL-SLOW-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
455; AVX512VL-SLOW-NEXT:    retq
456;
457; AVX512VL-FAST-LABEL: shuffle_v4f64_3276:
458; AVX512VL-FAST:       # %bb.0:
459; AVX512VL-FAST-NEXT:    vmovapd {{.*#+}} ymm2 = [3,2,7,6]
460; AVX512VL-FAST-NEXT:    vpermt2pd %ymm1, %ymm2, %ymm0
461; AVX512VL-FAST-NEXT:    retq
462  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 3, i32 2, i32 7, i32 6>
463  ret <4 x double> %shuffle
464}
465
466define <4 x double> @shuffle_v4f64_1076(<4 x double> %a, <4 x double> %b) {
467; ALL-LABEL: shuffle_v4f64_1076:
468; ALL:       # %bb.0:
469; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
470; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
471; ALL-NEXT:    retq
472  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 7, i32 6>
473  ret <4 x double> %shuffle
474}
475
476define <4 x double> @shuffle_v4f64_0415(<4 x double> %a, <4 x double> %b) {
477; AVX1-LABEL: shuffle_v4f64_0415:
478; AVX1:       # %bb.0:
479; AVX1-NEXT:    vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm1[1]
480; AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
481; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
482; AVX1-NEXT:    retq
483;
484; AVX2-LABEL: shuffle_v4f64_0415:
485; AVX2:       # %bb.0:
486; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1]
487; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
488; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
489; AVX2-NEXT:    retq
490;
491; AVX512VL-LABEL: shuffle_v4f64_0415:
492; AVX512VL:       # %bb.0:
493; AVX512VL-NEXT:    vmovapd {{.*#+}} ymm2 = [0,4,1,5]
494; AVX512VL-NEXT:    vpermt2pd %ymm1, %ymm2, %ymm0
495; AVX512VL-NEXT:    retq
496  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
497  ret <4 x double> %shuffle
498}
499
500define <4 x double> @shuffle_v4f64_u062(<4 x double> %a, <4 x double> %b) {
501; ALL-LABEL: shuffle_v4f64_u062:
502; ALL:       # %bb.0:
503; ALL-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
504; ALL-NEXT:    retq
505  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 undef, i32 0, i32 6, i32 2>
506  ret <4 x double> %shuffle
507}
508
509define <4 x double> @shuffle_v4f64_15uu(<4 x double> %a, <4 x double> %b) {
510; ALL-LABEL: shuffle_v4f64_15uu:
511; ALL:       # %bb.0:
512; ALL-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
513; ALL-NEXT:    retq
514  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 5, i32 undef, i32 undef>
515  ret <4 x double> %shuffle
516}
517
518define <4 x double> @shuffle_v4f64_11uu(<4 x double> %a, <4 x double> %b) {
519; ALL-LABEL: shuffle_v4f64_11uu:
520; ALL:       # %bb.0:
521; ALL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,1]
522; ALL-NEXT:    retq
523  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 1, i32 undef, i32 undef>
524  ret <4 x double> %shuffle
525}
526
527define <4 x double> @shuffle_v4f64_22uu(<4 x double> %a, <4 x double> %b) {
528; AVX1-LABEL: shuffle_v4f64_22uu:
529; AVX1:       # %bb.0:
530; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
531; AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
532; AVX1-NEXT:    retq
533;
534; AVX2-LABEL: shuffle_v4f64_22uu:
535; AVX2:       # %bb.0:
536; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3]
537; AVX2-NEXT:    retq
538;
539; AVX512VL-LABEL: shuffle_v4f64_22uu:
540; AVX512VL:       # %bb.0:
541; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3]
542; AVX512VL-NEXT:    retq
543  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 2, i32 undef, i32 undef>
544  ret <4 x double> %shuffle
545}
546
547define <4 x double> @shuffle_v4f64_3333(<4 x double> %a, <4 x double> %b) {
548; AVX1-LABEL: shuffle_v4f64_3333:
549; AVX1:       # %bb.0:
550; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,3]
551; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
552; AVX1-NEXT:    retq
553;
554; AVX2-LABEL: shuffle_v4f64_3333:
555; AVX2:       # %bb.0:
556; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
557; AVX2-NEXT:    retq
558;
559; AVX512VL-LABEL: shuffle_v4f64_3333:
560; AVX512VL:       # %bb.0:
561; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
562; AVX512VL-NEXT:    retq
563  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
564  ret <4 x double> %shuffle
565}
566
567define <4 x double> @shuffle_v4f64_0z3z(<4 x double> %a, <4 x double> %b) {
568; AVX1-LABEL: shuffle_v4f64_0z3z:
569; AVX1:       # %bb.0:
570; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,2]
571; AVX1-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
572; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
573; AVX1-NEXT:    retq
574;
575; AVX2-SLOW-LABEL: shuffle_v4f64_0z3z:
576; AVX2-SLOW:       # %bb.0:
577; AVX2-SLOW-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,2]
578; AVX2-SLOW-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
579; AVX2-SLOW-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
580; AVX2-SLOW-NEXT:    retq
581;
582; AVX2-FAST-LABEL: shuffle_v4f64_0z3z:
583; AVX2-FAST:       # %bb.0:
584; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
585; AVX2-FAST-NEXT:    retq
586;
587; AVX512VL-SLOW-LABEL: shuffle_v4f64_0z3z:
588; AVX512VL-SLOW:       # %bb.0:
589; AVX512VL-SLOW-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,2]
590; AVX512VL-SLOW-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
591; AVX512VL-SLOW-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
592; AVX512VL-SLOW-NEXT:    retq
593;
594; AVX512VL-FAST-LABEL: shuffle_v4f64_0z3z:
595; AVX512VL-FAST:       # %bb.0:
596; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
597; AVX512VL-FAST-NEXT:    retq
598  %shuffle = shufflevector <4 x double> %a, <4 x double> <double 0.000000e+00, double undef, double undef, double undef>, <4 x i32> <i32 0, i32 4, i32 3, i32 4>
599  ret <4 x double> %shuffle
600}
601
602define <4 x double> @shuffle_v4f64_1z2z(<4 x double> %a, <4 x double> %b) {
603; AVX1-LABEL: shuffle_v4f64_1z2z:
604; AVX1:       # %bb.0:
605; AVX1-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
606; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
607; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
608; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
609; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
610; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
611; AVX1-NEXT:    retq
612;
613; AVX2-SLOW-LABEL: shuffle_v4f64_1z2z:
614; AVX2-SLOW:       # %bb.0:
615; AVX2-SLOW-NEXT:    vxorps %xmm1, %xmm1, %xmm1
616; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
617; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,0,2,0]
618; AVX2-SLOW-NEXT:    retq
619;
620; AVX2-FAST-LABEL: shuffle_v4f64_1z2z:
621; AVX2-FAST:       # %bb.0:
622; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23],zero,zero,zero,zero,zero,zero,zero,zero
623; AVX2-FAST-NEXT:    retq
624;
625; AVX512VL-SLOW-LABEL: shuffle_v4f64_1z2z:
626; AVX512VL-SLOW:       # %bb.0:
627; AVX512VL-SLOW-NEXT:    vxorps %xmm1, %xmm1, %xmm1
628; AVX512VL-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
629; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,0,2,0]
630; AVX512VL-SLOW-NEXT:    retq
631;
632; AVX512VL-FAST-LABEL: shuffle_v4f64_1z2z:
633; AVX512VL-FAST:       # %bb.0:
634; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23],zero,zero,zero,zero,zero,zero,zero,zero
635; AVX512VL-FAST-NEXT:    retq
636  %1 = shufflevector <4 x double> %a, <4 x double> <double 0.000000e+00, double undef, double undef, double undef>, <4 x i32> <i32 1, i32 4, i32 2, i32 4>
637  ret <4 x double> %1
638}
639
640define <4 x i64> @shuffle_v4i64_0000(<4 x i64> %a, <4 x i64> %b) {
641; AVX1-LABEL: shuffle_v4i64_0000:
642; AVX1:       # %bb.0:
643; AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
644; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
645; AVX1-NEXT:    retq
646;
647; AVX2-LABEL: shuffle_v4i64_0000:
648; AVX2:       # %bb.0:
649; AVX2-NEXT:    vbroadcastsd %xmm0, %ymm0
650; AVX2-NEXT:    retq
651;
652; AVX512VL-LABEL: shuffle_v4i64_0000:
653; AVX512VL:       # %bb.0:
654; AVX512VL-NEXT:    vbroadcastsd %xmm0, %ymm0
655; AVX512VL-NEXT:    retq
656  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
657  ret <4 x i64> %shuffle
658}
659
660define <4 x i64> @shuffle_v4i64_0001(<4 x i64> %a, <4 x i64> %b) {
661; AVX1-LABEL: shuffle_v4i64_0001:
662; AVX1:       # %bb.0:
663; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,1]
664; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
665; AVX1-NEXT:    retq
666;
667; AVX2-LABEL: shuffle_v4i64_0001:
668; AVX2:       # %bb.0:
669; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1]
670; AVX2-NEXT:    retq
671;
672; AVX512VL-LABEL: shuffle_v4i64_0001:
673; AVX512VL:       # %bb.0:
674; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1]
675; AVX512VL-NEXT:    retq
676  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
677  ret <4 x i64> %shuffle
678}
679
680define <4 x i64> @shuffle_v4i64_0020(<4 x i64> %a, <4 x i64> %b) {
681; AVX1-LABEL: shuffle_v4i64_0020:
682; AVX1:       # %bb.0:
683; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
684; AVX1-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
685; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
686; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
687; AVX1-NEXT:    retq
688;
689; AVX2-LABEL: shuffle_v4i64_0020:
690; AVX2:       # %bb.0:
691; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,0]
692; AVX2-NEXT:    retq
693;
694; AVX512VL-LABEL: shuffle_v4i64_0020:
695; AVX512VL:       # %bb.0:
696; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,0]
697; AVX512VL-NEXT:    retq
698  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
699  ret <4 x i64> %shuffle
700}
701
702define <4 x i64> @shuffle_v4i64_0112(<4 x i64> %a, <4 x i64> %b) {
703; AVX1-LABEL: shuffle_v4i64_0112:
704; AVX1:       # %bb.0:
705; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
706; AVX1-NEXT:    vpalignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
707; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
708; AVX1-NEXT:    retq
709;
710; AVX2-LABEL: shuffle_v4i64_0112:
711; AVX2:       # %bb.0:
712; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,2]
713; AVX2-NEXT:    retq
714;
715; AVX512VL-LABEL: shuffle_v4i64_0112:
716; AVX512VL:       # %bb.0:
717; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,2]
718; AVX512VL-NEXT:    retq
719  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 1, i32 2>
720  ret <4 x i64> %shuffle
721}
722
723define <4 x i64> @shuffle_v4i64_0300(<4 x i64> %a, <4 x i64> %b) {
724; AVX1-LABEL: shuffle_v4i64_0300:
725; AVX1:       # %bb.0:
726; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
727; AVX1-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm1[0,1,2,2]
728; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
729; AVX1-NEXT:    retq
730;
731; AVX2-LABEL: shuffle_v4i64_0300:
732; AVX2:       # %bb.0:
733; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,0]
734; AVX2-NEXT:    retq
735;
736; AVX512VL-LABEL: shuffle_v4i64_0300:
737; AVX512VL:       # %bb.0:
738; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,0]
739; AVX512VL-NEXT:    retq
740  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
741  ret <4 x i64> %shuffle
742}
743
744define <4 x i64> @shuffle_v4i64_1000(<4 x i64> %a, <4 x i64> %b) {
745; AVX1-LABEL: shuffle_v4i64_1000:
746; AVX1:       # %bb.0:
747; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
748; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
749; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
750; AVX1-NEXT:    retq
751;
752; AVX2-LABEL: shuffle_v4i64_1000:
753; AVX2:       # %bb.0:
754; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0]
755; AVX2-NEXT:    retq
756;
757; AVX512VL-LABEL: shuffle_v4i64_1000:
758; AVX512VL:       # %bb.0:
759; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0]
760; AVX512VL-NEXT:    retq
761  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
762  ret <4 x i64> %shuffle
763}
764
765define <4 x i64> @shuffle_v4i64_2200(<4 x i64> %a, <4 x i64> %b) {
766; AVX1-LABEL: shuffle_v4i64_2200:
767; AVX1:       # %bb.0:
768; AVX1-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
769; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
770; AVX1-NEXT:    retq
771;
772; AVX2-LABEL: shuffle_v4i64_2200:
773; AVX2:       # %bb.0:
774; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0]
775; AVX2-NEXT:    retq
776;
777; AVX512VL-LABEL: shuffle_v4i64_2200:
778; AVX512VL:       # %bb.0:
779; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0]
780; AVX512VL-NEXT:    retq
781  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
782  ret <4 x i64> %shuffle
783}
784
785define <4 x i64> @shuffle_v4i64_3330(<4 x i64> %a, <4 x i64> %b) {
786; AVX1-LABEL: shuffle_v4i64_3330:
787; AVX1:       # %bb.0:
788; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
789; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3]
790; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,2]
791; AVX1-NEXT:    retq
792;
793; AVX2-LABEL: shuffle_v4i64_3330:
794; AVX2:       # %bb.0:
795; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,0]
796; AVX2-NEXT:    retq
797;
798; AVX512VL-LABEL: shuffle_v4i64_3330:
799; AVX512VL:       # %bb.0:
800; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,0]
801; AVX512VL-NEXT:    retq
802  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0>
803  ret <4 x i64> %shuffle
804}
805
806define <4 x i64> @shuffle_v4i64_3210(<4 x i64> %a, <4 x i64> %b) {
807; AVX1-LABEL: shuffle_v4i64_3210:
808; AVX1:       # %bb.0:
809; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
810; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
811; AVX1-NEXT:    retq
812;
813; AVX2-LABEL: shuffle_v4i64_3210:
814; AVX2:       # %bb.0:
815; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0]
816; AVX2-NEXT:    retq
817;
818; AVX512VL-LABEL: shuffle_v4i64_3210:
819; AVX512VL:       # %bb.0:
820; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0]
821; AVX512VL-NEXT:    retq
822  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
823  ret <4 x i64> %shuffle
824}
825
826define <4 x i64> @shuffle_v4i64_0213(<4 x i64> %a, <4 x i64> %b) {
827; AVX1-LABEL: shuffle_v4i64_0213:
828; AVX1:       # %bb.0:
829; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
830; AVX1-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,2]
831; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3]
832; AVX1-NEXT:    retq
833;
834; AVX2-LABEL: shuffle_v4i64_0213:
835; AVX2:       # %bb.0:
836; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
837; AVX2-NEXT:    retq
838;
839; AVX512VL-LABEL: shuffle_v4i64_0213:
840; AVX512VL:       # %bb.0:
841; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
842; AVX512VL-NEXT:    retq
843  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
844  ret <4 x i64> %shuffle
845}
846
847define <4 x i64> @shuffle_v4i64_0124(<4 x i64> %a, <4 x i64> %b) {
848; AVX1-LABEL: shuffle_v4i64_0124:
849; AVX1:       # %bb.0:
850; AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = xmm1[0,0]
851; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
852; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
853; AVX1-NEXT:    retq
854;
855; AVX2-LABEL: shuffle_v4i64_0124:
856; AVX2:       # %bb.0:
857; AVX2-NEXT:    vbroadcastsd %xmm1, %ymm1
858; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
859; AVX2-NEXT:    retq
860;
861; AVX512VL-SLOW-LABEL: shuffle_v4i64_0124:
862; AVX512VL-SLOW:       # %bb.0:
863; AVX512VL-SLOW-NEXT:    vbroadcastsd %xmm1, %ymm1
864; AVX512VL-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
865; AVX512VL-SLOW-NEXT:    retq
866;
867; AVX512VL-FAST-LABEL: shuffle_v4i64_0124:
868; AVX512VL-FAST:       # %bb.0:
869; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,2,4]
870; AVX512VL-FAST-NEXT:    vpermt2q %ymm1, %ymm2, %ymm0
871; AVX512VL-FAST-NEXT:    retq
872  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
873  ret <4 x i64> %shuffle
874}
875
876define <4 x i64> @shuffle_v4i64_0142(<4 x i64> %a, <4 x i64> %b) {
877; AVX1-LABEL: shuffle_v4i64_0142:
878; AVX1:       # %bb.0:
879; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
880; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,1,2,2]
881; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3]
882; AVX1-NEXT:    retq
883;
884; AVX2-LABEL: shuffle_v4i64_0142:
885; AVX2:       # %bb.0:
886; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
887; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,2]
888; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
889; AVX2-NEXT:    retq
890;
891; AVX512VL-LABEL: shuffle_v4i64_0142:
892; AVX512VL:       # %bb.0:
893; AVX512VL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
894; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,2]
895; AVX512VL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
896; AVX512VL-NEXT:    retq
897  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
898  ret <4 x i64> %shuffle
899}
900
901define <4 x i64> @shuffle_v4i64_0412(<4 x i64> %a, <4 x i64> %b) {
902; AVX1-LABEL: shuffle_v4i64_0412:
903; AVX1:       # %bb.0:
904; AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = xmm1[0,0]
905; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
906; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
907; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
908; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3]
909; AVX1-NEXT:    retq
910;
911; AVX2-LABEL: shuffle_v4i64_0412:
912; AVX2:       # %bb.0:
913; AVX2-NEXT:    vpbroadcastq %xmm1, %xmm1
914; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2]
915; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
916; AVX2-NEXT:    retq
917;
918; AVX512VL-SLOW-LABEL: shuffle_v4i64_0412:
919; AVX512VL-SLOW:       # %bb.0:
920; AVX512VL-SLOW-NEXT:    vpbroadcastq %xmm1, %xmm1
921; AVX512VL-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2]
922; AVX512VL-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
923; AVX512VL-SLOW-NEXT:    retq
924;
925; AVX512VL-FAST-LABEL: shuffle_v4i64_0412:
926; AVX512VL-FAST:       # %bb.0:
927; AVX512VL-FAST-NEXT:    vpbroadcastq %xmm1, %xmm1
928; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,5,1,2]
929; AVX512VL-FAST-NEXT:    vpermt2q %ymm1, %ymm2, %ymm0
930; AVX512VL-FAST-NEXT:    retq
931  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 2>
932  ret <4 x i64> %shuffle
933}
934
935define <4 x i64> @shuffle_v4i64_4012(<4 x i64> %a, <4 x i64> %b) {
936; AVX1-LABEL: shuffle_v4i64_4012:
937; AVX1:       # %bb.0:
938; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
939; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
940; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
941; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
942; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
943; AVX1-NEXT:    retq
944;
945; AVX2-LABEL: shuffle_v4i64_4012:
946; AVX2:       # %bb.0:
947; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,2]
948; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
949; AVX2-NEXT:    retq
950;
951; AVX512VL-SLOW-LABEL: shuffle_v4i64_4012:
952; AVX512VL-SLOW:       # %bb.0:
953; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,2]
954; AVX512VL-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
955; AVX512VL-SLOW-NEXT:    retq
956;
957; AVX512VL-FAST-LABEL: shuffle_v4i64_4012:
958; AVX512VL-FAST:       # %bb.0:
959; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,0,1,2]
960; AVX512VL-FAST-NEXT:    vpermt2q %ymm1, %ymm2, %ymm0
961; AVX512VL-FAST-NEXT:    retq
962  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 2>
963  ret <4 x i64> %shuffle
964}
965
966define <4 x i64> @shuffle_v4i64_0145(<4 x i64> %a, <4 x i64> %b) {
967; ALL-LABEL: shuffle_v4i64_0145:
968; ALL:       # %bb.0:
969; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
970; ALL-NEXT:    retq
971  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
972  ret <4 x i64> %shuffle
973}
974
975define <4 x i64> @shuffle_v4i64_0451(<4 x i64> %a, <4 x i64> %b) {
976; AVX1-LABEL: shuffle_v4i64_0451:
977; AVX1:       # %bb.0:
978; AVX1-NEXT:    vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
979; AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
980; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
981; AVX1-NEXT:    retq
982;
983; AVX2-LABEL: shuffle_v4i64_0451:
984; AVX2:       # %bb.0:
985; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,0,1,3]
986; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1]
987; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
988; AVX2-NEXT:    retq
989;
990; AVX512VL-LABEL: shuffle_v4i64_0451:
991; AVX512VL:       # %bb.0:
992; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,4,5,1]
993; AVX512VL-NEXT:    vpermt2q %ymm1, %ymm2, %ymm0
994; AVX512VL-NEXT:    retq
995  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 5, i32 1>
996  ret <4 x i64> %shuffle
997}
998
999define <4 x i64> @shuffle_v4i64_4501(<4 x i64> %a, <4 x i64> %b) {
1000; ALL-LABEL: shuffle_v4i64_4501:
1001; ALL:       # %bb.0:
1002; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1003; ALL-NEXT:    retq
1004  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
1005  ret <4 x i64> %shuffle
1006}
1007
1008define <4 x i64> @shuffle_v4i64_4015(<4 x i64> %a, <4 x i64> %b) {
1009; AVX1-LABEL: shuffle_v4i64_4015:
1010; AVX1:       # %bb.0:
1011; AVX1-NEXT:    vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm1[1]
1012; AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1013; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1014; AVX1-NEXT:    retq
1015;
1016; AVX2-LABEL: shuffle_v4i64_4015:
1017; AVX2:       # %bb.0:
1018; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
1019; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3]
1020; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
1021; AVX2-NEXT:    retq
1022;
1023; AVX512VL-LABEL: shuffle_v4i64_4015:
1024; AVX512VL:       # %bb.0:
1025; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,0,1,5]
1026; AVX512VL-NEXT:    vpermt2q %ymm1, %ymm2, %ymm0
1027; AVX512VL-NEXT:    retq
1028  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 5>
1029  ret <4 x i64> %shuffle
1030}
1031
1032define <4 x i64> @shuffle_v4i64_2u35(<4 x i64> %a, <4 x i64> %b) {
1033; AVX1-LABEL: shuffle_v4i64_2u35:
1034; AVX1:       # %bb.0:
1035; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1036; AVX1-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm0[1],xmm1[1]
1037; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1038; AVX1-NEXT:    retq
1039;
1040; AVX2-LABEL: shuffle_v4i64_2u35:
1041; AVX2:       # %bb.0:
1042; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
1043; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,1]
1044; AVX2-NEXT:    retq
1045;
1046; AVX512VL-SLOW-LABEL: shuffle_v4i64_2u35:
1047; AVX512VL-SLOW:       # %bb.0:
1048; AVX512VL-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
1049; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,1]
1050; AVX512VL-SLOW-NEXT:    retq
1051;
1052; AVX512VL-FAST-LABEL: shuffle_v4i64_2u35:
1053; AVX512VL-FAST:       # %bb.0:
1054; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,5,3,5]
1055; AVX512VL-FAST-NEXT:    vpermt2q %ymm1, %ymm2, %ymm0
1056; AVX512VL-FAST-NEXT:    retq
1057  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 undef, i32 3, i32 5>
1058  ret <4 x i64> %shuffle
1059}
1060
1061define <4 x i64> @shuffle_v4i64_1251(<4 x i64> %a, <4 x i64> %b) {
1062; AVX1-LABEL: shuffle_v4i64_1251:
1063; AVX1:       # %bb.0:
1064; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
1065; AVX1-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm2[0],ymm0[2],ymm2[3]
1066; AVX1-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1067; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
1068; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3]
1069; AVX1-NEXT:    retq
1070;
1071; AVX2-LABEL: shuffle_v4i64_1251:
1072; AVX2:       # %bb.0:
1073; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3]
1074; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,2,2,1]
1075; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
1076; AVX2-NEXT:    retq
1077;
1078; AVX512VL-LABEL: shuffle_v4i64_1251:
1079; AVX512VL:       # %bb.0:
1080; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,2,5,1]
1081; AVX512VL-NEXT:    vpermt2q %ymm1, %ymm2, %ymm0
1082; AVX512VL-NEXT:    retq
1083  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 2, i32 5, i32 1>
1084  ret <4 x i64> %shuffle
1085}
1086
1087define <4 x i64> @shuffle_v4i64_1054(<4 x i64> %a, <4 x i64> %b) {
1088; AVX1-LABEL: shuffle_v4i64_1054:
1089; AVX1:       # %bb.0:
1090; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1091; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
1092; AVX1-NEXT:    retq
1093;
1094; AVX2-LABEL: shuffle_v4i64_1054:
1095; AVX2:       # %bb.0:
1096; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1097; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
1098; AVX2-NEXT:    retq
1099;
1100; AVX512VL-LABEL: shuffle_v4i64_1054:
1101; AVX512VL:       # %bb.0:
1102; AVX512VL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1103; AVX512VL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
1104; AVX512VL-NEXT:    retq
1105  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 0, i32 5, i32 4>
1106  ret <4 x i64> %shuffle
1107}
1108
1109define <4 x i64> @shuffle_v4i64_3254(<4 x i64> %a, <4 x i64> %b) {
1110; AVX1-LABEL: shuffle_v4i64_3254:
1111; AVX1:       # %bb.0:
1112; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
1113; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
1114; AVX1-NEXT:    retq
1115;
1116; AVX2-LABEL: shuffle_v4i64_3254:
1117; AVX2:       # %bb.0:
1118; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
1119; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
1120; AVX2-NEXT:    retq
1121;
1122; AVX512VL-SLOW-LABEL: shuffle_v4i64_3254:
1123; AVX512VL-SLOW:       # %bb.0:
1124; AVX512VL-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
1125; AVX512VL-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
1126; AVX512VL-SLOW-NEXT:    retq
1127;
1128; AVX512VL-FAST-LABEL: shuffle_v4i64_3254:
1129; AVX512VL-FAST:       # %bb.0:
1130; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,2,5,4]
1131; AVX512VL-FAST-NEXT:    vpermt2q %ymm1, %ymm2, %ymm0
1132; AVX512VL-FAST-NEXT:    retq
1133  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 2, i32 5, i32 4>
1134  ret <4 x i64> %shuffle
1135}
1136
1137define <4 x i64> @shuffle_v4i64_3276(<4 x i64> %a, <4 x i64> %b) {
1138; AVX1-LABEL: shuffle_v4i64_3276:
1139; AVX1:       # %bb.0:
1140; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1141; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
1142; AVX1-NEXT:    retq
1143;
1144; AVX2-LABEL: shuffle_v4i64_3276:
1145; AVX2:       # %bb.0:
1146; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1147; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
1148; AVX2-NEXT:    retq
1149;
1150; AVX512VL-SLOW-LABEL: shuffle_v4i64_3276:
1151; AVX512VL-SLOW:       # %bb.0:
1152; AVX512VL-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1153; AVX512VL-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
1154; AVX512VL-SLOW-NEXT:    retq
1155;
1156; AVX512VL-FAST-LABEL: shuffle_v4i64_3276:
1157; AVX512VL-FAST:       # %bb.0:
1158; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,2,7,6]
1159; AVX512VL-FAST-NEXT:    vpermt2q %ymm1, %ymm2, %ymm0
1160; AVX512VL-FAST-NEXT:    retq
1161  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 2, i32 7, i32 6>
1162  ret <4 x i64> %shuffle
1163}
1164
1165define <4 x i64> @shuffle_v4i64_1076(<4 x i64> %a, <4 x i64> %b) {
1166; AVX1-LABEL: shuffle_v4i64_1076:
1167; AVX1:       # %bb.0:
1168; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
1169; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
1170; AVX1-NEXT:    retq
1171;
1172; AVX2-LABEL: shuffle_v4i64_1076:
1173; AVX2:       # %bb.0:
1174; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
1175; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
1176; AVX2-NEXT:    retq
1177;
1178; AVX512VL-LABEL: shuffle_v4i64_1076:
1179; AVX512VL:       # %bb.0:
1180; AVX512VL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
1181; AVX512VL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
1182; AVX512VL-NEXT:    retq
1183  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 0, i32 7, i32 6>
1184  ret <4 x i64> %shuffle
1185}
1186
1187define <4 x i64> @shuffle_v4i64_0415(<4 x i64> %a, <4 x i64> %b) {
1188; AVX1-LABEL: shuffle_v4i64_0415:
1189; AVX1:       # %bb.0:
1190; AVX1-NEXT:    vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm1[1]
1191; AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1192; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1193; AVX1-NEXT:    retq
1194;
1195; AVX2-LABEL: shuffle_v4i64_0415:
1196; AVX2:       # %bb.0:
1197; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1]
1198; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
1199; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
1200; AVX2-NEXT:    retq
1201;
1202; AVX512VL-LABEL: shuffle_v4i64_0415:
1203; AVX512VL:       # %bb.0:
1204; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,4,1,5]
1205; AVX512VL-NEXT:    vpermt2q %ymm1, %ymm2, %ymm0
1206; AVX512VL-NEXT:    retq
1207  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
1208  ret <4 x i64> %shuffle
1209}
1210
1211define <4 x i64> @shuffle_v4i64_z4z6(<4 x i64> %a) {
1212; AVX1-LABEL: shuffle_v4i64_z4z6:
1213; AVX1:       # %bb.0:
1214; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1215; AVX1-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
1216; AVX1-NEXT:    retq
1217;
1218; AVX2-LABEL: shuffle_v4i64_z4z6:
1219; AVX2:       # %bb.0:
1220; AVX2-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23]
1221; AVX2-NEXT:    retq
1222;
1223; AVX512VL-LABEL: shuffle_v4i64_z4z6:
1224; AVX512VL:       # %bb.0:
1225; AVX512VL-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23]
1226; AVX512VL-NEXT:    retq
1227  %shuffle = shufflevector <4 x i64> zeroinitializer, <4 x i64> %a, <4 x i32> <i32 0, i32 4, i32 0, i32 6>
1228  ret <4 x i64> %shuffle
1229}
1230
1231define <4 x i64> @shuffle_v4i64_5zuz(<4 x i64> %a) {
1232; AVX1-LABEL: shuffle_v4i64_5zuz:
1233; AVX1:       # %bb.0:
1234; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1235; AVX1-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
1236; AVX1-NEXT:    retq
1237;
1238; AVX2-LABEL: shuffle_v4i64_5zuz:
1239; AVX2:       # %bb.0:
1240; AVX2-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
1241; AVX2-NEXT:    retq
1242;
1243; AVX512VL-LABEL: shuffle_v4i64_5zuz:
1244; AVX512VL:       # %bb.0:
1245; AVX512VL-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
1246; AVX512VL-NEXT:    retq
1247  %shuffle = shufflevector <4 x i64> zeroinitializer, <4 x i64> %a, <4 x i32> <i32 5, i32 0, i32 undef, i32 0>
1248  ret <4 x i64> %shuffle
1249}
1250
1251define <4 x i64> @shuffle_v4i64_40u2(<4 x i64> %a, <4 x i64> %b) {
1252; ALL-LABEL: shuffle_v4i64_40u2:
1253; ALL:       # %bb.0:
1254; ALL-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
1255; ALL-NEXT:    retq
1256  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 0, i32 undef, i32 2>
1257  ret <4 x i64> %shuffle
1258}
1259
1260define <4 x i64> @shuffle_v4i64_15uu(<4 x i64> %a, <4 x i64> %b) {
1261; ALL-LABEL: shuffle_v4i64_15uu:
1262; ALL:       # %bb.0:
1263; ALL-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1264; ALL-NEXT:    retq
1265  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 5, i32 undef, i32 undef>
1266  ret <4 x i64> %shuffle
1267}
1268
1269define <4 x i64> @shuffle_v4i64_11uu(<4 x i64> %a, <4 x i64> %b) {
1270; ALL-LABEL: shuffle_v4i64_11uu:
1271; ALL:       # %bb.0:
1272; ALL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
1273; ALL-NEXT:    retq
1274  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 1, i32 undef, i32 undef>
1275  ret <4 x i64> %shuffle
1276}
1277
1278define <4 x i64> @shuffle_v4i64_22uu(<4 x i64> %a, <4 x i64> %b) {
1279; AVX1-LABEL: shuffle_v4i64_22uu:
1280; AVX1:       # %bb.0:
1281; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1282; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
1283; AVX1-NEXT:    retq
1284;
1285; AVX2-LABEL: shuffle_v4i64_22uu:
1286; AVX2:       # %bb.0:
1287; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3]
1288; AVX2-NEXT:    retq
1289;
1290; AVX512VL-LABEL: shuffle_v4i64_22uu:
1291; AVX512VL:       # %bb.0:
1292; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3]
1293; AVX512VL-NEXT:    retq
1294  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 2, i32 undef, i32 undef>
1295  ret <4 x i64> %shuffle
1296}
1297
1298define <4 x i64> @shuffle_v4i64_3333(<4 x i64> %a, <4 x i64> %b) {
1299; AVX1-LABEL: shuffle_v4i64_3333:
1300; AVX1:       # %bb.0:
1301; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,3]
1302; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
1303; AVX1-NEXT:    retq
1304;
1305; AVX2-LABEL: shuffle_v4i64_3333:
1306; AVX2:       # %bb.0:
1307; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
1308; AVX2-NEXT:    retq
1309;
1310; AVX512VL-LABEL: shuffle_v4i64_3333:
1311; AVX512VL:       # %bb.0:
1312; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
1313; AVX512VL-NEXT:    retq
1314  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1315  ret <4 x i64> %shuffle
1316}
1317
1318define <4 x i64> @shuffle_v4i64_1z3z(<4 x i64> %a, <4 x i64> %b) {
1319; AVX1-LABEL: shuffle_v4i64_1z3z:
1320; AVX1:       # %bb.0:
1321; AVX1-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1322; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
1323; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1324; AVX1-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0]
1325; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1326; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1327; AVX1-NEXT:    retq
1328;
1329; AVX2-LABEL: shuffle_v4i64_1z3z:
1330; AVX2:       # %bb.0:
1331; AVX2-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
1332; AVX2-NEXT:    retq
1333;
1334; AVX512VL-LABEL: shuffle_v4i64_1z3z:
1335; AVX512VL:       # %bb.0:
1336; AVX512VL-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
1337; AVX512VL-NEXT:    retq
1338  %shuffle = shufflevector <4 x i64> %a, <4 x i64> <i64 0, i64 undef, i64 undef, i64 undef>, <4 x i32> <i32 1, i32 4, i32 3, i32 4>
1339  ret <4 x i64> %shuffle
1340}
1341
1342define <4 x i64> @stress_test1(<4 x i64> %a, <4 x i64> %b) {
1343; ALL-LABEL: stress_test1:
1344; ALL:         retq
1345  %c = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> <i32 3, i32 1, i32 1, i32 0>
1346  %d = shufflevector <4 x i64> %c, <4 x i64> undef, <4 x i32> <i32 3, i32 undef, i32 2, i32 undef>
1347  %e = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 undef>
1348  %f = shufflevector <4 x i64> %d, <4 x i64> %e, <4 x i32> <i32 5, i32 1, i32 1, i32 0>
1349
1350  ret <4 x i64> %f
1351}
1352
1353define <4 x i64> @insert_reg_and_zero_v4i64(i64 %a) {
1354; ALL-LABEL: insert_reg_and_zero_v4i64:
1355; ALL:       # %bb.0:
1356; ALL-NEXT:    vmovq %rdi, %xmm0
1357; ALL-NEXT:    retq
1358  %v = insertelement <4 x i64> undef, i64 %a, i64 0
1359  %shuffle = shufflevector <4 x i64> %v, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1360  ret <4 x i64> %shuffle
1361}
1362
1363define <4 x i64> @insert_mem_and_zero_v4i64(i64* %ptr) {
1364; ALL-LABEL: insert_mem_and_zero_v4i64:
1365; ALL:       # %bb.0:
1366; ALL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
1367; ALL-NEXT:    retq
1368  %a = load i64, i64* %ptr
1369  %v = insertelement <4 x i64> undef, i64 %a, i64 0
1370  %shuffle = shufflevector <4 x i64> %v, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1371  ret <4 x i64> %shuffle
1372}
1373
1374define <4 x double> @insert_reg_and_zero_v4f64(double %a) {
1375; ALL-LABEL: insert_reg_and_zero_v4f64:
1376; ALL:       # %bb.0:
1377; ALL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1378; ALL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1379; ALL-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1380; ALL-NEXT:    retq
1381  %v = insertelement <4 x double> undef, double %a, i32 0
1382  %shuffle = shufflevector <4 x double> %v, <4 x double> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1383  ret <4 x double> %shuffle
1384}
1385
1386define <4 x double> @insert_mem_and_zero_v4f64(double* %ptr) {
1387; ALL-LABEL: insert_mem_and_zero_v4f64:
1388; ALL:       # %bb.0:
1389; ALL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
1390; ALL-NEXT:    retq
1391  %a = load double, double* %ptr
1392  %v = insertelement <4 x double> undef, double %a, i32 0
1393  %shuffle = shufflevector <4 x double> %v, <4 x double> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1394  ret <4 x double> %shuffle
1395}
1396
1397define <4 x double> @splat_mem_v4f64(double* %ptr) {
1398; ALL-LABEL: splat_mem_v4f64:
1399; ALL:       # %bb.0:
1400; ALL-NEXT:    vbroadcastsd (%rdi), %ymm0
1401; ALL-NEXT:    retq
1402  %a = load double, double* %ptr
1403  %v = insertelement <4 x double> undef, double %a, i32 0
1404  %shuffle = shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
1405  ret <4 x double> %shuffle
1406}
1407
1408define <4 x i64> @splat_mem_v4i64(i64* %ptr) {
1409; ALL-LABEL: splat_mem_v4i64:
1410; ALL:       # %bb.0:
1411; ALL-NEXT:    vbroadcastsd (%rdi), %ymm0
1412; ALL-NEXT:    retq
1413  %a = load i64, i64* %ptr
1414  %v = insertelement <4 x i64> undef, i64 %a, i64 0
1415  %shuffle = shufflevector <4 x i64> %v, <4 x i64> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
1416  ret <4 x i64> %shuffle
1417}
1418
1419define <4 x double> @splat_mem_v4f64_2(double* %p) {
1420; ALL-LABEL: splat_mem_v4f64_2:
1421; ALL:       # %bb.0:
1422; ALL-NEXT:    vbroadcastsd (%rdi), %ymm0
1423; ALL-NEXT:    retq
1424  %1 = load double, double* %p
1425  %2 = insertelement <2 x double> undef, double %1, i32 0
1426  %3 = shufflevector <2 x double> %2, <2 x double> undef, <4 x i32> zeroinitializer
1427  ret <4 x double> %3
1428}
1429
1430define <4 x double> @splat_v4f64(<2 x double> %r) {
1431; AVX1-LABEL: splat_v4f64:
1432; AVX1:       # %bb.0:
1433; AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
1434; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1435; AVX1-NEXT:    retq
1436;
1437; AVX2-LABEL: splat_v4f64:
1438; AVX2:       # %bb.0:
1439; AVX2-NEXT:    vbroadcastsd %xmm0, %ymm0
1440; AVX2-NEXT:    retq
1441;
1442; AVX512VL-LABEL: splat_v4f64:
1443; AVX512VL:       # %bb.0:
1444; AVX512VL-NEXT:    vbroadcastsd %xmm0, %ymm0
1445; AVX512VL-NEXT:    retq
1446  %1 = shufflevector <2 x double> %r, <2 x double> undef, <4 x i32> zeroinitializer
1447  ret <4 x double> %1
1448}
1449
1450define <4 x i64> @splat_mem_v4i64_from_v2i64(<2 x i64>* %ptr) {
1451; ALL-LABEL: splat_mem_v4i64_from_v2i64:
1452; ALL:       # %bb.0:
1453; ALL-NEXT:    vbroadcastsd (%rdi), %ymm0
1454; ALL-NEXT:    retq
1455  %v = load <2 x i64>, <2 x i64>* %ptr
1456  %shuffle = shufflevector <2 x i64> %v, <2 x i64> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
1457  ret <4 x i64> %shuffle
1458}
1459
1460define <4 x double> @splat_mem_v4f64_from_v2f64(<2 x double>* %ptr) {
1461; ALL-LABEL: splat_mem_v4f64_from_v2f64:
1462; ALL:       # %bb.0:
1463; ALL-NEXT:    vbroadcastsd (%rdi), %ymm0
1464; ALL-NEXT:    retq
1465  %v = load <2 x double>, <2 x double>* %ptr
1466  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
1467  ret <4 x double> %shuffle
1468}
1469
1470define <4 x i64> @splat128_mem_v4i64_from_v2i64(<2 x i64>* %ptr) {
1471; AVX1OR2-LABEL: splat128_mem_v4i64_from_v2i64:
1472; AVX1OR2:       # %bb.0:
1473; AVX1OR2-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
1474; AVX1OR2-NEXT:    retq
1475;
1476; AVX512VL-LABEL: splat128_mem_v4i64_from_v2i64:
1477; AVX512VL:       # %bb.0:
1478; AVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
1479; AVX512VL-NEXT:    retq
1480  %v = load <2 x i64>, <2 x i64>* %ptr
1481  %shuffle = shufflevector <2 x i64> %v, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
1482  ret <4 x i64> %shuffle
1483}
1484
1485define <4 x double> @splat128_mem_v4f64_from_v2f64(<2 x double>* %ptr) {
1486; ALL-LABEL: splat128_mem_v4f64_from_v2f64:
1487; ALL:       # %bb.0:
1488; ALL-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
1489; ALL-NEXT:    retq
1490  %v = load <2 x double>, <2 x double>* %ptr
1491  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
1492  ret <4 x double> %shuffle
1493}
1494
1495define <4 x double> @broadcast_v4f64_0000_from_v2i64(<2 x i64> %a0) {
1496; AVX1-LABEL: broadcast_v4f64_0000_from_v2i64:
1497; AVX1:       # %bb.0:
1498; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
1499; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1500; AVX1-NEXT:    retq
1501;
1502; AVX2-LABEL: broadcast_v4f64_0000_from_v2i64:
1503; AVX2:       # %bb.0:
1504; AVX2-NEXT:    vbroadcastsd %xmm0, %ymm0
1505; AVX2-NEXT:    retq
1506;
1507; AVX512VL-LABEL: broadcast_v4f64_0000_from_v2i64:
1508; AVX512VL:       # %bb.0:
1509; AVX512VL-NEXT:    vbroadcastsd %xmm0, %ymm0
1510; AVX512VL-NEXT:    retq
1511  %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1512  %2 = bitcast <4 x i64> %1 to <4 x double>
1513  %3 = shufflevector <4 x double> %2, <4 x double> undef, <4 x i32> zeroinitializer
1514  ret <4 x double> %3
1515}
1516
1517define <4 x double> @bitcast_v4f64_0426(<4 x double> %a, <4 x double> %b) {
1518; ALL-LABEL: bitcast_v4f64_0426:
1519; ALL:       # %bb.0:
1520; ALL-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
1521; ALL-NEXT:    retq
1522  %shuffle64 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 0, i32 6, i32 2>
1523  %bitcast32 = bitcast <4 x double> %shuffle64 to <8 x float>
1524  %shuffle32 = shufflevector <8 x float> %bitcast32, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
1525  %bitcast16 = bitcast <8 x float> %shuffle32 to <16 x i16>
1526  %shuffle16 = shufflevector <16 x i16> %bitcast16, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9, i32 14, i32 15, i32 12, i32 13>
1527  %bitcast64 = bitcast <16 x i16> %shuffle16 to <4 x double>
1528  ret <4 x double> %bitcast64
1529}
1530
1531define <4 x i64> @concat_v4i64_0167(<4 x i64> %a0, <4 x i64> %a1) {
1532; ALL-LABEL: concat_v4i64_0167:
1533; ALL:       # %bb.0:
1534; ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
1535; ALL-NEXT:    retq
1536  %a0lo = shufflevector <4 x i64> %a0, <4 x i64> %a1, <2 x i32> <i32 0, i32 1>
1537  %a1hi = shufflevector <4 x i64> %a0, <4 x i64> %a1, <2 x i32> <i32 6, i32 7>
1538  %shuffle64 = shufflevector <2 x i64> %a0lo, <2 x i64> %a1hi, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1539  ret <4 x i64> %shuffle64
1540}
1541
1542define <4 x i64> @concat_v4i64_0145_bc(<4 x i64> %a0, <4 x i64> %a1) {
1543; ALL-LABEL: concat_v4i64_0145_bc:
1544; ALL:       # %bb.0:
1545; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1546; ALL-NEXT:    retq
1547  %a0lo = shufflevector <4 x i64> %a0, <4 x i64> %a1, <2 x i32> <i32 0, i32 1>
1548  %a1lo = shufflevector <4 x i64> %a0, <4 x i64> %a1, <2 x i32> <i32 4, i32 5>
1549  %bc0lo = bitcast <2 x i64> %a0lo to <4 x i32>
1550  %bc1lo = bitcast <2 x i64> %a1lo to <4 x i32>
1551  %shuffle32 = shufflevector <4 x i32> %bc0lo, <4 x i32> %bc1lo, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1552  %shuffle64 = bitcast <8 x i32> %shuffle32 to <4 x i64>
1553  ret <4 x i64> %shuffle64
1554}
1555
1556define <4 x i64> @insert_dup_mem_v4i64(i64* %ptr) {
1557; ALL-LABEL: insert_dup_mem_v4i64:
1558; ALL:       # %bb.0:
1559; ALL-NEXT:    vbroadcastsd (%rdi), %ymm0
1560; ALL-NEXT:    retq
1561  %tmp = load i64, i64* %ptr, align 1
1562  %tmp1 = insertelement <2 x i64> undef, i64 %tmp, i32 0
1563  %tmp2 = shufflevector <2 x i64> %tmp1, <2 x i64> undef, <4 x i32> zeroinitializer
1564  ret <4 x i64> %tmp2
1565}
1566
1567define <4 x i64> @shuffle_v4i64_1234(<4 x i64> %a, <4 x i64> %b) {
1568; AVX1-LABEL: shuffle_v4i64_1234:
1569; AVX1:       # %bb.0:
1570; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
1571; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
1572; AVX1-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[3],ymm1[2]
1573; AVX1-NEXT:    retq
1574;
1575; AVX2-LABEL: shuffle_v4i64_1234:
1576; AVX2:       # %bb.0:
1577; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
1578; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,2,3,0]
1579; AVX2-NEXT:    retq
1580;
1581; AVX512VL-LABEL: shuffle_v4i64_1234:
1582; AVX512VL:       # %bb.0:
1583; AVX512VL-NEXT:    valignq {{.*#+}} ymm0 = ymm0[1,2,3],ymm1[0]
1584; AVX512VL-NEXT:    retq
1585  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
1586  ret <4 x i64> %shuffle
1587}
1588
1589define <4 x i64> @shuffle_v4i64_1230(<4 x i64> %a) {
1590; AVX1-LABEL: shuffle_v4i64_1230:
1591; AVX1:       # %bb.0:
1592; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
1593; AVX1-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[3],ymm1[2]
1594; AVX1-NEXT:    retq
1595;
1596; AVX2-LABEL: shuffle_v4i64_1230:
1597; AVX2:       # %bb.0:
1598; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,2,3,0]
1599; AVX2-NEXT:    retq
1600;
1601; AVX512VL-LABEL: shuffle_v4i64_1230:
1602; AVX512VL:       # %bb.0:
1603; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,2,3,0]
1604; AVX512VL-NEXT:    retq
1605  %shuffle = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
1606  ret <4 x i64> %shuffle
1607}
1608
1609define <4 x i64> @shuffle_v4i64_z0z3(<4 x i64> %a, <4 x i64> %b) {
1610; AVX1-LABEL: shuffle_v4i64_z0z3:
1611; AVX1:       # %bb.0:
1612; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,3]
1613; AVX1-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1614; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3]
1615; AVX1-NEXT:    retq
1616;
1617; AVX2-SLOW-LABEL: shuffle_v4i64_z0z3:
1618; AVX2-SLOW:       # %bb.0:
1619; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,3]
1620; AVX2-SLOW-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1621; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
1622; AVX2-SLOW-NEXT:    retq
1623;
1624; AVX2-FAST-LABEL: shuffle_v4i64_z0z3:
1625; AVX2-FAST:       # %bb.0:
1626; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31]
1627; AVX2-FAST-NEXT:    retq
1628;
1629; AVX512VL-SLOW-LABEL: shuffle_v4i64_z0z3:
1630; AVX512VL-SLOW:       # %bb.0:
1631; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,3]
1632; AVX512VL-SLOW-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1633; AVX512VL-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
1634; AVX512VL-SLOW-NEXT:    retq
1635;
1636; AVX512VL-FAST-LABEL: shuffle_v4i64_z0z3:
1637; AVX512VL-FAST:       # %bb.0:
1638; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31]
1639; AVX512VL-FAST-NEXT:    retq
1640  %1 = shufflevector <4 x i64> %a, <4 x i64> <i64 0, i64 undef, i64 undef, i64 undef>, <4 x i32> <i32 4, i32 0, i32 4, i32 3>
1641  ret <4 x i64> %1
1642}
1643
1644define <4 x i64> @shuffle_v4i64_1z2z(<4 x i64> %a, <4 x i64> %b) {
1645; AVX1-LABEL: shuffle_v4i64_1z2z:
1646; AVX1:       # %bb.0:
1647; AVX1-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1648; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
1649; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1650; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1651; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1652; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1653; AVX1-NEXT:    retq
1654;
1655; AVX2-SLOW-LABEL: shuffle_v4i64_1z2z:
1656; AVX2-SLOW:       # %bb.0:
1657; AVX2-SLOW-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1658; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
1659; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,0,2,0]
1660; AVX2-SLOW-NEXT:    retq
1661;
1662; AVX2-FAST-LABEL: shuffle_v4i64_1z2z:
1663; AVX2-FAST:       # %bb.0:
1664; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23],zero,zero,zero,zero,zero,zero,zero,zero
1665; AVX2-FAST-NEXT:    retq
1666;
1667; AVX512VL-SLOW-LABEL: shuffle_v4i64_1z2z:
1668; AVX512VL-SLOW:       # %bb.0:
1669; AVX512VL-SLOW-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1670; AVX512VL-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
1671; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,0,2,0]
1672; AVX512VL-SLOW-NEXT:    retq
1673;
1674; AVX512VL-FAST-LABEL: shuffle_v4i64_1z2z:
1675; AVX512VL-FAST:       # %bb.0:
1676; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23],zero,zero,zero,zero,zero,zero,zero,zero
1677; AVX512VL-FAST-NEXT:    retq
1678  %1 = shufflevector <4 x i64> %a, <4 x i64> <i64 0, i64 undef, i64 undef, i64 undef>, <4 x i32> <i32 1, i32 4, i32 2, i32 4>
1679  ret <4 x i64> %1
1680}
1681
1682define <4 x double> @add_v4f64_0246_1357(<4 x double> %a, <4 x double> %b) {
1683; AVX1-LABEL: add_v4f64_0246_1357:
1684; AVX1:       # %bb.0: # %entry
1685; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1686; AVX1-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm2[0]
1687; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
1688; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
1689; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm5 = xmm0[0],xmm4[0]
1690; AVX1-NEXT:    vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3]
1691; AVX1-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
1692; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
1693; AVX1-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1]
1694; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
1695; AVX1-NEXT:    vaddpd %ymm0, %ymm3, %ymm0
1696; AVX1-NEXT:    retq
1697;
1698; AVX2-LABEL: add_v4f64_0246_1357:
1699; AVX2:       # %bb.0: # %entry
1700; AVX2-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
1701; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
1702; AVX2-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
1703; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
1704; AVX2-NEXT:    vaddpd %ymm0, %ymm2, %ymm0
1705; AVX2-NEXT:    retq
1706;
1707; AVX512VL-SLOW-LABEL: add_v4f64_0246_1357:
1708; AVX512VL-SLOW:       # %bb.0: # %entry
1709; AVX512VL-SLOW-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
1710; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
1711; AVX512VL-SLOW-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
1712; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
1713; AVX512VL-SLOW-NEXT:    vaddpd %ymm0, %ymm2, %ymm0
1714; AVX512VL-SLOW-NEXT:    retq
1715;
1716; AVX512VL-FAST-LABEL: add_v4f64_0246_1357:
1717; AVX512VL-FAST:       # %bb.0: # %entry
1718; AVX512VL-FAST-NEXT:    vmovapd {{.*#+}} ymm2 = [0,2,4,6]
1719; AVX512VL-FAST-NEXT:    vpermi2pd %ymm1, %ymm0, %ymm2
1720; AVX512VL-FAST-NEXT:    vmovapd {{.*#+}} ymm3 = [1,3,5,7]
1721; AVX512VL-FAST-NEXT:    vpermi2pd %ymm1, %ymm0, %ymm3
1722; AVX512VL-FAST-NEXT:    vaddpd %ymm3, %ymm2, %ymm0
1723; AVX512VL-FAST-NEXT:    retq
1724entry:
1725  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1726  %shuffle1 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1727  %add = fadd <4 x double> %shuffle, %shuffle1
1728  ret <4 x double> %add
1729}
1730
1731define <4 x double> @add_v4f64_4602_5713(<4 x double> %a, <4 x double> %b) {
1732; AVX1-LABEL: add_v4f64_4602_5713:
1733; AVX1:       # %bb.0: # %entry
1734; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1735; AVX1-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm0[0],xmm2[0]
1736; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
1737; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
1738; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm5 = xmm1[0],xmm4[0]
1739; AVX1-NEXT:    vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3]
1740; AVX1-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
1741; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1742; AVX1-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1]
1743; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
1744; AVX1-NEXT:    vaddpd %ymm0, %ymm3, %ymm0
1745; AVX1-NEXT:    retq
1746;
1747; AVX2-LABEL: add_v4f64_4602_5713:
1748; AVX2:       # %bb.0: # %entry
1749; AVX2-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
1750; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
1751; AVX2-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
1752; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
1753; AVX2-NEXT:    vaddpd %ymm0, %ymm2, %ymm0
1754; AVX2-NEXT:    retq
1755;
1756; AVX512VL-SLOW-LABEL: add_v4f64_4602_5713:
1757; AVX512VL-SLOW:       # %bb.0: # %entry
1758; AVX512VL-SLOW-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
1759; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
1760; AVX512VL-SLOW-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
1761; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
1762; AVX512VL-SLOW-NEXT:    vaddpd %ymm0, %ymm2, %ymm0
1763; AVX512VL-SLOW-NEXT:    retq
1764;
1765; AVX512VL-FAST-LABEL: add_v4f64_4602_5713:
1766; AVX512VL-FAST:       # %bb.0: # %entry
1767; AVX512VL-FAST-NEXT:    vmovapd {{.*#+}} ymm2 = [0,2,4,6]
1768; AVX512VL-FAST-NEXT:    vpermi2pd %ymm0, %ymm1, %ymm2
1769; AVX512VL-FAST-NEXT:    vmovapd {{.*#+}} ymm3 = [1,3,5,7]
1770; AVX512VL-FAST-NEXT:    vpermi2pd %ymm0, %ymm1, %ymm3
1771; AVX512VL-FAST-NEXT:    vaddpd %ymm3, %ymm2, %ymm0
1772; AVX512VL-FAST-NEXT:    retq
1773entry:
1774  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 6, i32 0, i32 2>
1775  %shuffle1 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 5, i32 7, i32 1, i32 3>
1776  %add = fadd <4 x double> %shuffle, %shuffle1
1777  ret <4 x double> %add
1778}
1779
1780define <4 x i64> @add_v4i64_0246_1357(<4 x i64> %a, <4 x i64> %b) {
1781; AVX1-LABEL: add_v4i64_0246_1357:
1782; AVX1:       # %bb.0: # %entry
1783; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1784; AVX1-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm2[0]
1785; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
1786; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
1787; AVX1-NEXT:    vmovlhps {{.*#+}} xmm5 = xmm0[0],xmm4[0]
1788; AVX1-NEXT:    vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
1789; AVX1-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
1790; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
1791; AVX1-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1]
1792; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
1793; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1794; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
1795; AVX1-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
1796; AVX1-NEXT:    vpaddq %xmm0, %xmm3, %xmm0
1797; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1798; AVX1-NEXT:    retq
1799;
1800; AVX2-LABEL: add_v4i64_0246_1357:
1801; AVX2:       # %bb.0: # %entry
1802; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
1803; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
1804; AVX2-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
1805; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1806; AVX2-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
1807; AVX2-NEXT:    retq
1808;
1809; AVX512VL-SLOW-LABEL: add_v4i64_0246_1357:
1810; AVX512VL-SLOW:       # %bb.0: # %entry
1811; AVX512VL-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
1812; AVX512VL-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
1813; AVX512VL-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
1814; AVX512VL-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1815; AVX512VL-SLOW-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
1816; AVX512VL-SLOW-NEXT:    retq
1817;
1818; AVX512VL-FAST-LABEL: add_v4i64_0246_1357:
1819; AVX512VL-FAST:       # %bb.0: # %entry
1820; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6]
1821; AVX512VL-FAST-NEXT:    vpermi2q %ymm1, %ymm0, %ymm2
1822; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,3,5,7]
1823; AVX512VL-FAST-NEXT:    vpermi2q %ymm1, %ymm0, %ymm3
1824; AVX512VL-FAST-NEXT:    vpaddq %ymm3, %ymm2, %ymm0
1825; AVX512VL-FAST-NEXT:    retq
1826entry:
1827  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1828  %shuffle1 = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1829  %add = add <4 x i64> %shuffle, %shuffle1
1830  ret <4 x i64> %add
1831}
1832
1833define <4 x i64> @add_v4i64_4602_5713(<4 x i64> %a, <4 x i64> %b) {
1834; AVX1-LABEL: add_v4i64_4602_5713:
1835; AVX1:       # %bb.0: # %entry
1836; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1837; AVX1-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm0[0],xmm2[0]
1838; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
1839; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
1840; AVX1-NEXT:    vmovlhps {{.*#+}} xmm5 = xmm1[0],xmm4[0]
1841; AVX1-NEXT:    vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
1842; AVX1-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
1843; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1844; AVX1-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1]
1845; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
1846; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1847; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
1848; AVX1-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
1849; AVX1-NEXT:    vpaddq %xmm0, %xmm3, %xmm0
1850; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1851; AVX1-NEXT:    retq
1852;
1853; AVX2-LABEL: add_v4i64_4602_5713:
1854; AVX2:       # %bb.0: # %entry
1855; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
1856; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
1857; AVX2-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
1858; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1859; AVX2-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
1860; AVX2-NEXT:    retq
1861;
1862; AVX512VL-SLOW-LABEL: add_v4i64_4602_5713:
1863; AVX512VL-SLOW:       # %bb.0: # %entry
1864; AVX512VL-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
1865; AVX512VL-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
1866; AVX512VL-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
1867; AVX512VL-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1868; AVX512VL-SLOW-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
1869; AVX512VL-SLOW-NEXT:    retq
1870;
1871; AVX512VL-FAST-LABEL: add_v4i64_4602_5713:
1872; AVX512VL-FAST:       # %bb.0: # %entry
1873; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6]
1874; AVX512VL-FAST-NEXT:    vpermi2q %ymm0, %ymm1, %ymm2
1875; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,3,5,7]
1876; AVX512VL-FAST-NEXT:    vpermi2q %ymm0, %ymm1, %ymm3
1877; AVX512VL-FAST-NEXT:    vpaddq %ymm3, %ymm2, %ymm0
1878; AVX512VL-FAST-NEXT:    retq
1879entry:
1880  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 6, i32 0, i32 2>
1881  %shuffle1 = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 5, i32 7, i32 1, i32 3>
1882  %add = add <4 x i64> %shuffle, %shuffle1
1883  ret <4 x i64> %add
1884}
1885
1886define <4 x double> @shuffle_v4f64_0zzz_optsize(<4 x double> %a) optsize {
1887; ALL-LABEL: shuffle_v4f64_0zzz_optsize:
1888; ALL:       # %bb.0:
1889; ALL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1890; ALL-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
1891; ALL-NEXT:    retq
1892  %b = shufflevector <4 x double> %a, <4 x double> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1893  ret <4 x double> %b
1894}
1895
1896define <4 x i64> @shuffle_v4i64_0zzz_optsize(<4 x i64> %a) optsize {
1897; ALL-LABEL: shuffle_v4i64_0zzz_optsize:
1898; ALL:       # %bb.0:
1899; ALL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1900; ALL-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
1901; ALL-NEXT:    retq
1902  %b = shufflevector <4 x i64> %a, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1903  ret <4 x i64> %b
1904}
1905
1906define <8 x float> @shuffle_v8f32_0zzzzzzz_optsize(<8 x float> %a) optsize {
1907; ALL-LABEL: shuffle_v8f32_0zzzzzzz_optsize:
1908; ALL:       # %bb.0:
1909; ALL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1910; ALL-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1911; ALL-NEXT:    retq
1912  %b = shufflevector <8 x float> %a, <8 x float> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1913  ret <8 x float> %b
1914}
1915
1916define <8 x i32> @shuffle_v8i32_0zzzzzzz_optsize(<8 x i32> %a) optsize {
1917; ALL-LABEL: shuffle_v8i32_0zzzzzzz_optsize:
1918; ALL:       # %bb.0:
1919; ALL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1920; ALL-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1921; ALL-NEXT:    retq
1922  %b = shufflevector <8 x i32> %a, <8 x i32> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1923  ret <8 x i32> %b
1924}
1925