• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX1
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX2 --check-prefix=AVX2-SLOW
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX2 --check-prefix=AVX2-FAST
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512VL-SLOW
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512VL-FAST
7
8define <4 x double> @shuffle_v4f64_0000(<4 x double> %a, <4 x double> %b) {
9; AVX1-LABEL: shuffle_v4f64_0000:
10; AVX1:       # %bb.0:
11; AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
12; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
13; AVX1-NEXT:    retq
14;
15; AVX2-LABEL: shuffle_v4f64_0000:
16; AVX2:       # %bb.0:
17; AVX2-NEXT:    vbroadcastsd %xmm0, %ymm0
18; AVX2-NEXT:    retq
19;
20; AVX512VL-LABEL: shuffle_v4f64_0000:
21; AVX512VL:       # %bb.0:
22; AVX512VL-NEXT:    vbroadcastsd %xmm0, %ymm0
23; AVX512VL-NEXT:    retq
24  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
25  ret <4 x double> %shuffle
26}
27
28define <4 x double> @shuffle_v4f64_0001(<4 x double> %a, <4 x double> %b) {
29; AVX1-LABEL: shuffle_v4f64_0001:
30; AVX1:       # %bb.0:
31; AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
32; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
33; AVX1-NEXT:    retq
34;
35; AVX2-LABEL: shuffle_v4f64_0001:
36; AVX2:       # %bb.0:
37; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1]
38; AVX2-NEXT:    retq
39;
40; AVX512VL-LABEL: shuffle_v4f64_0001:
41; AVX512VL:       # %bb.0:
42; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1]
43; AVX512VL-NEXT:    retq
44  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
45  ret <4 x double> %shuffle
46}
47
48define <4 x double> @shuffle_v4f64_0020(<4 x double> %a, <4 x double> %b) {
49; AVX1-LABEL: shuffle_v4f64_0020:
50; AVX1:       # %bb.0:
51; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm1
52; AVX1-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
53; AVX1-NEXT:    retq
54;
55; AVX2-LABEL: shuffle_v4f64_0020:
56; AVX2:       # %bb.0:
57; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,0]
58; AVX2-NEXT:    retq
59;
60; AVX512VL-LABEL: shuffle_v4f64_0020:
61; AVX512VL:       # %bb.0:
62; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,0]
63; AVX512VL-NEXT:    retq
64  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
65  ret <4 x double> %shuffle
66}
67
68define <4 x double> @shuffle_v4f64_0300(<4 x double> %a, <4 x double> %b) {
69; AVX1-LABEL: shuffle_v4f64_0300:
70; AVX1:       # %bb.0:
71; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
72; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
73; AVX1-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[2]
74; AVX1-NEXT:    retq
75;
76; AVX2-LABEL: shuffle_v4f64_0300:
77; AVX2:       # %bb.0:
78; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,0]
79; AVX2-NEXT:    retq
80;
81; AVX512VL-LABEL: shuffle_v4f64_0300:
82; AVX512VL:       # %bb.0:
83; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,0]
84; AVX512VL-NEXT:    retq
85  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
86  ret <4 x double> %shuffle
87}
88
89define <4 x double> @shuffle_v4f64_1000(<4 x double> %a, <4 x double> %b) {
90; AVX1-LABEL: shuffle_v4f64_1000:
91; AVX1:       # %bb.0:
92; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
93; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2]
94; AVX1-NEXT:    retq
95;
96; AVX2-LABEL: shuffle_v4f64_1000:
97; AVX2:       # %bb.0:
98; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0]
99; AVX2-NEXT:    retq
100;
101; AVX512VL-LABEL: shuffle_v4f64_1000:
102; AVX512VL:       # %bb.0:
103; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0]
104; AVX512VL-NEXT:    retq
105  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
106  ret <4 x double> %shuffle
107}
108
109define <4 x double> @shuffle_v4f64_2200(<4 x double> %a, <4 x double> %b) {
110; AVX1-LABEL: shuffle_v4f64_2200:
111; AVX1:       # %bb.0:
112; AVX1-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
113; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
114; AVX1-NEXT:    retq
115;
116; AVX2-LABEL: shuffle_v4f64_2200:
117; AVX2:       # %bb.0:
118; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0]
119; AVX2-NEXT:    retq
120;
121; AVX512VL-LABEL: shuffle_v4f64_2200:
122; AVX512VL:       # %bb.0:
123; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0]
124; AVX512VL-NEXT:    retq
125  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
126  ret <4 x double> %shuffle
127}
128
129define <4 x double> @shuffle_v4f64_2222(<4 x double> %a, <4 x double> %b) {
130; AVX1-LABEL: shuffle_v4f64_2222:
131; AVX1:       # %bb.0:
132; AVX1-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
133; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
134; AVX1-NEXT:    retq
135;
136; AVX2-LABEL: shuffle_v4f64_2222:
137; AVX2:       # %bb.0:
138; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
139; AVX2-NEXT:    retq
140;
141; AVX512VL-LABEL: shuffle_v4f64_2222:
142; AVX512VL:       # %bb.0:
143; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
144; AVX512VL-NEXT:    retq
145  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
146  ret <4 x double> %shuffle
147}
148
149define <4 x double> @shuffle_v4f64_2222_bc(<4 x i64> %a, <4 x i64> %b) {
150; AVX1-LABEL: shuffle_v4f64_2222_bc:
151; AVX1:       # %bb.0:
152; AVX1-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
153; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
154; AVX1-NEXT:    retq
155;
156; AVX2-LABEL: shuffle_v4f64_2222_bc:
157; AVX2:       # %bb.0:
158; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
159; AVX2-NEXT:    retq
160;
161; AVX512VL-LABEL: shuffle_v4f64_2222_bc:
162; AVX512VL:       # %bb.0:
163; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
164; AVX512VL-NEXT:    retq
165  %tmp0 = bitcast <4 x i64> %a to <4 x double>
166  %tmp1 = bitcast <4 x i64> %b to <4 x double>
167  %shuffle = shufflevector <4 x double> %tmp0, <4 x double> %tmp1, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
168  ret <4 x double> %shuffle
169}
170
171define <4 x double> @shuffle_v4f64_2233(<4 x double> %a, <4 x double> %b) {
172; AVX1-LABEL: shuffle_v4f64_2233:
173; AVX1:       # %bb.0:
174; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
175; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,3]
176; AVX1-NEXT:    retq
177;
178; AVX2-LABEL: shuffle_v4f64_2233:
179; AVX2:       # %bb.0:
180; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,3,3]
181; AVX2-NEXT:    retq
182;
183; AVX512VL-LABEL: shuffle_v4f64_2233:
184; AVX512VL:       # %bb.0:
185; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,3,3]
186; AVX512VL-NEXT:    retq
187  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 2, i32 3, i32 3>
188  ret <4 x double> %shuffle
189}
190
191define <4 x double> @shuffle_v4f64_3330(<4 x double> %a, <4 x double> %b) {
192; AVX1-LABEL: shuffle_v4f64_3330:
193; AVX1:       # %bb.0:
194; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
195; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
196; AVX1-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[2]
197; AVX1-NEXT:    retq
198;
199; AVX2-LABEL: shuffle_v4f64_3330:
200; AVX2:       # %bb.0:
201; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,0]
202; AVX2-NEXT:    retq
203;
204; AVX512VL-LABEL: shuffle_v4f64_3330:
205; AVX512VL:       # %bb.0:
206; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,0]
207; AVX512VL-NEXT:    retq
208  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0>
209  ret <4 x double> %shuffle
210}
211
212define <4 x double> @shuffle_v4f64_3210(<4 x double> %a, <4 x double> %b) {
213; AVX1-LABEL: shuffle_v4f64_3210:
214; AVX1:       # %bb.0:
215; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
216; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
217; AVX1-NEXT:    retq
218;
219; AVX2-LABEL: shuffle_v4f64_3210:
220; AVX2:       # %bb.0:
221; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0]
222; AVX2-NEXT:    retq
223;
224; AVX512VL-LABEL: shuffle_v4f64_3210:
225; AVX512VL:       # %bb.0:
226; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0]
227; AVX512VL-NEXT:    retq
228  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
229  ret <4 x double> %shuffle
230}
231
232define <4 x double> @shuffle_v4f64_0023(<4 x double> %a, <4 x double> %b) {
233; ALL-LABEL: shuffle_v4f64_0023:
234; ALL:       # %bb.0:
235; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,3]
236; ALL-NEXT:    retq
237
238  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
239  ret <4 x double> %shuffle
240}
241
242define <4 x double> @shuffle_v4f64_0022(<4 x double> %a, <4 x double> %b) {
243; ALL-LABEL: shuffle_v4f64_0022:
244; ALL:       # %bb.0:
245; ALL-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
246; ALL-NEXT:    retq
247  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
248  ret <4 x double> %shuffle
249}
250
251define <4 x double> @shuffle_v4f64mem_0022(<4 x double>* %ptr, <4 x double> %b) {
252; ALL-LABEL: shuffle_v4f64mem_0022:
253; ALL:       # %bb.0:
254; ALL-NEXT:    vmovddup {{.*#+}} ymm0 = mem[0,0,2,2]
255; ALL-NEXT:    retq
256  %a = load  <4 x double>,  <4 x double>* %ptr
257  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
258  ret <4 x double> %shuffle
259}
260
261define <4 x double> @shuffle_v4f64_1032(<4 x double> %a, <4 x double> %b) {
262; ALL-LABEL: shuffle_v4f64_1032:
263; ALL:       # %bb.0:
264; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
265; ALL-NEXT:    retq
266  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
267  ret <4 x double> %shuffle
268}
269
270define <4 x double> @shuffle_v4f64_1133(<4 x double> %a, <4 x double> %b) {
271; ALL-LABEL: shuffle_v4f64_1133:
272; ALL:       # %bb.0:
273; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,3]
274; ALL-NEXT:    retq
275  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
276  ret <4 x double> %shuffle
277}
278
279define <4 x double> @shuffle_v4f64_1023(<4 x double> %a, <4 x double> %b) {
280; ALL-LABEL: shuffle_v4f64_1023:
281; ALL:       # %bb.0:
282; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3]
283; ALL-NEXT:    retq
284  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
285  ret <4 x double> %shuffle
286}
287
288define <4 x double> @shuffle_v4f64_1022(<4 x double> %a, <4 x double> %b) {
289; ALL-LABEL: shuffle_v4f64_1022:
290; ALL:       # %bb.0:
291; ALL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2]
292; ALL-NEXT:    retq
293  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 2, i32 2>
294  ret <4 x double> %shuffle
295}
296
297define <4 x double> @shuffle_v4f64_0213(<4 x double> %a, <4 x double> %b) {
298; AVX1-LABEL: shuffle_v4f64_0213:
299; AVX1:       # %bb.0:
300; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,2,3]
301; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
302; AVX1-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[3]
303; AVX1-NEXT:    retq
304;
305; AVX2-LABEL: shuffle_v4f64_0213:
306; AVX2:       # %bb.0:
307; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
308; AVX2-NEXT:    retq
309;
310; AVX512VL-LABEL: shuffle_v4f64_0213:
311; AVX512VL:       # %bb.0:
312; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
313; AVX512VL-NEXT:    retq
314  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
315  ret <4 x double> %shuffle
316}
317
318define <4 x double> @shuffle_v4f64_0423(<4 x double> %a, <4 x double> %b) {
319; AVX1OR2-LABEL: shuffle_v4f64_0423:
320; AVX1OR2:       # %bb.0:
321; AVX1OR2-NEXT:    vmovddup {{.*#+}} xmm1 = xmm1[0,0]
322; AVX1OR2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
323; AVX1OR2-NEXT:    retq
324;
325; AVX512VL-SLOW-LABEL: shuffle_v4f64_0423:
326; AVX512VL-SLOW:       # %bb.0:
327; AVX512VL-SLOW-NEXT:    vmovddup {{.*#+}} xmm1 = xmm1[0,0]
328; AVX512VL-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
329; AVX512VL-SLOW-NEXT:    retq
330;
331; AVX512VL-FAST-LABEL: shuffle_v4f64_0423:
332; AVX512VL-FAST:       # %bb.0:
333; AVX512VL-FAST-NEXT:    vmovapd {{.*#+}} ymm2 = [0,4,2,3]
334; AVX512VL-FAST-NEXT:    vpermt2pd %ymm1, %ymm2, %ymm0
335; AVX512VL-FAST-NEXT:    retq
336  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
337  ret <4 x double> %shuffle
338}
339
340define <4 x double> @shuffle_v4f64_0462(<4 x double> %a, <4 x double> %b) {
341; AVX1OR2-LABEL: shuffle_v4f64_0462:
342; AVX1OR2:       # %bb.0:
343; AVX1OR2-NEXT:    vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
344; AVX1OR2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
345; AVX1OR2-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
346; AVX1OR2-NEXT:    retq
347;
348; AVX512VL-LABEL: shuffle_v4f64_0462:
349; AVX512VL:       # %bb.0:
350; AVX512VL-NEXT:    vmovapd {{.*#+}} ymm2 = [0,4,6,2]
351; AVX512VL-NEXT:    vpermt2pd %ymm1, %ymm2, %ymm0
352; AVX512VL-NEXT:    retq
353  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 6, i32 2>
354  ret <4 x double> %shuffle
355}
356
357define <4 x double> @shuffle_v4f64_0426(<4 x double> %a, <4 x double> %b) {
358; ALL-LABEL: shuffle_v4f64_0426:
359; ALL:       # %bb.0:
360; ALL-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
361; ALL-NEXT:    retq
362  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
363  ret <4 x double> %shuffle
364}
365
366define <4 x double> @shuffle_v4f64_1537(<4 x double> %a, <4 x double> %b) {
367; ALL-LABEL: shuffle_v4f64_1537:
368; ALL:       # %bb.0:
369; ALL-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
370; ALL-NEXT:    retq
371  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
372  ret <4 x double> %shuffle
373}
374
375define <4 x double> @shuffle_v4f64_4062(<4 x double> %a, <4 x double> %b) {
376; ALL-LABEL: shuffle_v4f64_4062:
377; ALL:       # %bb.0:
378; ALL-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
379; ALL-NEXT:    retq
380  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 0, i32 6, i32 2>
381  ret <4 x double> %shuffle
382}
383
384define <4 x double> @shuffle_v4f64_5173(<4 x double> %a, <4 x double> %b) {
385; ALL-LABEL: shuffle_v4f64_5173:
386; ALL:       # %bb.0:
387; ALL-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
388; ALL-NEXT:    retq
389  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 5, i32 1, i32 7, i32 3>
390  ret <4 x double> %shuffle
391}
392
393define <4 x double> @shuffle_v4f64_5163(<4 x double> %a, <4 x double> %b) {
394; ALL-LABEL: shuffle_v4f64_5163:
395; ALL:       # %bb.0:
396; ALL-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[2],ymm0[3]
397; ALL-NEXT:    retq
398  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 5, i32 1, i32 6, i32 3>
399  ret <4 x double> %shuffle
400}
401
402define <4 x double> @shuffle_v4f64_0527(<4 x double> %a, <4 x double> %b) {
403; ALL-LABEL: shuffle_v4f64_0527:
404; ALL:       # %bb.0:
405; ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
406; ALL-NEXT:    retq
407  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
408  ret <4 x double> %shuffle
409}
410
411define <4 x double> @shuffle_v4f64_4163(<4 x double> %a, <4 x double> %b) {
412; ALL-LABEL: shuffle_v4f64_4163:
413; ALL:       # %bb.0:
414; ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
415; ALL-NEXT:    retq
416  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
417  ret <4 x double> %shuffle
418}
419
420define <4 x double> @shuffle_v4f64_0145(<4 x double> %a, <4 x double> %b) {
421; ALL-LABEL: shuffle_v4f64_0145:
422; ALL:       # %bb.0:
423; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
424; ALL-NEXT:    retq
425  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
426  ret <4 x double> %shuffle
427}
428
429define <4 x double> @shuffle_v4f64_4501(<4 x double> %a, <4 x double> %b) {
430; ALL-LABEL: shuffle_v4f64_4501:
431; ALL:       # %bb.0:
432; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
433; ALL-NEXT:    retq
434  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
435  ret <4 x double> %shuffle
436}
437
438define <4 x double> @shuffle_v4f64_0167(<4 x double> %a, <4 x double> %b) {
439; ALL-LABEL: shuffle_v4f64_0167:
440; ALL:       # %bb.0:
441; ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
442; ALL-NEXT:    retq
443  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
444  ret <4 x double> %shuffle
445}
446
447define <4 x double> @shuffle_v4f64_1054(<4 x double> %a, <4 x double> %b) {
448; AVX1OR2-LABEL: shuffle_v4f64_1054:
449; AVX1OR2:       # %bb.0:
450; AVX1OR2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
451; AVX1OR2-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
452; AVX1OR2-NEXT:    retq
453;
454; AVX512VL-SLOW-LABEL: shuffle_v4f64_1054:
455; AVX512VL-SLOW:       # %bb.0:
456; AVX512VL-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
457; AVX512VL-SLOW-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
458; AVX512VL-SLOW-NEXT:    retq
459;
460; AVX512VL-FAST-LABEL: shuffle_v4f64_1054:
461; AVX512VL-FAST:       # %bb.0:
462; AVX512VL-FAST-NEXT:    vmovapd {{.*#+}} ymm2 = [1,0,5,4]
463; AVX512VL-FAST-NEXT:    vpermt2pd %ymm1, %ymm2, %ymm0
464; AVX512VL-FAST-NEXT:    retq
465  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 5, i32 4>
466  ret <4 x double> %shuffle
467}
468
469define <4 x double> @shuffle_v4f64_3254(<4 x double> %a, <4 x double> %b) {
470; AVX1OR2-LABEL: shuffle_v4f64_3254:
471; AVX1OR2:       # %bb.0:
472; AVX1OR2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
473; AVX1OR2-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
474; AVX1OR2-NEXT:    retq
475;
476; AVX512VL-SLOW-LABEL: shuffle_v4f64_3254:
477; AVX512VL-SLOW:       # %bb.0:
478; AVX512VL-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
479; AVX512VL-SLOW-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
480; AVX512VL-SLOW-NEXT:    retq
481;
482; AVX512VL-FAST-LABEL: shuffle_v4f64_3254:
483; AVX512VL-FAST:       # %bb.0:
484; AVX512VL-FAST-NEXT:    vmovapd {{.*#+}} ymm2 = [3,2,5,4]
485; AVX512VL-FAST-NEXT:    vpermt2pd %ymm1, %ymm2, %ymm0
486; AVX512VL-FAST-NEXT:    retq
487  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 3, i32 2, i32 5, i32 4>
488  ret <4 x double> %shuffle
489}
490
491define <4 x double> @shuffle_v4f64_3276(<4 x double> %a, <4 x double> %b) {
492; AVX1OR2-LABEL: shuffle_v4f64_3276:
493; AVX1OR2:       # %bb.0:
494; AVX1OR2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
495; AVX1OR2-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
496; AVX1OR2-NEXT:    retq
497;
498; AVX512VL-SLOW-LABEL: shuffle_v4f64_3276:
499; AVX512VL-SLOW:       # %bb.0:
500; AVX512VL-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
501; AVX512VL-SLOW-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
502; AVX512VL-SLOW-NEXT:    retq
503;
504; AVX512VL-FAST-LABEL: shuffle_v4f64_3276:
505; AVX512VL-FAST:       # %bb.0:
506; AVX512VL-FAST-NEXT:    vmovapd {{.*#+}} ymm2 = [3,2,7,6]
507; AVX512VL-FAST-NEXT:    vpermt2pd %ymm1, %ymm2, %ymm0
508; AVX512VL-FAST-NEXT:    retq
509  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 3, i32 2, i32 7, i32 6>
510  ret <4 x double> %shuffle
511}
512
513define <4 x double> @shuffle_v4f64_1076(<4 x double> %a, <4 x double> %b) {
514; AVX1OR2-LABEL: shuffle_v4f64_1076:
515; AVX1OR2:       # %bb.0:
516; AVX1OR2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
517; AVX1OR2-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
518; AVX1OR2-NEXT:    retq
519;
520; AVX512VL-SLOW-LABEL: shuffle_v4f64_1076:
521; AVX512VL-SLOW:       # %bb.0:
522; AVX512VL-SLOW-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
523; AVX512VL-SLOW-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
524; AVX512VL-SLOW-NEXT:    retq
525;
526; AVX512VL-FAST-LABEL: shuffle_v4f64_1076:
527; AVX512VL-FAST:       # %bb.0:
528; AVX512VL-FAST-NEXT:    vmovapd {{.*#+}} ymm2 = [1,0,7,6]
529; AVX512VL-FAST-NEXT:    vpermt2pd %ymm1, %ymm2, %ymm0
530; AVX512VL-FAST-NEXT:    retq
531  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 7, i32 6>
532  ret <4 x double> %shuffle
533}
534
535define <4 x double> @shuffle_v4f64_0415(<4 x double> %a, <4 x double> %b) {
536; AVX1-LABEL: shuffle_v4f64_0415:
537; AVX1:       # %bb.0:
538; AVX1-NEXT:    vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm1[1]
539; AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
540; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
541; AVX1-NEXT:    retq
542;
543; AVX2-LABEL: shuffle_v4f64_0415:
544; AVX2:       # %bb.0:
545; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1]
546; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
547; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
548; AVX2-NEXT:    retq
549;
550; AVX512VL-LABEL: shuffle_v4f64_0415:
551; AVX512VL:       # %bb.0:
552; AVX512VL-NEXT:    vmovapd {{.*#+}} ymm2 = [0,4,1,5]
553; AVX512VL-NEXT:    vpermt2pd %ymm1, %ymm2, %ymm0
554; AVX512VL-NEXT:    retq
555  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
556  ret <4 x double> %shuffle
557}
558
559define <4 x double> @shuffle_v4f64_u062(<4 x double> %a, <4 x double> %b) {
560; ALL-LABEL: shuffle_v4f64_u062:
561; ALL:       # %bb.0:
562; ALL-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
563; ALL-NEXT:    retq
564  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 undef, i32 0, i32 6, i32 2>
565  ret <4 x double> %shuffle
566}
567
568define <4 x double> @shuffle_v4f64_15uu(<4 x double> %a, <4 x double> %b) {
569; ALL-LABEL: shuffle_v4f64_15uu:
570; ALL:       # %bb.0:
571; ALL-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
572; ALL-NEXT:    retq
573  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 5, i32 undef, i32 undef>
574  ret <4 x double> %shuffle
575}
576
577define <4 x double> @shuffle_v4f64_11uu(<4 x double> %a, <4 x double> %b) {
578; ALL-LABEL: shuffle_v4f64_11uu:
579; ALL:       # %bb.0:
580; ALL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,1]
581; ALL-NEXT:    retq
582  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 1, i32 undef, i32 undef>
583  ret <4 x double> %shuffle
584}
585
586define <4 x double> @shuffle_v4f64_22uu(<4 x double> %a, <4 x double> %b) {
587; AVX1-LABEL: shuffle_v4f64_22uu:
588; AVX1:       # %bb.0:
589; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
590; AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
591; AVX1-NEXT:    retq
592;
593; AVX2-LABEL: shuffle_v4f64_22uu:
594; AVX2:       # %bb.0:
595; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
596; AVX2-NEXT:    retq
597;
598; AVX512VL-LABEL: shuffle_v4f64_22uu:
599; AVX512VL:       # %bb.0:
600; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
601; AVX512VL-NEXT:    retq
602  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 2, i32 undef, i32 undef>
603  ret <4 x double> %shuffle
604}
605
606define <4 x double> @shuffle_v4f64_3333(<4 x double> %a, <4 x double> %b) {
607; AVX1-LABEL: shuffle_v4f64_3333:
608; AVX1:       # %bb.0:
609; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,3]
610; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
611; AVX1-NEXT:    retq
612;
613; AVX2-LABEL: shuffle_v4f64_3333:
614; AVX2:       # %bb.0:
615; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
616; AVX2-NEXT:    retq
617;
618; AVX512VL-LABEL: shuffle_v4f64_3333:
619; AVX512VL:       # %bb.0:
620; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
621; AVX512VL-NEXT:    retq
622  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
623  ret <4 x double> %shuffle
624}
625
626define <4 x double> @shuffle_v4f64_0456(<4 x double> %a, <4 x double> %b) {
627; AVX1OR2-LABEL: shuffle_v4f64_0456:
628; AVX1OR2:       # %bb.0:
629; AVX1OR2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
630; AVX1OR2-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2]
631; AVX1OR2-NEXT:    retq
632;
633; AVX512VL-SLOW-LABEL: shuffle_v4f64_0456:
634; AVX512VL-SLOW:       # %bb.0:
635; AVX512VL-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
636; AVX512VL-SLOW-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2]
637; AVX512VL-SLOW-NEXT:    retq
638;
639; AVX512VL-FAST-LABEL: shuffle_v4f64_0456:
640; AVX512VL-FAST:       # %bb.0:
641; AVX512VL-FAST-NEXT:    vmovapd {{.*#+}} ymm2 = [4,0,1,2]
642; AVX512VL-FAST-NEXT:    vpermi2pd %ymm0, %ymm1, %ymm2
643; AVX512VL-FAST-NEXT:    vmovapd %ymm2, %ymm0
644; AVX512VL-FAST-NEXT:    retq
645  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
646  ret <4 x double> %shuffle
647}
648
649define <4 x double> @shuffle_v4f64_0z3z(<4 x double> %a, <4 x double> %b) {
650; ALL-LABEL: shuffle_v4f64_0z3z:
651; ALL:       # %bb.0:
652; ALL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
653; ALL-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2]
654; ALL-NEXT:    retq
655  %shuffle = shufflevector <4 x double> %a, <4 x double> <double 0.000000e+00, double undef, double undef, double undef>, <4 x i32> <i32 0, i32 4, i32 3, i32 4>
656  ret <4 x double> %shuffle
657}
658
659define <4 x double> @shuffle_v4f64_1z2z(<4 x double> %a, <4 x double> %b) {
660; ALL-LABEL: shuffle_v4f64_1z2z:
661; ALL:       # %bb.0:
662; ALL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
663; ALL-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[2]
664; ALL-NEXT:    retq
665  %1 = shufflevector <4 x double> %a, <4 x double> <double 0.000000e+00, double undef, double undef, double undef>, <4 x i32> <i32 1, i32 4, i32 2, i32 4>
666  ret <4 x double> %1
667}
668
669define <4 x double> @shuffle_v4f64_0044(<4 x double> %a, <4 x double> %b) {
670; AVX1-LABEL: shuffle_v4f64_0044:
671; AVX1:       # %bb.0:
672; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
673; AVX1-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
674; AVX1-NEXT:    retq
675;
676; AVX2-LABEL: shuffle_v4f64_0044:
677; AVX2:       # %bb.0:
678; AVX2-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
679; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1]
680; AVX2-NEXT:    retq
681;
682; AVX512VL-SLOW-LABEL: shuffle_v4f64_0044:
683; AVX512VL-SLOW:       # %bb.0:
684; AVX512VL-SLOW-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
685; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1]
686; AVX512VL-SLOW-NEXT:    retq
687;
688; AVX512VL-FAST-LABEL: shuffle_v4f64_0044:
689; AVX512VL-FAST:       # %bb.0:
690; AVX512VL-FAST-NEXT:    vmovapd {{.*#+}} ymm2 = [0,0,4,4]
691; AVX512VL-FAST-NEXT:    vpermt2pd %ymm1, %ymm2, %ymm0
692; AVX512VL-FAST-NEXT:    retq
693  %1 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 4>
694  ret <4 x double> %1
695}
696
697define <4 x double> @shuffle_v4f64_0044_v2f64(<2 x double> %a, <2 x double> %b) {
698; ALL-LABEL: shuffle_v4f64_0044_v2f64:
699; ALL:       # %bb.0:
700; ALL-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
701; ALL-NEXT:    vmovddup {{.*#+}} xmm1 = xmm1[0,0]
702; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
703; ALL-NEXT:    retq
704  %1 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 0, i32 0>
705  %2 = shufflevector <2 x double> %b, <2 x double> undef, <2 x i32> <i32 0, i32 0>
706  %3 = shufflevector <2 x double> %1, <2 x double> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
707  ret <4 x double> %3
708}
709
710define <4 x double> @shuffle_v4f64_1032_v2f64(<2 x double> %a, <2 x double> %b) {
711; ALL-LABEL: shuffle_v4f64_1032_v2f64:
712; ALL:       # %bb.0:
713; ALL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
714; ALL-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
715; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
716; ALL-NEXT:    retq
717  %1 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 1, i32 0>
718  %2 = shufflevector <2 x double> %b, <2 x double> undef, <2 x i32> <i32 1, i32 0>
719  %3 = shufflevector <2 x double> %1, <2 x double> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
720  ret <4 x double> %3
721}
722
723;PR34359
724define <4 x double> @shuffle_v4f64_2345_0567_select(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3) {
725; ALL-LABEL: shuffle_v4f64_2345_0567_select:
726; ALL:       # %bb.0:
727; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
728; ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
729; ALL-NEXT:    retq
730  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
731  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x double> %shuf, <4 x double> %vec3
732  ret <4 x double> %res
733}
734
735define <4 x i64> @shuffle_v4i64_0000(<4 x i64> %a, <4 x i64> %b) {
736; AVX1-LABEL: shuffle_v4i64_0000:
737; AVX1:       # %bb.0:
738; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
739; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
740; AVX1-NEXT:    retq
741;
742; AVX2-LABEL: shuffle_v4i64_0000:
743; AVX2:       # %bb.0:
744; AVX2-NEXT:    vbroadcastsd %xmm0, %ymm0
745; AVX2-NEXT:    retq
746;
747; AVX512VL-LABEL: shuffle_v4i64_0000:
748; AVX512VL:       # %bb.0:
749; AVX512VL-NEXT:    vbroadcastsd %xmm0, %ymm0
750; AVX512VL-NEXT:    retq
751  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
752  ret <4 x i64> %shuffle
753}
754
755define <4 x i64> @shuffle_v4i64_0001(<4 x i64> %a, <4 x i64> %b) {
756; AVX1-LABEL: shuffle_v4i64_0001:
757; AVX1:       # %bb.0:
758; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,1]
759; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
760; AVX1-NEXT:    retq
761;
762; AVX2-LABEL: shuffle_v4i64_0001:
763; AVX2:       # %bb.0:
764; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1]
765; AVX2-NEXT:    retq
766;
767; AVX512VL-LABEL: shuffle_v4i64_0001:
768; AVX512VL:       # %bb.0:
769; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1]
770; AVX512VL-NEXT:    retq
771  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
772  ret <4 x i64> %shuffle
773}
774
775define <4 x i64> @shuffle_v4i64_0020(<4 x i64> %a, <4 x i64> %b) {
776; AVX1-LABEL: shuffle_v4i64_0020:
777; AVX1:       # %bb.0:
778; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm1
779; AVX1-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
780; AVX1-NEXT:    retq
781;
782; AVX2-LABEL: shuffle_v4i64_0020:
783; AVX2:       # %bb.0:
784; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,0]
785; AVX2-NEXT:    retq
786;
787; AVX512VL-LABEL: shuffle_v4i64_0020:
788; AVX512VL:       # %bb.0:
789; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,0]
790; AVX512VL-NEXT:    retq
791  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
792  ret <4 x i64> %shuffle
793}
794
795define <4 x i64> @shuffle_v4i64_0112(<4 x i64> %a, <4 x i64> %b) {
796; AVX1-LABEL: shuffle_v4i64_0112:
797; AVX1:       # %bb.0:
798; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm1
799; AVX1-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2]
800; AVX1-NEXT:    retq
801;
802; AVX2-LABEL: shuffle_v4i64_0112:
803; AVX2:       # %bb.0:
804; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,2]
805; AVX2-NEXT:    retq
806;
807; AVX512VL-LABEL: shuffle_v4i64_0112:
808; AVX512VL:       # %bb.0:
809; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,2]
810; AVX512VL-NEXT:    retq
811  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 1, i32 2>
812  ret <4 x i64> %shuffle
813}
814
815define <4 x i64> @shuffle_v4i64_0300(<4 x i64> %a, <4 x i64> %b) {
816; AVX1-LABEL: shuffle_v4i64_0300:
817; AVX1:       # %bb.0:
818; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
819; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
820; AVX1-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[2]
821; AVX1-NEXT:    retq
822;
823; AVX2-LABEL: shuffle_v4i64_0300:
824; AVX2:       # %bb.0:
825; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,0]
826; AVX2-NEXT:    retq
827;
828; AVX512VL-LABEL: shuffle_v4i64_0300:
829; AVX512VL:       # %bb.0:
830; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,0]
831; AVX512VL-NEXT:    retq
832  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
833  ret <4 x i64> %shuffle
834}
835
836define <4 x i64> @shuffle_v4i64_1000(<4 x i64> %a, <4 x i64> %b) {
837; AVX1-LABEL: shuffle_v4i64_1000:
838; AVX1:       # %bb.0:
839; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
840; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2]
841; AVX1-NEXT:    retq
842;
843; AVX2-LABEL: shuffle_v4i64_1000:
844; AVX2:       # %bb.0:
845; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0]
846; AVX2-NEXT:    retq
847;
848; AVX512VL-LABEL: shuffle_v4i64_1000:
849; AVX512VL:       # %bb.0:
850; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0]
851; AVX512VL-NEXT:    retq
852  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
853  ret <4 x i64> %shuffle
854}
855
856define <4 x i64> @shuffle_v4i64_2200(<4 x i64> %a, <4 x i64> %b) {
857; AVX1-LABEL: shuffle_v4i64_2200:
858; AVX1:       # %bb.0:
859; AVX1-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
860; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
861; AVX1-NEXT:    retq
862;
863; AVX2-LABEL: shuffle_v4i64_2200:
864; AVX2:       # %bb.0:
865; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0]
866; AVX2-NEXT:    retq
867;
868; AVX512VL-LABEL: shuffle_v4i64_2200:
869; AVX512VL:       # %bb.0:
870; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0]
871; AVX512VL-NEXT:    retq
872  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
873  ret <4 x i64> %shuffle
874}
875
876define <4 x i64> @shuffle_v4i64_3330(<4 x i64> %a, <4 x i64> %b) {
877; AVX1-LABEL: shuffle_v4i64_3330:
878; AVX1:       # %bb.0:
879; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
880; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
881; AVX1-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[2]
882; AVX1-NEXT:    retq
883;
884; AVX2-LABEL: shuffle_v4i64_3330:
885; AVX2:       # %bb.0:
886; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,0]
887; AVX2-NEXT:    retq
888;
889; AVX512VL-LABEL: shuffle_v4i64_3330:
890; AVX512VL:       # %bb.0:
891; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,0]
892; AVX512VL-NEXT:    retq
893  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0>
894  ret <4 x i64> %shuffle
895}
896
897define <4 x i64> @shuffle_v4i64_3210(<4 x i64> %a, <4 x i64> %b) {
898; AVX1-LABEL: shuffle_v4i64_3210:
899; AVX1:       # %bb.0:
900; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
901; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
902; AVX1-NEXT:    retq
903;
904; AVX2-LABEL: shuffle_v4i64_3210:
905; AVX2:       # %bb.0:
906; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0]
907; AVX2-NEXT:    retq
908;
909; AVX512VL-LABEL: shuffle_v4i64_3210:
910; AVX512VL:       # %bb.0:
911; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0]
912; AVX512VL-NEXT:    retq
913  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
914  ret <4 x i64> %shuffle
915}
916
917define <4 x i64> @shuffle_v4i64_0213(<4 x i64> %a, <4 x i64> %b) {
918; AVX1-LABEL: shuffle_v4i64_0213:
919; AVX1:       # %bb.0:
920; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,2,3]
921; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
922; AVX1-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[3]
923; AVX1-NEXT:    retq
924;
925; AVX2-LABEL: shuffle_v4i64_0213:
926; AVX2:       # %bb.0:
927; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
928; AVX2-NEXT:    retq
929;
930; AVX512VL-LABEL: shuffle_v4i64_0213:
931; AVX512VL:       # %bb.0:
932; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
933; AVX512VL-NEXT:    retq
934  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
935  ret <4 x i64> %shuffle
936}
937
938define <4 x i64> @shuffle_v4i64_0124(<4 x i64> %a, <4 x i64> %b) {
939; AVX1-LABEL: shuffle_v4i64_0124:
940; AVX1:       # %bb.0:
941; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
942; AVX1-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[2]
943; AVX1-NEXT:    retq
944;
945; AVX2-LABEL: shuffle_v4i64_0124:
946; AVX2:       # %bb.0:
947; AVX2-NEXT:    vbroadcastsd %xmm1, %ymm1
948; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
949; AVX2-NEXT:    retq
950;
951; AVX512VL-SLOW-LABEL: shuffle_v4i64_0124:
952; AVX512VL-SLOW:       # %bb.0:
953; AVX512VL-SLOW-NEXT:    vbroadcastsd %xmm1, %ymm1
954; AVX512VL-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
955; AVX512VL-SLOW-NEXT:    retq
956;
957; AVX512VL-FAST-LABEL: shuffle_v4i64_0124:
958; AVX512VL-FAST:       # %bb.0:
959; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,2,4]
960; AVX512VL-FAST-NEXT:    vpermt2q %ymm1, %ymm2, %ymm0
961; AVX512VL-FAST-NEXT:    retq
962  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
963  ret <4 x i64> %shuffle
964}
965
966define <4 x i64> @shuffle_v4i64_0142(<4 x i64> %a, <4 x i64> %b) {
967; AVX1-LABEL: shuffle_v4i64_0142:
968; AVX1:       # %bb.0:
969; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
970; AVX1-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2]
971; AVX1-NEXT:    retq
972;
973; AVX2-LABEL: shuffle_v4i64_0142:
974; AVX2:       # %bb.0:
975; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
976; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,2]
977; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
978; AVX2-NEXT:    retq
979;
980; AVX512VL-LABEL: shuffle_v4i64_0142:
981; AVX512VL:       # %bb.0:
982; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,2]
983; AVX512VL-NEXT:    vpermt2q %ymm1, %ymm2, %ymm0
984; AVX512VL-NEXT:    retq
985  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
986  ret <4 x i64> %shuffle
987}
988
989define <4 x i64> @shuffle_v4i64_0412(<4 x i64> %a, <4 x i64> %b) {
990; AVX1-LABEL: shuffle_v4i64_0412:
991; AVX1:       # %bb.0:
992; AVX1-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3]
993; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
994; AVX1-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[2]
995; AVX1-NEXT:    retq
996;
997; AVX2-LABEL: shuffle_v4i64_0412:
998; AVX2:       # %bb.0:
999; AVX2-NEXT:    vmovddup {{.*#+}} xmm1 = xmm1[0,0]
1000; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,2]
1001; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
1002; AVX2-NEXT:    retq
1003;
1004; AVX512VL-LABEL: shuffle_v4i64_0412:
1005; AVX512VL:       # %bb.0:
1006; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,4,1,2]
1007; AVX512VL-NEXT:    vpermt2q %ymm1, %ymm2, %ymm0
1008; AVX512VL-NEXT:    retq
1009  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 2>
1010  ret <4 x i64> %shuffle
1011}
1012
1013define <4 x i64> @shuffle_v4i64_4012(<4 x i64> %a, <4 x i64> %b) {
1014; AVX1-LABEL: shuffle_v4i64_4012:
1015; AVX1:       # %bb.0:
1016; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
1017; AVX1-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[2]
1018; AVX1-NEXT:    retq
1019;
1020; AVX2-LABEL: shuffle_v4i64_4012:
1021; AVX2:       # %bb.0:
1022; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,2]
1023; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
1024; AVX2-NEXT:    retq
1025;
1026; AVX512VL-SLOW-LABEL: shuffle_v4i64_4012:
1027; AVX512VL-SLOW:       # %bb.0:
1028; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,2]
1029; AVX512VL-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
1030; AVX512VL-SLOW-NEXT:    retq
1031;
1032; AVX512VL-FAST-LABEL: shuffle_v4i64_4012:
1033; AVX512VL-FAST:       # %bb.0:
1034; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,0,1,2]
1035; AVX512VL-FAST-NEXT:    vpermt2q %ymm1, %ymm2, %ymm0
1036; AVX512VL-FAST-NEXT:    retq
1037  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 2>
1038  ret <4 x i64> %shuffle
1039}
1040
1041define <4 x i64> @shuffle_v4i64_0145(<4 x i64> %a, <4 x i64> %b) {
1042; ALL-LABEL: shuffle_v4i64_0145:
1043; ALL:       # %bb.0:
1044; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1045; ALL-NEXT:    retq
1046  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1047  ret <4 x i64> %shuffle
1048}
1049
1050define <4 x i64> @shuffle_v4i64_0451(<4 x i64> %a, <4 x i64> %b) {
1051; AVX1-LABEL: shuffle_v4i64_0451:
1052; AVX1:       # %bb.0:
1053; AVX1-NEXT:    vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
1054; AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1055; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1056; AVX1-NEXT:    retq
1057;
1058; AVX2-LABEL: shuffle_v4i64_0451:
1059; AVX2:       # %bb.0:
1060; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,0,1,3]
1061; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1]
1062; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
1063; AVX2-NEXT:    retq
1064;
1065; AVX512VL-LABEL: shuffle_v4i64_0451:
1066; AVX512VL:       # %bb.0:
1067; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,4,5,1]
1068; AVX512VL-NEXT:    vpermt2q %ymm1, %ymm2, %ymm0
1069; AVX512VL-NEXT:    retq
1070  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 5, i32 1>
1071  ret <4 x i64> %shuffle
1072}
1073
1074define <4 x i64> @shuffle_v4i64_4501(<4 x i64> %a, <4 x i64> %b) {
1075; ALL-LABEL: shuffle_v4i64_4501:
1076; ALL:       # %bb.0:
1077; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1078; ALL-NEXT:    retq
1079  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
1080  ret <4 x i64> %shuffle
1081}
1082
1083define <4 x i64> @shuffle_v4i64_4015(<4 x i64> %a, <4 x i64> %b) {
1084; AVX1-LABEL: shuffle_v4i64_4015:
1085; AVX1:       # %bb.0:
1086; AVX1-NEXT:    vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm1[1]
1087; AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1088; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1089; AVX1-NEXT:    retq
1090;
1091; AVX2-LABEL: shuffle_v4i64_4015:
1092; AVX2:       # %bb.0:
1093; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
1094; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3]
1095; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
1096; AVX2-NEXT:    retq
1097;
1098; AVX512VL-LABEL: shuffle_v4i64_4015:
1099; AVX512VL:       # %bb.0:
1100; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,0,1,5]
1101; AVX512VL-NEXT:    vpermt2q %ymm1, %ymm2, %ymm0
1102; AVX512VL-NEXT:    retq
1103  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 5>
1104  ret <4 x i64> %shuffle
1105}
1106
1107define <4 x i64> @shuffle_v4i64_2u35(<4 x i64> %a, <4 x i64> %b) {
1108; AVX1-LABEL: shuffle_v4i64_2u35:
1109; AVX1:       # %bb.0:
1110; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
1111; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
1112; AVX1-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[3]
1113; AVX1-NEXT:    retq
1114;
1115; AVX2-LABEL: shuffle_v4i64_2u35:
1116; AVX2:       # %bb.0:
1117; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
1118; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,1]
1119; AVX2-NEXT:    retq
1120;
1121; AVX512VL-SLOW-LABEL: shuffle_v4i64_2u35:
1122; AVX512VL-SLOW:       # %bb.0:
1123; AVX512VL-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
1124; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,1]
1125; AVX512VL-SLOW-NEXT:    retq
1126;
1127; AVX512VL-FAST-LABEL: shuffle_v4i64_2u35:
1128; AVX512VL-FAST:       # %bb.0:
1129; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,5,3,5]
1130; AVX512VL-FAST-NEXT:    vpermt2q %ymm1, %ymm2, %ymm0
1131; AVX512VL-FAST-NEXT:    retq
1132  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 undef, i32 3, i32 5>
1133  ret <4 x i64> %shuffle
1134}
1135
1136define <4 x i64> @shuffle_v4i64_1251(<4 x i64> %a, <4 x i64> %b) {
1137; AVX1-LABEL: shuffle_v4i64_1251:
1138; AVX1:       # %bb.0:
1139; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
1140; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1141; AVX1-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm2[0],ymm0[3],ymm2[3]
1142; AVX1-NEXT:    retq
1143;
1144; AVX2-LABEL: shuffle_v4i64_1251:
1145; AVX2:       # %bb.0:
1146; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[1,1,1,1]
1147; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,2,2,1]
1148; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
1149; AVX2-NEXT:    retq
1150;
1151; AVX512VL-LABEL: shuffle_v4i64_1251:
1152; AVX512VL:       # %bb.0:
1153; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,2,5,1]
1154; AVX512VL-NEXT:    vpermt2q %ymm1, %ymm2, %ymm0
1155; AVX512VL-NEXT:    retq
1156  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 2, i32 5, i32 1>
1157  ret <4 x i64> %shuffle
1158}
1159
1160define <4 x i64> @shuffle_v4i64_1054(<4 x i64> %a, <4 x i64> %b) {
1161; AVX1-LABEL: shuffle_v4i64_1054:
1162; AVX1:       # %bb.0:
1163; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1164; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
1165; AVX1-NEXT:    retq
1166;
1167; AVX2-LABEL: shuffle_v4i64_1054:
1168; AVX2:       # %bb.0:
1169; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1170; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
1171; AVX2-NEXT:    retq
1172;
1173; AVX512VL-SLOW-LABEL: shuffle_v4i64_1054:
1174; AVX512VL-SLOW:       # %bb.0:
1175; AVX512VL-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1176; AVX512VL-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
1177; AVX512VL-SLOW-NEXT:    retq
1178;
1179; AVX512VL-FAST-LABEL: shuffle_v4i64_1054:
1180; AVX512VL-FAST:       # %bb.0:
1181; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,0,5,4]
1182; AVX512VL-FAST-NEXT:    vpermt2q %ymm1, %ymm2, %ymm0
1183; AVX512VL-FAST-NEXT:    retq
1184  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 0, i32 5, i32 4>
1185  ret <4 x i64> %shuffle
1186}
1187
1188define <4 x i64> @shuffle_v4i64_3254(<4 x i64> %a, <4 x i64> %b) {
1189; AVX1-LABEL: shuffle_v4i64_3254:
1190; AVX1:       # %bb.0:
1191; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
1192; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
1193; AVX1-NEXT:    retq
1194;
1195; AVX2-LABEL: shuffle_v4i64_3254:
1196; AVX2:       # %bb.0:
1197; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
1198; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
1199; AVX2-NEXT:    retq
1200;
1201; AVX512VL-SLOW-LABEL: shuffle_v4i64_3254:
1202; AVX512VL-SLOW:       # %bb.0:
1203; AVX512VL-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
1204; AVX512VL-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
1205; AVX512VL-SLOW-NEXT:    retq
1206;
1207; AVX512VL-FAST-LABEL: shuffle_v4i64_3254:
1208; AVX512VL-FAST:       # %bb.0:
1209; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,2,5,4]
1210; AVX512VL-FAST-NEXT:    vpermt2q %ymm1, %ymm2, %ymm0
1211; AVX512VL-FAST-NEXT:    retq
1212  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 2, i32 5, i32 4>
1213  ret <4 x i64> %shuffle
1214}
1215
1216define <4 x i64> @shuffle_v4i64_3276(<4 x i64> %a, <4 x i64> %b) {
1217; AVX1-LABEL: shuffle_v4i64_3276:
1218; AVX1:       # %bb.0:
1219; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1220; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
1221; AVX1-NEXT:    retq
1222;
1223; AVX2-LABEL: shuffle_v4i64_3276:
1224; AVX2:       # %bb.0:
1225; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1226; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
1227; AVX2-NEXT:    retq
1228;
1229; AVX512VL-SLOW-LABEL: shuffle_v4i64_3276:
1230; AVX512VL-SLOW:       # %bb.0:
1231; AVX512VL-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1232; AVX512VL-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
1233; AVX512VL-SLOW-NEXT:    retq
1234;
1235; AVX512VL-FAST-LABEL: shuffle_v4i64_3276:
1236; AVX512VL-FAST:       # %bb.0:
1237; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,2,7,6]
1238; AVX512VL-FAST-NEXT:    vpermt2q %ymm1, %ymm2, %ymm0
1239; AVX512VL-FAST-NEXT:    retq
1240  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 2, i32 7, i32 6>
1241  ret <4 x i64> %shuffle
1242}
1243
1244define <4 x i64> @shuffle_v4i64_1076(<4 x i64> %a, <4 x i64> %b) {
1245; AVX1-LABEL: shuffle_v4i64_1076:
1246; AVX1:       # %bb.0:
1247; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
1248; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
1249; AVX1-NEXT:    retq
1250;
1251; AVX2-LABEL: shuffle_v4i64_1076:
1252; AVX2:       # %bb.0:
1253; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
1254; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
1255; AVX2-NEXT:    retq
1256;
1257; AVX512VL-SLOW-LABEL: shuffle_v4i64_1076:
1258; AVX512VL-SLOW:       # %bb.0:
1259; AVX512VL-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
1260; AVX512VL-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
1261; AVX512VL-SLOW-NEXT:    retq
1262;
1263; AVX512VL-FAST-LABEL: shuffle_v4i64_1076:
1264; AVX512VL-FAST:       # %bb.0:
1265; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,0,7,6]
1266; AVX512VL-FAST-NEXT:    vpermt2q %ymm1, %ymm2, %ymm0
1267; AVX512VL-FAST-NEXT:    retq
1268  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 0, i32 7, i32 6>
1269  ret <4 x i64> %shuffle
1270}
1271
1272define <4 x i64> @shuffle_v4i64_0415(<4 x i64> %a, <4 x i64> %b) {
1273; AVX1-LABEL: shuffle_v4i64_0415:
1274; AVX1:       # %bb.0:
1275; AVX1-NEXT:    vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm1[1]
1276; AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1277; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1278; AVX1-NEXT:    retq
1279;
1280; AVX2-LABEL: shuffle_v4i64_0415:
1281; AVX2:       # %bb.0:
1282; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1]
1283; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
1284; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
1285; AVX2-NEXT:    retq
1286;
1287; AVX512VL-LABEL: shuffle_v4i64_0415:
1288; AVX512VL:       # %bb.0:
1289; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,4,1,5]
1290; AVX512VL-NEXT:    vpermt2q %ymm1, %ymm2, %ymm0
1291; AVX512VL-NEXT:    retq
1292  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
1293  ret <4 x i64> %shuffle
1294}
1295
1296define <4 x i64> @shuffle_v4i64_z4z6(<4 x i64> %a) {
1297; AVX1-LABEL: shuffle_v4i64_z4z6:
1298; AVX1:       # %bb.0:
1299; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1300; AVX1-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
1301; AVX1-NEXT:    retq
1302;
1303; AVX2-LABEL: shuffle_v4i64_z4z6:
1304; AVX2:       # %bb.0:
1305; AVX2-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23]
1306; AVX2-NEXT:    retq
1307;
1308; AVX512VL-LABEL: shuffle_v4i64_z4z6:
1309; AVX512VL:       # %bb.0:
1310; AVX512VL-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23]
1311; AVX512VL-NEXT:    retq
1312  %shuffle = shufflevector <4 x i64> zeroinitializer, <4 x i64> %a, <4 x i32> <i32 0, i32 4, i32 0, i32 6>
1313  ret <4 x i64> %shuffle
1314}
1315
1316define <4 x i64> @shuffle_v4i64_5zuz(<4 x i64> %a) {
1317; AVX1-LABEL: shuffle_v4i64_5zuz:
1318; AVX1:       # %bb.0:
1319; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1320; AVX1-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
1321; AVX1-NEXT:    retq
1322;
1323; AVX2-LABEL: shuffle_v4i64_5zuz:
1324; AVX2:       # %bb.0:
1325; AVX2-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
1326; AVX2-NEXT:    retq
1327;
1328; AVX512VL-LABEL: shuffle_v4i64_5zuz:
1329; AVX512VL:       # %bb.0:
1330; AVX512VL-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
1331; AVX512VL-NEXT:    retq
1332  %shuffle = shufflevector <4 x i64> zeroinitializer, <4 x i64> %a, <4 x i32> <i32 5, i32 0, i32 undef, i32 0>
1333  ret <4 x i64> %shuffle
1334}
1335
1336define <4 x i64> @shuffle_v4i64_40u2(<4 x i64> %a, <4 x i64> %b) {
1337; ALL-LABEL: shuffle_v4i64_40u2:
1338; ALL:       # %bb.0:
1339; ALL-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
1340; ALL-NEXT:    retq
1341  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 0, i32 undef, i32 2>
1342  ret <4 x i64> %shuffle
1343}
1344
1345define <4 x i64> @shuffle_v4i64_15uu(<4 x i64> %a, <4 x i64> %b) {
1346; ALL-LABEL: shuffle_v4i64_15uu:
1347; ALL:       # %bb.0:
1348; ALL-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1349; ALL-NEXT:    retq
1350  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 5, i32 undef, i32 undef>
1351  ret <4 x i64> %shuffle
1352}
1353
1354define <4 x i64> @shuffle_v4i64_11uu(<4 x i64> %a, <4 x i64> %b) {
1355; ALL-LABEL: shuffle_v4i64_11uu:
1356; ALL:       # %bb.0:
1357; ALL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
1358; ALL-NEXT:    retq
1359  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 1, i32 undef, i32 undef>
1360  ret <4 x i64> %shuffle
1361}
1362
1363define <4 x i64> @shuffle_v4i64_22uu(<4 x i64> %a, <4 x i64> %b) {
1364; AVX1-LABEL: shuffle_v4i64_22uu:
1365; AVX1:       # %bb.0:
1366; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1367; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
1368; AVX1-NEXT:    retq
1369;
1370; AVX2-LABEL: shuffle_v4i64_22uu:
1371; AVX2:       # %bb.0:
1372; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
1373; AVX2-NEXT:    retq
1374;
1375; AVX512VL-LABEL: shuffle_v4i64_22uu:
1376; AVX512VL:       # %bb.0:
1377; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
1378; AVX512VL-NEXT:    retq
1379  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 2, i32 undef, i32 undef>
1380  ret <4 x i64> %shuffle
1381}
1382
1383define <4 x i64> @shuffle_v4i64_3333(<4 x i64> %a, <4 x i64> %b) {
1384; AVX1-LABEL: shuffle_v4i64_3333:
1385; AVX1:       # %bb.0:
1386; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,3]
1387; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
1388; AVX1-NEXT:    retq
1389;
1390; AVX2-LABEL: shuffle_v4i64_3333:
1391; AVX2:       # %bb.0:
1392; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
1393; AVX2-NEXT:    retq
1394;
1395; AVX512VL-LABEL: shuffle_v4i64_3333:
1396; AVX512VL:       # %bb.0:
1397; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
1398; AVX512VL-NEXT:    retq
1399  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1400  ret <4 x i64> %shuffle
1401}
1402
1403define <4 x i64> @shuffle_v4i64_1z3z(<4 x i64> %a, <4 x i64> %b) {
1404; AVX1-LABEL: shuffle_v4i64_1z3z:
1405; AVX1:       # %bb.0:
1406; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1407; AVX1-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
1408; AVX1-NEXT:    retq
1409;
1410; AVX2-LABEL: shuffle_v4i64_1z3z:
1411; AVX2:       # %bb.0:
1412; AVX2-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
1413; AVX2-NEXT:    retq
1414;
1415; AVX512VL-LABEL: shuffle_v4i64_1z3z:
1416; AVX512VL:       # %bb.0:
1417; AVX512VL-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
1418; AVX512VL-NEXT:    retq
1419  %shuffle = shufflevector <4 x i64> %a, <4 x i64> <i64 0, i64 undef, i64 undef, i64 undef>, <4 x i32> <i32 1, i32 4, i32 3, i32 4>
1420  ret <4 x i64> %shuffle
1421}
1422
1423define <4 x i64> @shuffle_v4i64_1032_v2i64(<2 x i64> %a, <2 x i64> %b) {
1424; ALL-LABEL: shuffle_v4i64_1032_v2i64:
1425; ALL:       # %bb.0:
1426; ALL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1427; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1428; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
1429; ALL-NEXT:    retq
1430  %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
1431  %2 = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
1432  %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1433  ret <4 x i64> %3
1434}
1435
1436define <4 x i64> @stress_test1(<4 x i64> %a, <4 x i64> %b) {
1437; ALL-LABEL: stress_test1:
1438; ALL:         retq
1439  %c = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> <i32 3, i32 1, i32 1, i32 0>
1440  %d = shufflevector <4 x i64> %c, <4 x i64> undef, <4 x i32> <i32 3, i32 undef, i32 2, i32 undef>
1441  %e = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 undef>
1442  %f = shufflevector <4 x i64> %d, <4 x i64> %e, <4 x i32> <i32 5, i32 1, i32 1, i32 0>
1443
1444  ret <4 x i64> %f
1445}
1446
1447define <4 x i64> @insert_reg_and_zero_v4i64(i64 %a) {
1448; ALL-LABEL: insert_reg_and_zero_v4i64:
1449; ALL:       # %bb.0:
1450; ALL-NEXT:    vmovq %rdi, %xmm0
1451; ALL-NEXT:    retq
1452  %v = insertelement <4 x i64> undef, i64 %a, i64 0
1453  %shuffle = shufflevector <4 x i64> %v, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1454  ret <4 x i64> %shuffle
1455}
1456
1457define <4 x i64> @insert_mem_and_zero_v4i64(i64* %ptr) {
1458; ALL-LABEL: insert_mem_and_zero_v4i64:
1459; ALL:       # %bb.0:
1460; ALL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
1461; ALL-NEXT:    retq
1462  %a = load i64, i64* %ptr
1463  %v = insertelement <4 x i64> undef, i64 %a, i64 0
1464  %shuffle = shufflevector <4 x i64> %v, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1465  ret <4 x i64> %shuffle
1466}
1467
1468define <4 x double> @insert_reg_and_zero_v4f64(double %a) {
1469; ALL-LABEL: insert_reg_and_zero_v4f64:
1470; ALL:       # %bb.0:
1471; ALL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
1472; ALL-NEXT:    retq
1473  %v = insertelement <4 x double> undef, double %a, i32 0
1474  %shuffle = shufflevector <4 x double> %v, <4 x double> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1475  ret <4 x double> %shuffle
1476}
1477
1478define <4 x double> @insert_mem_and_zero_v4f64(double* %ptr) {
1479; ALL-LABEL: insert_mem_and_zero_v4f64:
1480; ALL:       # %bb.0:
1481; ALL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
1482; ALL-NEXT:    retq
1483  %a = load double, double* %ptr
1484  %v = insertelement <4 x double> undef, double %a, i32 0
1485  %shuffle = shufflevector <4 x double> %v, <4 x double> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1486  ret <4 x double> %shuffle
1487}
1488
1489define <4 x double> @splat_mem_v4f64(double* %ptr) {
1490; ALL-LABEL: splat_mem_v4f64:
1491; ALL:       # %bb.0:
1492; ALL-NEXT:    vbroadcastsd (%rdi), %ymm0
1493; ALL-NEXT:    retq
1494  %a = load double, double* %ptr
1495  %v = insertelement <4 x double> undef, double %a, i32 0
1496  %shuffle = shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
1497  ret <4 x double> %shuffle
1498}
1499
1500define <4 x i64> @splat_mem_v4i64(i64* %ptr) {
1501; ALL-LABEL: splat_mem_v4i64:
1502; ALL:       # %bb.0:
1503; ALL-NEXT:    vbroadcastsd (%rdi), %ymm0
1504; ALL-NEXT:    retq
1505  %a = load i64, i64* %ptr
1506  %v = insertelement <4 x i64> undef, i64 %a, i64 0
1507  %shuffle = shufflevector <4 x i64> %v, <4 x i64> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
1508  ret <4 x i64> %shuffle
1509}
1510
1511define <4 x double> @splat_mem_v4f64_2(double* %p) {
1512; ALL-LABEL: splat_mem_v4f64_2:
1513; ALL:       # %bb.0:
1514; ALL-NEXT:    vbroadcastsd (%rdi), %ymm0
1515; ALL-NEXT:    retq
1516  %1 = load double, double* %p
1517  %2 = insertelement <2 x double> undef, double %1, i32 0
1518  %3 = shufflevector <2 x double> %2, <2 x double> undef, <4 x i32> zeroinitializer
1519  ret <4 x double> %3
1520}
1521
1522define <4 x double> @splat_v4f64(<2 x double> %r) {
1523; AVX1-LABEL: splat_v4f64:
1524; AVX1:       # %bb.0:
1525; AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
1526; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1527; AVX1-NEXT:    retq
1528;
1529; AVX2-LABEL: splat_v4f64:
1530; AVX2:       # %bb.0:
1531; AVX2-NEXT:    vbroadcastsd %xmm0, %ymm0
1532; AVX2-NEXT:    retq
1533;
1534; AVX512VL-LABEL: splat_v4f64:
1535; AVX512VL:       # %bb.0:
1536; AVX512VL-NEXT:    vbroadcastsd %xmm0, %ymm0
1537; AVX512VL-NEXT:    retq
1538  %1 = shufflevector <2 x double> %r, <2 x double> undef, <4 x i32> zeroinitializer
1539  ret <4 x double> %1
1540}
1541
1542define <4 x i64> @splat_mem_v4i64_from_v2i64(<2 x i64>* %ptr) {
1543; ALL-LABEL: splat_mem_v4i64_from_v2i64:
1544; ALL:       # %bb.0:
1545; ALL-NEXT:    vbroadcastsd (%rdi), %ymm0
1546; ALL-NEXT:    retq
1547  %v = load <2 x i64>, <2 x i64>* %ptr
1548  %shuffle = shufflevector <2 x i64> %v, <2 x i64> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
1549  ret <4 x i64> %shuffle
1550}
1551
1552define <4 x double> @splat_mem_v4f64_from_v2f64(<2 x double>* %ptr) {
1553; ALL-LABEL: splat_mem_v4f64_from_v2f64:
1554; ALL:       # %bb.0:
1555; ALL-NEXT:    vbroadcastsd (%rdi), %ymm0
1556; ALL-NEXT:    retq
1557  %v = load <2 x double>, <2 x double>* %ptr
1558  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
1559  ret <4 x double> %shuffle
1560}
1561
1562define <4 x i64> @splat128_mem_v4i64_from_v2i64(<2 x i64>* %ptr) {
1563; AVX1OR2-LABEL: splat128_mem_v4i64_from_v2i64:
1564; AVX1OR2:       # %bb.0:
1565; AVX1OR2-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
1566; AVX1OR2-NEXT:    retq
1567;
1568; AVX512VL-LABEL: splat128_mem_v4i64_from_v2i64:
1569; AVX512VL:       # %bb.0:
1570; AVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
1571; AVX512VL-NEXT:    retq
1572  %v = load <2 x i64>, <2 x i64>* %ptr
1573  %shuffle = shufflevector <2 x i64> %v, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
1574  ret <4 x i64> %shuffle
1575}
1576
1577define <4 x double> @splat128_mem_v4f64_from_v2f64(<2 x double>* %ptr) {
1578; ALL-LABEL: splat128_mem_v4f64_from_v2f64:
1579; ALL:       # %bb.0:
1580; ALL-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
1581; ALL-NEXT:    retq
1582  %v = load <2 x double>, <2 x double>* %ptr
1583  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
1584  ret <4 x double> %shuffle
1585}
1586
1587define <4 x double> @broadcast_v4f64_0000_from_v2i64(<2 x i64> %a0) {
1588; AVX1-LABEL: broadcast_v4f64_0000_from_v2i64:
1589; AVX1:       # %bb.0:
1590; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
1591; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1592; AVX1-NEXT:    retq
1593;
1594; AVX2-LABEL: broadcast_v4f64_0000_from_v2i64:
1595; AVX2:       # %bb.0:
1596; AVX2-NEXT:    vbroadcastsd %xmm0, %ymm0
1597; AVX2-NEXT:    retq
1598;
1599; AVX512VL-LABEL: broadcast_v4f64_0000_from_v2i64:
1600; AVX512VL:       # %bb.0:
1601; AVX512VL-NEXT:    vbroadcastsd %xmm0, %ymm0
1602; AVX512VL-NEXT:    retq
1603  %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1604  %2 = bitcast <4 x i64> %1 to <4 x double>
1605  %3 = shufflevector <4 x double> %2, <4 x double> undef, <4 x i32> zeroinitializer
1606  ret <4 x double> %3
1607}
1608
1609define <4 x double> @bitcast_v4f64_0426(<4 x double> %a, <4 x double> %b) {
1610; ALL-LABEL: bitcast_v4f64_0426:
1611; ALL:       # %bb.0:
1612; ALL-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
1613; ALL-NEXT:    retq
1614  %shuffle64 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 0, i32 6, i32 2>
1615  %bitcast32 = bitcast <4 x double> %shuffle64 to <8 x float>
1616  %shuffle32 = shufflevector <8 x float> %bitcast32, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
1617  %bitcast16 = bitcast <8 x float> %shuffle32 to <16 x i16>
1618  %shuffle16 = shufflevector <16 x i16> %bitcast16, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9, i32 14, i32 15, i32 12, i32 13>
1619  %bitcast64 = bitcast <16 x i16> %shuffle16 to <4 x double>
1620  ret <4 x double> %bitcast64
1621}
1622
1623define <4 x i64> @concat_v4i64_0167(<4 x i64> %a0, <4 x i64> %a1) {
1624; ALL-LABEL: concat_v4i64_0167:
1625; ALL:       # %bb.0:
1626; ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
1627; ALL-NEXT:    retq
1628  %a0lo = shufflevector <4 x i64> %a0, <4 x i64> %a1, <2 x i32> <i32 0, i32 1>
1629  %a1hi = shufflevector <4 x i64> %a0, <4 x i64> %a1, <2 x i32> <i32 6, i32 7>
1630  %shuffle64 = shufflevector <2 x i64> %a0lo, <2 x i64> %a1hi, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1631  ret <4 x i64> %shuffle64
1632}
1633
1634define <4 x i64> @concat_v4i64_0145_bc(<4 x i64> %a0, <4 x i64> %a1) {
1635; ALL-LABEL: concat_v4i64_0145_bc:
1636; ALL:       # %bb.0:
1637; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1638; ALL-NEXT:    retq
1639  %a0lo = shufflevector <4 x i64> %a0, <4 x i64> %a1, <2 x i32> <i32 0, i32 1>
1640  %a1lo = shufflevector <4 x i64> %a0, <4 x i64> %a1, <2 x i32> <i32 4, i32 5>
1641  %bc0lo = bitcast <2 x i64> %a0lo to <4 x i32>
1642  %bc1lo = bitcast <2 x i64> %a1lo to <4 x i32>
1643  %shuffle32 = shufflevector <4 x i32> %bc0lo, <4 x i32> %bc1lo, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1644  %shuffle64 = bitcast <8 x i32> %shuffle32 to <4 x i64>
1645  ret <4 x i64> %shuffle64
1646}
1647
1648define <4 x i64> @insert_dup_mem_v4i64(i64* %ptr) {
1649; ALL-LABEL: insert_dup_mem_v4i64:
1650; ALL:       # %bb.0:
1651; ALL-NEXT:    vbroadcastsd (%rdi), %ymm0
1652; ALL-NEXT:    retq
1653  %tmp = load i64, i64* %ptr, align 1
1654  %tmp1 = insertelement <2 x i64> undef, i64 %tmp, i32 0
1655  %tmp2 = shufflevector <2 x i64> %tmp1, <2 x i64> undef, <4 x i32> zeroinitializer
1656  ret <4 x i64> %tmp2
1657}
1658
1659define <4 x i64> @shuffle_v4i64_1234(<4 x i64> %a, <4 x i64> %b) {
1660; AVX1-LABEL: shuffle_v4i64_1234:
1661; AVX1:       # %bb.0:
1662; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[0,1]
1663; AVX1-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[3],ymm1[2]
1664; AVX1-NEXT:    retq
1665;
1666; AVX2-LABEL: shuffle_v4i64_1234:
1667; AVX2:       # %bb.0:
1668; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[0,1]
1669; AVX2-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
1670; AVX2-NEXT:    retq
1671;
1672; AVX512VL-LABEL: shuffle_v4i64_1234:
1673; AVX512VL:       # %bb.0:
1674; AVX512VL-NEXT:    valignq {{.*#+}} ymm0 = ymm0[1,2,3],ymm1[0]
1675; AVX512VL-NEXT:    retq
1676  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
1677  ret <4 x i64> %shuffle
1678}
1679
1680define <4 x i64> @shuffle_v4i64_1230(<4 x i64> %a) {
1681; AVX1-LABEL: shuffle_v4i64_1230:
1682; AVX1:       # %bb.0:
1683; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
1684; AVX1-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[3],ymm1[2]
1685; AVX1-NEXT:    retq
1686;
1687; AVX2-LABEL: shuffle_v4i64_1230:
1688; AVX2:       # %bb.0:
1689; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,2,3,0]
1690; AVX2-NEXT:    retq
1691;
1692; AVX512VL-LABEL: shuffle_v4i64_1230:
1693; AVX512VL:       # %bb.0:
1694; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,2,3,0]
1695; AVX512VL-NEXT:    retq
1696  %shuffle = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
1697  ret <4 x i64> %shuffle
1698}
1699
1700define <4 x i64> @shuffle_v4i64_z0z3(<4 x i64> %a, <4 x i64> %b) {
1701; AVX1-LABEL: shuffle_v4i64_z0z3:
1702; AVX1:       # %bb.0:
1703; AVX1-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1704; AVX1-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3]
1705; AVX1-NEXT:    retq
1706;
1707; AVX2-SLOW-LABEL: shuffle_v4i64_z0z3:
1708; AVX2-SLOW:       # %bb.0:
1709; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,3]
1710; AVX2-SLOW-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1711; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
1712; AVX2-SLOW-NEXT:    retq
1713;
1714; AVX2-FAST-LABEL: shuffle_v4i64_z0z3:
1715; AVX2-FAST:       # %bb.0:
1716; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31]
1717; AVX2-FAST-NEXT:    retq
1718;
1719; AVX512VL-SLOW-LABEL: shuffle_v4i64_z0z3:
1720; AVX512VL-SLOW:       # %bb.0:
1721; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,3]
1722; AVX512VL-SLOW-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1723; AVX512VL-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
1724; AVX512VL-SLOW-NEXT:    retq
1725;
1726; AVX512VL-FAST-LABEL: shuffle_v4i64_z0z3:
1727; AVX512VL-FAST:       # %bb.0:
1728; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31]
1729; AVX512VL-FAST-NEXT:    retq
1730  %1 = shufflevector <4 x i64> %a, <4 x i64> <i64 0, i64 undef, i64 undef, i64 undef>, <4 x i32> <i32 4, i32 0, i32 4, i32 3>
1731  ret <4 x i64> %1
1732}
1733
1734define <4 x i64> @shuffle_v4i64_1z2z(<4 x i64> %a, <4 x i64> %b) {
1735; AVX1-LABEL: shuffle_v4i64_1z2z:
1736; AVX1:       # %bb.0:
1737; AVX1-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1738; AVX1-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[2]
1739; AVX1-NEXT:    retq
1740;
1741; AVX2-SLOW-LABEL: shuffle_v4i64_1z2z:
1742; AVX2-SLOW:       # %bb.0:
1743; AVX2-SLOW-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1744; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
1745; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,0,2,0]
1746; AVX2-SLOW-NEXT:    retq
1747;
1748; AVX2-FAST-LABEL: shuffle_v4i64_1z2z:
1749; AVX2-FAST:       # %bb.0:
1750; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23],zero,zero,zero,zero,zero,zero,zero,zero
1751; AVX2-FAST-NEXT:    retq
1752;
1753; AVX512VL-SLOW-LABEL: shuffle_v4i64_1z2z:
1754; AVX512VL-SLOW:       # %bb.0:
1755; AVX512VL-SLOW-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1756; AVX512VL-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
1757; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,0,2,0]
1758; AVX512VL-SLOW-NEXT:    retq
1759;
1760; AVX512VL-FAST-LABEL: shuffle_v4i64_1z2z:
1761; AVX512VL-FAST:       # %bb.0:
1762; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23],zero,zero,zero,zero,zero,zero,zero,zero
1763; AVX512VL-FAST-NEXT:    retq
1764  %1 = shufflevector <4 x i64> %a, <4 x i64> <i64 0, i64 undef, i64 undef, i64 undef>, <4 x i32> <i32 1, i32 4, i32 2, i32 4>
1765  ret <4 x i64> %1
1766}
1767
1768define <4 x double> @add_v4f64_0246_1357(<4 x double> %a, <4 x double> %b) {
1769; AVX1-LABEL: add_v4f64_0246_1357:
1770; AVX1:       # %bb.0: # %entry
1771; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
1772; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1773; AVX1-NEXT:    vhaddpd %ymm2, %ymm0, %ymm0
1774; AVX1-NEXT:    retq
1775;
1776; AVX2-LABEL: add_v4f64_0246_1357:
1777; AVX2:       # %bb.0: # %entry
1778; AVX2-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
1779; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
1780; AVX2-NEXT:    retq
1781;
1782; AVX512VL-LABEL: add_v4f64_0246_1357:
1783; AVX512VL:       # %bb.0: # %entry
1784; AVX512VL-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
1785; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
1786; AVX512VL-NEXT:    retq
1787entry:
1788  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1789  %shuffle1 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1790  %add = fadd <4 x double> %shuffle, %shuffle1
1791  ret <4 x double> %add
1792}
1793
1794define <4 x double> @add_v4f64_4602_5713(<4 x double> %a, <4 x double> %b) {
1795; AVX1-LABEL: add_v4f64_4602_5713:
1796; AVX1:       # %bb.0: # %entry
1797; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[2,3]
1798; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1799; AVX1-NEXT:    vhaddpd %ymm2, %ymm0, %ymm0
1800; AVX1-NEXT:    retq
1801;
1802; AVX2-LABEL: add_v4f64_4602_5713:
1803; AVX2:       # %bb.0: # %entry
1804; AVX2-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
1805; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,3,0,2]
1806; AVX2-NEXT:    retq
1807;
1808; AVX512VL-LABEL: add_v4f64_4602_5713:
1809; AVX512VL:       # %bb.0: # %entry
1810; AVX512VL-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
1811; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,3,0,2]
1812; AVX512VL-NEXT:    retq
1813entry:
1814  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 6, i32 0, i32 2>
1815  %shuffle1 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 5, i32 7, i32 1, i32 3>
1816  %add = fadd <4 x double> %shuffle, %shuffle1
1817  ret <4 x double> %add
1818}
1819
1820define <4 x i64> @add_v4i64_0246_1357(<4 x i64> %a, <4 x i64> %b) {
1821; AVX1-LABEL: add_v4i64_0246_1357:
1822; AVX1:       # %bb.0: # %entry
1823; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
1824; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1825; AVX1-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
1826; AVX1-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
1827; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1828; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
1829; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
1830; AVX1-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
1831; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1832; AVX1-NEXT:    retq
1833;
1834; AVX2-LABEL: add_v4i64_0246_1357:
1835; AVX2:       # %bb.0: # %entry
1836; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
1837; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
1838; AVX2-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
1839; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1840; AVX2-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
1841; AVX2-NEXT:    retq
1842;
1843; AVX512VL-SLOW-LABEL: add_v4i64_0246_1357:
1844; AVX512VL-SLOW:       # %bb.0: # %entry
1845; AVX512VL-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
1846; AVX512VL-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
1847; AVX512VL-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
1848; AVX512VL-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1849; AVX512VL-SLOW-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
1850; AVX512VL-SLOW-NEXT:    retq
1851;
1852; AVX512VL-FAST-LABEL: add_v4i64_0246_1357:
1853; AVX512VL-FAST:       # %bb.0: # %entry
1854; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6]
1855; AVX512VL-FAST-NEXT:    vpermi2q %ymm1, %ymm0, %ymm2
1856; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,3,5,7]
1857; AVX512VL-FAST-NEXT:    vpermi2q %ymm1, %ymm0, %ymm3
1858; AVX512VL-FAST-NEXT:    vpaddq %ymm3, %ymm2, %ymm0
1859; AVX512VL-FAST-NEXT:    retq
1860entry:
1861  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1862  %shuffle1 = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1863  %add = add <4 x i64> %shuffle, %shuffle1
1864  ret <4 x i64> %add
1865}
1866
1867define <4 x i64> @add_v4i64_4602_5713(<4 x i64> %a, <4 x i64> %b) {
1868; AVX1-LABEL: add_v4i64_4602_5713:
1869; AVX1:       # %bb.0: # %entry
1870; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[2,3]
1871; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1872; AVX1-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
1873; AVX1-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
1874; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1875; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
1876; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
1877; AVX1-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
1878; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1879; AVX1-NEXT:    retq
1880;
1881; AVX2-LABEL: add_v4i64_4602_5713:
1882; AVX2:       # %bb.0: # %entry
1883; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
1884; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
1885; AVX2-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
1886; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1887; AVX2-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
1888; AVX2-NEXT:    retq
1889;
1890; AVX512VL-SLOW-LABEL: add_v4i64_4602_5713:
1891; AVX512VL-SLOW:       # %bb.0: # %entry
1892; AVX512VL-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
1893; AVX512VL-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
1894; AVX512VL-SLOW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
1895; AVX512VL-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1896; AVX512VL-SLOW-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
1897; AVX512VL-SLOW-NEXT:    retq
1898;
1899; AVX512VL-FAST-LABEL: add_v4i64_4602_5713:
1900; AVX512VL-FAST:       # %bb.0: # %entry
1901; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6]
1902; AVX512VL-FAST-NEXT:    vpermi2q %ymm0, %ymm1, %ymm2
1903; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,3,5,7]
1904; AVX512VL-FAST-NEXT:    vpermi2q %ymm0, %ymm1, %ymm3
1905; AVX512VL-FAST-NEXT:    vpaddq %ymm3, %ymm2, %ymm0
1906; AVX512VL-FAST-NEXT:    retq
1907entry:
1908  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 6, i32 0, i32 2>
1909  %shuffle1 = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 5, i32 7, i32 1, i32 3>
1910  %add = add <4 x i64> %shuffle, %shuffle1
1911  ret <4 x i64> %add
1912}
1913
1914define <4 x double> @shuffle_v4f64_0zzz_optsize(<4 x double> %a) optsize {
1915; ALL-LABEL: shuffle_v4f64_0zzz_optsize:
1916; ALL:       # %bb.0:
1917; ALL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
1918; ALL-NEXT:    retq
1919  %b = shufflevector <4 x double> %a, <4 x double> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1920  ret <4 x double> %b
1921}
1922
1923define <4 x i64> @shuffle_v4i64_0zzz_optsize(<4 x i64> %a) optsize {
1924; ALL-LABEL: shuffle_v4i64_0zzz_optsize:
1925; ALL:       # %bb.0:
1926; ALL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
1927; ALL-NEXT:    retq
1928  %b = shufflevector <4 x i64> %a, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1929  ret <4 x i64> %b
1930}
1931
1932define <8 x float> @shuffle_v8f32_0zzzzzzz_optsize(<8 x float> %a) optsize {
1933; ALL-LABEL: shuffle_v8f32_0zzzzzzz_optsize:
1934; ALL:       # %bb.0:
1935; ALL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1936; ALL-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1937; ALL-NEXT:    retq
1938  %b = shufflevector <8 x float> %a, <8 x float> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1939  ret <8 x float> %b
1940}
1941
1942define <8 x i32> @shuffle_v8i32_0zzzzzzz_optsize(<8 x i32> %a) optsize {
1943; ALL-LABEL: shuffle_v8i32_0zzzzzzz_optsize:
1944; ALL:       # %bb.0:
1945; ALL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1946; ALL-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1947; ALL-NEXT:    retq
1948  %b = shufflevector <8 x i32> %a, <8 x i32> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1949  ret <8 x i32> %b
1950}
1951
1952define <4 x double> @shuffle_v4f64_0zzz_pgso(<4 x double> %a) !prof !14 {
1953; ALL-LABEL: shuffle_v4f64_0zzz_pgso:
1954; ALL:       # %bb.0:
1955; ALL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
1956; ALL-NEXT:    retq
1957  %b = shufflevector <4 x double> %a, <4 x double> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1958  ret <4 x double> %b
1959}
1960
1961define <4 x i64> @shuffle_v4i64_0zzz_pgso(<4 x i64> %a) !prof !14 {
1962; ALL-LABEL: shuffle_v4i64_0zzz_pgso:
1963; ALL:       # %bb.0:
1964; ALL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
1965; ALL-NEXT:    retq
1966  %b = shufflevector <4 x i64> %a, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1967  ret <4 x i64> %b
1968}
1969
1970define <8 x float> @shuffle_v8f32_0zzzzzzz_pgso(<8 x float> %a) !prof !14 {
1971; AVX1OR2-LABEL: shuffle_v8f32_0zzzzzzz_pgso:
1972; AVX1OR2:       # %bb.0:
1973; AVX1OR2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1974; AVX1OR2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1975; AVX1OR2-NEXT:    retq
1976;
1977; AVX512VL-LABEL: shuffle_v8f32_0zzzzzzz_pgso:
1978; AVX512VL:       # %bb.0:
1979; AVX512VL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1980; AVX512VL-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1981; AVX512VL-NEXT:    retq
1982  %b = shufflevector <8 x float> %a, <8 x float> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1983  ret <8 x float> %b
1984}
1985
1986define <8 x i32> @shuffle_v8i32_0zzzzzzz_pgso(<8 x i32> %a) !prof !14 {
1987; AVX1OR2-LABEL: shuffle_v8i32_0zzzzzzz_pgso:
1988; AVX1OR2:       # %bb.0:
1989; AVX1OR2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1990; AVX1OR2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1991; AVX1OR2-NEXT:    retq
1992;
1993; AVX512VL-LABEL: shuffle_v8i32_0zzzzzzz_pgso:
1994; AVX512VL:       # %bb.0:
1995; AVX512VL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1996; AVX512VL-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1997; AVX512VL-NEXT:    retq
1998  %b = shufflevector <8 x i32> %a, <8 x i32> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1999  ret <8 x i32> %b
2000}
2001
2002define <4 x i64> @unpckh_v4i64(<4 x i64> %x, <4 x i64> %y) {
2003; ALL-LABEL: unpckh_v4i64:
2004; ALL:       # %bb.0:
2005; ALL-NEXT:    vextractf128 $1, %ymm1, %xmm1
2006; ALL-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
2007; ALL-NEXT:    retq
2008  %unpckh = shufflevector <4 x i64> %x, <4 x i64> %y, <4 x i32> <i32 1, i32 7, i32 undef, i32 undef>
2009  ret <4 x i64> %unpckh
2010}
2011
2012define <4 x double> @unpckh_v4f64(<4 x double> %x, <4 x double> %y) {
2013; ALL-LABEL: unpckh_v4f64:
2014; ALL:       # %bb.0:
2015; ALL-NEXT:    vextractf128 $1, %ymm1, %xmm1
2016; ALL-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
2017; ALL-NEXT:    retq
2018  %unpckh = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 1, i32 7, i32 undef, i32 undef>
2019  ret <4 x double> %unpckh
2020}
2021
2022!llvm.module.flags = !{!0}
2023!0 = !{i32 1, !"ProfileSummary", !1}
2024!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
2025!2 = !{!"ProfileFormat", !"InstrProf"}
2026!3 = !{!"TotalCount", i64 10000}
2027!4 = !{!"MaxCount", i64 10}
2028!5 = !{!"MaxInternalCount", i64 1}
2029!6 = !{!"MaxFunctionCount", i64 1000}
2030!7 = !{!"NumCounts", i64 3}
2031!8 = !{!"NumFunctions", i64 3}
2032!9 = !{!"DetailedSummary", !10}
2033!10 = !{!11, !12, !13}
2034!11 = !{i32 10000, i64 100, i32 1}
2035!12 = !{i32 999000, i64 100, i32 1}
2036!13 = !{i32 999999, i64 1, i32 2}
2037!14 = !{!"function_entry_count", i64 0}
2038