• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX1
3; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX2
4
5define <8 x float> @shuffle_v8f32_45670123(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
6; ALL-LABEL: shuffle_v8f32_45670123:
7; ALL:       ## BB#0: ## %entry
8; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
9; ALL-NEXT:    retq
10entry:
11  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
12  ret <8 x float> %shuffle
13}
14
15define <8 x float> @shuffle_v8f32_45670123_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp {
16; ALL-LABEL: shuffle_v8f32_45670123_mem:
17; ALL:       ## BB#0: ## %entry
18; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1]
19; ALL-NEXT:    retq
20entry:
21  %a = load <8 x float>, <8 x float>* %pa
22  %b = load <8 x float>, <8 x float>* %pb
23  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
24  ret <8 x float> %shuffle
25}
26
27define <8 x float> @shuffle_v8f32_0123cdef(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
28; ALL-LABEL: shuffle_v8f32_0123cdef:
29; ALL:       ## BB#0: ## %entry
30; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
31; ALL-NEXT:    retq
32entry:
33  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
34  ret <8 x float> %shuffle
35}
36
37define <8 x float> @shuffle_v8f32_01230123(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
38; AVX1-LABEL: shuffle_v8f32_01230123:
39; AVX1:       ## BB#0: ## %entry
40; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
41; AVX1-NEXT:    retq
42;
43; AVX2-LABEL: shuffle_v8f32_01230123:
44; AVX2:       ## BB#0: ## %entry
45; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
46; AVX2-NEXT:    retq
47entry:
48  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
49  ret <8 x float> %shuffle
50}
51
52define <8 x float> @shuffle_v8f32_01230123_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp {
53; AVX1-LABEL: shuffle_v8f32_01230123_mem:
54; AVX1:       ## BB#0: ## %entry
55; AVX1-NEXT:    vmovaps (%rdi), %ymm0
56; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
57; AVX1-NEXT:    retq
58;
59; AVX2-LABEL: shuffle_v8f32_01230123_mem:
60; AVX2:       ## BB#0: ## %entry
61; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = mem[0,1,0,1]
62; AVX2-NEXT:    retq
63entry:
64  %a = load <8 x float>, <8 x float>* %pa
65  %b = load <8 x float>, <8 x float>* %pb
66  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
67  ret <8 x float> %shuffle
68}
69
70define <8 x float> @shuffle_v8f32_45674567(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
71; ALL-LABEL: shuffle_v8f32_45674567:
72; ALL:       ## BB#0: ## %entry
73; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
74; ALL-NEXT:    retq
75entry:
76  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
77  ret <8 x float> %shuffle
78}
79
80define <8 x float> @shuffle_v8f32_45674567_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp {
81; ALL-LABEL: shuffle_v8f32_45674567_mem:
82; ALL:       ## BB#0: ## %entry
83; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3,2,3]
84; ALL-NEXT:    retq
85entry:
86  %a = load <8 x float>, <8 x float>* %pa
87  %b = load <8 x float>, <8 x float>* %pb
88  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
89  ret <8 x float> %shuffle
90}
91
92define <32 x i8> @shuffle_v32i8_2323(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp {
93; ALL-LABEL: shuffle_v32i8_2323:
94; ALL:       ## BB#0: ## %entry
95; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
96; ALL-NEXT:    retq
97entry:
98  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
99  ret <32 x i8> %shuffle
100}
101
102define <32 x i8> @shuffle_v32i8_2323_domain(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp {
103; AVX1-LABEL: shuffle_v32i8_2323_domain:
104; AVX1:       ## BB#0: ## %entry
105; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
106; AVX1-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
107; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
108; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
109; AVX1-NEXT:    retq
110;
111; AVX2-LABEL: shuffle_v32i8_2323_domain:
112; AVX2:       ## BB#0: ## %entry
113; AVX2-NEXT:    vpaddb {{.*}}(%rip), %ymm0, %ymm0
114; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
115; AVX2-NEXT:    retq
116entry:
117  ; add forces execution domain
118  %a2 = add <32 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
119  %shuffle = shufflevector <32 x i8> %a2, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
120  ret <32 x i8> %shuffle
121}
122
123define <4 x i64> @shuffle_v4i64_6701(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
124; ALL-LABEL: shuffle_v4i64_6701:
125; ALL:       ## BB#0: ## %entry
126; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
127; ALL-NEXT:    retq
128entry:
129  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
130  ret <4 x i64> %shuffle
131}
132
133define <4 x i64> @shuffle_v4i64_6701_domain(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
134; AVX1-LABEL: shuffle_v4i64_6701_domain:
135; AVX1:       ## BB#0: ## %entry
136; AVX1-NEXT:    vpaddq {{.*}}(%rip), %xmm0, %xmm0
137; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
138; AVX1-NEXT:    retq
139;
140; AVX2-LABEL: shuffle_v4i64_6701_domain:
141; AVX2:       ## BB#0: ## %entry
142; AVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
143; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
144; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
145; AVX2-NEXT:    retq
146entry:
147  ; add forces execution domain
148  %a2 = add <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1>
149  %shuffle = shufflevector <4 x i64> %a2, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
150  ret <4 x i64> %shuffle
151}
152
153define <8 x i32> @shuffle_v8i32_u5u7cdef(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp {
154; AVX1-LABEL: shuffle_v8i32_u5u7cdef:
155; AVX1:       ## BB#0: ## %entry
156; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
157; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
158; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
159; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
160; AVX1-NEXT:    retq
161;
162; AVX2-LABEL: shuffle_v8i32_u5u7cdef:
163; AVX2:       ## BB#0: ## %entry
164; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm2
165; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
166; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
167; AVX2-NEXT:    retq
168entry:
169  ; add forces execution domain
170  %a2 = add <8 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
171  %shuffle = shufflevector <8 x i32> %a2, <8 x i32> %b, <8 x i32> <i32 undef, i32 5, i32 undef, i32 7, i32 12, i32 13, i32 14, i32 15>
172  ret <8 x i32> %shuffle
173}
174
175define <16 x i16> @shuffle_v16i16_4501(<16 x i16> %a, <16 x i16> %b) nounwind uwtable readnone ssp {
176; AVX1-LABEL: shuffle_v16i16_4501:
177; AVX1:       ## BB#0: ## %entry
178; AVX1-NEXT:    vpaddw {{.*}}(%rip), %xmm0, %xmm0
179; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
180; AVX1-NEXT:    retq
181;
182; AVX2-LABEL: shuffle_v16i16_4501:
183; AVX2:       ## BB#0: ## %entry
184; AVX2-NEXT:    vpaddw {{.*}}(%rip), %ymm0, %ymm0
185; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
186; AVX2-NEXT:    retq
187entry:
188  ; add forces execution domain
189  %a2 = add <16 x i16> %a, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
190  %shuffle = shufflevector <16 x i16> %a2, <16 x i16> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
191  ret <16 x i16> %shuffle
192}
193
194define <16 x i16> @shuffle_v16i16_4501_mem(<16 x i16>* %a, <16 x i16>* %b) nounwind uwtable readnone ssp {
195; AVX1-LABEL: shuffle_v16i16_4501_mem:
196; AVX1:       ## BB#0: ## %entry
197; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
198; AVX1-NEXT:    vmovaps (%rsi), %ymm1
199; AVX1-NEXT:    vpaddw {{.*}}(%rip), %xmm0, %xmm0
200; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
201; AVX1-NEXT:    retq
202;
203; AVX2-LABEL: shuffle_v16i16_4501_mem:
204; AVX2:       ## BB#0: ## %entry
205; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
206; AVX2-NEXT:    vmovdqa (%rsi), %ymm1
207; AVX2-NEXT:    vpaddw {{.*}}(%rip), %ymm0, %ymm0
208; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
209; AVX2-NEXT:    retq
210entry:
211  %c = load <16 x i16>, <16 x i16>* %a
212  %d = load <16 x i16>, <16 x i16>* %b
213  %c2 = add <16 x i16> %c, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
214  %shuffle = shufflevector <16 x i16> %c2, <16 x i16> %d, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
215  ret <16 x i16> %shuffle
216}
217
218;;;; Cases with undef indicies mixed in the mask
219
220define <8 x float> @shuffle_v8f32_uu67u9ub(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
221; ALL-LABEL: shuffle_v8f32_uu67u9ub:
222; ALL:       ## BB#0: ## %entry
223; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
224; ALL-NEXT:    retq
225entry:
226  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 9, i32 undef, i32 11>
227  ret <8 x float> %shuffle
228}
229
230define <8 x float> @shuffle_v8f32_uu67uu67(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
231; ALL-LABEL: shuffle_v8f32_uu67uu67:
232; ALL:       ## BB#0: ## %entry
233; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
234; ALL-NEXT:    retq
235entry:
236  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 undef, i32 6, i32 7>
237  ret <8 x float> %shuffle
238}
239
240define <8 x float> @shuffle_v8f32_uu67uuab(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
241; ALL-LABEL: shuffle_v8f32_uu67uuab:
242; ALL:       ## BB#0: ## %entry
243; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
244; ALL-NEXT:    retq
245entry:
246  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 undef, i32 10, i32 11>
247  ret <8 x float> %shuffle
248}
249
250define <8 x float> @shuffle_v8f32_uu67uuef(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
251; ALL-LABEL: shuffle_v8f32_uu67uuef:
252; ALL:       ## BB#0: ## %entry
253; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
254; ALL-NEXT:    retq
255entry:
256  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 undef, i32 14, i32 15>
257  ret <8 x float> %shuffle
258}
259
260define <8 x float> @shuffle_v8f32_uu674567(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
261; ALL-LABEL: shuffle_v8f32_uu674567:
262; ALL:       ## BB#0: ## %entry
263; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
264; ALL-NEXT:    retq
265entry:
266  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
267  ret <8 x float> %shuffle
268}
269
270define <8 x float> @shuffle_v8f32_uu6789ab(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
271; ALL-LABEL: shuffle_v8f32_uu6789ab:
272; ALL:       ## BB#0: ## %entry
273; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
274; ALL-NEXT:    retq
275entry:
276  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
277  ret <8 x float> %shuffle
278}
279
280define <8 x float> @shuffle_v8f32_4567uu67(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
281; ALL-LABEL: shuffle_v8f32_4567uu67:
282; ALL:       ## BB#0: ## %entry
283; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
284; ALL-NEXT:    retq
285entry:
286  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 6, i32 7>
287  ret <8 x float> %shuffle
288}
289
290define <8 x float> @shuffle_v8f32_4567uuef(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
291; ALL-LABEL: shuffle_v8f32_4567uuef:
292; ALL:       ## BB#0: ## %entry
293; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
294; ALL-NEXT:    retq
295entry:
296  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 14, i32 15>
297  ret <8 x float> %shuffle
298}
299
300;;;; Cases we must not select vperm2f128
301
302define <8 x float> @shuffle_v8f32_uu67ucuf(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
303; ALL-LABEL: shuffle_v8f32_uu67ucuf:
304; ALL:       ## BB#0: ## %entry
305; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
306; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7]
307; ALL-NEXT:    retq
308entry:
309  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 12, i32 undef, i32 15>
310  ret <8 x float> %shuffle
311}
312
313;; Test zero mask generation.
314;; PR22984: https://llvm.org/bugs/show_bug.cgi?id=22984
315;; Prefer xor+vblendpd over vperm2f128 because that has better performance.
316;; TODO: When building for optsize we should use vperm2f128.
317
318define <4 x double> @shuffle_v4f64_zz01(<4 x double> %a) {
319; ALL-LABEL: shuffle_v4f64_zz01:
320; ALL:       ## BB#0:
321; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
322; ALL-NEXT:    retq
323  %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
324  ret <4 x double> %s
325}
326define <4 x double> @shuffle_v4f64_zz01_optsize(<4 x double> %a) optsize {
327; ALL-LABEL: shuffle_v4f64_zz01_optsize:
328; ALL:       ## BB#0:
329; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
330; ALL-NEXT:    retq
331  %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
332  ret <4 x double> %s
333}
334
335define <4 x double> @shuffle_v4f64_zz23(<4 x double> %a) {
336; ALL-LABEL: shuffle_v4f64_zz23:
337; ALL:       ## BB#0:
338; ALL-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
339; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
340; ALL-NEXT:    retq
341  %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
342  ret <4 x double> %s
343}
344define <4 x double> @shuffle_v4f64_zz23_optsize(<4 x double> %a) optsize {
345; ALL-LABEL: shuffle_v4f64_zz23_optsize:
346; ALL:       ## BB#0:
347; ALL-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
348; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
349; ALL-NEXT:    retq
350  %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
351  ret <4 x double> %s
352}
353
354define <4 x double> @shuffle_v4f64_zz45(<4 x double> %a) {
355; ALL-LABEL: shuffle_v4f64_zz45:
356; ALL:       ## BB#0:
357; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
358; ALL-NEXT:    retq
359  %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
360  ret <4 x double> %s
361}
362define <4 x double> @shuffle_v4f64_zz45_optsize(<4 x double> %a) optsize {
363; ALL-LABEL: shuffle_v4f64_zz45_optsize:
364; ALL:       ## BB#0:
365; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
366; ALL-NEXT:    retq
367  %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
368  ret <4 x double> %s
369}
370
371define <4 x double> @shuffle_v4f64_zz67(<4 x double> %a) {
372; ALL-LABEL: shuffle_v4f64_zz67:
373; ALL:       ## BB#0:
374; ALL-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
375; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
376; ALL-NEXT:    retq
377  %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
378  ret <4 x double> %s
379}
380define <4 x double> @shuffle_v4f64_zz67_optsize(<4 x double> %a) optsize {
381; ALL-LABEL: shuffle_v4f64_zz67_optsize:
382; ALL:       ## BB#0:
383; ALL-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
384; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
385; ALL-NEXT:    retq
386  %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
387  ret <4 x double> %s
388}
389
390define <4 x double> @shuffle_v4f64_01zz(<4 x double> %a) {
391; ALL-LABEL: shuffle_v4f64_01zz:
392; ALL:       ## BB#0:
393; ALL-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
394; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
395; ALL-NEXT:    retq
396  %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
397  ret <4 x double> %s
398}
399define <4 x double> @shuffle_v4f64_01zz_optsize(<4 x double> %a) optsize {
400; ALL-LABEL: shuffle_v4f64_01zz_optsize:
401; ALL:       ## BB#0:
402; ALL-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
403; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
404; ALL-NEXT:    retq
405  %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
406  ret <4 x double> %s
407}
408
409define <4 x double> @shuffle_v4f64_23zz(<4 x double> %a) {
410; ALL-LABEL: shuffle_v4f64_23zz:
411; ALL:       ## BB#0:
412; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
413; ALL-NEXT:    retq
414  %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
415  ret <4 x double> %s
416}
417define <4 x double> @shuffle_v4f64_23zz_optsize(<4 x double> %a) optsize {
418; ALL-LABEL: shuffle_v4f64_23zz_optsize:
419; ALL:       ## BB#0:
420; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
421; ALL-NEXT:    retq
422  %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
423  ret <4 x double> %s
424}
425
426define <4 x double> @shuffle_v4f64_45zz(<4 x double> %a) {
427; ALL-LABEL: shuffle_v4f64_45zz:
428; ALL:       ## BB#0:
429; ALL-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
430; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
431; ALL-NEXT:    retq
432  %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
433  ret <4 x double> %s
434}
435define <4 x double> @shuffle_v4f64_45zz_optsize(<4 x double> %a) optsize {
436; ALL-LABEL: shuffle_v4f64_45zz_optsize:
437; ALL:       ## BB#0:
438; ALL-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
439; ALL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
440; ALL-NEXT:    retq
441  %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
442  ret <4 x double> %s
443}
444
445define <4 x double> @shuffle_v4f64_67zz(<4 x double> %a) {
446; ALL-LABEL: shuffle_v4f64_67zz:
447; ALL:       ## BB#0:
448; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
449; ALL-NEXT:    retq
450  %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
451  ret <4 x double> %s
452}
453define <4 x double> @shuffle_v4f64_67zz_optsize(<4 x double> %a) optsize {
454; ALL-LABEL: shuffle_v4f64_67zz_optsize:
455; ALL:       ## BB#0:
456; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
457; ALL-NEXT:    retq
458  %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
459  ret <4 x double> %s
460}
461
462;; With AVX2 select the integer version of the instruction. Use an add to force the domain selection.
463
464define <4 x i64> @shuffle_v4i64_67zz(<4 x i64> %a, <4 x i64> %b) {
465; AVX1-LABEL: shuffle_v4i64_67zz:
466; AVX1:       ## BB#0:
467; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
468; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
469; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
470; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
471; AVX1-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
472; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
473; AVX1-NEXT:    retq
474;
475; AVX2-LABEL: shuffle_v4i64_67zz:
476; AVX2:       ## BB#0:
477; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
478; AVX2-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
479; AVX2-NEXT:    retq
480  %s = shufflevector <4 x i64> <i64 0, i64 0, i64 undef, i64 undef>, <4 x i64> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
481  %c = add <4 x i64> %b, %s
482  ret <4 x i64> %c
483}
484
485;;; Memory folding cases
486
487define <4 x double> @ld0_hi0_lo1_4f64(<4 x double> * %pa, <4 x double> %b) nounwind uwtable readnone ssp {
488; AVX1-LABEL: ld0_hi0_lo1_4f64:
489; AVX1:       ## BB#0: ## %entry
490; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
491; AVX1-NEXT:    vaddpd {{.*}}(%rip), %ymm0, %ymm0
492; AVX1-NEXT:    retq
493;
494; AVX2-LABEL: ld0_hi0_lo1_4f64:
495; AVX2:       ## BB#0: ## %entry
496; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
497; AVX2-NEXT:    vbroadcastsd {{.*}}(%rip), %ymm1
498; AVX2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
499; AVX2-NEXT:    retq
500entry:
501  %a = load <4 x double>, <4 x double> * %pa
502  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
503  %res = fadd <4 x double> %shuffle, <double 1.0, double 1.0, double 1.0, double 1.0>
504  ret <4 x double> %res
505}
506
507define <4 x double> @ld1_hi0_hi1_4f64(<4 x double> %a, <4 x double> * %pb) nounwind uwtable readnone ssp {
508; AVX1-LABEL: ld1_hi0_hi1_4f64:
509; AVX1:       ## BB#0: ## %entry
510; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
511; AVX1-NEXT:    vaddpd {{.*}}(%rip), %ymm0, %ymm0
512; AVX1-NEXT:    retq
513;
514; AVX2-LABEL: ld1_hi0_hi1_4f64:
515; AVX2:       ## BB#0: ## %entry
516; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
517; AVX2-NEXT:    vbroadcastsd {{.*}}(%rip), %ymm1
518; AVX2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
519; AVX2-NEXT:    retq
520entry:
521  %b = load <4 x double>, <4 x double> * %pb
522  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
523  %res = fadd <4 x double> %shuffle, <double 1.0, double 1.0, double 1.0, double 1.0>
524  ret <4 x double> %res
525}
526
527define <8 x float> @ld0_hi0_lo1_8f32(<8 x float> * %pa, <8 x float> %b) nounwind uwtable readnone ssp {
528; AVX1-LABEL: ld0_hi0_lo1_8f32:
529; AVX1:       ## BB#0: ## %entry
530; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
531; AVX1-NEXT:    vaddps {{.*}}(%rip), %ymm0, %ymm0
532; AVX1-NEXT:    retq
533;
534; AVX2-LABEL: ld0_hi0_lo1_8f32:
535; AVX2:       ## BB#0: ## %entry
536; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
537; AVX2-NEXT:    vbroadcastss {{.*}}(%rip), %ymm1
538; AVX2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
539; AVX2-NEXT:    retq
540entry:
541  %a = load <8 x float>, <8 x float> * %pa
542  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
543  %res = fadd <8 x float> %shuffle, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>
544  ret <8 x float> %res
545}
546
547define <8 x float> @ld1_hi0_hi1_8f32(<8 x float> %a, <8 x float> * %pb) nounwind uwtable readnone ssp {
548; AVX1-LABEL: ld1_hi0_hi1_8f32:
549; AVX1:       ## BB#0: ## %entry
550; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
551; AVX1-NEXT:    vaddps {{.*}}(%rip), %ymm0, %ymm0
552; AVX1-NEXT:    retq
553;
554; AVX2-LABEL: ld1_hi0_hi1_8f32:
555; AVX2:       ## BB#0: ## %entry
556; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
557; AVX2-NEXT:    vbroadcastss {{.*}}(%rip), %ymm1
558; AVX2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
559; AVX2-NEXT:    retq
560entry:
561  %b = load <8 x float>, <8 x float> * %pb
562  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
563  %res = fadd <8 x float> %shuffle, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>
564  ret <8 x float> %res
565}
566
567define <4 x i64> @ld0_hi0_lo1_4i64(<4 x i64> * %pa, <4 x i64> %b) nounwind uwtable readnone ssp {
568; AVX1-LABEL: ld0_hi0_lo1_4i64:
569; AVX1:       ## BB#0: ## %entry
570; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
571; AVX1-NEXT:    vpaddq {{.*}}(%rip), %xmm0, %xmm1
572; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
573; AVX1-NEXT:    vpaddq {{.*}}(%rip), %xmm0, %xmm0
574; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
575; AVX1-NEXT:    retq
576;
577; AVX2-LABEL: ld0_hi0_lo1_4i64:
578; AVX2:       ## BB#0: ## %entry
579; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
580; AVX2-NEXT:    vpaddq {{.*}}(%rip), %ymm0, %ymm0
581; AVX2-NEXT:    retq
582entry:
583  %a = load <4 x i64>, <4 x i64> * %pa
584  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
585  %res = add <4 x i64> %shuffle, <i64 1, i64 2, i64 3, i64 4>
586  ret <4 x i64> %res
587}
588
589define <4 x i64> @ld1_hi0_hi1_4i64(<4 x i64> %a, <4 x i64> * %pb) nounwind uwtable readnone ssp {
590; AVX1-LABEL: ld1_hi0_hi1_4i64:
591; AVX1:       ## BB#0: ## %entry
592; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
593; AVX1-NEXT:    vpaddq {{.*}}(%rip), %xmm0, %xmm1
594; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
595; AVX1-NEXT:    vpaddq {{.*}}(%rip), %xmm0, %xmm0
596; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
597; AVX1-NEXT:    retq
598;
599; AVX2-LABEL: ld1_hi0_hi1_4i64:
600; AVX2:       ## BB#0: ## %entry
601; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
602; AVX2-NEXT:    vpaddq {{.*}}(%rip), %ymm0, %ymm0
603; AVX2-NEXT:    retq
604entry:
605  %b = load <4 x i64>, <4 x i64> * %pb
606  %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
607  %res = add <4 x i64> %shuffle, <i64 1, i64 2, i64 3, i64 4>
608  ret <4 x i64> %res
609}
610
611define <8 x i32> @ld0_hi0_lo1_8i32(<8 x i32> * %pa, <8 x i32> %b) nounwind uwtable readnone ssp {
612; AVX1-LABEL: ld0_hi0_lo1_8i32:
613; AVX1:       ## BB#0: ## %entry
614; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
615; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
616; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,2,3,4]
617; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
618; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
619; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
620; AVX1-NEXT:    retq
621;
622; AVX2-LABEL: ld0_hi0_lo1_8i32:
623; AVX2:       ## BB#0: ## %entry
624; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
625; AVX2-NEXT:    vpaddd {{.*}}(%rip), %ymm0, %ymm0
626; AVX2-NEXT:    retq
627entry:
628  %a = load <8 x i32>, <8 x i32> * %pa
629  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
630  %res = add <8 x i32> %shuffle, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
631  ret <8 x i32> %res
632}
633
634define <8 x i32> @ld1_hi0_hi1_8i32(<8 x i32> %a, <8 x i32> * %pb) nounwind uwtable readnone ssp {
635; AVX1-LABEL: ld1_hi0_hi1_8i32:
636; AVX1:       ## BB#0: ## %entry
637; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
638; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
639; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,2,3,4]
640; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
641; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
642; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
643; AVX1-NEXT:    retq
644;
645; AVX2-LABEL: ld1_hi0_hi1_8i32:
646; AVX2:       ## BB#0: ## %entry
647; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
648; AVX2-NEXT:    vpaddd {{.*}}(%rip), %ymm0, %ymm0
649; AVX2-NEXT:    retq
650entry:
651  %b = load <8 x i32>, <8 x i32> * %pb
652  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
653  %res = add <8 x i32> %shuffle, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
654  ret <8 x i32> %res
655}
656