• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512F
5;
6; Just one 32-bit run to make sure we do reasonable things.
7; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X32-AVX
8
9define <4 x double> @merge_4f64_2f64_23(<2 x double>* %ptr) nounwind uwtable noinline ssp {
10; AVX-LABEL: merge_4f64_2f64_23:
11; AVX:       # %bb.0:
12; AVX-NEXT:    vmovups 32(%rdi), %ymm0
13; AVX-NEXT:    retq
14;
15; X32-AVX-LABEL: merge_4f64_2f64_23:
16; X32-AVX:       # %bb.0:
17; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
18; X32-AVX-NEXT:    vmovups 32(%eax), %ymm0
19; X32-AVX-NEXT:    retl
20  %ptr0 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 2
21  %ptr1 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 3
22  %val0 = load <2 x double>, <2 x double>* %ptr0
23  %val1 = load <2 x double>, <2 x double>* %ptr1
24  %res = shufflevector <2 x double> %val0, <2 x double> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
25  ret <4 x double> %res
26}
27
28define <4 x double> @merge_4f64_2f64_2z(<2 x double>* %ptr) nounwind uwtable noinline ssp {
29; AVX-LABEL: merge_4f64_2f64_2z:
30; AVX:       # %bb.0:
31; AVX-NEXT:    vmovaps 32(%rdi), %xmm0
32; AVX-NEXT:    retq
33;
34; X32-AVX-LABEL: merge_4f64_2f64_2z:
35; X32-AVX:       # %bb.0:
36; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
37; X32-AVX-NEXT:    vmovaps 32(%eax), %xmm0
38; X32-AVX-NEXT:    retl
39  %ptr0 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 2
40  %val0 = load <2 x double>, <2 x double>* %ptr0
41  %res = shufflevector <2 x double> %val0, <2 x double> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
42  ret <4 x double> %res
43}
44
45define <4 x double> @merge_4f64_f64_2345(double* %ptr) nounwind uwtable noinline ssp {
46; AVX-LABEL: merge_4f64_f64_2345:
47; AVX:       # %bb.0:
48; AVX-NEXT:    vmovups 16(%rdi), %ymm0
49; AVX-NEXT:    retq
50;
51; X32-AVX-LABEL: merge_4f64_f64_2345:
52; X32-AVX:       # %bb.0:
53; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
54; X32-AVX-NEXT:    vmovups 16(%eax), %ymm0
55; X32-AVX-NEXT:    retl
56  %ptr0 = getelementptr inbounds double, double* %ptr, i64 2
57  %ptr1 = getelementptr inbounds double, double* %ptr, i64 3
58  %ptr2 = getelementptr inbounds double, double* %ptr, i64 4
59  %ptr3 = getelementptr inbounds double, double* %ptr, i64 5
60  %val0 = load double, double* %ptr0
61  %val1 = load double, double* %ptr1
62  %val2 = load double, double* %ptr2
63  %val3 = load double, double* %ptr3
64  %res0 = insertelement <4 x double> undef, double %val0, i32 0
65  %res1 = insertelement <4 x double> %res0, double %val1, i32 1
66  %res2 = insertelement <4 x double> %res1, double %val2, i32 2
67  %res3 = insertelement <4 x double> %res2, double %val3, i32 3
68  ret <4 x double> %res3
69}
70
71define <4 x double> @merge_4f64_f64_3zuu(double* %ptr) nounwind uwtable noinline ssp {
72; AVX-LABEL: merge_4f64_f64_3zuu:
73; AVX:       # %bb.0:
74; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
75; AVX-NEXT:    retq
76;
77; X32-AVX-LABEL: merge_4f64_f64_3zuu:
78; X32-AVX:       # %bb.0:
79; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
80; X32-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
81; X32-AVX-NEXT:    retl
82  %ptr0 = getelementptr inbounds double, double* %ptr, i64 3
83  %val0 = load double, double* %ptr0
84  %res0 = insertelement <4 x double> undef, double %val0, i32 0
85  %res1 = insertelement <4 x double> %res0, double 0.0, i32 1
86  ret <4 x double> %res1
87}
88
89define <4 x double> @merge_4f64_f64_34uu(double* %ptr) nounwind uwtable noinline ssp {
90; AVX-LABEL: merge_4f64_f64_34uu:
91; AVX:       # %bb.0:
92; AVX-NEXT:    vmovups 24(%rdi), %xmm0
93; AVX-NEXT:    retq
94;
95; X32-AVX-LABEL: merge_4f64_f64_34uu:
96; X32-AVX:       # %bb.0:
97; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
98; X32-AVX-NEXT:    vmovups 24(%eax), %xmm0
99; X32-AVX-NEXT:    retl
100  %ptr0 = getelementptr inbounds double, double* %ptr, i64 3
101  %ptr1 = getelementptr inbounds double, double* %ptr, i64 4
102  %val0 = load double, double* %ptr0
103  %val1 = load double, double* %ptr1
104  %res0 = insertelement <4 x double> undef, double %val0, i32 0
105  %res1 = insertelement <4 x double> %res0, double %val1, i32 1
106  ret <4 x double> %res1
107}
108
109define <4 x double> @merge_4f64_f64_45zz(double* %ptr) nounwind uwtable noinline ssp {
110; AVX-LABEL: merge_4f64_f64_45zz:
111; AVX:       # %bb.0:
112; AVX-NEXT:    vmovups 32(%rdi), %xmm0
113; AVX-NEXT:    retq
114;
115; X32-AVX-LABEL: merge_4f64_f64_45zz:
116; X32-AVX:       # %bb.0:
117; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
118; X32-AVX-NEXT:    vmovups 32(%eax), %xmm0
119; X32-AVX-NEXT:    retl
120  %ptr0 = getelementptr inbounds double, double* %ptr, i64 4
121  %ptr1 = getelementptr inbounds double, double* %ptr, i64 5
122  %val0 = load double, double* %ptr0
123  %val1 = load double, double* %ptr1
124  %res0 = insertelement <4 x double> zeroinitializer, double %val0, i32 0
125  %res1 = insertelement <4 x double> %res0, double %val1, i32 1
126  ret <4 x double> %res1
127}
128
129define <4 x double> @merge_4f64_f64_34z6(double* %ptr) nounwind uwtable noinline ssp {
130; AVX-LABEL: merge_4f64_f64_34z6:
131; AVX:       # %bb.0:
132; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
133; AVX-NEXT:    vblendps {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5],mem[6,7]
134; AVX-NEXT:    retq
135;
136; X32-AVX-LABEL: merge_4f64_f64_34z6:
137; X32-AVX:       # %bb.0:
138; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
139; X32-AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
140; X32-AVX-NEXT:    vblendps {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5],mem[6,7]
141; X32-AVX-NEXT:    retl
142  %ptr0 = getelementptr inbounds double, double* %ptr, i64 3
143  %ptr1 = getelementptr inbounds double, double* %ptr, i64 4
144  %ptr3 = getelementptr inbounds double, double* %ptr, i64 6
145  %val0 = load double, double* %ptr0
146  %val1 = load double, double* %ptr1
147  %val3 = load double, double* %ptr3
148  %res0 = insertelement <4 x double> undef, double %val0, i32 0
149  %res1 = insertelement <4 x double> %res0, double %val1, i32 1
150  %res2 = insertelement <4 x double> %res1, double   0.0, i32 2
151  %res3 = insertelement <4 x double> %res2, double %val3, i32 3
152  ret <4 x double> %res3
153}
154
155define <4 x i64> @merge_4i64_2i64_3z(<2 x i64>* %ptr) nounwind uwtable noinline ssp {
156; AVX-LABEL: merge_4i64_2i64_3z:
157; AVX:       # %bb.0:
158; AVX-NEXT:    vmovaps 48(%rdi), %xmm0
159; AVX-NEXT:    retq
160;
161; X32-AVX-LABEL: merge_4i64_2i64_3z:
162; X32-AVX:       # %bb.0:
163; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
164; X32-AVX-NEXT:    vmovaps 48(%eax), %xmm0
165; X32-AVX-NEXT:    retl
166  %ptr0 = getelementptr inbounds <2 x i64>, <2 x i64>* %ptr, i64 3
167  %val0 = load <2 x i64>, <2 x i64>* %ptr0
168  %res = shufflevector <2 x i64> %val0, <2 x i64> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
169  ret <4 x i64> %res
170}
171
172define <4 x i64> @merge_4i64_i64_1234(i64* %ptr) nounwind uwtable noinline ssp {
173; AVX-LABEL: merge_4i64_i64_1234:
174; AVX:       # %bb.0:
175; AVX-NEXT:    vmovups 8(%rdi), %ymm0
176; AVX-NEXT:    retq
177;
178; X32-AVX-LABEL: merge_4i64_i64_1234:
179; X32-AVX:       # %bb.0:
180; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
181; X32-AVX-NEXT:    vmovups 8(%eax), %ymm0
182; X32-AVX-NEXT:    retl
183  %ptr0 = getelementptr inbounds i64, i64* %ptr, i64 1
184  %ptr1 = getelementptr inbounds i64, i64* %ptr, i64 2
185  %ptr2 = getelementptr inbounds i64, i64* %ptr, i64 3
186  %ptr3 = getelementptr inbounds i64, i64* %ptr, i64 4
187  %val0 = load i64, i64* %ptr0
188  %val1 = load i64, i64* %ptr1
189  %val2 = load i64, i64* %ptr2
190  %val3 = load i64, i64* %ptr3
191  %res0 = insertelement <4 x i64> undef, i64 %val0, i32 0
192  %res1 = insertelement <4 x i64> %res0, i64 %val1, i32 1
193  %res2 = insertelement <4 x i64> %res1, i64 %val2, i32 2
194  %res3 = insertelement <4 x i64> %res2, i64 %val3, i32 3
195  ret <4 x i64> %res3
196}
197
198define <4 x i64> @merge_4i64_i64_1zzu(i64* %ptr) nounwind uwtable noinline ssp {
199; AVX-LABEL: merge_4i64_i64_1zzu:
200; AVX:       # %bb.0:
201; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
202; AVX-NEXT:    retq
203;
204; X32-AVX-LABEL: merge_4i64_i64_1zzu:
205; X32-AVX:       # %bb.0:
206; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
207; X32-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
208; X32-AVX-NEXT:    retl
209  %ptr0 = getelementptr inbounds i64, i64* %ptr, i64 1
210  %val0 = load i64, i64* %ptr0
211  %res0 = insertelement <4 x i64> undef, i64 %val0, i32 0
212  %res1 = insertelement <4 x i64> %res0, i64 0, i32 1
213  %res2 = insertelement <4 x i64> %res1, i64 0, i32 2
214  ret <4 x i64> %res2
215}
216
217define <4 x i64> @merge_4i64_i64_23zz(i64* %ptr) nounwind uwtable noinline ssp {
218; AVX-LABEL: merge_4i64_i64_23zz:
219; AVX:       # %bb.0:
220; AVX-NEXT:    vmovups 16(%rdi), %xmm0
221; AVX-NEXT:    retq
222;
223; X32-AVX-LABEL: merge_4i64_i64_23zz:
224; X32-AVX:       # %bb.0:
225; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
226; X32-AVX-NEXT:    vmovups 16(%eax), %xmm0
227; X32-AVX-NEXT:    retl
228  %ptr0 = getelementptr inbounds i64, i64* %ptr, i64 2
229  %ptr1 = getelementptr inbounds i64, i64* %ptr, i64 3
230  %val0 = load i64, i64* %ptr0
231  %val1 = load i64, i64* %ptr1
232  %res0 = insertelement <4 x i64> zeroinitializer, i64 %val0, i32 0
233  %res1 = insertelement <4 x i64> %res0, i64 %val1, i32 1
234  ret <4 x i64> %res1
235}
236
237define <8 x float> @merge_8f32_2f32_23z5(<2 x float>* %ptr) nounwind uwtable noinline ssp {
238; AVX1-LABEL: merge_8f32_2f32_23z5:
239; AVX1:       # %bb.0:
240; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
241; AVX1-NEXT:    vmovups 16(%rdi), %xmm1
242; AVX1-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
243; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
244; AVX1-NEXT:    retq
245;
246; AVX2-LABEL: merge_8f32_2f32_23z5:
247; AVX2:       # %bb.0:
248; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
249; AVX2-NEXT:    vmovdqu 16(%rdi), %xmm1
250; AVX2-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
251; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
252; AVX2-NEXT:    retq
253;
254; AVX512F-LABEL: merge_8f32_2f32_23z5:
255; AVX512F:       # %bb.0:
256; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
257; AVX512F-NEXT:    vmovdqu 16(%rdi), %xmm1
258; AVX512F-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
259; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
260; AVX512F-NEXT:    retq
261;
262; X32-AVX-LABEL: merge_8f32_2f32_23z5:
263; X32-AVX:       # %bb.0:
264; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
265; X32-AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
266; X32-AVX-NEXT:    vblendps {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5],mem[6,7]
267; X32-AVX-NEXT:    retl
268  %ptr0 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 2
269  %ptr1 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 3
270  %ptr3 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 5
271  %val0 = load <2 x float>, <2 x float>* %ptr0
272  %val1 = load <2 x float>, <2 x float>* %ptr1
273  %val3 = load <2 x float>, <2 x float>* %ptr3
274  %res01 = shufflevector <2 x float> %val0, <2 x float> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
275  %res23 = shufflevector <2 x float> zeroinitializer, <2 x float> %val3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
276  %res = shufflevector <4 x float> %res01, <4 x float> %res23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
277  ret <8 x float> %res
278}
279
280define <8 x float> @merge_8f32_4f32_z2(<4 x float>* %ptr) nounwind uwtable noinline ssp {
281; AVX-LABEL: merge_8f32_4f32_z2:
282; AVX:       # %bb.0:
283; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
284; AVX-NEXT:    vinsertf128 $1, 32(%rdi), %ymm0, %ymm0
285; AVX-NEXT:    retq
286;
287; X32-AVX-LABEL: merge_8f32_4f32_z2:
288; X32-AVX:       # %bb.0:
289; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
290; X32-AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
291; X32-AVX-NEXT:    vinsertf128 $1, 32(%eax), %ymm0, %ymm0
292; X32-AVX-NEXT:    retl
293  %ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 2
294  %val1 = load <4 x float>, <4 x float>* %ptr1
295  %res = shufflevector <4 x float> zeroinitializer, <4 x float> %val1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
296  ret <8 x float> %res
297}
298
299define <8 x float> @merge_8f32_f32_12zzuuzz(float* %ptr) nounwind uwtable noinline ssp {
300; AVX-LABEL: merge_8f32_f32_12zzuuzz:
301; AVX:       # %bb.0:
302; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
303; AVX-NEXT:    retq
304;
305; X32-AVX-LABEL: merge_8f32_f32_12zzuuzz:
306; X32-AVX:       # %bb.0:
307; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
308; X32-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
309; X32-AVX-NEXT:    retl
310  %ptr0 = getelementptr inbounds float, float* %ptr, i64 1
311  %ptr1 = getelementptr inbounds float, float* %ptr, i64 2
312  %val0 = load float, float* %ptr0
313  %val1 = load float, float* %ptr1
314  %res0 = insertelement <8 x float> undef, float %val0, i32 0
315  %res1 = insertelement <8 x float> %res0, float %val1, i32 1
316  %res2 = insertelement <8 x float> %res1, float   0.0, i32 2
317  %res3 = insertelement <8 x float> %res2, float   0.0, i32 3
318  %res6 = insertelement <8 x float> %res3, float   0.0, i32 6
319  %res7 = insertelement <8 x float> %res6, float   0.0, i32 7
320  ret <8 x float> %res7
321}
322
323define <8 x float> @merge_8f32_f32_1u3u5zu8(float* %ptr) nounwind uwtable noinline ssp {
324; AVX-LABEL: merge_8f32_f32_1u3u5zu8:
325; AVX:       # %bb.0:
326; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
327; AVX-NEXT:    vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4],ymm0[5],mem[6,7]
328; AVX-NEXT:    retq
329;
330; X32-AVX-LABEL: merge_8f32_f32_1u3u5zu8:
331; X32-AVX:       # %bb.0:
332; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
333; X32-AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
334; X32-AVX-NEXT:    vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4],ymm0[5],mem[6,7]
335; X32-AVX-NEXT:    retl
336  %ptr0 = getelementptr inbounds float, float* %ptr, i64 1
337  %ptr2 = getelementptr inbounds float, float* %ptr, i64 3
338  %ptr4 = getelementptr inbounds float, float* %ptr, i64 5
339  %ptr7 = getelementptr inbounds float, float* %ptr, i64 8
340  %val0 = load float, float* %ptr0
341  %val2 = load float, float* %ptr2
342  %val4 = load float, float* %ptr4
343  %val7 = load float, float* %ptr7
344  %res0 = insertelement <8 x float> undef, float %val0, i32 0
345  %res2 = insertelement <8 x float> %res0, float %val2, i32 2
346  %res4 = insertelement <8 x float> %res2, float %val4, i32 4
347  %res5 = insertelement <8 x float> %res4, float   0.0, i32 5
348  %res7 = insertelement <8 x float> %res5, float %val7, i32 7
349  ret <8 x float> %res7
350}
351
352define <8 x i32> @merge_8i32_4i32_z3(<4 x i32>* %ptr) nounwind uwtable noinline ssp {
353; AVX-LABEL: merge_8i32_4i32_z3:
354; AVX:       # %bb.0:
355; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
356; AVX-NEXT:    vinsertf128 $1, 48(%rdi), %ymm0, %ymm0
357; AVX-NEXT:    retq
358;
359; X32-AVX-LABEL: merge_8i32_4i32_z3:
360; X32-AVX:       # %bb.0:
361; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
362; X32-AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
363; X32-AVX-NEXT:    vinsertf128 $1, 48(%eax), %ymm0, %ymm0
364; X32-AVX-NEXT:    retl
365  %ptr1 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 3
366  %val1 = load <4 x i32>, <4 x i32>* %ptr1
367  %res = shufflevector <4 x i32> zeroinitializer, <4 x i32> %val1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
368  ret <8 x i32> %res
369}
370
371define <8 x i32> @merge_8i32_i32_56zz9uzz(i32* %ptr) nounwind uwtable noinline ssp {
372; AVX-LABEL: merge_8i32_i32_56zz9uzz:
373; AVX:       # %bb.0:
374; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
375; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
376; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
377; AVX-NEXT:    retq
378;
379; X32-AVX-LABEL: merge_8i32_i32_56zz9uzz:
380; X32-AVX:       # %bb.0:
381; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
382; X32-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
383; X32-AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
384; X32-AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
385; X32-AVX-NEXT:    retl
386  %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 5
387  %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 6
388  %ptr4 = getelementptr inbounds i32, i32* %ptr, i64 9
389  %val0 = load i32, i32* %ptr0
390  %val1 = load i32, i32* %ptr1
391  %val4 = load i32, i32* %ptr4
392  %res0 = insertelement <8 x i32> undef, i32 %val0, i32 0
393  %res1 = insertelement <8 x i32> %res0, i32 %val1, i32 1
394  %res2 = insertelement <8 x i32> %res1, i32     0, i32 2
395  %res3 = insertelement <8 x i32> %res2, i32     0, i32 3
396  %res4 = insertelement <8 x i32> %res3, i32 %val4, i32 4
397  %res6 = insertelement <8 x i32> %res4, i32     0, i32 6
398  %res7 = insertelement <8 x i32> %res6, i32     0, i32 7
399  ret <8 x i32> %res7
400}
401
402define <8 x i32> @merge_8i32_i32_1u3u5zu8(i32* %ptr) nounwind uwtable noinline ssp {
403; AVX-LABEL: merge_8i32_i32_1u3u5zu8:
404; AVX:       # %bb.0:
405; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
406; AVX-NEXT:    vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4],ymm0[5],mem[6,7]
407; AVX-NEXT:    retq
408;
409; X32-AVX-LABEL: merge_8i32_i32_1u3u5zu8:
410; X32-AVX:       # %bb.0:
411; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
412; X32-AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
413; X32-AVX-NEXT:    vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4],ymm0[5],mem[6,7]
414; X32-AVX-NEXT:    retl
415  %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 1
416  %ptr2 = getelementptr inbounds i32, i32* %ptr, i64 3
417  %ptr4 = getelementptr inbounds i32, i32* %ptr, i64 5
418  %ptr7 = getelementptr inbounds i32, i32* %ptr, i64 8
419  %val0 = load i32, i32* %ptr0
420  %val2 = load i32, i32* %ptr2
421  %val4 = load i32, i32* %ptr4
422  %val7 = load i32, i32* %ptr7
423  %res0 = insertelement <8 x i32> undef, i32 %val0, i32 0
424  %res2 = insertelement <8 x i32> %res0, i32 %val2, i32 2
425  %res4 = insertelement <8 x i32> %res2, i32 %val4, i32 4
426  %res5 = insertelement <8 x i32> %res4, i32     0, i32 5
427  %res7 = insertelement <8 x i32> %res5, i32 %val7, i32 7
428  ret <8 x i32> %res7
429}
430
431define <16 x i16> @merge_16i16_i16_89zzzuuuuuuuuuuuz(i16* %ptr) nounwind uwtable noinline ssp {
432; AVX-LABEL: merge_16i16_i16_89zzzuuuuuuuuuuuz:
433; AVX:       # %bb.0:
434; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
435; AVX-NEXT:    retq
436;
437; X32-AVX-LABEL: merge_16i16_i16_89zzzuuuuuuuuuuuz:
438; X32-AVX:       # %bb.0:
439; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
440; X32-AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
441; X32-AVX-NEXT:    retl
442  %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 8
443  %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 9
444  %val0 = load i16, i16* %ptr0
445  %val1 = load i16, i16* %ptr1
446  %res0 = insertelement <16 x i16> undef, i16 %val0, i16 0
447  %res1 = insertelement <16 x i16> %res0, i16 %val1, i16 1
448  %res2 = insertelement <16 x i16> %res1, i16     0, i16 2
449  %res3 = insertelement <16 x i16> %res2, i16     0, i16 3
450  %res4 = insertelement <16 x i16> %res3, i16     0, i16 4
451  %resF = insertelement <16 x i16> %res4, i16     0, i16 15
452  ret <16 x i16> %resF
453}
454
455define <16 x i16> @merge_16i16_i16_45u7uuuuuuuuuuuu(i16* %ptr) nounwind uwtable noinline ssp {
456; AVX-LABEL: merge_16i16_i16_45u7uuuuuuuuuuuu:
457; AVX:       # %bb.0:
458; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
459; AVX-NEXT:    retq
460;
461; X32-AVX-LABEL: merge_16i16_i16_45u7uuuuuuuuuuuu:
462; X32-AVX:       # %bb.0:
463; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
464; X32-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
465; X32-AVX-NEXT:    retl
466  %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 4
467  %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 5
468  %ptr3 = getelementptr inbounds i16, i16* %ptr, i64 7
469  %val0 = load i16, i16* %ptr0
470  %val1 = load i16, i16* %ptr1
471  %val3 = load i16, i16* %ptr3
472  %res0 = insertelement <16 x i16> undef, i16 %val0, i16 0
473  %res1 = insertelement <16 x i16> %res0, i16 %val1, i16 1
474  %res3 = insertelement <16 x i16> %res1, i16 %val3, i16 3
475  ret <16 x i16> %res3
476}
477
478define <16 x i16> @merge_16i16_i16_0uu3uuuuuuuuCuEF(i16* %ptr) nounwind uwtable noinline ssp {
479; AVX-LABEL: merge_16i16_i16_0uu3uuuuuuuuCuEF:
480; AVX:       # %bb.0:
481; AVX-NEXT:    vmovups (%rdi), %ymm0
482; AVX-NEXT:    retq
483;
484; X32-AVX-LABEL: merge_16i16_i16_0uu3uuuuuuuuCuEF:
485; X32-AVX:       # %bb.0:
486; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
487; X32-AVX-NEXT:    vmovups (%eax), %ymm0
488; X32-AVX-NEXT:    retl
489  %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 0
490  %ptr3 = getelementptr inbounds i16, i16* %ptr, i64 3
491  %ptrC = getelementptr inbounds i16, i16* %ptr, i64 12
492  %ptrE = getelementptr inbounds i16, i16* %ptr, i64 14
493  %ptrF = getelementptr inbounds i16, i16* %ptr, i64 15
494  %val0 = load i16, i16* %ptr0
495  %val3 = load i16, i16* %ptr3
496  %valC = load i16, i16* %ptrC
497  %valE = load i16, i16* %ptrE
498  %valF = load i16, i16* %ptrF
499  %res0 = insertelement <16 x i16> undef, i16 %val0, i16 0
500  %res3 = insertelement <16 x i16> %res0, i16 %val3, i16 3
501  %resC = insertelement <16 x i16> %res3, i16 %valC, i16 12
502  %resE = insertelement <16 x i16> %resC, i16 %valE, i16 14
503  %resF = insertelement <16 x i16> %resE, i16 %valF, i16 15
504  ret <16 x i16> %resF
505}
506
507define <16 x i16> @merge_16i16_i16_0uu3zzuuuuuzCuEF(i16* %ptr) nounwind uwtable noinline ssp {
508; AVX-LABEL: merge_16i16_i16_0uu3zzuuuuuzCuEF:
509; AVX:       # %bb.0:
510; AVX-NEXT:    vmovups (%rdi), %ymm0
511; AVX-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
512; AVX-NEXT:    retq
513;
514; X32-AVX-LABEL: merge_16i16_i16_0uu3zzuuuuuzCuEF:
515; X32-AVX:       # %bb.0:
516; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
517; X32-AVX-NEXT:    vmovups (%eax), %ymm0
518; X32-AVX-NEXT:    vandps {{\.LCPI.*}}, %ymm0, %ymm0
519; X32-AVX-NEXT:    retl
520  %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 0
521  %ptr3 = getelementptr inbounds i16, i16* %ptr, i64 3
522  %ptrC = getelementptr inbounds i16, i16* %ptr, i64 12
523  %ptrE = getelementptr inbounds i16, i16* %ptr, i64 14
524  %ptrF = getelementptr inbounds i16, i16* %ptr, i64 15
525  %val0 = load i16, i16* %ptr0
526  %val3 = load i16, i16* %ptr3
527  %valC = load i16, i16* %ptrC
528  %valE = load i16, i16* %ptrE
529  %valF = load i16, i16* %ptrF
530  %res0 = insertelement <16 x i16> undef, i16 %val0, i16 0
531  %res3 = insertelement <16 x i16> %res0, i16 %val3, i16 3
532  %res4 = insertelement <16 x i16> %res3, i16     0, i16 4
533  %res5 = insertelement <16 x i16> %res4, i16     0, i16 5
534  %resC = insertelement <16 x i16> %res5, i16 %valC, i16 12
535  %resD = insertelement <16 x i16> %resC, i16     0, i16 13
536  %resE = insertelement <16 x i16> %resD, i16 %valE, i16 14
537  %resF = insertelement <16 x i16> %resE, i16 %valF, i16 15
538  ret <16 x i16> %resF
539}
540
541define <32 x i8> @merge_32i8_i8_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu(i8* %ptr) nounwind uwtable noinline ssp {
542; AVX-LABEL: merge_32i8_i8_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu:
543; AVX:       # %bb.0:
544; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
545; AVX-NEXT:    retq
546;
547; X32-AVX-LABEL: merge_32i8_i8_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu:
548; X32-AVX:       # %bb.0:
549; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
550; X32-AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
551; X32-AVX-NEXT:    retl
552  %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 4
553  %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 5
554  %ptr3 = getelementptr inbounds i8, i8* %ptr, i64 7
555  %val0 = load i8, i8* %ptr0
556  %val1 = load i8, i8* %ptr1
557  %val3 = load i8, i8* %ptr3
558  %res0 = insertelement <32 x i8> undef, i8 %val0, i8 0
559  %res1 = insertelement <32 x i8> %res0, i8 %val1, i8 1
560  %res3 = insertelement <32 x i8> %res1, i8 %val3, i8 3
561  ret <32 x i8> %res3
562}
563
564define <32 x i8> @merge_32i8_i8_23u5uuuuuuuuuuzzzzuuuuuuuuuuuuuu(i8* %ptr) nounwind uwtable noinline ssp {
565; AVX-LABEL: merge_32i8_i8_23u5uuuuuuuuuuzzzzuuuuuuuuuuuuuu:
566; AVX:       # %bb.0:
567; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
568; AVX-NEXT:    retq
569;
570; X32-AVX-LABEL: merge_32i8_i8_23u5uuuuuuuuuuzzzzuuuuuuuuuuuuuu:
571; X32-AVX:       # %bb.0:
572; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
573; X32-AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
574; X32-AVX-NEXT:    retl
575  %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 2
576  %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 3
577  %ptr3 = getelementptr inbounds i8, i8* %ptr, i64 5
578  %val0 = load i8, i8* %ptr0
579  %val1 = load i8, i8* %ptr1
580  %val3 = load i8, i8* %ptr3
581  %res0 = insertelement <32 x i8> undef, i8 %val0, i8 0
582  %res1 = insertelement <32 x i8> %res0, i8 %val1, i8 1
583  %res3 = insertelement <32 x i8> %res1, i8 %val3, i8 3
584  %resE = insertelement <32 x i8> %res3, i8     0, i8 14
585  %resF = insertelement <32 x i8> %resE, i8     0, i8 15
586  %resG = insertelement <32 x i8> %resF, i8     0, i8 16
587  %resH = insertelement <32 x i8> %resG, i8     0, i8 17
588  ret <32 x i8> %resH
589}
590
591;
592; consecutive loads including any/all volatiles may not be combined
593;
594
595define <4 x double> @merge_4f64_f64_34uz_volatile(double* %ptr) nounwind uwtable noinline ssp {
596; AVX-LABEL: merge_4f64_f64_34uz_volatile:
597; AVX:       # %bb.0:
598; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
599; AVX-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
600; AVX-NEXT:    retq
601;
602; X32-AVX-LABEL: merge_4f64_f64_34uz_volatile:
603; X32-AVX:       # %bb.0:
604; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
605; X32-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
606; X32-AVX-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
607; X32-AVX-NEXT:    retl
608  %ptr0 = getelementptr inbounds double, double* %ptr, i64 3
609  %ptr1 = getelementptr inbounds double, double* %ptr, i64 4
610  %val0 = load volatile double, double* %ptr0
611  %val1 = load volatile double, double* %ptr1
612  %res0 = insertelement <4 x double> undef, double %val0, i32 0
613  %res1 = insertelement <4 x double> %res0, double %val1, i32 1
614  %res3 = insertelement <4 x double> %res1, double   0.0, i32 3
615  ret <4 x double> %res3
616}
617
618define <16 x i16> @merge_16i16_i16_0uu3zzuuuuuzCuEF_volatile(i16* %ptr) nounwind uwtable noinline ssp {
619; AVX1-LABEL: merge_16i16_i16_0uu3zzuuuuuzCuEF_volatile:
620; AVX1:       # %bb.0:
621; AVX1-NEXT:    vpxor %xmm0, %xmm0, %xmm0
622; AVX1-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm1
623; AVX1-NEXT:    vpinsrw $4, 24(%rdi), %xmm0, %xmm0
624; AVX1-NEXT:    vpinsrw $6, 28(%rdi), %xmm0, %xmm0
625; AVX1-NEXT:    vpinsrw $7, 30(%rdi), %xmm0, %xmm0
626; AVX1-NEXT:    vpinsrw $3, 6(%rdi), %xmm1, %xmm1
627; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
628; AVX1-NEXT:    retq
629;
630; AVX2-LABEL: merge_16i16_i16_0uu3zzuuuuuzCuEF_volatile:
631; AVX2:       # %bb.0:
632; AVX2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
633; AVX2-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm1
634; AVX2-NEXT:    vpinsrw $4, 24(%rdi), %xmm0, %xmm0
635; AVX2-NEXT:    vpinsrw $6, 28(%rdi), %xmm0, %xmm0
636; AVX2-NEXT:    vpinsrw $7, 30(%rdi), %xmm0, %xmm0
637; AVX2-NEXT:    vpinsrw $3, 6(%rdi), %xmm1, %xmm1
638; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
639; AVX2-NEXT:    retq
640;
641; AVX512F-LABEL: merge_16i16_i16_0uu3zzuuuuuzCuEF_volatile:
642; AVX512F:       # %bb.0:
643; AVX512F-NEXT:    vpxor %xmm0, %xmm0, %xmm0
644; AVX512F-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm1
645; AVX512F-NEXT:    vpinsrw $4, 24(%rdi), %xmm0, %xmm0
646; AVX512F-NEXT:    vpinsrw $6, 28(%rdi), %xmm0, %xmm0
647; AVX512F-NEXT:    vpinsrw $7, 30(%rdi), %xmm0, %xmm0
648; AVX512F-NEXT:    vpinsrw $3, 6(%rdi), %xmm1, %xmm1
649; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
650; AVX512F-NEXT:    retq
651;
652; X32-AVX-LABEL: merge_16i16_i16_0uu3zzuuuuuzCuEF_volatile:
653; X32-AVX:       # %bb.0:
654; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
655; X32-AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
656; X32-AVX-NEXT:    vpinsrw $0, (%eax), %xmm0, %xmm1
657; X32-AVX-NEXT:    vpinsrw $4, 24(%eax), %xmm0, %xmm0
658; X32-AVX-NEXT:    vpinsrw $6, 28(%eax), %xmm0, %xmm0
659; X32-AVX-NEXT:    vpinsrw $7, 30(%eax), %xmm0, %xmm0
660; X32-AVX-NEXT:    vpinsrw $3, 6(%eax), %xmm1, %xmm1
661; X32-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
662; X32-AVX-NEXT:    retl
663  %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 0
664  %ptr3 = getelementptr inbounds i16, i16* %ptr, i64 3
665  %ptrC = getelementptr inbounds i16, i16* %ptr, i64 12
666  %ptrE = getelementptr inbounds i16, i16* %ptr, i64 14
667  %ptrF = getelementptr inbounds i16, i16* %ptr, i64 15
668  %val0 = load volatile i16, i16* %ptr0
669  %val3 = load i16, i16* %ptr3
670  %valC = load i16, i16* %ptrC
671  %valE = load i16, i16* %ptrE
672  %valF = load volatile i16, i16* %ptrF
673  %res0 = insertelement <16 x i16> undef, i16 %val0, i16 0
674  %res3 = insertelement <16 x i16> %res0, i16 %val3, i16 3
675  %res4 = insertelement <16 x i16> %res3, i16     0, i16 4
676  %res5 = insertelement <16 x i16> %res4, i16     0, i16 5
677  %resC = insertelement <16 x i16> %res5, i16 %valC, i16 12
678  %resD = insertelement <16 x i16> %resC, i16     0, i16 13
679  %resE = insertelement <16 x i16> %resD, i16 %valE, i16 14
680  %resF = insertelement <16 x i16> %resE, i16 %valF, i16 15
681  ret <16 x i16> %resF
682}
683