• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512F
7;
8; Just one 32-bit run to make sure we do reasonable things.
9; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X32-SSE
10
11define <2 x double> @merge_2f64_f64_23(double* %ptr) nounwind uwtable noinline ssp {
12; SSE-LABEL: merge_2f64_f64_23:
13; SSE:       # BB#0:
14; SSE-NEXT:    movups 16(%rdi), %xmm0
15; SSE-NEXT:    retq
16;
17; AVX-LABEL: merge_2f64_f64_23:
18; AVX:       # BB#0:
19; AVX-NEXT:    vmovups 16(%rdi), %xmm0
20; AVX-NEXT:    retq
21;
22; X32-SSE-LABEL: merge_2f64_f64_23:
23; X32-SSE:       # BB#0:
24; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
25; X32-SSE-NEXT:    movups 16(%eax), %xmm0
26; X32-SSE-NEXT:    retl
27  %ptr0 = getelementptr inbounds double, double* %ptr, i64 2
28  %ptr1 = getelementptr inbounds double, double* %ptr, i64 3
29  %val0 = load double, double* %ptr0
30  %val1 = load double, double* %ptr1
31  %res0 = insertelement <2 x double> undef, double %val0, i32 0
32  %res1 = insertelement <2 x double> %res0, double %val1, i32 1
33  ret <2 x double> %res1
34}
35
36define <2 x i64> @merge_2i64_i64_12(i64* %ptr) nounwind uwtable noinline ssp {
37; SSE-LABEL: merge_2i64_i64_12:
38; SSE:       # BB#0:
39; SSE-NEXT:    movups 8(%rdi), %xmm0
40; SSE-NEXT:    retq
41;
42; AVX-LABEL: merge_2i64_i64_12:
43; AVX:       # BB#0:
44; AVX-NEXT:    vmovups 8(%rdi), %xmm0
45; AVX-NEXT:    retq
46;
47; X32-SSE-LABEL: merge_2i64_i64_12:
48; X32-SSE:       # BB#0:
49; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
50; X32-SSE-NEXT:    movups 8(%eax), %xmm0
51; X32-SSE-NEXT:    retl
52  %ptr0 = getelementptr inbounds i64, i64* %ptr, i64 1
53  %ptr1 = getelementptr inbounds i64, i64* %ptr, i64 2
54  %val0 = load i64, i64* %ptr0
55  %val1 = load i64, i64* %ptr1
56  %res0 = insertelement <2 x i64> undef, i64 %val0, i32 0
57  %res1 = insertelement <2 x i64> %res0, i64 %val1, i32 1
58  ret <2 x i64> %res1
59}
60
61define <4 x float> @merge_4f32_f32_2345(float* %ptr) nounwind uwtable noinline ssp {
62; SSE-LABEL: merge_4f32_f32_2345:
63; SSE:       # BB#0:
64; SSE-NEXT:    movups 8(%rdi), %xmm0
65; SSE-NEXT:    retq
66;
67; AVX-LABEL: merge_4f32_f32_2345:
68; AVX:       # BB#0:
69; AVX-NEXT:    vmovups 8(%rdi), %xmm0
70; AVX-NEXT:    retq
71;
72; X32-SSE-LABEL: merge_4f32_f32_2345:
73; X32-SSE:       # BB#0:
74; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
75; X32-SSE-NEXT:    movups 8(%eax), %xmm0
76; X32-SSE-NEXT:    retl
77  %ptr0 = getelementptr inbounds float, float* %ptr, i64 2
78  %ptr1 = getelementptr inbounds float, float* %ptr, i64 3
79  %ptr2 = getelementptr inbounds float, float* %ptr, i64 4
80  %ptr3 = getelementptr inbounds float, float* %ptr, i64 5
81  %val0 = load float, float* %ptr0
82  %val1 = load float, float* %ptr1
83  %val2 = load float, float* %ptr2
84  %val3 = load float, float* %ptr3
85  %res0 = insertelement <4 x float> undef, float %val0, i32 0
86  %res1 = insertelement <4 x float> %res0, float %val1, i32 1
87  %res2 = insertelement <4 x float> %res1, float %val2, i32 2
88  %res3 = insertelement <4 x float> %res2, float %val3, i32 3
89  ret <4 x float> %res3
90}
91
92define <4 x float> @merge_4f32_f32_3zuu(float* %ptr) nounwind uwtable noinline ssp {
93; SSE-LABEL: merge_4f32_f32_3zuu:
94; SSE:       # BB#0:
95; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
96; SSE-NEXT:    retq
97;
98; AVX-LABEL: merge_4f32_f32_3zuu:
99; AVX:       # BB#0:
100; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
101; AVX-NEXT:    retq
102;
103; X32-SSE-LABEL: merge_4f32_f32_3zuu:
104; X32-SSE:       # BB#0:
105; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
106; X32-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
107; X32-SSE-NEXT:    retl
108  %ptr0 = getelementptr inbounds float, float* %ptr, i64 3
109  %val0 = load float, float* %ptr0
110  %res0 = insertelement <4 x float> undef, float %val0, i32 0
111  %res1 = insertelement <4 x float> %res0, float 0.0, i32 1
112  ret <4 x float> %res1
113}
114
115define <4 x float> @merge_4f32_f32_34uu(float* %ptr) nounwind uwtable noinline ssp {
116; SSE-LABEL: merge_4f32_f32_34uu:
117; SSE:       # BB#0:
118; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
119; SSE-NEXT:    retq
120;
121; AVX-LABEL: merge_4f32_f32_34uu:
122; AVX:       # BB#0:
123; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
124; AVX-NEXT:    retq
125;
126; X32-SSE-LABEL: merge_4f32_f32_34uu:
127; X32-SSE:       # BB#0:
128; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
129; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
130; X32-SSE-NEXT:    retl
131  %ptr0 = getelementptr inbounds float, float* %ptr, i64 3
132  %ptr1 = getelementptr inbounds float, float* %ptr, i64 4
133  %val0 = load float, float* %ptr0
134  %val1 = load float, float* %ptr1
135  %res0 = insertelement <4 x float> undef, float %val0, i32 0
136  %res1 = insertelement <4 x float> %res0, float %val1, i32 1
137  ret <4 x float> %res1
138}
139
140define <4 x float> @merge_4f32_f32_34z6(float* %ptr) nounwind uwtable noinline ssp {
141; SSE2-LABEL: merge_4f32_f32_34z6:
142; SSE2:       # BB#0:
143; SSE2-NEXT:    movups 12(%rdi), %xmm0
144; SSE2-NEXT:    xorps %xmm1, %xmm1
145; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
146; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
147; SSE2-NEXT:    retq
148;
149; SSE41-LABEL: merge_4f32_f32_34z6:
150; SSE41:       # BB#0:
151; SSE41-NEXT:    movups 12(%rdi), %xmm1
152; SSE41-NEXT:    xorps %xmm0, %xmm0
153; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
154; SSE41-NEXT:    retq
155;
156; AVX-LABEL: merge_4f32_f32_34z6:
157; AVX:       # BB#0:
158; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
159; AVX-NEXT:    vblendps {{.*#+}} xmm0 = mem[0,1],xmm0[2],mem[3]
160; AVX-NEXT:    retq
161;
162; X32-SSE-LABEL: merge_4f32_f32_34z6:
163; X32-SSE:       # BB#0:
164; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
165; X32-SSE-NEXT:    movups 12(%eax), %xmm1
166; X32-SSE-NEXT:    xorps %xmm0, %xmm0
167; X32-SSE-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
168; X32-SSE-NEXT:    retl
169  %ptr0 = getelementptr inbounds float, float* %ptr, i64 3
170  %ptr1 = getelementptr inbounds float, float* %ptr, i64 4
171  %ptr3 = getelementptr inbounds float, float* %ptr, i64 6
172  %val0 = load float, float* %ptr0
173  %val1 = load float, float* %ptr1
174  %val3 = load float, float* %ptr3
175  %res0 = insertelement <4 x float> zeroinitializer, float %val0, i32 0
176  %res1 = insertelement <4 x float> %res0, float %val1, i32 1
177  %res3 = insertelement <4 x float> %res1, float %val3, i32 3
178  ret <4 x float> %res3
179}
180
181define <4 x float> @merge_4f32_f32_45zz(float* %ptr) nounwind uwtable noinline ssp {
182; SSE-LABEL: merge_4f32_f32_45zz:
183; SSE:       # BB#0:
184; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
185; SSE-NEXT:    retq
186;
187; AVX-LABEL: merge_4f32_f32_45zz:
188; AVX:       # BB#0:
189; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
190; AVX-NEXT:    retq
191;
192; X32-SSE-LABEL: merge_4f32_f32_45zz:
193; X32-SSE:       # BB#0:
194; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
195; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
196; X32-SSE-NEXT:    retl
197  %ptr0 = getelementptr inbounds float, float* %ptr, i64 4
198  %ptr1 = getelementptr inbounds float, float* %ptr, i64 5
199  %val0 = load float, float* %ptr0
200  %val1 = load float, float* %ptr1
201  %res0 = insertelement <4 x float> zeroinitializer, float %val0, i32 0
202  %res1 = insertelement <4 x float> %res0, float %val1, i32 1
203  ret <4 x float> %res1
204}
205
206define <4 x float> @merge_4f32_f32_012u(float* %ptr) nounwind uwtable noinline ssp {
207; SSE2-LABEL: merge_4f32_f32_012u:
208; SSE2:       # BB#0:
209; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
210; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
211; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
212; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
213; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
214; SSE2-NEXT:    retq
215;
216; SSE41-LABEL: merge_4f32_f32_012u:
217; SSE41:       # BB#0:
218; SSE41-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
219; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
220; SSE41-NEXT:    retq
221;
222; AVX-LABEL: merge_4f32_f32_012u:
223; AVX:       # BB#0:
224; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
225; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
226; AVX-NEXT:    retq
227;
228; X32-SSE-LABEL: merge_4f32_f32_012u:
229; X32-SSE:       # BB#0:
230; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
231; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
232; X32-SSE-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
233; X32-SSE-NEXT:    retl
234  %ptr0 = getelementptr inbounds float, float* %ptr, i64 0
235  %ptr1 = getelementptr inbounds float, float* %ptr, i64 1
236  %ptr2 = getelementptr inbounds float, float* %ptr, i64 2
237  %val0 = load float, float* %ptr0
238  %val1 = load float, float* %ptr1
239  %val2 = load float, float* %ptr2
240  %res0 = insertelement <4 x float> undef, float %val0, i32 0
241  %res1 = insertelement <4 x float> %res0, float %val1, i32 1
242  %res2 = insertelement <4 x float> %res1, float %val2, i32 2
243  %res3 = insertelement <4 x float> %res2, float undef, i32 3
244  ret <4 x float> %res3
245}
246
247define <4 x float> @merge_4f32_f32_019u(float* %ptr) nounwind uwtable noinline ssp {
248; SSE2-LABEL: merge_4f32_f32_019u:
249; SSE2:       # BB#0:
250; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
251; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
252; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
253; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
254; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
255; SSE2-NEXT:    retq
256;
257; SSE41-LABEL: merge_4f32_f32_019u:
258; SSE41:       # BB#0:
259; SSE41-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
260; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
261; SSE41-NEXT:    retq
262;
263; AVX-LABEL: merge_4f32_f32_019u:
264; AVX:       # BB#0:
265; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
266; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
267; AVX-NEXT:    retq
268;
269; X32-SSE-LABEL: merge_4f32_f32_019u:
270; X32-SSE:       # BB#0:
271; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
272; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
273; X32-SSE-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
274; X32-SSE-NEXT:    retl
275  %ptr0 = getelementptr inbounds float, float* %ptr, i64 0
276  %ptr1 = getelementptr inbounds float, float* %ptr, i64 1
277  %ptr2 = getelementptr inbounds float, float* %ptr, i64 9
278  %val0 = load float, float* %ptr0
279  %val1 = load float, float* %ptr1
280  %val2 = load float, float* %ptr2
281  %res0 = insertelement <4 x float> undef, float %val0, i32 0
282  %res1 = insertelement <4 x float> %res0, float %val1, i32 1
283  %res2 = insertelement <4 x float> %res1, float %val2, i32 2
284  %res3 = insertelement <4 x float> %res2, float undef, i32 3
285  ret <4 x float> %res3
286}
287
288define <4 x i32> @merge_4i32_i32_23u5(i32* %ptr) nounwind uwtable noinline ssp {
289; SSE-LABEL: merge_4i32_i32_23u5:
290; SSE:       # BB#0:
291; SSE-NEXT:    movups 8(%rdi), %xmm0
292; SSE-NEXT:    retq
293;
294; AVX-LABEL: merge_4i32_i32_23u5:
295; AVX:       # BB#0:
296; AVX-NEXT:    vmovups 8(%rdi), %xmm0
297; AVX-NEXT:    retq
298;
299; X32-SSE-LABEL: merge_4i32_i32_23u5:
300; X32-SSE:       # BB#0:
301; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
302; X32-SSE-NEXT:    movups 8(%eax), %xmm0
303; X32-SSE-NEXT:    retl
304  %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 2
305  %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 3
306  %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 5
307  %val0 = load i32, i32* %ptr0
308  %val1 = load i32, i32* %ptr1
309  %val3 = load i32, i32* %ptr3
310  %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0
311  %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
312  %res3 = insertelement <4 x i32> %res1, i32 %val3, i32 3
313  ret <4 x i32> %res3
314}
315
316define <4 x i32> @merge_4i32_i32_3zuu(i32* %ptr) nounwind uwtable noinline ssp {
317; SSE-LABEL: merge_4i32_i32_3zuu:
318; SSE:       # BB#0:
319; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
320; SSE-NEXT:    retq
321;
322; AVX-LABEL: merge_4i32_i32_3zuu:
323; AVX:       # BB#0:
324; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
325; AVX-NEXT:    retq
326;
327; X32-SSE-LABEL: merge_4i32_i32_3zuu:
328; X32-SSE:       # BB#0:
329; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
330; X32-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
331; X32-SSE-NEXT:    retl
332  %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 3
333  %val0 = load i32, i32* %ptr0
334  %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0
335  %res1 = insertelement <4 x i32> %res0, i32     0, i32 1
336  ret <4 x i32> %res1
337}
338
339define <4 x i32> @merge_4i32_i32_34uu(i32* %ptr) nounwind uwtable noinline ssp {
340; SSE-LABEL: merge_4i32_i32_34uu:
341; SSE:       # BB#0:
342; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
343; SSE-NEXT:    retq
344;
345; AVX-LABEL: merge_4i32_i32_34uu:
346; AVX:       # BB#0:
347; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
348; AVX-NEXT:    retq
349;
350; X32-SSE-LABEL: merge_4i32_i32_34uu:
351; X32-SSE:       # BB#0:
352; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
353; X32-SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
354; X32-SSE-NEXT:    retl
355  %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 3
356  %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 4
357  %val0 = load i32, i32* %ptr0
358  %val1 = load i32, i32* %ptr1
359  %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0
360  %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
361  ret <4 x i32> %res1
362}
363
364define <4 x i32> @merge_4i32_i32_45zz(i32* %ptr) nounwind uwtable noinline ssp {
365; SSE-LABEL: merge_4i32_i32_45zz:
366; SSE:       # BB#0:
367; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
368; SSE-NEXT:    retq
369;
370; AVX-LABEL: merge_4i32_i32_45zz:
371; AVX:       # BB#0:
372; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
373; AVX-NEXT:    retq
374;
375; X32-SSE-LABEL: merge_4i32_i32_45zz:
376; X32-SSE:       # BB#0:
377; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
378; X32-SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
379; X32-SSE-NEXT:    retl
380  %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 4
381  %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 5
382  %val0 = load i32, i32* %ptr0
383  %val1 = load i32, i32* %ptr1
384  %res0 = insertelement <4 x i32> zeroinitializer, i32 %val0, i32 0
385  %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
386  ret <4 x i32> %res1
387}
388
389define <8 x i16> @merge_8i16_i16_23u567u9(i16* %ptr) nounwind uwtable noinline ssp {
390; SSE-LABEL: merge_8i16_i16_23u567u9:
391; SSE:       # BB#0:
392; SSE-NEXT:    movups 4(%rdi), %xmm0
393; SSE-NEXT:    retq
394;
395; AVX-LABEL: merge_8i16_i16_23u567u9:
396; AVX:       # BB#0:
397; AVX-NEXT:    vmovups 4(%rdi), %xmm0
398; AVX-NEXT:    retq
399;
400; X32-SSE-LABEL: merge_8i16_i16_23u567u9:
401; X32-SSE:       # BB#0:
402; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
403; X32-SSE-NEXT:    movups 4(%eax), %xmm0
404; X32-SSE-NEXT:    retl
405  %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 2
406  %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 3
407  %ptr3 = getelementptr inbounds i16, i16* %ptr, i64 5
408  %ptr4 = getelementptr inbounds i16, i16* %ptr, i64 6
409  %ptr5 = getelementptr inbounds i16, i16* %ptr, i64 7
410  %ptr7 = getelementptr inbounds i16, i16* %ptr, i64 9
411  %val0 = load i16, i16* %ptr0
412  %val1 = load i16, i16* %ptr1
413  %val3 = load i16, i16* %ptr3
414  %val4 = load i16, i16* %ptr4
415  %val5 = load i16, i16* %ptr5
416  %val7 = load i16, i16* %ptr7
417  %res0 = insertelement <8 x i16> undef, i16 %val0, i32 0
418  %res1 = insertelement <8 x i16> %res0, i16 %val1, i32 1
419  %res3 = insertelement <8 x i16> %res1, i16 %val3, i32 3
420  %res4 = insertelement <8 x i16> %res3, i16 %val4, i32 4
421  %res5 = insertelement <8 x i16> %res4, i16 %val5, i32 5
422  %res7 = insertelement <8 x i16> %res5, i16 %val7, i32 7
423  ret <8 x i16> %res7
424}
425
426define <8 x i16> @merge_8i16_i16_34uuuuuu(i16* %ptr) nounwind uwtable noinline ssp {
427; SSE-LABEL: merge_8i16_i16_34uuuuuu:
428; SSE:       # BB#0:
429; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
430; SSE-NEXT:    retq
431;
432; AVX-LABEL: merge_8i16_i16_34uuuuuu:
433; AVX:       # BB#0:
434; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
435; AVX-NEXT:    retq
436;
437; X32-SSE-LABEL: merge_8i16_i16_34uuuuuu:
438; X32-SSE:       # BB#0:
439; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
440; X32-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
441; X32-SSE-NEXT:    retl
442  %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 3
443  %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 4
444  %val0 = load i16, i16* %ptr0
445  %val1 = load i16, i16* %ptr1
446  %res0 = insertelement <8 x i16> undef, i16 %val0, i32 0
447  %res1 = insertelement <8 x i16> %res0, i16 %val1, i32 1
448  ret <8 x i16> %res1
449}
450
451define <8 x i16> @merge_8i16_i16_45u7zzzz(i16* %ptr) nounwind uwtable noinline ssp {
452; SSE-LABEL: merge_8i16_i16_45u7zzzz:
453; SSE:       # BB#0:
454; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
455; SSE-NEXT:    retq
456;
457; AVX-LABEL: merge_8i16_i16_45u7zzzz:
458; AVX:       # BB#0:
459; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
460; AVX-NEXT:    retq
461;
462; X32-SSE-LABEL: merge_8i16_i16_45u7zzzz:
463; X32-SSE:       # BB#0:
464; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
465; X32-SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
466; X32-SSE-NEXT:    retl
467  %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 4
468  %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 5
469  %ptr3 = getelementptr inbounds i16, i16* %ptr, i64 7
470  %val0 = load i16, i16* %ptr0
471  %val1 = load i16, i16* %ptr1
472  %val3 = load i16, i16* %ptr3
473  %res0 = insertelement <8 x i16> undef, i16 %val0, i32 0
474  %res1 = insertelement <8 x i16> %res0, i16 %val1, i32 1
475  %res3 = insertelement <8 x i16> %res1, i16 %val3, i32 3
476  %res4 = insertelement <8 x i16> %res3, i16     0, i32 4
477  %res5 = insertelement <8 x i16> %res4, i16     0, i32 5
478  %res6 = insertelement <8 x i16> %res5, i16     0, i32 6
479  %res7 = insertelement <8 x i16> %res6, i16     0, i32 7
480  ret <8 x i16> %res7
481}
482
483define <16 x i8> @merge_16i8_i8_01u3456789ABCDuF(i8* %ptr) nounwind uwtable noinline ssp {
484; SSE-LABEL: merge_16i8_i8_01u3456789ABCDuF:
485; SSE:       # BB#0:
486; SSE-NEXT:    movups (%rdi), %xmm0
487; SSE-NEXT:    retq
488;
489; AVX-LABEL: merge_16i8_i8_01u3456789ABCDuF:
490; AVX:       # BB#0:
491; AVX-NEXT:    vmovups (%rdi), %xmm0
492; AVX-NEXT:    retq
493;
494; X32-SSE-LABEL: merge_16i8_i8_01u3456789ABCDuF:
495; X32-SSE:       # BB#0:
496; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
497; X32-SSE-NEXT:    movups (%eax), %xmm0
498; X32-SSE-NEXT:    retl
499  %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 0
500  %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 1
501  %ptr3 = getelementptr inbounds i8, i8* %ptr, i64 3
502  %ptr4 = getelementptr inbounds i8, i8* %ptr, i64 4
503  %ptr5 = getelementptr inbounds i8, i8* %ptr, i64 5
504  %ptr6 = getelementptr inbounds i8, i8* %ptr, i64 6
505  %ptr7 = getelementptr inbounds i8, i8* %ptr, i64 7
506  %ptr8 = getelementptr inbounds i8, i8* %ptr, i64 8
507  %ptr9 = getelementptr inbounds i8, i8* %ptr, i64 9
508  %ptrA = getelementptr inbounds i8, i8* %ptr, i64 10
509  %ptrB = getelementptr inbounds i8, i8* %ptr, i64 11
510  %ptrC = getelementptr inbounds i8, i8* %ptr, i64 12
511  %ptrD = getelementptr inbounds i8, i8* %ptr, i64 13
512  %ptrF = getelementptr inbounds i8, i8* %ptr, i64 15
513  %val0 = load i8, i8* %ptr0
514  %val1 = load i8, i8* %ptr1
515  %val3 = load i8, i8* %ptr3
516  %val4 = load i8, i8* %ptr4
517  %val5 = load i8, i8* %ptr5
518  %val6 = load i8, i8* %ptr6
519  %val7 = load i8, i8* %ptr7
520  %val8 = load i8, i8* %ptr8
521  %val9 = load i8, i8* %ptr9
522  %valA = load i8, i8* %ptrA
523  %valB = load i8, i8* %ptrB
524  %valC = load i8, i8* %ptrC
525  %valD = load i8, i8* %ptrD
526  %valF = load i8, i8* %ptrF
527  %res0 = insertelement <16 x i8> undef, i8 %val0, i32 0
528  %res1 = insertelement <16 x i8> %res0, i8 %val1, i32 1
529  %res3 = insertelement <16 x i8> %res1, i8 %val3, i32 3
530  %res4 = insertelement <16 x i8> %res3, i8 %val4, i32 4
531  %res5 = insertelement <16 x i8> %res4, i8 %val5, i32 5
532  %res6 = insertelement <16 x i8> %res5, i8 %val6, i32 6
533  %res7 = insertelement <16 x i8> %res6, i8 %val7, i32 7
534  %res8 = insertelement <16 x i8> %res7, i8 %val8, i32 8
535  %res9 = insertelement <16 x i8> %res8, i8 %val9, i32 9
536  %resA = insertelement <16 x i8> %res9, i8 %valA, i32 10
537  %resB = insertelement <16 x i8> %resA, i8 %valB, i32 11
538  %resC = insertelement <16 x i8> %resB, i8 %valC, i32 12
539  %resD = insertelement <16 x i8> %resC, i8 %valD, i32 13
540  %resF = insertelement <16 x i8> %resD, i8 %valF, i32 15
541  ret <16 x i8> %resF
542}
543
544define <16 x i8> @merge_16i8_i8_01u3uuzzuuuuuzzz(i8* %ptr) nounwind uwtable noinline ssp {
545; SSE-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz:
546; SSE:       # BB#0:
547; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
548; SSE-NEXT:    retq
549;
550; AVX-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz:
551; AVX:       # BB#0:
552; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
553; AVX-NEXT:    retq
554;
555; X32-SSE-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz:
556; X32-SSE:       # BB#0:
557; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
558; X32-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
559; X32-SSE-NEXT:    retl
560  %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 0
561  %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 1
562  %ptr3 = getelementptr inbounds i8, i8* %ptr, i64 3
563  %val0 = load i8, i8* %ptr0
564  %val1 = load i8, i8* %ptr1
565  %val3 = load i8, i8* %ptr3
566  %res0 = insertelement <16 x i8> undef, i8 %val0, i32 0
567  %res1 = insertelement <16 x i8> %res0, i8 %val1, i32 1
568  %res3 = insertelement <16 x i8> %res1, i8 %val3, i32 3
569  %res6 = insertelement <16 x i8> %res3, i8     0, i32 6
570  %res7 = insertelement <16 x i8> %res6, i8     0, i32 7
571  %resD = insertelement <16 x i8> %res7, i8     0, i32 13
572  %resE = insertelement <16 x i8> %resD, i8     0, i32 14
573  %resF = insertelement <16 x i8> %resE, i8     0, i32 15
574  ret <16 x i8> %resF
575}
576
577define <16 x i8> @merge_16i8_i8_0123uu67uuuuuzzz(i8* %ptr) nounwind uwtable noinline ssp {
578; SSE-LABEL: merge_16i8_i8_0123uu67uuuuuzzz:
579; SSE:       # BB#0:
580; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
581; SSE-NEXT:    retq
582;
583; AVX-LABEL: merge_16i8_i8_0123uu67uuuuuzzz:
584; AVX:       # BB#0:
585; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
586; AVX-NEXT:    retq
587;
588; X32-SSE-LABEL: merge_16i8_i8_0123uu67uuuuuzzz:
589; X32-SSE:       # BB#0:
590; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
591; X32-SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
592; X32-SSE-NEXT:    retl
593  %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 0
594  %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 1
595  %ptr2 = getelementptr inbounds i8, i8* %ptr, i64 2
596  %ptr3 = getelementptr inbounds i8, i8* %ptr, i64 3
597  %ptr6 = getelementptr inbounds i8, i8* %ptr, i64 6
598  %ptr7 = getelementptr inbounds i8, i8* %ptr, i64 7
599  %val0 = load i8, i8* %ptr0
600  %val1 = load i8, i8* %ptr1
601  %val2 = load i8, i8* %ptr2
602  %val3 = load i8, i8* %ptr3
603  %val6 = load i8, i8* %ptr6
604  %val7 = load i8, i8* %ptr7
605  %res0 = insertelement <16 x i8> undef, i8 %val0, i32 0
606  %res1 = insertelement <16 x i8> %res0, i8 %val1, i32 1
607  %res2 = insertelement <16 x i8> %res1, i8 %val2, i32 2
608  %res3 = insertelement <16 x i8> %res2, i8 %val3, i32 3
609  %res6 = insertelement <16 x i8> %res3, i8 %val6, i32 6
610  %res7 = insertelement <16 x i8> %res6, i8 %val7, i32 7
611  %resD = insertelement <16 x i8> %res7, i8     0, i32 13
612  %resE = insertelement <16 x i8> %resD, i8     0, i32 14
613  %resF = insertelement <16 x i8> %resE, i8     0, i32 15
614  ret <16 x i8> %resF
615}
616
617define void @merge_4i32_i32_combine(<4 x i32>* %dst, i32* %src) {
618; SSE-LABEL: merge_4i32_i32_combine:
619; SSE:       # BB#0:
620; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
621; SSE-NEXT:    movaps %xmm0, (%rdi)
622; SSE-NEXT:    retq
623;
624; AVX1-LABEL: merge_4i32_i32_combine:
625; AVX1:       # BB#0:
626; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
627; AVX1-NEXT:    vmovaps %xmm0, (%rdi)
628; AVX1-NEXT:    retq
629;
630; AVX2-LABEL: merge_4i32_i32_combine:
631; AVX2:       # BB#0:
632; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
633; AVX2-NEXT:    vmovaps %xmm0, (%rdi)
634; AVX2-NEXT:    retq
635;
636; AVX512F-LABEL: merge_4i32_i32_combine:
637; AVX512F:       # BB#0:
638; AVX512F-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
639; AVX512F-NEXT:    vmovdqa %xmm0, (%rdi)
640; AVX512F-NEXT:    retq
641;
642; X32-SSE-LABEL: merge_4i32_i32_combine:
643; X32-SSE:       # BB#0:
644; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
645; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
646; X32-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
647; X32-SSE-NEXT:    movaps %xmm0, (%eax)
648; X32-SSE-NEXT:    retl
649 %1 = getelementptr i32, i32* %src, i32 0
650 %2 = load i32, i32* %1
651 %3 = insertelement <4 x i32> undef, i32 %2, i32 0
652 %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer
653 %5 = lshr <4 x i32> %4, <i32 0, i32 undef, i32 undef, i32 undef>
654 %6 = and <4 x i32> %5, <i32 -1, i32 0, i32 0, i32 0>
655 store <4 x i32> %6, <4 x i32>* %dst
656 ret void
657}
658
659;
660; consecutive loads including any/all volatiles may not be combined
661;
662
663define <2 x i64> @merge_2i64_i64_12_volatile(i64* %ptr) nounwind uwtable noinline ssp {
664; SSE-LABEL: merge_2i64_i64_12_volatile:
665; SSE:       # BB#0:
666; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
667; SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
668; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
669; SSE-NEXT:    retq
670;
671; AVX-LABEL: merge_2i64_i64_12_volatile:
672; AVX:       # BB#0:
673; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
674; AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
675; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
676; AVX-NEXT:    retq
677;
678; X32-SSE-LABEL: merge_2i64_i64_12_volatile:
679; X32-SSE:       # BB#0:
680; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
681; X32-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
682; X32-SSE-NEXT:    pinsrd $1, 12(%eax), %xmm0
683; X32-SSE-NEXT:    pinsrd $2, 16(%eax), %xmm0
684; X32-SSE-NEXT:    pinsrd $3, 20(%eax), %xmm0
685; X32-SSE-NEXT:    retl
686  %ptr0 = getelementptr inbounds i64, i64* %ptr, i64 1
687  %ptr1 = getelementptr inbounds i64, i64* %ptr, i64 2
688  %val0 = load volatile i64, i64* %ptr0
689  %val1 = load volatile i64, i64* %ptr1
690  %res0 = insertelement <2 x i64> undef, i64 %val0, i32 0
691  %res1 = insertelement <2 x i64> %res0, i64 %val1, i32 1
692  ret <2 x i64> %res1
693}
694
695define <4 x float> @merge_4f32_f32_2345_volatile(float* %ptr) nounwind uwtable noinline ssp {
696; SSE2-LABEL: merge_4f32_f32_2345_volatile:
697; SSE2:       # BB#0:
698; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
699; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
700; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
701; SSE2-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
702; SSE2-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
703; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
704; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
705; SSE2-NEXT:    retq
706;
707; SSE41-LABEL: merge_4f32_f32_2345_volatile:
708; SSE41:       # BB#0:
709; SSE41-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
710; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
711; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
712; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
713; SSE41-NEXT:    retq
714;
715; AVX-LABEL: merge_4f32_f32_2345_volatile:
716; AVX:       # BB#0:
717; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
718; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
719; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
720; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
721; AVX-NEXT:    retq
722;
723; X32-SSE-LABEL: merge_4f32_f32_2345_volatile:
724; X32-SSE:       # BB#0:
725; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
726; X32-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
727; X32-SSE-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
728; X32-SSE-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
729; X32-SSE-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
730; X32-SSE-NEXT:    retl
731  %ptr0 = getelementptr inbounds float, float* %ptr, i64 2
732  %ptr1 = getelementptr inbounds float, float* %ptr, i64 3
733  %ptr2 = getelementptr inbounds float, float* %ptr, i64 4
734  %ptr3 = getelementptr inbounds float, float* %ptr, i64 5
735  %val0 = load volatile float, float* %ptr0
736  %val1 = load float, float* %ptr1
737  %val2 = load float, float* %ptr2
738  %val3 = load float, float* %ptr3
739  %res0 = insertelement <4 x float> undef, float %val0, i32 0
740  %res1 = insertelement <4 x float> %res0, float %val1, i32 1
741  %res2 = insertelement <4 x float> %res1, float %val2, i32 2
742  %res3 = insertelement <4 x float> %res2, float %val3, i32 3
743  ret <4 x float> %res3
744}
745
746;
747; Non-consecutive test.
748;
749
750define <4 x float> @merge_4f32_f32_X0YY(float* %ptr0, float* %ptr1) nounwind uwtable noinline ssp {
751; SSE-LABEL: merge_4f32_f32_X0YY:
752; SSE:       # BB#0:
753; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
754; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
755; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0,0,1,1]
756; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
757; SSE-NEXT:    retq
758;
759; AVX-LABEL: merge_4f32_f32_X0YY:
760; AVX:       # BB#0:
761; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
762; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
763; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1]
764; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
765; AVX-NEXT:    retq
766;
767; X32-SSE-LABEL: merge_4f32_f32_X0YY:
768; X32-SSE:       # BB#0:
769; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
770; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
771; X32-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
772; X32-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
773; X32-SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0,0,1,1]
774; X32-SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
775; X32-SSE-NEXT:    retl
776  %val0 = load float, float* %ptr0, align 4
777  %val1 = load float, float* %ptr1, align 4
778  %res0 = insertelement <4 x float> undef, float %val0, i32 0
779  %res1 = insertelement <4 x float> %res0, float 0.000000e+00, i32 1
780  %res2 = insertelement <4 x float> %res1, float %val1, i32 2
781  %res3 = insertelement <4 x float> %res2, float %val1, i32 3
782  ret <4 x float> %res3
783}
784