• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,X86,SSE,X86-SSE
3; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX1,X86-AVX1
4; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX512,X86-AVX512
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,X64,SSE,X64-SSE
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX1,X64-AVX1
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX512,X64-AVX512
8
9; Tests for SSE2 and below, without SSE3+.
10
11define void @test1(<2 x double>* %r, <2 x double>* %A, double %B) nounwind  {
12; X86-SSE-LABEL: test1:
13; X86-SSE:       # %bb.0:
14; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
15; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
16; X86-SSE-NEXT:    movapd (%ecx), %xmm0
17; X86-SSE-NEXT:    movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
18; X86-SSE-NEXT:    movapd %xmm0, (%eax)
19; X86-SSE-NEXT:    retl
20;
21; X86-AVX-LABEL: test1:
22; X86-AVX:       # %bb.0:
23; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
24; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
25; X86-AVX-NEXT:    vmovapd (%ecx), %xmm0
26; X86-AVX-NEXT:    vmovlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
27; X86-AVX-NEXT:    vmovapd %xmm0, (%eax)
28; X86-AVX-NEXT:    retl
29;
30; X64-SSE-LABEL: test1:
31; X64-SSE:       # %bb.0:
32; X64-SSE-NEXT:    movapd (%rsi), %xmm1
33; X64-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
34; X64-SSE-NEXT:    movapd %xmm1, (%rdi)
35; X64-SSE-NEXT:    retq
36;
37; X64-AVX-LABEL: test1:
38; X64-AVX:       # %bb.0:
39; X64-AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
40; X64-AVX-NEXT:    vmovaps %xmm0, (%rdi)
41; X64-AVX-NEXT:    retq
42	%tmp3 = load <2 x double>, <2 x double>* %A, align 16
43	%tmp7 = insertelement <2 x double> undef, double %B, i32 0
44	%tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 2, i32 1 >
45	store <2 x double> %tmp9, <2 x double>* %r, align 16
46	ret void
47}
48
49define void @test2(<2 x double>* %r, <2 x double>* %A, double %B) nounwind  {
50; X86-SSE-LABEL: test2:
51; X86-SSE:       # %bb.0:
52; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
53; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
54; X86-SSE-NEXT:    movapd (%ecx), %xmm0
55; X86-SSE-NEXT:    movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
56; X86-SSE-NEXT:    movapd %xmm0, (%eax)
57; X86-SSE-NEXT:    retl
58;
59; X86-AVX-LABEL: test2:
60; X86-AVX:       # %bb.0:
61; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
62; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
63; X86-AVX-NEXT:    vmovapd (%ecx), %xmm0
64; X86-AVX-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
65; X86-AVX-NEXT:    vmovapd %xmm0, (%eax)
66; X86-AVX-NEXT:    retl
67;
68; X64-SSE-LABEL: test2:
69; X64-SSE:       # %bb.0:
70; X64-SSE-NEXT:    movaps (%rsi), %xmm1
71; X64-SSE-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
72; X64-SSE-NEXT:    movaps %xmm1, (%rdi)
73; X64-SSE-NEXT:    retq
74;
75; X64-AVX-LABEL: test2:
76; X64-AVX:       # %bb.0:
77; X64-AVX-NEXT:    vmovaps (%rsi), %xmm1
78; X64-AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
79; X64-AVX-NEXT:    vmovaps %xmm0, (%rdi)
80; X64-AVX-NEXT:    retq
81	%tmp3 = load <2 x double>, <2 x double>* %A, align 16
82	%tmp7 = insertelement <2 x double> undef, double %B, i32 0
83	%tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 0, i32 2 >
84	store <2 x double> %tmp9, <2 x double>* %r, align 16
85	ret void
86}
87
88
89define void @test3(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B) nounwind {
90; X86-SSE-LABEL: test3:
91; X86-SSE:       # %bb.0:
92; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
93; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
94; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
95; X86-SSE-NEXT:    movaps (%edx), %xmm0
96; X86-SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
97; X86-SSE-NEXT:    movaps %xmm0, (%eax)
98; X86-SSE-NEXT:    retl
99;
100; X86-AVX-LABEL: test3:
101; X86-AVX:       # %bb.0:
102; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
103; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
104; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
105; X86-AVX-NEXT:    vmovaps (%edx), %xmm0
106; X86-AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
107; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
108; X86-AVX-NEXT:    retl
109;
110; X64-SSE-LABEL: test3:
111; X64-SSE:       # %bb.0:
112; X64-SSE-NEXT:    movaps (%rsi), %xmm0
113; X64-SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
114; X64-SSE-NEXT:    movaps %xmm0, (%rdi)
115; X64-SSE-NEXT:    retq
116;
117; X64-AVX-LABEL: test3:
118; X64-AVX:       # %bb.0:
119; X64-AVX-NEXT:    vmovaps (%rsi), %xmm0
120; X64-AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
121; X64-AVX-NEXT:    vmovaps %xmm0, (%rdi)
122; X64-AVX-NEXT:    retq
123	%tmp = load <4 x float>, <4 x float>* %B		; <<4 x float>> [#uses=2]
124	%tmp3 = load <4 x float>, <4 x float>* %A		; <<4 x float>> [#uses=2]
125	%tmp.upgrd.1 = extractelement <4 x float> %tmp3, i32 0		; <float> [#uses=1]
126	%tmp7 = extractelement <4 x float> %tmp, i32 0		; <float> [#uses=1]
127	%tmp8 = extractelement <4 x float> %tmp3, i32 1		; <float> [#uses=1]
128	%tmp9 = extractelement <4 x float> %tmp, i32 1		; <float> [#uses=1]
129	%tmp10 = insertelement <4 x float> undef, float %tmp.upgrd.1, i32 0		; <<4 x float>> [#uses=1]
130	%tmp11 = insertelement <4 x float> %tmp10, float %tmp7, i32 1		; <<4 x float>> [#uses=1]
131	%tmp12 = insertelement <4 x float> %tmp11, float %tmp8, i32 2		; <<4 x float>> [#uses=1]
132	%tmp13 = insertelement <4 x float> %tmp12, float %tmp9, i32 3		; <<4 x float>> [#uses=1]
133	store <4 x float> %tmp13, <4 x float>* %res
134	ret void
135}
136
137define void @test4(<4 x float> %X, <4 x float>* %res) nounwind {
138; X86-SSE-LABEL: test4:
139; X86-SSE:       # %bb.0:
140; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
141; X86-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,1,3,3]
142; X86-SSE-NEXT:    movaps %xmm0, (%eax)
143; X86-SSE-NEXT:    retl
144;
145; X86-AVX-LABEL: test4:
146; X86-AVX:       # %bb.0:
147; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
148; X86-AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3]
149; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
150; X86-AVX-NEXT:    retl
151;
152; X64-SSE-LABEL: test4:
153; X64-SSE:       # %bb.0:
154; X64-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,1,3,3]
155; X64-SSE-NEXT:    movaps %xmm0, (%rdi)
156; X64-SSE-NEXT:    retq
157;
158; X64-AVX-LABEL: test4:
159; X64-AVX:       # %bb.0:
160; X64-AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3]
161; X64-AVX-NEXT:    vmovaps %xmm0, (%rdi)
162; X64-AVX-NEXT:    retq
163	%tmp5 = shufflevector <4 x float> %X, <4 x float> undef, <4 x i32> < i32 2, i32 6, i32 3, i32 7 >		; <<4 x float>> [#uses=1]
164	store <4 x float> %tmp5, <4 x float>* %res
165	ret void
166}
167
168define <4 x i32> @test5(i8** %ptr) nounwind {
169; X86-SSE-LABEL: test5:
170; X86-SSE:       # %bb.0:
171; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
172; X86-SSE-NEXT:    movl (%eax), %eax
173; X86-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
174; X86-SSE-NEXT:    pxor %xmm0, %xmm0
175; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
176; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
177; X86-SSE-NEXT:    retl
178;
179; X86-AVX-LABEL: test5:
180; X86-AVX:       # %bb.0:
181; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
182; X86-AVX-NEXT:    movl (%eax), %eax
183; X86-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
184; X86-AVX-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
185; X86-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
186; X86-AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
187; X86-AVX-NEXT:    retl
188;
189; X64-SSE-LABEL: test5:
190; X64-SSE:       # %bb.0:
191; X64-SSE-NEXT:    movq (%rdi), %rax
192; X64-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
193; X64-SSE-NEXT:    pxor %xmm0, %xmm0
194; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
195; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
196; X64-SSE-NEXT:    retq
197;
198; X64-AVX-LABEL: test5:
199; X64-AVX:       # %bb.0:
200; X64-AVX-NEXT:    movq (%rdi), %rax
201; X64-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
202; X64-AVX-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
203; X64-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
204; X64-AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
205; X64-AVX-NEXT:    retq
206	%tmp = load i8*, i8** %ptr		; <i8*> [#uses=1]
207	%tmp.upgrd.1 = bitcast i8* %tmp to float*		; <float*> [#uses=1]
208	%tmp.upgrd.2 = load float, float* %tmp.upgrd.1		; <float> [#uses=1]
209	%tmp.upgrd.3 = insertelement <4 x float> undef, float %tmp.upgrd.2, i32 0		; <<4 x float>> [#uses=1]
210	%tmp9 = insertelement <4 x float> %tmp.upgrd.3, float 0.000000e+00, i32 1		; <<4 x float>> [#uses=1]
211	%tmp10 = insertelement <4 x float> %tmp9, float 0.000000e+00, i32 2		; <<4 x float>> [#uses=1]
212	%tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 3		; <<4 x float>> [#uses=1]
213	%tmp21 = bitcast <4 x float> %tmp11 to <16 x i8>		; <<16 x i8>> [#uses=1]
214	%tmp22 = shufflevector <16 x i8> %tmp21, <16 x i8> zeroinitializer, <16 x i32> < i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23 >		; <<16 x i8>> [#uses=1]
215	%tmp31 = bitcast <16 x i8> %tmp22 to <8 x i16>		; <<8 x i16>> [#uses=1]
216	%tmp.upgrd.4 = shufflevector <8 x i16> zeroinitializer, <8 x i16> %tmp31, <8 x i32> < i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11 >		; <<8 x i16>> [#uses=1]
217	%tmp36 = bitcast <8 x i16> %tmp.upgrd.4 to <4 x i32>		; <<4 x i32>> [#uses=1]
218	ret <4 x i32> %tmp36
219}
220
221define void @test6(<4 x float>* %res, <4 x float>* %A) nounwind {
222; X86-SSE-LABEL: test6:
223; X86-SSE:       # %bb.0:
224; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
225; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
226; X86-SSE-NEXT:    movaps (%ecx), %xmm0
227; X86-SSE-NEXT:    movaps %xmm0, (%eax)
228; X86-SSE-NEXT:    retl
229;
230; X86-AVX-LABEL: test6:
231; X86-AVX:       # %bb.0:
232; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
233; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
234; X86-AVX-NEXT:    vmovaps (%ecx), %xmm0
235; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
236; X86-AVX-NEXT:    retl
237;
238; X64-SSE-LABEL: test6:
239; X64-SSE:       # %bb.0:
240; X64-SSE-NEXT:    movaps (%rsi), %xmm0
241; X64-SSE-NEXT:    movaps %xmm0, (%rdi)
242; X64-SSE-NEXT:    retq
243;
244; X64-AVX-LABEL: test6:
245; X64-AVX:       # %bb.0:
246; X64-AVX-NEXT:    vmovaps (%rsi), %xmm0
247; X64-AVX-NEXT:    vmovaps %xmm0, (%rdi)
248; X64-AVX-NEXT:    retq
249  %tmp1 = load <4 x float>, <4 x float>* %A            ; <<4 x float>> [#uses=1]
250  %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> < i32 0, i32 5, i32 6, i32 7 >          ; <<4 x float>> [#uses=1]
251  store <4 x float> %tmp2, <4 x float>* %res
252  ret void
253}
254
255define void @test7() nounwind {
256; SSE-LABEL: test7:
257; SSE:       # %bb.0:
258; SSE-NEXT:    xorps %xmm0, %xmm0
259; SSE-NEXT:    movaps %xmm0, 0
260; SSE-NEXT:    ret{{[l|q]}}
261;
262; AVX-LABEL: test7:
263; AVX:       # %bb.0:
264; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
265; AVX-NEXT:    vmovaps %xmm0, 0
266; AVX-NEXT:    ret{{[l|q]}}
267  bitcast <4 x i32> zeroinitializer to <4 x float>                ; <<4 x float>>:1 [#uses=1]
268  shufflevector <4 x float> %1, <4 x float> zeroinitializer, <4 x i32> zeroinitializer         ; <<4 x float>>:2 [#uses=1]
269  store <4 x float> %2, <4 x float>* null
270  ret void
271}
272
273@x = external global [4 x i32]
274
275define <2 x i64> @test8() nounwind {
276; X86-SSE-LABEL: test8:
277; X86-SSE:       # %bb.0:
278; X86-SSE-NEXT:    movups x, %xmm0
279; X86-SSE-NEXT:    retl
280;
281; X86-AVX-LABEL: test8:
282; X86-AVX:       # %bb.0:
283; X86-AVX-NEXT:    vmovups x, %xmm0
284; X86-AVX-NEXT:    retl
285;
286; X64-SSE-LABEL: test8:
287; X64-SSE:       # %bb.0:
288; X64-SSE-NEXT:    movups {{.*}}(%rip), %xmm0
289; X64-SSE-NEXT:    retq
290;
291; X64-AVX-LABEL: test8:
292; X64-AVX:       # %bb.0:
293; X64-AVX-NEXT:    vmovups {{.*}}(%rip), %xmm0
294; X64-AVX-NEXT:    retq
295	%tmp = load i32, i32* getelementptr ([4 x i32], [4 x i32]* @x, i32 0, i32 0)		; <i32> [#uses=1]
296	%tmp3 = load i32, i32* getelementptr ([4 x i32], [4 x i32]* @x, i32 0, i32 1)		; <i32> [#uses=1]
297	%tmp5 = load i32, i32* getelementptr ([4 x i32], [4 x i32]* @x, i32 0, i32 2)		; <i32> [#uses=1]
298	%tmp7 = load i32, i32* getelementptr ([4 x i32], [4 x i32]* @x, i32 0, i32 3)		; <i32> [#uses=1]
299	%tmp.upgrd.1 = insertelement <4 x i32> undef, i32 %tmp, i32 0		; <<4 x i32>> [#uses=1]
300	%tmp13 = insertelement <4 x i32> %tmp.upgrd.1, i32 %tmp3, i32 1		; <<4 x i32>> [#uses=1]
301	%tmp14 = insertelement <4 x i32> %tmp13, i32 %tmp5, i32 2		; <<4 x i32>> [#uses=1]
302	%tmp15 = insertelement <4 x i32> %tmp14, i32 %tmp7, i32 3		; <<4 x i32>> [#uses=1]
303	%tmp16 = bitcast <4 x i32> %tmp15 to <2 x i64>		; <<2 x i64>> [#uses=1]
304	ret <2 x i64> %tmp16
305}
306
307define <4 x float> @test9(i32 %dummy, float %a, float %b, float %c, float %d) nounwind {
308; X86-SSE-LABEL: test9:
309; X86-SSE:       # %bb.0:
310; X86-SSE-NEXT:    movups {{[0-9]+}}(%esp), %xmm0
311; X86-SSE-NEXT:    retl
312;
313; X86-AVX-LABEL: test9:
314; X86-AVX:       # %bb.0:
315; X86-AVX-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
316; X86-AVX-NEXT:    retl
317;
318; X64-SSE-LABEL: test9:
319; X64-SSE:       # %bb.0:
320; X64-SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
321; X64-SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
322; X64-SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
323; X64-SSE-NEXT:    retq
324;
325; X64-AVX-LABEL: test9:
326; X64-AVX:       # %bb.0:
327; X64-AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
328; X64-AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
329; X64-AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
330; X64-AVX-NEXT:    retq
331	%tmp = insertelement <4 x float> undef, float %a, i32 0		; <<4 x float>> [#uses=1]
332	%tmp11 = insertelement <4 x float> %tmp, float %b, i32 1		; <<4 x float>> [#uses=1]
333	%tmp12 = insertelement <4 x float> %tmp11, float %c, i32 2		; <<4 x float>> [#uses=1]
334	%tmp13 = insertelement <4 x float> %tmp12, float %d, i32 3		; <<4 x float>> [#uses=1]
335	ret <4 x float> %tmp13
336}
337
338define <4 x float> @test10(float %a, float %b, float %c, float %d) nounwind {
339; X86-SSE-LABEL: test10:
340; X86-SSE:       # %bb.0:
341; X86-SSE-NEXT:    movups {{[0-9]+}}(%esp), %xmm0
342; X86-SSE-NEXT:    retl
343;
344; X86-AVX-LABEL: test10:
345; X86-AVX:       # %bb.0:
346; X86-AVX-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
347; X86-AVX-NEXT:    retl
348;
349; X64-SSE-LABEL: test10:
350; X64-SSE:       # %bb.0:
351; X64-SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
352; X64-SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
353; X64-SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
354; X64-SSE-NEXT:    retq
355;
356; X64-AVX-LABEL: test10:
357; X64-AVX:       # %bb.0:
358; X64-AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
359; X64-AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
360; X64-AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
361; X64-AVX-NEXT:    retq
362	%tmp = insertelement <4 x float> undef, float %a, i32 0		; <<4 x float>> [#uses=1]
363	%tmp11 = insertelement <4 x float> %tmp, float %b, i32 1		; <<4 x float>> [#uses=1]
364	%tmp12 = insertelement <4 x float> %tmp11, float %c, i32 2		; <<4 x float>> [#uses=1]
365	%tmp13 = insertelement <4 x float> %tmp12, float %d, i32 3		; <<4 x float>> [#uses=1]
366	ret <4 x float> %tmp13
367}
368
369define <2 x double> @test11(double %a, double %b) nounwind {
370; X86-SSE-LABEL: test11:
371; X86-SSE:       # %bb.0:
372; X86-SSE-NEXT:    movups {{[0-9]+}}(%esp), %xmm0
373; X86-SSE-NEXT:    retl
374;
375; X86-AVX-LABEL: test11:
376; X86-AVX:       # %bb.0:
377; X86-AVX-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
378; X86-AVX-NEXT:    retl
379;
380; X64-SSE-LABEL: test11:
381; X64-SSE:       # %bb.0:
382; X64-SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
383; X64-SSE-NEXT:    retq
384;
385; X64-AVX-LABEL: test11:
386; X64-AVX:       # %bb.0:
387; X64-AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
388; X64-AVX-NEXT:    retq
389	%tmp = insertelement <2 x double> undef, double %a, i32 0		; <<2 x double>> [#uses=1]
390	%tmp7 = insertelement <2 x double> %tmp, double %b, i32 1		; <<2 x double>> [#uses=1]
391	ret <2 x double> %tmp7
392}
393
394define void @test12() nounwind {
395; SSE-LABEL: test12:
396; SSE:       # %bb.0:
397; SSE-NEXT:    movapd 0, %xmm0
398; SSE-NEXT:    movapd {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
399; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
400; SSE-NEXT:    xorps %xmm2, %xmm2
401; SSE-NEXT:    movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
402; SSE-NEXT:    addps %xmm1, %xmm2
403; SSE-NEXT:    movaps %xmm2, 0
404; SSE-NEXT:    ret{{[l|q]}}
405;
406; AVX1-LABEL: test12:
407; AVX1:       # %bb.0:
408; AVX1-NEXT:    vmovaps 0, %xmm0
409; AVX1-NEXT:    vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3]
410; AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
411; AVX1-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
412; AVX1-NEXT:    vaddps %xmm0, %xmm1, %xmm0
413; AVX1-NEXT:    vmovaps %xmm0, 0
414; AVX1-NEXT:    ret{{[l|q]}}
415;
416; AVX512-LABEL: test12:
417; AVX512:       # %bb.0:
418; AVX512-NEXT:    vmovaps 0, %xmm0
419; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1,1,1,1]
420; AVX512-NEXT:    vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3]
421; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
422; AVX512-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
423; AVX512-NEXT:    vaddps %xmm0, %xmm1, %xmm0
424; AVX512-NEXT:    vmovaps %xmm0, 0
425; AVX512-NEXT:    ret{{[l|q]}}
426  %tmp1 = load <4 x float>, <4 x float>* null          ; <<4 x float>> [#uses=2]
427  %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> < float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 >, <4 x i32> < i32 0, i32 1, i32 6, i32 7 >             ; <<4 x float>> [#uses=1]
428  %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 6, i32 7 >                ; <<4 x float>> [#uses=1]
429  %tmp4 = fadd <4 x float> %tmp2, %tmp3            ; <<4 x float>> [#uses=1]
430  store <4 x float> %tmp4, <4 x float>* null
431  ret void
432}
433
434define void @test13(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
435; X86-SSE-LABEL: test13:
436; X86-SSE:       # %bb.0:
437; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
438; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
439; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
440; X86-SSE-NEXT:    movaps (%edx), %xmm0
441; X86-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1]
442; X86-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
443; X86-SSE-NEXT:    movaps %xmm0, (%eax)
444; X86-SSE-NEXT:    retl
445;
446; X86-AVX-LABEL: test13:
447; X86-AVX:       # %bb.0:
448; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
449; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
450; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
451; X86-AVX-NEXT:    vmovaps (%edx), %xmm0
452; X86-AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1]
453; X86-AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
454; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
455; X86-AVX-NEXT:    retl
456;
457; X64-SSE-LABEL: test13:
458; X64-SSE:       # %bb.0:
459; X64-SSE-NEXT:    movaps (%rdx), %xmm0
460; X64-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1]
461; X64-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
462; X64-SSE-NEXT:    movaps %xmm0, (%rdi)
463; X64-SSE-NEXT:    retq
464;
465; X64-AVX-LABEL: test13:
466; X64-AVX:       # %bb.0:
467; X64-AVX-NEXT:    vmovaps (%rdx), %xmm0
468; X64-AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1]
469; X64-AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
470; X64-AVX-NEXT:    vmovaps %xmm0, (%rdi)
471; X64-AVX-NEXT:    retq
472  %tmp3 = load <4 x float>, <4 x float>* %B            ; <<4 x float>> [#uses=1]
473  %tmp5 = load <4 x float>, <4 x float>* %C            ; <<4 x float>> [#uses=1]
474  %tmp11 = shufflevector <4 x float> %tmp3, <4 x float> %tmp5, <4 x i32> < i32 1, i32 4, i32 1, i32 5 >         ; <<4 x float>> [#uses=1]
475  store <4 x float> %tmp11, <4 x float>* %res
476  ret void
477}
478
479define <4 x float> @test14(<4 x float>* %x, <4 x float>* %y) nounwind {
480; X86-SSE-LABEL: test14:
481; X86-SSE:       # %bb.0:
482; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
483; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
484; X86-SSE-NEXT:    movaps (%ecx), %xmm1
485; X86-SSE-NEXT:    movaps (%eax), %xmm2
486; X86-SSE-NEXT:    movaps %xmm2, %xmm0
487; X86-SSE-NEXT:    addps %xmm1, %xmm0
488; X86-SSE-NEXT:    subps %xmm1, %xmm2
489; X86-SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
490; X86-SSE-NEXT:    retl
491;
492; X86-AVX-LABEL: test14:
493; X86-AVX:       # %bb.0:
494; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
495; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
496; X86-AVX-NEXT:    vmovaps (%ecx), %xmm0
497; X86-AVX-NEXT:    vmovaps (%eax), %xmm1
498; X86-AVX-NEXT:    vaddps %xmm0, %xmm1, %xmm2
499; X86-AVX-NEXT:    vsubps %xmm0, %xmm1, %xmm0
500; X86-AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm0[0]
501; X86-AVX-NEXT:    retl
502;
503; X64-SSE-LABEL: test14:
504; X64-SSE:       # %bb.0:
505; X64-SSE-NEXT:    movaps (%rsi), %xmm1
506; X64-SSE-NEXT:    movaps (%rdi), %xmm2
507; X64-SSE-NEXT:    movaps %xmm2, %xmm0
508; X64-SSE-NEXT:    addps %xmm1, %xmm0
509; X64-SSE-NEXT:    subps %xmm1, %xmm2
510; X64-SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
511; X64-SSE-NEXT:    retq
512;
513; X64-AVX-LABEL: test14:
514; X64-AVX:       # %bb.0:
515; X64-AVX-NEXT:    vmovaps (%rsi), %xmm0
516; X64-AVX-NEXT:    vmovaps (%rdi), %xmm1
517; X64-AVX-NEXT:    vaddps %xmm0, %xmm1, %xmm2
518; X64-AVX-NEXT:    vsubps %xmm0, %xmm1, %xmm0
519; X64-AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm0[0]
520; X64-AVX-NEXT:    retq
521  %tmp = load <4 x float>, <4 x float>* %y             ; <<4 x float>> [#uses=2]
522  %tmp5 = load <4 x float>, <4 x float>* %x            ; <<4 x float>> [#uses=2]
523  %tmp9 = fadd <4 x float> %tmp5, %tmp             ; <<4 x float>> [#uses=1]
524  %tmp21 = fsub <4 x float> %tmp5, %tmp            ; <<4 x float>> [#uses=1]
525  %tmp27 = shufflevector <4 x float> %tmp9, <4 x float> %tmp21, <4 x i32> < i32 0, i32 1, i32 4, i32 5 >                ; <<4 x float>> [#uses=1]
526  ret <4 x float> %tmp27
527}
528
529define <4 x float> @test15(<4 x float>* %x, <4 x float>* %y) nounwind {
530; X86-SSE-LABEL: test15:
531; X86-SSE:       # %bb.0: # %entry
532; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
533; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
534; X86-SSE-NEXT:    movaps (%ecx), %xmm0
535; X86-SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
536; X86-SSE-NEXT:    retl
537;
538; X86-AVX-LABEL: test15:
539; X86-AVX:       # %bb.0: # %entry
540; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
541; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
542; X86-AVX-NEXT:    vmovaps (%ecx), %xmm0
543; X86-AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
544; X86-AVX-NEXT:    retl
545;
546; X64-SSE-LABEL: test15:
547; X64-SSE:       # %bb.0: # %entry
548; X64-SSE-NEXT:    movaps (%rdi), %xmm0
549; X64-SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
550; X64-SSE-NEXT:    retq
551;
552; X64-AVX-LABEL: test15:
553; X64-AVX:       # %bb.0: # %entry
554; X64-AVX-NEXT:    vmovaps (%rdi), %xmm0
555; X64-AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
556; X64-AVX-NEXT:    retq
557entry:
558  %tmp = load <4 x float>, <4 x float>* %y             ; <<4 x float>> [#uses=1]
559  %tmp3 = load <4 x float>, <4 x float>* %x            ; <<4 x float>> [#uses=1]
560  %tmp4 = shufflevector <4 x float> %tmp3, <4 x float> %tmp, <4 x i32> < i32 2, i32 3, i32 6, i32 7 >           ; <<4 x float>> [#uses=1]
561  ret <4 x float> %tmp4
562}
563
564; PR8900
565
566define  <2 x double> @test16(<4 x double> * nocapture %srcA, <2 x double>* nocapture %dst) {
567; X86-SSE-LABEL: test16:
568; X86-SSE:       # %bb.0:
569; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
570; X86-SSE-NEXT:    movaps 96(%eax), %xmm0
571; X86-SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
572; X86-SSE-NEXT:    retl
573;
574; X86-AVX-LABEL: test16:
575; X86-AVX:       # %bb.0:
576; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
577; X86-AVX-NEXT:    vmovaps 96(%eax), %ymm0
578; X86-AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
579; X86-AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
580; X86-AVX-NEXT:    vzeroupper
581; X86-AVX-NEXT:    retl
582;
583; X64-SSE-LABEL: test16:
584; X64-SSE:       # %bb.0:
585; X64-SSE-NEXT:    movaps 96(%rdi), %xmm0
586; X64-SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
587; X64-SSE-NEXT:    retq
588;
589; X64-AVX-LABEL: test16:
590; X64-AVX:       # %bb.0:
591; X64-AVX-NEXT:    vmovaps 96(%rdi), %ymm0
592; X64-AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
593; X64-AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
594; X64-AVX-NEXT:    vzeroupper
595; X64-AVX-NEXT:    retq
596  %i5 = getelementptr inbounds <4 x double>, <4 x double>* %srcA, i32 3
597  %i6 = load <4 x double>, <4 x double>* %i5, align 32
598  %i7 = shufflevector <4 x double> %i6, <4 x double> undef, <2 x i32> <i32 0, i32 2>
599  ret <2 x double> %i7
600}
601
602; PR9009
603define fastcc void @test17() nounwind {
604; X86-SSE-LABEL: test17:
605; X86-SSE:       # %bb.0: # %entry
606; X86-SSE-NEXT:    movaps {{.*#+}} xmm0 = <u,u,32768,32768>
607; X86-SSE-NEXT:    movaps %xmm0, (%eax)
608; X86-SSE-NEXT:    retl
609;
610; X86-AVX1-LABEL: test17:
611; X86-AVX1:       # %bb.0: # %entry
612; X86-AVX1-NEXT:    vmovaps {{.*#+}} xmm0 = <u,u,32768,32768>
613; X86-AVX1-NEXT:    vmovaps %xmm0, (%eax)
614; X86-AVX1-NEXT:    retl
615;
616; X86-AVX512-LABEL: test17:
617; X86-AVX512:       # %bb.0: # %entry
618; X86-AVX512-NEXT:    vbroadcastss {{.*#+}} xmm0 = [32768,32768,32768,32768]
619; X86-AVX512-NEXT:    vmovaps %xmm0, (%eax)
620; X86-AVX512-NEXT:    retl
621;
622; X64-SSE-LABEL: test17:
623; X64-SSE:       # %bb.0: # %entry
624; X64-SSE-NEXT:    movaps {{.*#+}} xmm0 = <u,u,32768,32768>
625; X64-SSE-NEXT:    movaps %xmm0, (%rax)
626; X64-SSE-NEXT:    retq
627;
628; X64-AVX1-LABEL: test17:
629; X64-AVX1:       # %bb.0: # %entry
630; X64-AVX1-NEXT:    vmovaps {{.*#+}} xmm0 = <u,u,32768,32768>
631; X64-AVX1-NEXT:    vmovaps %xmm0, (%rax)
632; X64-AVX1-NEXT:    retq
633;
634; X64-AVX512-LABEL: test17:
635; X64-AVX512:       # %bb.0: # %entry
636; X64-AVX512-NEXT:    vbroadcastss {{.*#+}} xmm0 = [32768,32768,32768,32768]
637; X64-AVX512-NEXT:    vmovaps %xmm0, (%rax)
638; X64-AVX512-NEXT:    retq
639entry:
640  %0 = insertelement <4 x i32> undef, i32 undef, i32 1
641  %1 = shufflevector <4 x i32> <i32 undef, i32 undef, i32 32768, i32 32768>, <4 x i32> %0, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
642  %2 = bitcast <4 x i32> %1 to <4 x float>
643  store <4 x float> %2, <4 x float> * undef
644  ret void
645}
646
647; PR9210
648define <4 x float> @f(<4 x double>) nounwind {
649; SSE-LABEL: f:
650; SSE:       # %bb.0: # %entry
651; SSE-NEXT:    cvtpd2ps %xmm1, %xmm1
652; SSE-NEXT:    cvtpd2ps %xmm0, %xmm0
653; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
654; SSE-NEXT:    ret{{[l|q]}}
655;
656; AVX-LABEL: f:
657; AVX:       # %bb.0: # %entry
658; AVX-NEXT:    vcvtpd2ps %ymm0, %xmm0
659; AVX-NEXT:    vzeroupper
660; AVX-NEXT:    ret{{[l|q]}}
661entry:
662 %double2float.i = fptrunc <4 x double> %0 to <4 x float>
663 ret <4 x float> %double2float.i
664}
665
666define <2 x i64> @test_insert_64_zext(<2 x i64> %i) {
667; SSE-LABEL: test_insert_64_zext:
668; SSE:       # %bb.0:
669; SSE-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
670; SSE-NEXT:    ret{{[l|q]}}
671;
672; AVX-LABEL: test_insert_64_zext:
673; AVX:       # %bb.0:
674; AVX-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
675; AVX-NEXT:    ret{{[l|q]}}
676  %1 = shufflevector <2 x i64> %i, <2 x i64> <i64 0, i64 undef>, <2 x i32> <i32 0, i32 2>
677  ret <2 x i64> %1
678}
679
680define <4 x i32> @PR19721(<4 x i32> %i) {
681; X86-SSE-LABEL: PR19721:
682; X86-SSE:       # %bb.0:
683; X86-SSE-NEXT:    andps {{\.LCPI.*}}, %xmm0
684; X86-SSE-NEXT:    retl
685;
686; X86-AVX-LABEL: PR19721:
687; X86-AVX:       # %bb.0:
688; X86-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
689; X86-AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
690; X86-AVX-NEXT:    retl
691;
692; X64-SSE-LABEL: PR19721:
693; X64-SSE:       # %bb.0:
694; X64-SSE-NEXT:    movq %xmm0, %rax
695; X64-SSE-NEXT:    movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
696; X64-SSE-NEXT:    andq %rax, %rcx
697; X64-SSE-NEXT:    movq %rcx, %xmm1
698; X64-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
699; X64-SSE-NEXT:    retq
700;
701; X64-AVX1-LABEL: PR19721:
702; X64-AVX1:       # %bb.0:
703; X64-AVX1-NEXT:    vmovq %xmm0, %rax
704; X64-AVX1-NEXT:    movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
705; X64-AVX1-NEXT:    andq %rax, %rcx
706; X64-AVX1-NEXT:    vmovq %rcx, %xmm1
707; X64-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
708; X64-AVX1-NEXT:    retq
709;
710; X64-AVX512-LABEL: PR19721:
711; X64-AVX512:       # %bb.0:
712; X64-AVX512-NEXT:    vmovq %xmm0, %rax
713; X64-AVX512-NEXT:    movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
714; X64-AVX512-NEXT:    andq %rax, %rcx
715; X64-AVX512-NEXT:    vmovq %rcx, %xmm1
716; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
717; X64-AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
718; X64-AVX512-NEXT:    retq
719  %bc = bitcast <4 x i32> %i to i128
720  %insert = and i128 %bc, -4294967296
721  %bc2 = bitcast i128 %insert to <4 x i32>
722  ret <4 x i32> %bc2
723}
724
725define <4 x i32> @test_mul(<4 x i32> %x, <4 x i32> %y) {
726; SSE-LABEL: test_mul:
727; SSE:       # %bb.0:
728; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
729; SSE-NEXT:    pmuludq %xmm1, %xmm0
730; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
731; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
732; SSE-NEXT:    pmuludq %xmm2, %xmm1
733; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
734; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
735; SSE-NEXT:    ret{{[l|q]}}
736;
737; AVX-LABEL: test_mul:
738; AVX:       # %bb.0:
739; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
740; AVX-NEXT:    ret{{[l|q]}}
741  %m = mul <4 x i32> %x, %y
742  ret <4 x i32> %m
743}
744