• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=avx,aes,pclmul | FileCheck %s --check-prefix=ALL --check-prefix=X32
3; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=avx,aes,pclmul | FileCheck %s --check-prefix=ALL --check-prefix=X64
4
5; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx-builtins.c
6
7define <4 x double> @test_mm256_add_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
8; X32-LABEL: test_mm256_add_pd:
9; X32:       # BB#0:
10; X32-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
11; X32-NEXT:    retl
12;
13; X64-LABEL: test_mm256_add_pd:
14; X64:       # BB#0:
15; X64-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
16; X64-NEXT:    retq
17  %res = fadd <4 x double> %a0, %a1
18  ret <4 x double> %res
19}
20
21define <8 x float> @test_mm256_add_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
22; X32-LABEL: test_mm256_add_ps:
23; X32:       # BB#0:
24; X32-NEXT:    vaddps %ymm1, %ymm0, %ymm0
25; X32-NEXT:    retl
26;
27; X64-LABEL: test_mm256_add_ps:
28; X64:       # BB#0:
29; X64-NEXT:    vaddps %ymm1, %ymm0, %ymm0
30; X64-NEXT:    retq
31  %res = fadd <8 x float> %a0, %a1
32  ret <8 x float> %res
33}
34
35define <4 x double> @test_mm256_addsub_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
36; X32-LABEL: test_mm256_addsub_pd:
37; X32:       # BB#0:
38; X32-NEXT:    vaddsubpd %ymm1, %ymm0, %ymm0
39; X32-NEXT:    retl
40;
41; X64-LABEL: test_mm256_addsub_pd:
42; X64:       # BB#0:
43; X64-NEXT:    vaddsubpd %ymm1, %ymm0, %ymm0
44; X64-NEXT:    retq
45  %res = call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %a0, <4 x double> %a1)
46  ret <4 x double> %res
47}
48declare <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double>, <4 x double>) nounwind readnone
49
50define <8 x float> @test_mm256_addsub_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
51; X32-LABEL: test_mm256_addsub_ps:
52; X32:       # BB#0:
53; X32-NEXT:    vaddsubps %ymm1, %ymm0, %ymm0
54; X32-NEXT:    retl
55;
56; X64-LABEL: test_mm256_addsub_ps:
57; X64:       # BB#0:
58; X64-NEXT:    vaddsubps %ymm1, %ymm0, %ymm0
59; X64-NEXT:    retq
60  %res = call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %a0, <8 x float> %a1)
61  ret <8 x float> %res
62}
63declare <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float>, <8 x float>) nounwind readnone
64
65define <4 x double> @test_mm256_and_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
66; X32-LABEL: test_mm256_and_pd:
67; X32:       # BB#0:
68; X32-NEXT:    vandps %ymm1, %ymm0, %ymm0
69; X32-NEXT:    retl
70;
71; X64-LABEL: test_mm256_and_pd:
72; X64:       # BB#0:
73; X64-NEXT:    vandps %ymm1, %ymm0, %ymm0
74; X64-NEXT:    retq
75  %1 = bitcast <4 x double> %a0 to <4 x i64>
76  %2 = bitcast <4 x double> %a1 to <4 x i64>
77  %res = and <4 x i64> %1, %2
78  %bc = bitcast <4 x i64> %res to <4 x double>
79  ret <4 x double> %bc
80}
81
82define <8 x float> @test_mm256_and_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
83; X32-LABEL: test_mm256_and_ps:
84; X32:       # BB#0:
85; X32-NEXT:    vandps %ymm1, %ymm0, %ymm0
86; X32-NEXT:    retl
87;
88; X64-LABEL: test_mm256_and_ps:
89; X64:       # BB#0:
90; X64-NEXT:    vandps %ymm1, %ymm0, %ymm0
91; X64-NEXT:    retq
92  %1 = bitcast <8 x float> %a0 to <8 x i32>
93  %2 = bitcast <8 x float> %a1 to <8 x i32>
94  %res = and <8 x i32> %1, %2
95  %bc = bitcast <8 x i32> %res to <8 x float>
96  ret <8 x float> %bc
97}
98
99define <4 x double> @test_mm256_andnot_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
100; X32-LABEL: test_mm256_andnot_pd:
101; X32:       # BB#0:
102; X32-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
103; X32-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
104; X32-NEXT:    vxorps %ymm2, %ymm0, %ymm0
105; X32-NEXT:    vandps %ymm1, %ymm0, %ymm0
106; X32-NEXT:    retl
107;
108; X64-LABEL: test_mm256_andnot_pd:
109; X64:       # BB#0:
110; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
111; X64-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
112; X64-NEXT:    vxorps %ymm2, %ymm0, %ymm0
113; X64-NEXT:    vandps %ymm1, %ymm0, %ymm0
114; X64-NEXT:    retq
115  %1 = bitcast <4 x double> %a0 to <4 x i64>
116  %2 = bitcast <4 x double> %a1 to <4 x i64>
117  %3 = xor <4 x i64> %1, <i64 -1, i64 -1, i64 -1, i64 -1>
118  %res = and <4 x i64> %3, %2
119  %bc = bitcast <4 x i64> %res to <4 x double>
120  ret <4 x double> %bc
121}
122
123define <8 x float> @test_mm256_andnot_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
124; X32-LABEL: test_mm256_andnot_ps:
125; X32:       # BB#0:
126; X32-NEXT:    vandnps %ymm1, %ymm0, %ymm0
127; X32-NEXT:    retl
128;
129; X64-LABEL: test_mm256_andnot_ps:
130; X64:       # BB#0:
131; X64-NEXT:    vandnps %ymm1, %ymm0, %ymm0
132; X64-NEXT:    retq
133  %1 = bitcast <8 x float> %a0 to <8 x i32>
134  %2 = bitcast <8 x float> %a1 to <8 x i32>
135  %3 = xor <8 x i32> %1, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
136  %res = and <8 x i32> %3, %2
137  %bc = bitcast <8 x i32> %res to <8 x float>
138  ret <8 x float> %bc
139}
140
141define <4 x double> @test_mm256_blend_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
142; X32-LABEL: test_mm256_blend_pd:
143; X32:       # BB#0:
144; X32-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3]
145; X32-NEXT:    retl
146;
147; X64-LABEL: test_mm256_blend_pd:
148; X64:       # BB#0:
149; X64-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3]
150; X64-NEXT:    retq
151  %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
152  ret <4 x double> %res
153}
154
155define <8 x float> @test_mm256_blend_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
156; X32-LABEL: test_mm256_blend_ps:
157; X32:       # BB#0:
158; X32-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6],ymm1[7]
159; X32-NEXT:    retl
160;
161; X64-LABEL: test_mm256_blend_ps:
162; X64:       # BB#0:
163; X64-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6],ymm1[7]
164; X64-NEXT:    retq
165  %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
166  ret <8 x float> %res
167}
168
169define <4 x double> @test_mm256_blendv_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) nounwind {
170; X32-LABEL: test_mm256_blendv_pd:
171; X32:       # BB#0:
172; X32-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
173; X32-NEXT:    retl
174;
175; X64-LABEL: test_mm256_blendv_pd:
176; X64:       # BB#0:
177; X64-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
178; X64-NEXT:    retq
179  %res = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
180  ret <4 x double> %res
181}
182declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
183
184define <8 x float> @test_mm256_blendv_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) nounwind {
185; X32-LABEL: test_mm256_blendv_ps:
186; X32:       # BB#0:
187; X32-NEXT:    vblendvps %ymm2, %ymm1, %ymm0, %ymm0
188; X32-NEXT:    retl
189;
190; X64-LABEL: test_mm256_blendv_ps:
191; X64:       # BB#0:
192; X64-NEXT:    vblendvps %ymm2, %ymm1, %ymm0, %ymm0
193; X64-NEXT:    retq
194  %res = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
195  ret <8 x float> %res
196}
197declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
198
199define <4 x double> @test_mm256_broadcast_pd(<2 x double>* %a0) nounwind {
200; X32-LABEL: test_mm256_broadcast_pd:
201; X32:       # BB#0:
202; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
203; X32-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
204; X32-NEXT:    retl
205;
206; X64-LABEL: test_mm256_broadcast_pd:
207; X64:       # BB#0:
208; X64-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
209; X64-NEXT:    retq
210  %arg0 = bitcast <2 x double>* %a0 to i8*
211  %res = call <4 x double> @llvm.x86.avx.vbroadcastf128.pd.256(i8* %arg0)
212  ret <4 x double> %res
213}
214declare <4 x double> @llvm.x86.avx.vbroadcastf128.pd.256(i8*) nounwind readonly
215
216define <8 x float> @test_mm256_broadcast_ps(<4 x float>* %a0) nounwind {
217; X32-LABEL: test_mm256_broadcast_ps:
218; X32:       # BB#0:
219; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
220; X32-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
221; X32-NEXT:    retl
222;
223; X64-LABEL: test_mm256_broadcast_ps:
224; X64:       # BB#0:
225; X64-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
226; X64-NEXT:    retq
227  %arg0 = bitcast <4 x float>* %a0 to i8*
228  %res = call <8 x float> @llvm.x86.avx.vbroadcastf128.ps.256(i8* %arg0)
229  ret <8 x float> %res
230}
231declare <8 x float> @llvm.x86.avx.vbroadcastf128.ps.256(i8*) nounwind readonly
232
233define <4 x double> @test_mm256_broadcast_sd(double* %a0) nounwind {
234; X32-LABEL: test_mm256_broadcast_sd:
235; X32:       # BB#0:
236; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
237; X32-NEXT:    vbroadcastsd (%eax), %ymm0
238; X32-NEXT:    retl
239;
240; X64-LABEL: test_mm256_broadcast_sd:
241; X64:       # BB#0:
242; X64-NEXT:    vbroadcastsd (%rdi), %ymm0
243; X64-NEXT:    retq
244  %ld = load double, double* %a0
245  %ins0 = insertelement <4 x double> undef, double %ld, i32 0
246  %ins1 = insertelement <4 x double> %ins0, double %ld, i32 1
247  %ins2 = insertelement <4 x double> %ins1, double %ld, i32 2
248  %ins3 = insertelement <4 x double> %ins2, double %ld, i32 3
249  ret <4 x double> %ins3
250}
251
252define <4 x float> @test_mm_broadcast_ss(float* %a0) nounwind {
253; X32-LABEL: test_mm_broadcast_ss:
254; X32:       # BB#0:
255; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
256; X32-NEXT:    vbroadcastss (%eax), %xmm0
257; X32-NEXT:    retl
258;
259; X64-LABEL: test_mm_broadcast_ss:
260; X64:       # BB#0:
261; X64-NEXT:    vbroadcastss (%rdi), %xmm0
262; X64-NEXT:    retq
263  %ld = load float, float* %a0
264  %ins0 = insertelement <4 x float> undef, float %ld, i32 0
265  %ins1 = insertelement <4 x float> %ins0, float %ld, i32 1
266  %ins2 = insertelement <4 x float> %ins1, float %ld, i32 2
267  %ins3 = insertelement <4 x float> %ins2, float %ld, i32 3
268  ret <4 x float> %ins3
269}
270
271define <8 x float> @test_mm256_broadcast_ss(float* %a0) nounwind {
272; X32-LABEL: test_mm256_broadcast_ss:
273; X32:       # BB#0:
274; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
275; X32-NEXT:    vbroadcastss (%eax), %ymm0
276; X32-NEXT:    retl
277;
278; X64-LABEL: test_mm256_broadcast_ss:
279; X64:       # BB#0:
280; X64-NEXT:    vbroadcastss (%rdi), %ymm0
281; X64-NEXT:    retq
282  %ld = load float, float* %a0
283  %ins0 = insertelement <8 x float> undef, float %ld, i32 0
284  %ins1 = insertelement <8 x float> %ins0, float %ld, i32 1
285  %ins2 = insertelement <8 x float> %ins1, float %ld, i32 2
286  %ins3 = insertelement <8 x float> %ins2, float %ld, i32 3
287  %ins4 = insertelement <8 x float> %ins3, float %ld, i32 4
288  %ins5 = insertelement <8 x float> %ins4, float %ld, i32 5
289  %ins6 = insertelement <8 x float> %ins5, float %ld, i32 6
290  %ins7 = insertelement <8 x float> %ins6, float %ld, i32 7
291  ret <8 x float> %ins7
292}
293
294define <8 x float> @test_mm256_castpd_ps(<4 x double> %a0) nounwind {
295; X32-LABEL: test_mm256_castpd_ps:
296; X32:       # BB#0:
297; X32-NEXT:    retl
298;
299; X64-LABEL: test_mm256_castpd_ps:
300; X64:       # BB#0:
301; X64-NEXT:    retq
302  %res = bitcast <4 x double> %a0 to <8 x float>
303  ret <8 x float> %res
304}
305
306define <4 x i64> @test_mm256_castpd_si256(<4 x double> %a0) nounwind {
307; X32-LABEL: test_mm256_castpd_si256:
308; X32:       # BB#0:
309; X32-NEXT:    retl
310;
311; X64-LABEL: test_mm256_castpd_si256:
312; X64:       # BB#0:
313; X64-NEXT:    retq
314  %res = bitcast <4 x double> %a0 to <4 x i64>
315  ret <4 x i64> %res
316}
317
318define <4 x double> @test_mm256_castpd128_pd256(<2 x double> %a0) nounwind {
319; X32-LABEL: test_mm256_castpd128_pd256:
320; X32:       # BB#0:
321; X32-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
322; X32-NEXT:    retl
323;
324; X64-LABEL: test_mm256_castpd128_pd256:
325; X64:       # BB#0:
326; X64-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
327; X64-NEXT:    retq
328  %res = shufflevector <2 x double> %a0, <2 x double> %a0, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
329  ret <4 x double> %res
330}
331
332define <2 x double> @test_mm256_castpd256_pd128(<4 x double> %a0) nounwind {
333; X32-LABEL: test_mm256_castpd256_pd128:
334; X32:       # BB#0:
335; X32-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
336; X32-NEXT:    vzeroupper
337; X32-NEXT:    retl
338;
339; X64-LABEL: test_mm256_castpd256_pd128:
340; X64:       # BB#0:
341; X64-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
342; X64-NEXT:    vzeroupper
343; X64-NEXT:    retq
344  %res = shufflevector <4 x double> %a0, <4 x double> %a0, <2 x i32> <i32 0, i32 1>
345  ret <2 x double> %res
346}
347
348define <4 x double> @test_mm256_castps_pd(<8 x float> %a0) nounwind {
349; X32-LABEL: test_mm256_castps_pd:
350; X32:       # BB#0:
351; X32-NEXT:    retl
352;
353; X64-LABEL: test_mm256_castps_pd:
354; X64:       # BB#0:
355; X64-NEXT:    retq
356  %res = bitcast <8 x float> %a0 to <4 x double>
357  ret <4 x double> %res
358}
359
360define <4 x i64> @test_mm256_castps_si256(<8 x float> %a0) nounwind {
361; X32-LABEL: test_mm256_castps_si256:
362; X32:       # BB#0:
363; X32-NEXT:    retl
364;
365; X64-LABEL: test_mm256_castps_si256:
366; X64:       # BB#0:
367; X64-NEXT:    retq
368  %res = bitcast <8 x float> %a0 to <4 x i64>
369  ret <4 x i64> %res
370}
371
372define <8 x float> @test_mm256_castps128_ps256(<4 x float> %a0) nounwind {
373; X32-LABEL: test_mm256_castps128_ps256:
374; X32:       # BB#0:
375; X32-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
376; X32-NEXT:    retl
377;
378; X64-LABEL: test_mm256_castps128_ps256:
379; X64:       # BB#0:
380; X64-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
381; X64-NEXT:    retq
382  %res = shufflevector <4 x float> %a0, <4 x float> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
383  ret <8 x float> %res
384}
385
386define <4 x float> @test_mm256_castps256_ps128(<8 x float> %a0) nounwind {
387; X32-LABEL: test_mm256_castps256_ps128:
388; X32:       # BB#0:
389; X32-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
390; X32-NEXT:    vzeroupper
391; X32-NEXT:    retl
392;
393; X64-LABEL: test_mm256_castps256_ps128:
394; X64:       # BB#0:
395; X64-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
396; X64-NEXT:    vzeroupper
397; X64-NEXT:    retq
398  %res = shufflevector <8 x float> %a0, <8 x float> %a0, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
399  ret <4 x float> %res
400}
401
402define <4 x i64> @test_mm256_castsi128_si256(<2 x i64> %a0) nounwind {
403; X32-LABEL: test_mm256_castsi128_si256:
404; X32:       # BB#0:
405; X32-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
406; X32-NEXT:    retl
407;
408; X64-LABEL: test_mm256_castsi128_si256:
409; X64:       # BB#0:
410; X64-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
411; X64-NEXT:    retq
412  %res = shufflevector <2 x i64> %a0, <2 x i64> %a0, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
413  ret <4 x i64> %res
414}
415
416define <4 x double> @test_mm256_castsi256_pd(<4 x i64> %a0) nounwind {
417; X32-LABEL: test_mm256_castsi256_pd:
418; X32:       # BB#0:
419; X32-NEXT:    retl
420;
421; X64-LABEL: test_mm256_castsi256_pd:
422; X64:       # BB#0:
423; X64-NEXT:    retq
424  %res = bitcast <4 x i64> %a0 to <4 x double>
425  ret <4 x double> %res
426}
427
428define <8 x float> @test_mm256_castsi256_ps(<4 x i64> %a0) nounwind {
429; X32-LABEL: test_mm256_castsi256_ps:
430; X32:       # BB#0:
431; X32-NEXT:    retl
432;
433; X64-LABEL: test_mm256_castsi256_ps:
434; X64:       # BB#0:
435; X64-NEXT:    retq
436  %res = bitcast <4 x i64> %a0 to <8 x float>
437  ret <8 x float> %res
438}
439
440define <2 x i64> @test_mm256_castsi256_si128(<4 x i64> %a0) nounwind {
441; X32-LABEL: test_mm256_castsi256_si128:
442; X32:       # BB#0:
443; X32-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
444; X32-NEXT:    vzeroupper
445; X32-NEXT:    retl
446;
447; X64-LABEL: test_mm256_castsi256_si128:
448; X64:       # BB#0:
449; X64-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
450; X64-NEXT:    vzeroupper
451; X64-NEXT:    retq
452  %res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> <i32 0, i32 1>
453  ret <2 x i64> %res
454}
455
456define <4 x double> @test_mm256_ceil_pd(<4 x double> %a0) nounwind {
457; X32-LABEL: test_mm256_ceil_pd:
458; X32:       # BB#0:
459; X32-NEXT:    vroundpd $2, %ymm0, %ymm0
460; X32-NEXT:    retl
461;
462; X64-LABEL: test_mm256_ceil_pd:
463; X64:       # BB#0:
464; X64-NEXT:    vroundpd $2, %ymm0, %ymm0
465; X64-NEXT:    retq
466  %res = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 2)
467  ret <4 x double> %res
468}
469declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
470
471define <8 x float> @test_mm256_ceil_ps(<8 x float> %a0) nounwind {
472; X32-LABEL: test_mm256_ceil_ps:
473; X32:       # BB#0:
474; X32-NEXT:    vroundps $2, %ymm0, %ymm0
475; X32-NEXT:    retl
476;
477; X64-LABEL: test_mm256_ceil_ps:
478; X64:       # BB#0:
479; X64-NEXT:    vroundps $2, %ymm0, %ymm0
480; X64-NEXT:    retq
481  %res = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 2)
482  ret <8 x float> %res
483}
484declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone
485
486define <2 x double> @test_mm_cmp_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
487; X32-LABEL: test_mm_cmp_pd:
488; X32:       # BB#0:
489; X32-NEXT:    vcmpgepd %xmm1, %xmm0, %xmm0
490; X32-NEXT:    retl
491;
492; X64-LABEL: test_mm_cmp_pd:
493; X64:       # BB#0:
494; X64-NEXT:    vcmpgepd %xmm1, %xmm0, %xmm0
495; X64-NEXT:    retq
496  %res = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %a0, <2 x double> %a1, i8 13)
497  ret <2 x double> %res
498}
499declare <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double>, <2 x double>, i8) nounwind readnone
500
501define <4 x double> @test_mm256_cmp_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
502; X32-LABEL: test_mm256_cmp_pd:
503; X32:       # BB#0:
504; X32-NEXT:    vcmpgepd %ymm1, %ymm0, %ymm0
505; X32-NEXT:    retl
506;
507; X64-LABEL: test_mm256_cmp_pd:
508; X64:       # BB#0:
509; X64-NEXT:    vcmpgepd %ymm1, %ymm0, %ymm0
510; X64-NEXT:    retq
511  %res = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> %a0, <4 x double> %a1, i8 13)
512  ret <4 x double> %res
513}
514declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
515
516define <4 x float> @test_mm_cmp_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
517; X32-LABEL: test_mm_cmp_ps:
518; X32:       # BB#0:
519; X32-NEXT:    vcmpgeps %xmm1, %xmm0, %xmm0
520; X32-NEXT:    retl
521;
522; X64-LABEL: test_mm_cmp_ps:
523; X64:       # BB#0:
524; X64-NEXT:    vcmpgeps %xmm1, %xmm0, %xmm0
525; X64-NEXT:    retq
526  %res = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 13)
527  ret <4 x float> %res
528}
529declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind readnone
530
531define <8 x float> @test_mm256_cmp_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
532; X32-LABEL: test_mm256_cmp_ps:
533; X32:       # BB#0:
534; X32-NEXT:    vcmpgeps %ymm1, %ymm0, %ymm0
535; X32-NEXT:    retl
536;
537; X64-LABEL: test_mm256_cmp_ps:
538; X64:       # BB#0:
539; X64-NEXT:    vcmpgeps %ymm1, %ymm0, %ymm0
540; X64-NEXT:    retq
541  %res = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 13)
542  ret <8 x float> %res
543}
544declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
545
546define <2 x double> @test_mm_cmp_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
547; X32-LABEL: test_mm_cmp_sd:
548; X32:       # BB#0:
549; X32-NEXT:    vcmpgesd %xmm1, %xmm0, %xmm0
550; X32-NEXT:    retl
551;
552; X64-LABEL: test_mm_cmp_sd:
553; X64:       # BB#0:
554; X64-NEXT:    vcmpgesd %xmm1, %xmm0, %xmm0
555; X64-NEXT:    retq
556  %res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 13)
557  ret <2 x double> %res
558}
559declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8) nounwind readnone
560
561define <4 x float> @test_mm_cmp_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
562; X32-LABEL: test_mm_cmp_ss:
563; X32:       # BB#0:
564; X32-NEXT:    vcmpgess %xmm1, %xmm0, %xmm0
565; X32-NEXT:    retl
566;
567; X64-LABEL: test_mm_cmp_ss:
568; X64:       # BB#0:
569; X64-NEXT:    vcmpgess %xmm1, %xmm0, %xmm0
570; X64-NEXT:    retq
571  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 13)
572  ret <4 x float> %res
573}
574declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind readnone
575
576define <4 x double> @test_mm256_cvtepi32_pd(<2 x i64> %a0) nounwind {
577; X32-LABEL: test_mm256_cvtepi32_pd:
578; X32:       # BB#0:
579; X32-NEXT:    vcvtdq2pd %xmm0, %ymm0
580; X32-NEXT:    retl
581;
582; X64-LABEL: test_mm256_cvtepi32_pd:
583; X64:       # BB#0:
584; X64-NEXT:    vcvtdq2pd %xmm0, %ymm0
585; X64-NEXT:    retq
586  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
587  %res = sitofp <4 x i32> %arg0 to <4 x double>
588  ret <4 x double> %res
589}
590
591define <8 x float> @test_mm256_cvtepi32_ps(<4 x i64> %a0) nounwind {
592; X32-LABEL: test_mm256_cvtepi32_ps:
593; X32:       # BB#0:
594; X32-NEXT:    vcvtdq2ps %ymm0, %ymm0
595; X32-NEXT:    retl
596;
597; X64-LABEL: test_mm256_cvtepi32_ps:
598; X64:       # BB#0:
599; X64-NEXT:    vcvtdq2ps %ymm0, %ymm0
600; X64-NEXT:    retq
601  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
602  %res = call <8 x float> @llvm.x86.avx.cvtdq2.ps.256(<8 x i32> %arg0)
603  ret <8 x float> %res
604}
605declare <8 x float> @llvm.x86.avx.cvtdq2.ps.256(<8 x i32>) nounwind readnone
606
607define <2 x i64> @test_mm256_cvtpd_epi32(<4 x double> %a0) nounwind {
608; X32-LABEL: test_mm256_cvtpd_epi32:
609; X32:       # BB#0:
610; X32-NEXT:    vcvtpd2dqy %ymm0, %xmm0
611; X32-NEXT:    vzeroupper
612; X32-NEXT:    retl
613;
614; X64-LABEL: test_mm256_cvtpd_epi32:
615; X64:       # BB#0:
616; X64-NEXT:    vcvtpd2dqy %ymm0, %xmm0
617; X64-NEXT:    vzeroupper
618; X64-NEXT:    retq
619  %cvt = call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> %a0)
620  %res = bitcast <4 x i32> %cvt to <2 x i64>
621  ret <2 x i64> %res
622}
623declare <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double>) nounwind readnone
624
625define <4 x float> @test_mm256_cvtpd_ps(<4 x double> %a0) nounwind {
626; X32-LABEL: test_mm256_cvtpd_ps:
627; X32:       # BB#0:
628; X32-NEXT:    vcvtpd2psy %ymm0, %xmm0
629; X32-NEXT:    vzeroupper
630; X32-NEXT:    retl
631;
632; X64-LABEL: test_mm256_cvtpd_ps:
633; X64:       # BB#0:
634; X64-NEXT:    vcvtpd2psy %ymm0, %xmm0
635; X64-NEXT:    vzeroupper
636; X64-NEXT:    retq
637  %res = call <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double> %a0)
638  ret <4 x float> %res
639}
640declare <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double>) nounwind readnone
641
642define <4 x i64> @test_mm256_cvtps_epi32(<8 x float> %a0) nounwind {
643; X32-LABEL: test_mm256_cvtps_epi32:
644; X32:       # BB#0:
645; X32-NEXT:    vcvtps2dq %ymm0, %ymm0
646; X32-NEXT:    retl
647;
648; X64-LABEL: test_mm256_cvtps_epi32:
649; X64:       # BB#0:
650; X64-NEXT:    vcvtps2dq %ymm0, %ymm0
651; X64-NEXT:    retq
652  %cvt = call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %a0)
653  %res = bitcast <8 x i32> %cvt to <4 x i64>
654  ret <4 x i64> %res
655}
656declare <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float>) nounwind readnone
657
658define <4 x double> @test_mm256_cvtps_pd(<4 x float> %a0) nounwind {
659; X32-LABEL: test_mm256_cvtps_pd:
660; X32:       # BB#0:
661; X32-NEXT:    vcvtps2pd %xmm0, %ymm0
662; X32-NEXT:    retl
663;
664; X64-LABEL: test_mm256_cvtps_pd:
665; X64:       # BB#0:
666; X64-NEXT:    vcvtps2pd %xmm0, %ymm0
667; X64-NEXT:    retq
668  %res = fpext <4 x float> %a0 to <4 x double>
669  ret <4 x double> %res
670}
671
672define <2 x i64> @test_mm256_cvttpd_epi32(<4 x double> %a0) nounwind {
673; X32-LABEL: test_mm256_cvttpd_epi32:
674; X32:       # BB#0:
675; X32-NEXT:    vcvttpd2dqy %ymm0, %xmm0
676; X32-NEXT:    vzeroupper
677; X32-NEXT:    retl
678;
679; X64-LABEL: test_mm256_cvttpd_epi32:
680; X64:       # BB#0:
681; X64-NEXT:    vcvttpd2dqy %ymm0, %xmm0
682; X64-NEXT:    vzeroupper
683; X64-NEXT:    retq
684  %cvt = fptosi <4 x double> %a0 to <4 x i32>
685  %res = bitcast <4 x i32> %cvt to <2 x i64>
686  ret <2 x i64> %res
687}
688
689define <4 x i64> @test_mm256_cvttps_epi32(<8 x float> %a0) nounwind {
690; X32-LABEL: test_mm256_cvttps_epi32:
691; X32:       # BB#0:
692; X32-NEXT:    vcvttps2dq %ymm0, %ymm0
693; X32-NEXT:    retl
694;
695; X64-LABEL: test_mm256_cvttps_epi32:
696; X64:       # BB#0:
697; X64-NEXT:    vcvttps2dq %ymm0, %ymm0
698; X64-NEXT:    retq
699  %cvt = fptosi <8 x float> %a0 to <8 x i32>
700  %res = bitcast <8 x i32> %cvt to <4 x i64>
701  ret <4 x i64> %res
702}
703
704define <4 x double> @test_mm256_div_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
705; X32-LABEL: test_mm256_div_pd:
706; X32:       # BB#0:
707; X32-NEXT:    vdivpd %ymm1, %ymm0, %ymm0
708; X32-NEXT:    retl
709;
710; X64-LABEL: test_mm256_div_pd:
711; X64:       # BB#0:
712; X64-NEXT:    vdivpd %ymm1, %ymm0, %ymm0
713; X64-NEXT:    retq
714  %res = fdiv <4 x double> %a0, %a1
715  ret <4 x double> %res
716}
717
718define <8 x float> @test_mm256_div_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
719; X32-LABEL: test_mm256_div_ps:
720; X32:       # BB#0:
721; X32-NEXT:    vdivps %ymm1, %ymm0, %ymm0
722; X32-NEXT:    retl
723;
724; X64-LABEL: test_mm256_div_ps:
725; X64:       # BB#0:
726; X64-NEXT:    vdivps %ymm1, %ymm0, %ymm0
727; X64-NEXT:    retq
728  %res = fdiv <8 x float> %a0, %a1
729  ret <8 x float> %res
730}
731
732define <8 x float> @test_mm256_dp_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
733; X32-LABEL: test_mm256_dp_ps:
734; X32:       # BB#0:
735; X32-NEXT:    vdpps $7, %ymm1, %ymm0, %ymm0
736; X32-NEXT:    retl
737;
738; X64-LABEL: test_mm256_dp_ps:
739; X64:       # BB#0:
740; X64-NEXT:    vdpps $7, %ymm1, %ymm0, %ymm0
741; X64-NEXT:    retq
742  %res = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7)
743  ret <8 x float> %res
744}
745declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
746
747define i32 @test_mm256_extract_epi8(<4 x i64> %a0) nounwind {
748; X32-LABEL: test_mm256_extract_epi8:
749; X32:       # BB#0:
750; X32-NEXT:    vextractf128 $1, %ymm0, %xmm0
751; X32-NEXT:    vpextrb $15, %xmm0, %eax
752; X32-NEXT:    movzbl %al, %eax
753; X32-NEXT:    vzeroupper
754; X32-NEXT:    retl
755;
756; X64-LABEL: test_mm256_extract_epi8:
757; X64:       # BB#0:
758; X64-NEXT:    vextractf128 $1, %ymm0, %xmm0
759; X64-NEXT:    vpextrb $15, %xmm0, %eax
760; X64-NEXT:    movzbl %al, %eax
761; X64-NEXT:    vzeroupper
762; X64-NEXT:    retq
763  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
764  %ext = extractelement <32 x i8> %arg0, i32 31
765  %res = zext i8 %ext to i32
766  ret i32 %res
767}
768
769define i32 @test_mm256_extract_epi16(<4 x i64> %a0) nounwind {
770; X32-LABEL: test_mm256_extract_epi16:
771; X32:       # BB#0:
772; X32-NEXT:    vextractf128 $1, %ymm0, %xmm0
773; X32-NEXT:    vpextrw $3, %xmm0, %eax
774; X32-NEXT:    movzwl %ax, %eax
775; X32-NEXT:    vzeroupper
776; X32-NEXT:    retl
777;
778; X64-LABEL: test_mm256_extract_epi16:
779; X64:       # BB#0:
780; X64-NEXT:    vextractf128 $1, %ymm0, %xmm0
781; X64-NEXT:    vpextrw $3, %xmm0, %eax
782; X64-NEXT:    movzwl %ax, %eax
783; X64-NEXT:    vzeroupper
784; X64-NEXT:    retq
785  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
786  %ext = extractelement <16 x i16> %arg0, i32 11
787  %res = zext i16 %ext to i32
788  ret i32 %res
789}
790
791define i32 @test_mm256_extract_epi32(<4 x i64> %a0) nounwind {
792; X32-LABEL: test_mm256_extract_epi32:
793; X32:       # BB#0:
794; X32-NEXT:    vextractf128 $1, %ymm0, %xmm0
795; X32-NEXT:    vpextrd $1, %xmm0, %eax
796; X32-NEXT:    vzeroupper
797; X32-NEXT:    retl
798;
799; X64-LABEL: test_mm256_extract_epi32:
800; X64:       # BB#0:
801; X64-NEXT:    vextractf128 $1, %ymm0, %xmm0
802; X64-NEXT:    vpextrd $1, %xmm0, %eax
803; X64-NEXT:    vzeroupper
804; X64-NEXT:    retq
805  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
806  %res = extractelement <8 x i32> %arg0, i32 5
807  ret i32 %res
808}
809
810define i64 @test_mm256_extract_epi64(<4 x i64> %a0) nounwind {
811; X32-LABEL: test_mm256_extract_epi64:
812; X32:       # BB#0:
813; X32-NEXT:    vextractf128 $1, %ymm0, %xmm0
814; X32-NEXT:    vpextrd $2, %xmm0, %eax
815; X32-NEXT:    vpextrd $3, %xmm0, %edx
816; X32-NEXT:    vzeroupper
817; X32-NEXT:    retl
818;
819; X64-LABEL: test_mm256_extract_epi64:
820; X64:       # BB#0:
821; X64-NEXT:    vextractf128 $1, %ymm0, %xmm0
822; X64-NEXT:    vpextrq $1, %xmm0, %rax
823; X64-NEXT:    vzeroupper
824; X64-NEXT:    retq
825  %res = extractelement <4 x i64> %a0, i32 3
826  ret i64 %res
827}
828
829define <2 x double> @test_mm256_extractf128_pd(<4 x double> %a0) nounwind {
830; X32-LABEL: test_mm256_extractf128_pd:
831; X32:       # BB#0:
832; X32-NEXT:    vextractf128 $1, %ymm0, %xmm0
833; X32-NEXT:    vzeroupper
834; X32-NEXT:    retl
835;
836; X64-LABEL: test_mm256_extractf128_pd:
837; X64:       # BB#0:
838; X64-NEXT:    vextractf128 $1, %ymm0, %xmm0
839; X64-NEXT:    vzeroupper
840; X64-NEXT:    retq
841  %res = shufflevector <4 x double> %a0, <4 x double> %a0, <2 x i32> <i32 2, i32 3>
842  ret <2 x double> %res
843}
844
845define <4 x float> @test_mm256_extractf128_ps(<8 x float> %a0) nounwind {
846; X32-LABEL: test_mm256_extractf128_ps:
847; X32:       # BB#0:
848; X32-NEXT:    vextractf128 $1, %ymm0, %xmm0
849; X32-NEXT:    vzeroupper
850; X32-NEXT:    retl
851;
852; X64-LABEL: test_mm256_extractf128_ps:
853; X64:       # BB#0:
854; X64-NEXT:    vextractf128 $1, %ymm0, %xmm0
855; X64-NEXT:    vzeroupper
856; X64-NEXT:    retq
857  %res = shufflevector <8 x float> %a0, <8 x float> %a0, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
858  ret <4 x float> %res
859}
860
861define <2 x i64> @test_mm256_extractf128_si256(<4 x i64> %a0) nounwind {
862; X32-LABEL: test_mm256_extractf128_si256:
863; X32:       # BB#0:
864; X32-NEXT:    vextractf128 $1, %ymm0, %xmm0
865; X32-NEXT:    vzeroupper
866; X32-NEXT:    retl
867;
868; X64-LABEL: test_mm256_extractf128_si256:
869; X64:       # BB#0:
870; X64-NEXT:    vextractf128 $1, %ymm0, %xmm0
871; X64-NEXT:    vzeroupper
872; X64-NEXT:    retq
873  %res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> <i32 2, i32 3>
874  ret <2 x i64> %res
875}
876
877define <4 x double> @test_mm256_floor_pd(<4 x double> %a0) nounwind {
878; X32-LABEL: test_mm256_floor_pd:
879; X32:       # BB#0:
880; X32-NEXT:    vroundpd $1, %ymm0, %ymm0
881; X32-NEXT:    retl
882;
883; X64-LABEL: test_mm256_floor_pd:
884; X64:       # BB#0:
885; X64-NEXT:    vroundpd $1, %ymm0, %ymm0
886; X64-NEXT:    retq
887  %res = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 1)
888  ret <4 x double> %res
889}
890
891define <8 x float> @test_mm256_floor_ps(<8 x float> %a0) nounwind {
892; X32-LABEL: test_mm256_floor_ps:
893; X32:       # BB#0:
894; X32-NEXT:    vroundps $1, %ymm0, %ymm0
895; X32-NEXT:    retl
896;
897; X64-LABEL: test_mm256_floor_ps:
898; X64:       # BB#0:
899; X64-NEXT:    vroundps $1, %ymm0, %ymm0
900; X64-NEXT:    retq
901  %res = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 1)
902  ret <8 x float> %res
903}
904
905define <4 x double> @test_mm256_hadd_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
906; X32-LABEL: test_mm256_hadd_pd:
907; X32:       # BB#0:
908; X32-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
909; X32-NEXT:    retl
910;
911; X64-LABEL: test_mm256_hadd_pd:
912; X64:       # BB#0:
913; X64-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
914; X64-NEXT:    retq
915  %res = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a0, <4 x double> %a1)
916  ret <4 x double> %res
917}
918declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
919
920define <8 x float> @test_mm256_hadd_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
921; X32-LABEL: test_mm256_hadd_ps:
922; X32:       # BB#0:
923; X32-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
924; X32-NEXT:    retl
925;
926; X64-LABEL: test_mm256_hadd_ps:
927; X64:       # BB#0:
928; X64-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
929; X64-NEXT:    retq
930  %res = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %a1)
931  ret <8 x float> %res
932}
933declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone
934
935define <4 x double> @test_mm256_hsub_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
936; X32-LABEL: test_mm256_hsub_pd:
937; X32:       # BB#0:
938; X32-NEXT:    vhsubpd %ymm1, %ymm0, %ymm0
939; X32-NEXT:    retl
940;
941; X64-LABEL: test_mm256_hsub_pd:
942; X64:       # BB#0:
943; X64-NEXT:    vhsubpd %ymm1, %ymm0, %ymm0
944; X64-NEXT:    retq
945  %res = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a0, <4 x double> %a1)
946  ret <4 x double> %res
947}
948declare <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double>, <4 x double>) nounwind readnone
949
950define <8 x float> @test_mm256_hsub_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
951; X32-LABEL: test_mm256_hsub_ps:
952; X32:       # BB#0:
953; X32-NEXT:    vhsubps %ymm1, %ymm0, %ymm0
954; X32-NEXT:    retl
955;
956; X64-LABEL: test_mm256_hsub_ps:
957; X64:       # BB#0:
958; X64-NEXT:    vhsubps %ymm1, %ymm0, %ymm0
959; X64-NEXT:    retq
960  %res = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a0, <8 x float> %a1)
961  ret <8 x float> %res
962}
963declare <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float>, <8 x float>) nounwind readnone
964
965define <4 x i64> @test_mm256_insert_epi8(<4 x i64> %a0, i8 %a1) nounwind {
966; X32-LABEL: test_mm256_insert_epi8:
967; X32:       # BB#0:
968; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
969; X32-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm1
970; X32-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
971; X32-NEXT:    retl
972;
973; X64-LABEL: test_mm256_insert_epi8:
974; X64:       # BB#0:
975; X64-NEXT:    movzbl %dil, %eax
976; X64-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm1
977; X64-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
978; X64-NEXT:    retq
979  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
980  %res = insertelement <32 x i8> %arg0, i8 %a1, i32 4
981  %bc = bitcast <32 x i8> %res to <4 x i64>
982  ret <4 x i64> %bc
983}
984
985define <4 x i64> @test_mm256_insert_epi16(<4 x i64> %a0, i16 %a1) nounwind {
986; X32-LABEL: test_mm256_insert_epi16:
987; X32:       # BB#0:
988; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
989; X32-NEXT:    vextractf128 $1, %ymm0, %xmm1
990; X32-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
991; X32-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
992; X32-NEXT:    retl
993;
994; X64-LABEL: test_mm256_insert_epi16:
995; X64:       # BB#0:
996; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
997; X64-NEXT:    vpinsrw $6, %edi, %xmm1, %xmm1
998; X64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
999; X64-NEXT:    retq
1000  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1001  %res = insertelement <16 x i16> %arg0, i16 %a1, i32 14
1002  %bc = bitcast <16 x i16> %res to <4 x i64>
1003  ret <4 x i64> %bc
1004}
1005
1006define <4 x i64> @test_mm256_insert_epi32(<4 x i64> %a0, i32 %a1) nounwind {
1007; X32-LABEL: test_mm256_insert_epi32:
1008; X32:       # BB#0:
1009; X32-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm1
1010; X32-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
1011; X32-NEXT:    retl
1012;
1013; X64-LABEL: test_mm256_insert_epi32:
1014; X64:       # BB#0:
1015; X64-NEXT:    vpinsrd $3, %edi, %xmm0, %xmm1
1016; X64-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
1017; X64-NEXT:    retq
1018  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1019  %res = insertelement <8 x i32> %arg0, i32 %a1, i32 3
1020  %bc = bitcast <8 x i32> %res to <4 x i64>
1021  ret <4 x i64> %bc
1022}
1023
1024define <4 x i64> @test_mm256_insert_epi64(<4 x i64> %a0, i64 %a1) nounwind {
1025; X32-LABEL: test_mm256_insert_epi64:
1026; X32:       # BB#0:
1027; X32-NEXT:    vextractf128 $1, %ymm0, %xmm1
1028; X32-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
1029; X32-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm2
1030; X32-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1031; X32-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1032; X32-NEXT:    retl
1033;
1034; X64-LABEL: test_mm256_insert_epi64:
1035; X64:       # BB#0:
1036; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
1037; X64-NEXT:    vpinsrq $1, %rdi, %xmm1, %xmm1
1038; X64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1039; X64-NEXT:    retq
1040  %res = insertelement <4 x i64> %a0, i64 %a1, i32 3
1041  ret <4 x i64> %res
1042}
1043
1044define <4 x double> @test_mm256_insertf128_pd(<4 x double> %a0, <2 x double> %a1) nounwind {
1045; X32-LABEL: test_mm256_insertf128_pd:
1046; X32:       # BB#0:
1047; X32-NEXT:    # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
1048; X32-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
1049; X32-NEXT:    retl
1050;
1051; X64-LABEL: test_mm256_insertf128_pd:
1052; X64:       # BB#0:
1053; X64-NEXT:    # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
1054; X64-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
1055; X64-NEXT:    retq
1056  %ext = shufflevector <2 x double> %a1, <2 x double> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1057  %res = shufflevector <4 x double> %a0, <4 x double> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
1058  ret <4 x double> %res
1059}
1060
1061define <8 x float> @test_mm256_insertf128_ps(<8 x float> %a0, <4 x float> %a1) nounwind {
1062; X32-LABEL: test_mm256_insertf128_ps:
1063; X32:       # BB#0:
1064; X32-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1065; X32-NEXT:    retl
1066;
1067; X64-LABEL: test_mm256_insertf128_ps:
1068; X64:       # BB#0:
1069; X64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1070; X64-NEXT:    retq
1071  %ext = shufflevector <4 x float> %a1, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
1072  %res = shufflevector <8 x float> %a0, <8 x float> %ext, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
1073  ret <8 x float> %res
1074}
1075
1076define <4 x i64> @test_mm256_insertf128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind {
1077; X32-LABEL: test_mm256_insertf128_si256:
1078; X32:       # BB#0:
1079; X32-NEXT:    # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
1080; X32-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
1081; X32-NEXT:    retl
1082;
1083; X64-LABEL: test_mm256_insertf128_si256:
1084; X64:       # BB#0:
1085; X64-NEXT:    # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
1086; X64-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
1087; X64-NEXT:    retq
1088  %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1089  %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
1090  ret <4 x i64> %res
1091}
1092
1093define <4 x i64> @test_mm256_lddqu_si256(<4 x i64>* %a0) nounwind {
1094; X32-LABEL: test_mm256_lddqu_si256:
1095; X32:       # BB#0:
1096; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1097; X32-NEXT:    vlddqu (%eax), %ymm0
1098; X32-NEXT:    retl
1099;
1100; X64-LABEL: test_mm256_lddqu_si256:
1101; X64:       # BB#0:
1102; X64-NEXT:    vlddqu (%rdi), %ymm0
1103; X64-NEXT:    retq
1104  %arg0 = bitcast <4 x i64>* %a0 to i8*
1105  %res = call <32 x i8> @llvm.x86.avx.ldu.dq.256(i8* %arg0)
1106  %bc = bitcast <32 x i8> %res to <4 x i64>
1107  ret <4 x i64> %bc
1108}
1109declare <32 x i8> @llvm.x86.avx.ldu.dq.256(i8*) nounwind readnone
1110
1111define <4 x double> @test_mm256_load_pd(double* %a0) nounwind {
1112; X32-LABEL: test_mm256_load_pd:
1113; X32:       # BB#0:
1114; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1115; X32-NEXT:    vmovaps (%eax), %ymm0
1116; X32-NEXT:    retl
1117;
1118; X64-LABEL: test_mm256_load_pd:
1119; X64:       # BB#0:
1120; X64-NEXT:    vmovaps (%rdi), %ymm0
1121; X64-NEXT:    retq
1122  %arg0 = bitcast double* %a0 to <4 x double>*
1123  %res = load <4 x double>, <4 x double>* %arg0, align 32
1124  ret <4 x double> %res
1125}
1126
1127define <8 x float> @test_mm256_load_ps(float* %a0) nounwind {
1128; X32-LABEL: test_mm256_load_ps:
1129; X32:       # BB#0:
1130; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1131; X32-NEXT:    vmovaps (%eax), %ymm0
1132; X32-NEXT:    retl
1133;
1134; X64-LABEL: test_mm256_load_ps:
1135; X64:       # BB#0:
1136; X64-NEXT:    vmovaps (%rdi), %ymm0
1137; X64-NEXT:    retq
1138  %arg0 = bitcast float* %a0 to <8 x float>*
1139  %res = load <8 x float>, <8 x float>* %arg0, align 32
1140  ret <8 x float> %res
1141}
1142
1143define <4 x i64> @test_mm256_load_si256(<4 x i64>* %a0) nounwind {
1144; X32-LABEL: test_mm256_load_si256:
1145; X32:       # BB#0:
1146; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1147; X32-NEXT:    vmovaps (%eax), %ymm0
1148; X32-NEXT:    retl
1149;
1150; X64-LABEL: test_mm256_load_si256:
1151; X64:       # BB#0:
1152; X64-NEXT:    vmovaps (%rdi), %ymm0
1153; X64-NEXT:    retq
1154  %res = load <4 x i64>, <4 x i64>* %a0, align 32
1155  ret <4 x i64> %res
1156}
1157
1158define <4 x double> @test_mm256_loadu_pd(double* %a0) nounwind {
1159; X32-LABEL: test_mm256_loadu_pd:
1160; X32:       # BB#0:
1161; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1162; X32-NEXT:    vmovups (%eax), %ymm0
1163; X32-NEXT:    retl
1164;
1165; X64-LABEL: test_mm256_loadu_pd:
1166; X64:       # BB#0:
1167; X64-NEXT:    vmovups (%rdi), %ymm0
1168; X64-NEXT:    retq
1169  %arg0 = bitcast double* %a0 to <4 x double>*
1170  %res = load <4 x double>, <4 x double>* %arg0, align 1
1171  ret <4 x double> %res
1172}
1173
1174define <8 x float> @test_mm256_loadu_ps(float* %a0) nounwind {
1175; X32-LABEL: test_mm256_loadu_ps:
1176; X32:       # BB#0:
1177; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1178; X32-NEXT:    vmovups (%eax), %ymm0
1179; X32-NEXT:    retl
1180;
1181; X64-LABEL: test_mm256_loadu_ps:
1182; X64:       # BB#0:
1183; X64-NEXT:    vmovups (%rdi), %ymm0
1184; X64-NEXT:    retq
1185  %arg0 = bitcast float* %a0 to <8 x float>*
1186  %res = load <8 x float>, <8 x float>* %arg0, align 1
1187  ret <8 x float> %res
1188}
1189
1190define <4 x i64> @test_mm256_loadu_si256(<4 x i64>* %a0) nounwind {
1191; X32-LABEL: test_mm256_loadu_si256:
1192; X32:       # BB#0:
1193; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1194; X32-NEXT:    vmovups (%eax), %ymm0
1195; X32-NEXT:    retl
1196;
1197; X64-LABEL: test_mm256_loadu_si256:
1198; X64:       # BB#0:
1199; X64-NEXT:    vmovups (%rdi), %ymm0
1200; X64-NEXT:    retq
1201  %res = load <4 x i64>, <4 x i64>* %a0, align 1
1202  ret <4 x i64> %res
1203}
1204
1205define <8 x float> @test_mm256_loadu2_m128(float* %a0, float* %a1) nounwind {
1206; X32-LABEL: test_mm256_loadu2_m128:
1207; X32:       # BB#0:
1208; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1209; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1210; X32-NEXT:    vmovups (%eax), %xmm0
1211; X32-NEXT:    vinsertf128 $1, (%ecx), %ymm0, %ymm0
1212; X32-NEXT:    retl
1213;
1214; X64-LABEL: test_mm256_loadu2_m128:
1215; X64:       # BB#0:
1216; X64-NEXT:    vmovups (%rsi), %xmm0
1217; X64-NEXT:    vinsertf128 $1, (%rdi), %ymm0, %ymm0
1218; X64-NEXT:    retq
1219  %arg0 = bitcast float* %a0 to <4 x float>*
1220  %hi4 = load <4 x float>, <4 x float>* %arg0, align 1
1221  %hi8 = shufflevector <4 x float> %hi4, <4 x float> %hi4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
1222  %arg1 = bitcast float* %a1 to <4 x float>*
1223  %lo4 = load <4 x float>, <4 x float>* %arg1, align 1
1224  %lo8 = shufflevector <4 x float> %lo4, <4 x float> %lo4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
1225  %res = shufflevector <8 x float> %lo8, <8 x float> %hi8, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
1226  ret <8 x float> %res
1227}
1228
1229define <4 x double> @test_mm256_loadu2_m128d(double* %a0, double* %a1) nounwind {
1230; X32-LABEL: test_mm256_loadu2_m128d:
1231; X32:       # BB#0:
1232; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1233; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1234; X32-NEXT:    vmovups (%eax), %xmm0
1235; X32-NEXT:    vinsertf128 $1, (%ecx), %ymm0, %ymm0
1236; X32-NEXT:    retl
1237;
1238; X64-LABEL: test_mm256_loadu2_m128d:
1239; X64:       # BB#0:
1240; X64-NEXT:    vmovups (%rsi), %xmm0
1241; X64-NEXT:    vinsertf128 $1, (%rdi), %ymm0, %ymm0
1242; X64-NEXT:    retq
1243  %arg0 = bitcast double* %a0 to <2 x double>*
1244  %hi2 = load <2 x double>, <2 x double>* %arg0, align 1
1245  %hi4 = shufflevector <2 x double> %hi2, <2 x double> %hi2, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1246  %arg1 = bitcast double* %a1 to <2 x double>*
1247  %lo2 = load <2 x double>, <2 x double>* %arg1, align 1
1248  %lo4 = shufflevector <2 x double> %lo2, <2 x double> %lo2, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1249  %res = shufflevector <4 x double> %lo4, <4 x double> %hi4, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1250  ret <4 x double> %res
1251}
1252
1253define <4 x i64> @test_mm256_loadu2_m128i(i64* %a0, i64* %a1) nounwind {
1254; X32-LABEL: test_mm256_loadu2_m128i:
1255; X32:       # BB#0:
1256; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1257; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1258; X32-NEXT:    vmovups (%eax), %xmm0
1259; X32-NEXT:    vinsertf128 $1, (%ecx), %ymm0, %ymm0
1260; X32-NEXT:    retl
1261;
1262; X64-LABEL: test_mm256_loadu2_m128i:
1263; X64:       # BB#0:
1264; X64-NEXT:    vmovups (%rsi), %xmm0
1265; X64-NEXT:    vinsertf128 $1, (%rdi), %ymm0, %ymm0
1266; X64-NEXT:    retq
1267  %arg0 = bitcast i64* %a0 to <2 x i64>*
1268  %hi2 = load <2 x i64>, <2 x i64>* %arg0, align 1
1269  %hi4 = shufflevector <2 x i64> %hi2, <2 x i64> %hi2, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1270  %arg1 = bitcast i64* %a1 to <2 x i64>*
1271  %lo2 = load <2 x i64>, <2 x i64>* %arg1, align 1
1272  %lo4 = shufflevector <2 x i64> %lo2, <2 x i64> %lo2, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1273  %res = shufflevector <4 x i64> %lo4, <4 x i64> %hi4, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1274  ret <4 x i64> %res
1275}
1276
1277define <2 x double> @test_mm_maskload_pd(double* %a0, <2 x i64> %a1) nounwind {
1278; X32-LABEL: test_mm_maskload_pd:
1279; X32:       # BB#0:
1280; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1281; X32-NEXT:    vmaskmovpd (%eax), %xmm0, %xmm0
1282; X32-NEXT:    retl
1283;
1284; X64-LABEL: test_mm_maskload_pd:
1285; X64:       # BB#0:
1286; X64-NEXT:    vmaskmovpd (%rdi), %xmm0, %xmm0
1287; X64-NEXT:    retq
1288  %arg0 = bitcast double* %a0 to i8*
1289  %res = call <2 x double> @llvm.x86.avx.maskload.pd(i8* %arg0, <2 x i64> %a1)
1290  ret <2 x double> %res
1291}
1292declare <2 x double> @llvm.x86.avx.maskload.pd(i8*, <2 x i64>) nounwind readnone
1293
1294define <4 x double> @test_mm256_maskload_pd(double* %a0, <4 x i64> %a1) nounwind {
1295; X32-LABEL: test_mm256_maskload_pd:
1296; X32:       # BB#0:
1297; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1298; X32-NEXT:    vmaskmovpd (%eax), %ymm0, %ymm0
1299; X32-NEXT:    retl
1300;
1301; X64-LABEL: test_mm256_maskload_pd:
1302; X64:       # BB#0:
1303; X64-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm0
1304; X64-NEXT:    retq
1305  %arg0 = bitcast double* %a0 to i8*
1306  %res = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %arg0, <4 x i64> %a1)
1307  ret <4 x double> %res
1308}
1309declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8*, <4 x i64>) nounwind readnone
1310
1311define <4 x float> @test_mm_maskload_ps(float* %a0, <2 x i64> %a1) nounwind {
1312; X32-LABEL: test_mm_maskload_ps:
1313; X32:       # BB#0:
1314; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1315; X32-NEXT:    vmaskmovps (%eax), %xmm0, %xmm0
1316; X32-NEXT:    retl
1317;
1318; X64-LABEL: test_mm_maskload_ps:
1319; X64:       # BB#0:
1320; X64-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm0
1321; X64-NEXT:    retq
1322  %arg0 = bitcast float* %a0 to i8*
1323  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1324  %res = call <4 x float> @llvm.x86.avx.maskload.ps(i8* %arg0, <4 x i32> %arg1)
1325  ret <4 x float> %res
1326}
1327declare <4 x float> @llvm.x86.avx.maskload.ps(i8*, <4 x i32>) nounwind readnone
1328
1329define <8 x float> @test_mm256_maskload_ps(float* %a0, <4 x i64> %a1) nounwind {
1330; X32-LABEL: test_mm256_maskload_ps:
1331; X32:       # BB#0:
1332; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1333; X32-NEXT:    vmaskmovps (%eax), %ymm0, %ymm0
1334; X32-NEXT:    retl
1335;
1336; X64-LABEL: test_mm256_maskload_ps:
1337; X64:       # BB#0:
1338; X64-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm0
1339; X64-NEXT:    retq
1340  %arg0 = bitcast float* %a0 to i8*
1341  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1342  %res = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8* %arg0, <8 x i32> %arg1)
1343  ret <8 x float> %res
1344}
1345declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8*, <8 x i32>) nounwind readnone
1346
1347define void @test_mm_maskstore_pd(double* %a0, <2 x i64> %a1, <2 x double> %a2) nounwind {
1348; X32-LABEL: test_mm_maskstore_pd:
1349; X32:       # BB#0:
1350; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1351; X32-NEXT:    vmaskmovpd %xmm1, %xmm0, (%eax)
1352; X32-NEXT:    retl
1353;
1354; X64-LABEL: test_mm_maskstore_pd:
1355; X64:       # BB#0:
1356; X64-NEXT:    vmaskmovpd %xmm1, %xmm0, (%rdi)
1357; X64-NEXT:    retq
1358  %arg0 = bitcast double* %a0 to i8*
1359  call void @llvm.x86.avx.maskstore.pd(i8* %arg0, <2 x i64> %a1, <2 x double> %a2)
1360  ret void
1361}
1362declare void @llvm.x86.avx.maskstore.pd(i8*, <2 x i64>, <2 x double>) nounwind readnone
1363
1364define void @test_mm256_maskstore_pd(double* %a0, <4 x i64> %a1, <4 x double> %a2) nounwind {
1365; X32-LABEL: test_mm256_maskstore_pd:
1366; X32:       # BB#0:
1367; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1368; X32-NEXT:    vmaskmovpd %ymm1, %ymm0, (%eax)
1369; X32-NEXT:    vzeroupper
1370; X32-NEXT:    retl
1371;
1372; X64-LABEL: test_mm256_maskstore_pd:
1373; X64:       # BB#0:
1374; X64-NEXT:    vmaskmovpd %ymm1, %ymm0, (%rdi)
1375; X64-NEXT:    vzeroupper
1376; X64-NEXT:    retq
1377  %arg0 = bitcast double* %a0 to i8*
1378  call void @llvm.x86.avx.maskstore.pd.256(i8* %arg0, <4 x i64> %a1, <4 x double> %a2)
1379  ret void
1380}
1381declare void @llvm.x86.avx.maskstore.pd.256(i8*, <4 x i64>, <4 x double>) nounwind readnone
1382
1383define void @test_mm_maskstore_ps(float* %a0, <2 x i64> %a1, <4 x float> %a2) nounwind {
1384; X32-LABEL: test_mm_maskstore_ps:
1385; X32:       # BB#0:
1386; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1387; X32-NEXT:    vmaskmovps %xmm1, %xmm0, (%eax)
1388; X32-NEXT:    retl
1389;
1390; X64-LABEL: test_mm_maskstore_ps:
1391; X64:       # BB#0:
1392; X64-NEXT:    vmaskmovps %xmm1, %xmm0, (%rdi)
1393; X64-NEXT:    retq
1394  %arg0 = bitcast float* %a0 to i8*
1395  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1396  call void @llvm.x86.avx.maskstore.ps(i8* %arg0, <4 x i32> %arg1, <4 x float> %a2)
1397  ret void
1398}
1399declare void @llvm.x86.avx.maskstore.ps(i8*, <4 x i32>, <4 x float>) nounwind readnone
1400
1401define void @test_mm256_maskstore_ps(float* %a0, <4 x i64> %a1, <8 x float> %a2) nounwind {
1402; X32-LABEL: test_mm256_maskstore_ps:
1403; X32:       # BB#0:
1404; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1405; X32-NEXT:    vmaskmovps %ymm1, %ymm0, (%eax)
1406; X32-NEXT:    vzeroupper
1407; X32-NEXT:    retl
1408;
1409; X64-LABEL: test_mm256_maskstore_ps:
1410; X64:       # BB#0:
1411; X64-NEXT:    vmaskmovps %ymm1, %ymm0, (%rdi)
1412; X64-NEXT:    vzeroupper
1413; X64-NEXT:    retq
1414  %arg0 = bitcast float* %a0 to i8*
1415  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1416  call void @llvm.x86.avx.maskstore.ps.256(i8* %arg0, <8 x i32> %arg1, <8 x float> %a2)
1417  ret void
1418}
1419declare void @llvm.x86.avx.maskstore.ps.256(i8*, <8 x i32>, <8 x float>) nounwind readnone
1420
1421define <4 x double> @test_mm256_max_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
1422; X32-LABEL: test_mm256_max_pd:
1423; X32:       # BB#0:
1424; X32-NEXT:    vmaxpd %ymm1, %ymm0, %ymm0
1425; X32-NEXT:    retl
1426;
1427; X64-LABEL: test_mm256_max_pd:
1428; X64:       # BB#0:
1429; X64-NEXT:    vmaxpd %ymm1, %ymm0, %ymm0
1430; X64-NEXT:    retq
1431  %res = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %a0, <4 x double> %a1)
1432  ret <4 x double> %res
1433}
1434declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
1435
1436define <8 x float> @test_mm256_max_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
1437; X32-LABEL: test_mm256_max_ps:
1438; X32:       # BB#0:
1439; X32-NEXT:    vmaxps %ymm1, %ymm0, %ymm0
1440; X32-NEXT:    retl
1441;
1442; X64-LABEL: test_mm256_max_ps:
1443; X64:       # BB#0:
1444; X64-NEXT:    vmaxps %ymm1, %ymm0, %ymm0
1445; X64-NEXT:    retq
1446  %res = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1)
1447  ret <8 x float> %res
1448}
1449declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
1450
1451define <4 x double> @test_mm256_min_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
1452; X32-LABEL: test_mm256_min_pd:
1453; X32:       # BB#0:
1454; X32-NEXT:    vminpd %ymm1, %ymm0, %ymm0
1455; X32-NEXT:    retl
1456;
1457; X64-LABEL: test_mm256_min_pd:
1458; X64:       # BB#0:
1459; X64-NEXT:    vminpd %ymm1, %ymm0, %ymm0
1460; X64-NEXT:    retq
1461  %res = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %a0, <4 x double> %a1)
1462  ret <4 x double> %res
1463}
1464declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
1465
1466define <8 x float> @test_mm256_min_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
1467; X32-LABEL: test_mm256_min_ps:
1468; X32:       # BB#0:
1469; X32-NEXT:    vminps %ymm1, %ymm0, %ymm0
1470; X32-NEXT:    retl
1471;
1472; X64-LABEL: test_mm256_min_ps:
1473; X64:       # BB#0:
1474; X64-NEXT:    vminps %ymm1, %ymm0, %ymm0
1475; X64-NEXT:    retq
1476  %res = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1)
1477  ret <8 x float> %res
1478}
1479declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
1480
1481define <4 x double> @test_mm256_movedup_pd(<4 x double> %a0) nounwind {
1482; X32-LABEL: test_mm256_movedup_pd:
1483; X32:       # BB#0:
1484; X32-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
1485; X32-NEXT:    retl
1486;
1487; X64-LABEL: test_mm256_movedup_pd:
1488; X64:       # BB#0:
1489; X64-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
1490; X64-NEXT:    retq
1491  %res = shufflevector <4 x double> %a0, <4 x double> %a0, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
1492  ret <4 x double> %res
1493}
1494
1495define <8 x float> @test_mm256_movehdup_ps(<8 x float> %a0) nounwind {
1496; X32-LABEL: test_mm256_movehdup_ps:
1497; X32:       # BB#0:
1498; X32-NEXT:    vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
1499; X32-NEXT:    retl
1500;
1501; X64-LABEL: test_mm256_movehdup_ps:
1502; X64:       # BB#0:
1503; X64-NEXT:    vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
1504; X64-NEXT:    retq
1505  %res = shufflevector <8 x float> %a0, <8 x float> %a0, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
1506  ret <8 x float> %res
1507}
1508
1509define <8 x float> @test_mm256_moveldup_ps(<8 x float> %a0) nounwind {
1510; X32-LABEL: test_mm256_moveldup_ps:
1511; X32:       # BB#0:
1512; X32-NEXT:    vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
1513; X32-NEXT:    retl
1514;
1515; X64-LABEL: test_mm256_moveldup_ps:
1516; X64:       # BB#0:
1517; X64-NEXT:    vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
1518; X64-NEXT:    retq
1519  %res = shufflevector <8 x float> %a0, <8 x float> %a0, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
1520  ret <8 x float> %res
1521}
1522
1523define i32 @test_mm256_movemask_pd(<4 x double> %a0) nounwind {
1524; X32-LABEL: test_mm256_movemask_pd:
1525; X32:       # BB#0:
1526; X32-NEXT:    vmovmskpd %ymm0, %eax
1527; X32-NEXT:    vzeroupper
1528; X32-NEXT:    retl
1529;
1530; X64-LABEL: test_mm256_movemask_pd:
1531; X64:       # BB#0:
1532; X64-NEXT:    vmovmskpd %ymm0, %eax
1533; X64-NEXT:    vzeroupper
1534; X64-NEXT:    retq
1535  %res = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0)
1536  ret i32 %res
1537}
1538declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>) nounwind readnone
1539
1540define i32 @test_mm256_movemask_ps(<8 x float> %a0) nounwind {
1541; X32-LABEL: test_mm256_movemask_ps:
1542; X32:       # BB#0:
1543; X32-NEXT:    vmovmskps %ymm0, %eax
1544; X32-NEXT:    vzeroupper
1545; X32-NEXT:    retl
1546;
1547; X64-LABEL: test_mm256_movemask_ps:
1548; X64:       # BB#0:
1549; X64-NEXT:    vmovmskps %ymm0, %eax
1550; X64-NEXT:    vzeroupper
1551; X64-NEXT:    retq
1552  %res = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0)
1553  ret i32 %res
1554}
1555declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
1556
1557define <4 x double> @test_mm256_mul_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
1558; X32-LABEL: test_mm256_mul_pd:
1559; X32:       # BB#0:
1560; X32-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
1561; X32-NEXT:    retl
1562;
1563; X64-LABEL: test_mm256_mul_pd:
1564; X64:       # BB#0:
1565; X64-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
1566; X64-NEXT:    retq
1567  %res = fmul <4 x double> %a0, %a1
1568  ret <4 x double> %res
1569}
1570
1571define <8 x float> @test_mm256_mul_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
1572; X32-LABEL: test_mm256_mul_ps:
1573; X32:       # BB#0:
1574; X32-NEXT:    vmulps %ymm1, %ymm0, %ymm0
1575; X32-NEXT:    retl
1576;
1577; X64-LABEL: test_mm256_mul_ps:
1578; X64:       # BB#0:
1579; X64-NEXT:    vmulps %ymm1, %ymm0, %ymm0
1580; X64-NEXT:    retq
1581  %res = fmul <8 x float> %a0, %a1
1582  ret <8 x float> %res
1583}
1584
1585define <4 x double> @test_mm256_or_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
1586; X32-LABEL: test_mm256_or_pd:
1587; X32:       # BB#0:
1588; X32-NEXT:    vorps %ymm1, %ymm0, %ymm0
1589; X32-NEXT:    retl
1590;
1591; X64-LABEL: test_mm256_or_pd:
1592; X64:       # BB#0:
1593; X64-NEXT:    vorps %ymm1, %ymm0, %ymm0
1594; X64-NEXT:    retq
1595  %1 = bitcast <4 x double> %a0 to <4 x i64>
1596  %2 = bitcast <4 x double> %a1 to <4 x i64>
1597  %res = or <4 x i64> %1, %2
1598  %bc = bitcast <4 x i64> %res to <4 x double>
1599  ret <4 x double> %bc
1600}
1601
1602define <8 x float> @test_mm256_or_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
1603; X32-LABEL: test_mm256_or_ps:
1604; X32:       # BB#0:
1605; X32-NEXT:    vorps %ymm1, %ymm0, %ymm0
1606; X32-NEXT:    retl
1607;
1608; X64-LABEL: test_mm256_or_ps:
1609; X64:       # BB#0:
1610; X64-NEXT:    vorps %ymm1, %ymm0, %ymm0
1611; X64-NEXT:    retq
1612  %1 = bitcast <8 x float> %a0 to <8 x i32>
1613  %2 = bitcast <8 x float> %a1 to <8 x i32>
1614  %res = or <8 x i32> %1, %2
1615  %bc = bitcast <8 x i32> %res to <8 x float>
1616  ret <8 x float> %bc
1617}
1618
1619define <2 x double> @test_mm_permute_pd(<2 x double> %a0) nounwind {
1620; X32-LABEL: test_mm_permute_pd:
1621; X32:       # BB#0:
1622; X32-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1623; X32-NEXT:    retl
1624;
1625; X64-LABEL: test_mm_permute_pd:
1626; X64:       # BB#0:
1627; X64-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1628; X64-NEXT:    retq
1629  %res = shufflevector <2 x double> %a0, <2 x double> %a0, <2 x i32> <i32 1, i32 0>
1630  ret <2 x double> %res
1631}
1632
1633define <4 x double> @test_mm256_permute_pd(<4 x double> %a0) nounwind {
1634; X32-LABEL: test_mm256_permute_pd:
1635; X32:       # BB#0:
1636; X32-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
1637; X32-NEXT:    retl
1638;
1639; X64-LABEL: test_mm256_permute_pd:
1640; X64:       # BB#0:
1641; X64-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
1642; X64-NEXT:    retq
1643  %res = shufflevector <4 x double> %a0, <4 x double> %a0, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
1644  ret <4 x double> %res
1645}
1646
1647define <4 x float> @test_mm_permute_ps(<4 x float> %a0) nounwind {
1648; X32-LABEL: test_mm_permute_ps:
1649; X32:       # BB#0:
1650; X32-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
1651; X32-NEXT:    retl
1652;
1653; X64-LABEL: test_mm_permute_ps:
1654; X64:       # BB#0:
1655; X64-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
1656; X64-NEXT:    retq
1657  %res = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1658  ret <4 x float> %res
1659}
1660
1661define <4 x float> @test2_mm_permute_ps(<4 x float> %a0) nounwind {
1662; X32-LABEL: test2_mm_permute_ps:
1663; X32:       # BB#0:
1664; X32-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,1,2,3]
1665; X32-NEXT:    retl
1666;
1667; X64-LABEL: test2_mm_permute_ps:
1668; X64:       # BB#0:
1669; X64-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,1,2,3]
1670; X64-NEXT:    retq
1671  %res = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> <i32 2, i32 1, i32 2, i32 3>
1672  ret <4 x float> %res
1673}
1674
1675define <8 x float> @test_mm256_permute_ps(<8 x float> %a0) nounwind {
1676; X32-LABEL: test_mm256_permute_ps:
1677; X32:       # BB#0:
1678; X32-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
1679; X32-NEXT:    retl
1680;
1681; X64-LABEL: test_mm256_permute_ps:
1682; X64:       # BB#0:
1683; X64-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
1684; X64-NEXT:    retq
1685  %res = shufflevector <8 x float> %a0, <8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
1686  ret <8 x float> %res
1687}
1688
1689define <4 x double> @test_mm256_permute2f128_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
1690; X32-LABEL: test_mm256_permute2f128_pd:
1691; X32:       # BB#0:
1692; X32-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm1[0,1]
1693; X32-NEXT:    retl
1694;
1695; X64-LABEL: test_mm256_permute2f128_pd:
1696; X64:       # BB#0:
1697; X64-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm1[0,1]
1698; X64-NEXT:    retq
1699  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 44)
1700  ret <4 x double> %res
1701}
1702declare <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
1703
1704; PR26667
1705define <8 x float> @test_mm256_permute2f128_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
1706; X32-LABEL: test_mm256_permute2f128_ps:
1707; X32:       # BB#0:
1708; X32-NEXT:    vmovaps %ymm1, %ymm0
1709; X32-NEXT:    retl
1710;
1711; X64-LABEL: test_mm256_permute2f128_ps:
1712; X64:       # BB#0:
1713; X64-NEXT:    vmovaps %ymm1, %ymm0
1714; X64-NEXT:    retq
1715  %res = call <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float> %a0, <8 x float> %a1, i8 50)
1716  ret <8 x float> %res
1717}
1718declare <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
1719
1720define <4 x i64> @test_mm256_permute2f128_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
1721; X32-LABEL: test_mm256_permute2f128_si256:
1722; X32:       # BB#0:
1723; X32-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1]
1724; X32-NEXT:    retl
1725;
1726; X64-LABEL: test_mm256_permute2f128_si256:
1727; X64:       # BB#0:
1728; X64-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1]
1729; X64-NEXT:    retq
1730  %1 = bitcast <4 x i64> %a0 to <8 x i32>
1731  %2 = bitcast <4 x i64> %a1 to <8 x i32>
1732  %res = call <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32> %1, <8 x i32> %2, i8 35)
1733  %bc = bitcast <8 x i32> %res to <4 x i64>
1734  ret <4 x i64> %bc
1735}
1736declare <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32>, <8 x i32>, i8) nounwind readnone
1737
1738define <2 x double> @test_mm_permutevar_pd(<2 x double> %a0, <2 x i64> %a1) nounwind {
1739; X32-LABEL: test_mm_permutevar_pd:
1740; X32:       # BB#0:
1741; X32-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
1742; X32-NEXT:    retl
1743;
1744; X64-LABEL: test_mm_permutevar_pd:
1745; X64:       # BB#0:
1746; X64-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
1747; X64-NEXT:    retq
1748  %res = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> %a1)
1749  ret <2 x double> %res
1750}
1751declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>) nounwind readnone
1752
1753define <4 x double> @test_mm256_permutevar_pd(<4 x double> %a0, <4 x i64> %a1) nounwind {
1754; X32-LABEL: test_mm256_permutevar_pd:
1755; X32:       # BB#0:
1756; X32-NEXT:    vpermilpd %ymm1, %ymm0, %ymm0
1757; X32-NEXT:    retl
1758;
1759; X64-LABEL: test_mm256_permutevar_pd:
1760; X64:       # BB#0:
1761; X64-NEXT:    vpermilpd %ymm1, %ymm0, %ymm0
1762; X64-NEXT:    retq
1763  %res = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> %a1)
1764  ret <4 x double> %res
1765}
1766declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>) nounwind readnone
1767
1768define <4 x float> @test_mm_permutevar_ps(<4 x float> %a0, <2 x i64> %a1) nounwind {
1769; X32-LABEL: test_mm_permutevar_ps:
1770; X32:       # BB#0:
1771; X32-NEXT:    vpermilps %xmm1, %xmm0, %xmm0
1772; X32-NEXT:    retl
1773;
1774; X64-LABEL: test_mm_permutevar_ps:
1775; X64:       # BB#0:
1776; X64-NEXT:    vpermilps %xmm1, %xmm0, %xmm0
1777; X64-NEXT:    retq
1778  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1779  %res = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %arg1)
1780  ret <4 x float> %res
1781}
1782declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>) nounwind readnone
1783
1784define <8 x float> @test_mm256_permutevar_ps(<8 x float> %a0, <4 x i64> %a1) nounwind {
1785; X32-LABEL: test_mm256_permutevar_ps:
1786; X32:       # BB#0:
1787; X32-NEXT:    vpermilps %ymm1, %ymm0, %ymm0
1788; X32-NEXT:    retl
1789;
1790; X64-LABEL: test_mm256_permutevar_ps:
1791; X64:       # BB#0:
1792; X64-NEXT:    vpermilps %ymm1, %ymm0, %ymm0
1793; X64-NEXT:    retq
1794  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1795  %res = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %arg1)
1796  ret <8 x float> %res
1797}
1798declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>) nounwind readnone
1799
1800define <8 x float> @test_mm256_rcp_ps(<8 x float> %a0) nounwind {
1801; X32-LABEL: test_mm256_rcp_ps:
1802; X32:       # BB#0:
1803; X32-NEXT:    vrcpps %ymm0, %ymm0
1804; X32-NEXT:    retl
1805;
1806; X64-LABEL: test_mm256_rcp_ps:
1807; X64:       # BB#0:
1808; X64-NEXT:    vrcpps %ymm0, %ymm0
1809; X64-NEXT:    retq
1810  %res = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %a0)
1811  ret <8 x float> %res
1812}
1813declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
1814
1815define <4 x double> @test_mm256_round_pd(<4 x double> %a0) nounwind {
1816; X32-LABEL: test_mm256_round_pd:
1817; X32:       # BB#0:
1818; X32-NEXT:    vroundpd $4, %ymm0, %ymm0
1819; X32-NEXT:    retl
1820;
1821; X64-LABEL: test_mm256_round_pd:
1822; X64:       # BB#0:
1823; X64-NEXT:    vroundpd $4, %ymm0, %ymm0
1824; X64-NEXT:    retq
1825  %res = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 4)
1826  ret <4 x double> %res
1827}
1828
1829define <8 x float> @test_mm256_round_ps(<8 x float> %a0) nounwind {
1830; X32-LABEL: test_mm256_round_ps:
1831; X32:       # BB#0:
1832; X32-NEXT:    vroundps $4, %ymm0, %ymm0
1833; X32-NEXT:    retl
1834;
1835; X64-LABEL: test_mm256_round_ps:
1836; X64:       # BB#0:
1837; X64-NEXT:    vroundps $4, %ymm0, %ymm0
1838; X64-NEXT:    retq
1839  %res = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 4)
1840  ret <8 x float> %res
1841}
1842
1843define <8 x float> @test_mm256_rsqrt_ps(<8 x float> %a0) nounwind {
1844; X32-LABEL: test_mm256_rsqrt_ps:
1845; X32:       # BB#0:
1846; X32-NEXT:    vrsqrtps %ymm0, %ymm0
1847; X32-NEXT:    retl
1848;
1849; X64-LABEL: test_mm256_rsqrt_ps:
1850; X64:       # BB#0:
1851; X64-NEXT:    vrsqrtps %ymm0, %ymm0
1852; X64-NEXT:    retq
1853  %res = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %a0)
1854  ret <8 x float> %res
1855}
1856declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
1857
1858define <4 x i64> @test_mm256_set_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7, i8 %a8, i8 %a9, i8 %a10, i8 %a11, i8 %a12, i8 %a13, i8 %a14, i8 %a15, i8 %a16, i8 %a17, i8 %a18, i8 %a19, i8 %a20, i8 %a21, i8 %a22, i8 %a23, i8 %a24, i8 %a25, i8 %a26, i8 %a27, i8 %a28, i8 %a29, i8 %a30, i8 %a31) nounwind {
1859; X32-LABEL: test_mm256_set_epi8:
1860; X32:       # BB#0:
1861; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1862; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
1863; X32-NEXT:    vmovd %ecx, %xmm0
1864; X32-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
1865; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1866; X32-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
1867; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1868; X32-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
1869; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1870; X32-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
1871; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1872; X32-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
1873; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1874; X32-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
1875; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1876; X32-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
1877; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1878; X32-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
1879; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1880; X32-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
1881; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1882; X32-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
1883; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1884; X32-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
1885; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1886; X32-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
1887; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1888; X32-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
1889; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1890; X32-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
1891; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1892; X32-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
1893; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1894; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
1895; X32-NEXT:    vmovd %ecx, %xmm1
1896; X32-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
1897; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1898; X32-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
1899; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1900; X32-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
1901; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1902; X32-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
1903; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1904; X32-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
1905; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1906; X32-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
1907; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1908; X32-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
1909; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1910; X32-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
1911; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1912; X32-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
1913; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1914; X32-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
1915; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1916; X32-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
1917; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1918; X32-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
1919; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1920; X32-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
1921; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1922; X32-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
1923; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1924; X32-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
1925; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1926; X32-NEXT:    retl
1927;
1928; X64-LABEL: test_mm256_set_epi8:
1929; X64:       # BB#0:
1930; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
1931; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1932; X64-NEXT:    vmovd %eax, %xmm0
1933; X64-NEXT:    vpinsrb $1, %r10d, %xmm0, %xmm0
1934; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1935; X64-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
1936; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1937; X64-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
1938; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1939; X64-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
1940; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1941; X64-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
1942; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1943; X64-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
1944; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1945; X64-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
1946; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1947; X64-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
1948; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1949; X64-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
1950; X64-NEXT:    movzbl %r9b, %eax
1951; X64-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
1952; X64-NEXT:    movzbl %r8b, %eax
1953; X64-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
1954; X64-NEXT:    movzbl %cl, %eax
1955; X64-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
1956; X64-NEXT:    movzbl %dl, %eax
1957; X64-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
1958; X64-NEXT:    movzbl %sil, %eax
1959; X64-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
1960; X64-NEXT:    movzbl %dil, %eax
1961; X64-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
1962; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1963; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
1964; X64-NEXT:    vmovd %ecx, %xmm1
1965; X64-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
1966; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1967; X64-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
1968; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1969; X64-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
1970; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1971; X64-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
1972; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1973; X64-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
1974; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1975; X64-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
1976; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1977; X64-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
1978; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1979; X64-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
1980; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1981; X64-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
1982; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1983; X64-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
1984; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1985; X64-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
1986; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1987; X64-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
1988; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1989; X64-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
1990; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1991; X64-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
1992; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1993; X64-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
1994; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1995; X64-NEXT:    retq
1996  %res0  = insertelement <32 x i8> undef,  i8 %a31, i32 0
1997  %res1  = insertelement <32 x i8> %res0,  i8 %a30, i32 1
1998  %res2  = insertelement <32 x i8> %res1,  i8 %a29, i32 2
1999  %res3  = insertelement <32 x i8> %res2,  i8 %a28, i32 3
2000  %res4  = insertelement <32 x i8> %res3,  i8 %a27, i32 4
2001  %res5  = insertelement <32 x i8> %res4,  i8 %a26, i32 5
2002  %res6  = insertelement <32 x i8> %res5,  i8 %a25, i32 6
2003  %res7  = insertelement <32 x i8> %res6,  i8 %a24, i32 7
2004  %res8  = insertelement <32 x i8> %res7,  i8 %a23, i32 8
2005  %res9  = insertelement <32 x i8> %res8,  i8 %a22, i32 9
2006  %res10 = insertelement <32 x i8> %res9,  i8 %a21, i32 10
2007  %res11 = insertelement <32 x i8> %res10, i8 %a20, i32 11
2008  %res12 = insertelement <32 x i8> %res11, i8 %a19, i32 12
2009  %res13 = insertelement <32 x i8> %res12, i8 %a18, i32 13
2010  %res14 = insertelement <32 x i8> %res13, i8 %a17, i32 14
2011  %res15 = insertelement <32 x i8> %res14, i8 %a16, i32 15
2012  %res16 = insertelement <32 x i8> %res15, i8 %a15, i32 16
2013  %res17 = insertelement <32 x i8> %res16, i8 %a14, i32 17
2014  %res18 = insertelement <32 x i8> %res17, i8 %a13, i32 18
2015  %res19 = insertelement <32 x i8> %res18, i8 %a12, i32 19
2016  %res20 = insertelement <32 x i8> %res19, i8 %a11, i32 20
2017  %res21 = insertelement <32 x i8> %res20, i8 %a10, i32 21
2018  %res22 = insertelement <32 x i8> %res21, i8 %a9 , i32 22
2019  %res23 = insertelement <32 x i8> %res22, i8 %a8 , i32 23
2020  %res24 = insertelement <32 x i8> %res23, i8 %a7 , i32 24
2021  %res25 = insertelement <32 x i8> %res24, i8 %a6 , i32 25
2022  %res26 = insertelement <32 x i8> %res25, i8 %a5 , i32 26
2023  %res27 = insertelement <32 x i8> %res26, i8 %a4 , i32 27
2024  %res28 = insertelement <32 x i8> %res27, i8 %a3 , i32 28
2025  %res29 = insertelement <32 x i8> %res28, i8 %a2 , i32 29
2026  %res30 = insertelement <32 x i8> %res29, i8 %a1 , i32 30
2027  %res31 = insertelement <32 x i8> %res30, i8 %a0 , i32 31
2028  %res = bitcast <32 x i8> %res31 to <4 x i64>
2029  ret <4 x i64> %res
2030}
2031
2032define <4 x i64> @test_mm256_set_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7, i16 %a8, i16 %a9, i16 %a10, i16 %a11, i16 %a12, i16 %a13, i16 %a14, i16 %a15) nounwind {
2033; X32-LABEL: test_mm256_set_epi16:
2034; X32:       # BB#0:
2035; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
2036; X32-NEXT:    vmovd %eax, %xmm0
2037; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
2038; X32-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
2039; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
2040; X32-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
2041; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
2042; X32-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
2043; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
2044; X32-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
2045; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
2046; X32-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
2047; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
2048; X32-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
2049; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
2050; X32-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
2051; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
2052; X32-NEXT:    vmovd %eax, %xmm1
2053; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
2054; X32-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm1
2055; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
2056; X32-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
2057; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
2058; X32-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm1
2059; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
2060; X32-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
2061; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
2062; X32-NEXT:    vpinsrw $5, %eax, %xmm1, %xmm1
2063; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
2064; X32-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
2065; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
2066; X32-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
2067; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2068; X32-NEXT:    retl
2069;
2070; X64-LABEL: test_mm256_set_epi16:
2071; X64:       # BB#0:
2072; X64-NEXT:    movw {{[0-9]+}}(%rsp), %ax
2073; X64-NEXT:    vmovd %eax, %xmm0
2074; X64-NEXT:    movw {{[0-9]+}}(%rsp), %ax
2075; X64-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
2076; X64-NEXT:    vpinsrw $2, %r9d, %xmm0, %xmm0
2077; X64-NEXT:    vpinsrw $3, %r8d, %xmm0, %xmm0
2078; X64-NEXT:    vpinsrw $4, %ecx, %xmm0, %xmm0
2079; X64-NEXT:    vpinsrw $5, %edx, %xmm0, %xmm0
2080; X64-NEXT:    vpinsrw $6, %esi, %xmm0, %xmm0
2081; X64-NEXT:    vpinsrw $7, %edi, %xmm0, %xmm0
2082; X64-NEXT:    movw {{[0-9]+}}(%rsp), %ax
2083; X64-NEXT:    vmovd %eax, %xmm1
2084; X64-NEXT:    movw {{[0-9]+}}(%rsp), %ax
2085; X64-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm1
2086; X64-NEXT:    movw {{[0-9]+}}(%rsp), %ax
2087; X64-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
2088; X64-NEXT:    movw {{[0-9]+}}(%rsp), %ax
2089; X64-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm1
2090; X64-NEXT:    movw {{[0-9]+}}(%rsp), %ax
2091; X64-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
2092; X64-NEXT:    movw {{[0-9]+}}(%rsp), %ax
2093; X64-NEXT:    vpinsrw $5, %eax, %xmm1, %xmm1
2094; X64-NEXT:    movw {{[0-9]+}}(%rsp), %ax
2095; X64-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
2096; X64-NEXT:    movw {{[0-9]+}}(%rsp), %ax
2097; X64-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
2098; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2099; X64-NEXT:    retq
2100  %res0  = insertelement <16 x i16> undef,  i16 %a15, i32 0
2101  %res1  = insertelement <16 x i16> %res0,  i16 %a14, i32 1
2102  %res2  = insertelement <16 x i16> %res1,  i16 %a13, i32 2
2103  %res3  = insertelement <16 x i16> %res2,  i16 %a12, i32 3
2104  %res4  = insertelement <16 x i16> %res3,  i16 %a11, i32 4
2105  %res5  = insertelement <16 x i16> %res4,  i16 %a10, i32 5
2106  %res6  = insertelement <16 x i16> %res5,  i16 %a9 , i32 6
2107  %res7  = insertelement <16 x i16> %res6,  i16 %a8 , i32 7
2108  %res8  = insertelement <16 x i16> %res7,  i16 %a7 , i32 8
2109  %res9  = insertelement <16 x i16> %res8,  i16 %a6 , i32 9
2110  %res10 = insertelement <16 x i16> %res9,  i16 %a5 , i32 10
2111  %res11 = insertelement <16 x i16> %res10, i16 %a4 , i32 11
2112  %res12 = insertelement <16 x i16> %res11, i16 %a3 , i32 12
2113  %res13 = insertelement <16 x i16> %res12, i16 %a2 , i32 13
2114  %res14 = insertelement <16 x i16> %res13, i16 %a1 , i32 14
2115  %res15 = insertelement <16 x i16> %res14, i16 %a0 , i32 15
2116  %res = bitcast <16 x i16> %res15 to <4 x i64>
2117  ret <4 x i64> %res
2118}
2119
2120define <4 x i64> @test_mm256_set_epi32(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7) nounwind {
2121; X32-LABEL: test_mm256_set_epi32:
2122; X32:       # BB#0:
2123; X32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2124; X32-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
2125; X32-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
2126; X32-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
2127; X32-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2128; X32-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
2129; X32-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
2130; X32-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
2131; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2132; X32-NEXT:    retl
2133;
2134; X64-LABEL: test_mm256_set_epi32:
2135; X64:       # BB#0:
2136; X64-NEXT:    vmovd %ecx, %xmm0
2137; X64-NEXT:    vpinsrd $1, %edx, %xmm0, %xmm0
2138; X64-NEXT:    vpinsrd $2, %esi, %xmm0, %xmm0
2139; X64-NEXT:    vpinsrd $3, %edi, %xmm0, %xmm0
2140; X64-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2141; X64-NEXT:    vpinsrd $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1
2142; X64-NEXT:    vpinsrd $2, %r9d, %xmm1, %xmm1
2143; X64-NEXT:    vpinsrd $3, %r8d, %xmm1, %xmm1
2144; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2145; X64-NEXT:    retq
2146  %res0 = insertelement <8 x i32> undef, i32 %a7, i32 0
2147  %res1 = insertelement <8 x i32> %res0, i32 %a6, i32 1
2148  %res2 = insertelement <8 x i32> %res1, i32 %a5, i32 2
2149  %res3 = insertelement <8 x i32> %res2, i32 %a4, i32 3
2150  %res4 = insertelement <8 x i32> %res3, i32 %a3, i32 4
2151  %res5 = insertelement <8 x i32> %res4, i32 %a2, i32 5
2152  %res6 = insertelement <8 x i32> %res5, i32 %a1, i32 6
2153  %res7 = insertelement <8 x i32> %res6, i32 %a0, i32 7
2154  %res = bitcast <8 x i32> %res7 to <4 x i64>
2155  ret <4 x i64> %res
2156}
2157
2158define <4 x i64> @test_mm256_set_epi64x(i64 %a0, i64 %a1, i64 %a2, i64 %a3) nounwind {
2159; X32-LABEL: test_mm256_set_epi64x:
2160; X32:       # BB#0:
2161; X32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2162; X32-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
2163; X32-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
2164; X32-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
2165; X32-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2166; X32-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
2167; X32-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
2168; X32-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
2169; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2170; X32-NEXT:    retl
2171;
2172; X64-LABEL: test_mm256_set_epi64x:
2173; X64:       # BB#0:
2174; X64-NEXT:    vmovq %rdi, %xmm0
2175; X64-NEXT:    vmovq %rsi, %xmm1
2176; X64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2177; X64-NEXT:    vmovq %rdx, %xmm1
2178; X64-NEXT:    vmovq %rcx, %xmm2
2179; X64-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
2180; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2181; X64-NEXT:    retq
2182  %res0 = insertelement <4 x i64> undef, i64 %a3, i32 0
2183  %res1 = insertelement <4 x i64> %res0, i64 %a2, i32 1
2184  %res2 = insertelement <4 x i64> %res1, i64 %a1, i32 2
2185  %res3 = insertelement <4 x i64> %res2, i64 %a0, i32 3
2186  ret <4 x i64> %res3
2187}
2188
2189define <8 x float> @test_mm256_set_m128(<4 x float> %a0, <4 x float> %a1) nounwind {
2190; X32-LABEL: test_mm256_set_m128:
2191; X32:       # BB#0:
2192; X32-NEXT:    # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
2193; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2194; X32-NEXT:    retl
2195;
2196; X64-LABEL: test_mm256_set_m128:
2197; X64:       # BB#0:
2198; X64-NEXT:    # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
2199; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2200; X64-NEXT:    retq
2201  %res = shufflevector <4 x float> %a1, <4 x float> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2202  ret <8 x float> %res
2203}
2204
2205define <4 x double> @test_mm256_set_m128d(<2 x double> %a0, <2 x double> %a1) nounwind {
2206; X32-LABEL: test_mm256_set_m128d:
2207; X32:       # BB#0:
2208; X32-NEXT:    # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
2209; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2210; X32-NEXT:    retl
2211;
2212; X64-LABEL: test_mm256_set_m128d:
2213; X64:       # BB#0:
2214; X64-NEXT:    # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
2215; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2216; X64-NEXT:    retq
2217  %arg0 = bitcast <2 x double> %a0 to <4 x float>
2218  %arg1 = bitcast <2 x double> %a1 to <4 x float>
2219  %res = shufflevector <4 x float> %arg1, <4 x float> %arg0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2220  %bc = bitcast <8 x float> %res to <4 x double>
2221  ret <4 x double> %bc
2222}
2223
2224define <4 x i64> @test_mm256_set_m128i(<2 x i64> %a0, <2 x i64> %a1) nounwind {
2225; X32-LABEL: test_mm256_set_m128i:
2226; X32:       # BB#0:
2227; X32-NEXT:    # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
2228; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2229; X32-NEXT:    retl
2230;
2231; X64-LABEL: test_mm256_set_m128i:
2232; X64:       # BB#0:
2233; X64-NEXT:    # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
2234; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2235; X64-NEXT:    retq
2236  %arg0 = bitcast <2 x i64> %a0 to <4 x float>
2237  %arg1 = bitcast <2 x i64> %a1 to <4 x float>
2238  %res = shufflevector <4 x float> %arg1, <4 x float> %arg0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2239  %bc = bitcast <8 x float> %res to <4 x i64>
2240  ret <4 x i64> %bc
2241}
2242
2243define <4 x double> @test_mm256_set_pd(double %a0, double %a1, double %a2, double %a3) nounwind {
2244; X32-LABEL: test_mm256_set_pd:
2245; X32:       # BB#0:
2246; X32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
2247; X32-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
2248; X32-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
2249; X32-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
2250; X32-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
2251; X32-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2252; X32-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2253; X32-NEXT:    retl
2254;
2255; X64-LABEL: test_mm256_set_pd:
2256; X64:       # BB#0:
2257; X64-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2258; X64-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm2[0]
2259; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2260; X64-NEXT:    retq
2261  %res0 = insertelement <4 x double> undef, double %a3, i32 0
2262  %res1 = insertelement <4 x double> %res0, double %a2, i32 1
2263  %res2 = insertelement <4 x double> %res1, double %a1, i32 2
2264  %res3 = insertelement <4 x double> %res2, double %a0, i32 3
2265  ret <4 x double> %res3
2266}
2267
2268define <8 x float> @test_mm256_set_ps(float %a0, float %a1, float %a2, float %a3, float %a4, float %a5, float %a6, float %a7) nounwind {
2269; X32-LABEL: test_mm256_set_ps:
2270; X32:       # BB#0:
2271; X32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2272; X32-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
2273; X32-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
2274; X32-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
2275; X32-NEXT:    vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
2276; X32-NEXT:    vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero
2277; X32-NEXT:    vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero
2278; X32-NEXT:    vmovss {{.*#+}} xmm7 = mem[0],zero,zero,zero
2279; X32-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3]
2280; X32-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3]
2281; X32-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0]
2282; X32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
2283; X32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
2284; X32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
2285; X32-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
2286; X32-NEXT:    retl
2287;
2288; X64-LABEL: test_mm256_set_ps:
2289; X64:       # BB#0:
2290; X64-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
2291; X64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
2292; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
2293; X64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm7[0],xmm6[0],xmm7[2,3]
2294; X64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm5[0],xmm1[3]
2295; X64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
2296; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2297; X64-NEXT:    retq
2298  %res0 = insertelement <8 x float> undef, float %a7, i32 0
2299  %res1 = insertelement <8 x float> %res0, float %a6, i32 1
2300  %res2 = insertelement <8 x float> %res1, float %a5, i32 2
2301  %res3 = insertelement <8 x float> %res2, float %a4, i32 3
2302  %res4 = insertelement <8 x float> %res3, float %a3, i32 4
2303  %res5 = insertelement <8 x float> %res4, float %a2, i32 5
2304  %res6 = insertelement <8 x float> %res5, float %a1, i32 6
2305  %res7 = insertelement <8 x float> %res6, float %a0, i32 7
2306  ret <8 x float> %res7
2307}
2308
2309define <4 x i64> @test_mm256_set1_epi8(i8 %a0) nounwind {
2310; X32-LABEL: test_mm256_set1_epi8:
2311; X32:       # BB#0:
2312; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2313; X32-NEXT:    vmovd %eax, %xmm0
2314; X32-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2315; X32-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
2316; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
2317; X32-NEXT:    retl
2318;
2319; X64-LABEL: test_mm256_set1_epi8:
2320; X64:       # BB#0:
2321; X64-NEXT:    movzbl %dil, %eax
2322; X64-NEXT:    vmovd %eax, %xmm0
2323; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2324; X64-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
2325; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
2326; X64-NEXT:    retq
2327  %res0  = insertelement <32 x i8> undef,  i8 %a0, i32 0
2328  %res1  = insertelement <32 x i8> %res0,  i8 %a0, i32 1
2329  %res2  = insertelement <32 x i8> %res1,  i8 %a0, i32 2
2330  %res3  = insertelement <32 x i8> %res2,  i8 %a0, i32 3
2331  %res4  = insertelement <32 x i8> %res3,  i8 %a0, i32 4
2332  %res5  = insertelement <32 x i8> %res4,  i8 %a0, i32 5
2333  %res6  = insertelement <32 x i8> %res5,  i8 %a0, i32 6
2334  %res7  = insertelement <32 x i8> %res6,  i8 %a0, i32 7
2335  %res8  = insertelement <32 x i8> %res7,  i8 %a0, i32 8
2336  %res9  = insertelement <32 x i8> %res8,  i8 %a0, i32 9
2337  %res10 = insertelement <32 x i8> %res9,  i8 %a0, i32 10
2338  %res11 = insertelement <32 x i8> %res10, i8 %a0, i32 11
2339  %res12 = insertelement <32 x i8> %res11, i8 %a0, i32 12
2340  %res13 = insertelement <32 x i8> %res12, i8 %a0, i32 13
2341  %res14 = insertelement <32 x i8> %res13, i8 %a0, i32 14
2342  %res15 = insertelement <32 x i8> %res14, i8 %a0, i32 15
2343  %res16 = insertelement <32 x i8> %res15, i8 %a0, i32 16
2344  %res17 = insertelement <32 x i8> %res16, i8 %a0, i32 17
2345  %res18 = insertelement <32 x i8> %res17, i8 %a0, i32 18
2346  %res19 = insertelement <32 x i8> %res18, i8 %a0, i32 19
2347  %res20 = insertelement <32 x i8> %res19, i8 %a0, i32 20
2348  %res21 = insertelement <32 x i8> %res20, i8 %a0, i32 21
2349  %res22 = insertelement <32 x i8> %res21, i8 %a0, i32 22
2350  %res23 = insertelement <32 x i8> %res22, i8 %a0, i32 23
2351  %res24 = insertelement <32 x i8> %res23, i8 %a0, i32 24
2352  %res25 = insertelement <32 x i8> %res24, i8 %a0, i32 25
2353  %res26 = insertelement <32 x i8> %res25, i8 %a0, i32 26
2354  %res27 = insertelement <32 x i8> %res26, i8 %a0, i32 27
2355  %res28 = insertelement <32 x i8> %res27, i8 %a0, i32 28
2356  %res29 = insertelement <32 x i8> %res28, i8 %a0, i32 29
2357  %res30 = insertelement <32 x i8> %res29, i8 %a0, i32 30
2358  %res31 = insertelement <32 x i8> %res30, i8 %a0, i32 31
2359  %res = bitcast <32 x i8> %res31 to <4 x i64>
2360  ret <4 x i64> %res
2361}
2362
2363define <4 x i64> @test_mm256_set1_epi16(i16 %a0) nounwind {
2364; X32-LABEL: test_mm256_set1_epi16:
2365; X32:       # BB#0:
2366; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
2367; X32-NEXT:    vmovd %eax, %xmm0
2368; X32-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
2369; X32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
2370; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
2371; X32-NEXT:    retl
2372;
2373; X64-LABEL: test_mm256_set1_epi16:
2374; X64:       # BB#0:
2375; X64-NEXT:    vmovd %edi, %xmm0
2376; X64-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
2377; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
2378; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
2379; X64-NEXT:    retq
2380  %res0  = insertelement <16 x i16> undef,  i16 %a0, i32 0
2381  %res1  = insertelement <16 x i16> %res0,  i16 %a0, i32 1
2382  %res2  = insertelement <16 x i16> %res1,  i16 %a0, i32 2
2383  %res3  = insertelement <16 x i16> %res2,  i16 %a0, i32 3
2384  %res4  = insertelement <16 x i16> %res3,  i16 %a0, i32 4
2385  %res5  = insertelement <16 x i16> %res4,  i16 %a0, i32 5
2386  %res6  = insertelement <16 x i16> %res5,  i16 %a0, i32 6
2387  %res7  = insertelement <16 x i16> %res6,  i16 %a0, i32 7
2388  %res8  = insertelement <16 x i16> %res7,  i16 %a0, i32 8
2389  %res9  = insertelement <16 x i16> %res8,  i16 %a0, i32 9
2390  %res10 = insertelement <16 x i16> %res9,  i16 %a0, i32 10
2391  %res11 = insertelement <16 x i16> %res10, i16 %a0, i32 11
2392  %res12 = insertelement <16 x i16> %res11, i16 %a0, i32 12
2393  %res13 = insertelement <16 x i16> %res12, i16 %a0, i32 13
2394  %res14 = insertelement <16 x i16> %res13, i16 %a0, i32 14
2395  %res15 = insertelement <16 x i16> %res14, i16 %a0, i32 15
2396  %res = bitcast <16 x i16> %res15 to <4 x i64>
2397  ret <4 x i64> %res
2398}
2399
2400define <4 x i64> @test_mm256_set1_epi32(i32 %a0) nounwind {
2401; X32-LABEL: test_mm256_set1_epi32:
2402; X32:       # BB#0:
2403; X32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2404; X32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2405; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
2406; X32-NEXT:    retl
2407;
2408; X64-LABEL: test_mm256_set1_epi32:
2409; X64:       # BB#0:
2410; X64-NEXT:    vmovd %edi, %xmm0
2411; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2412; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
2413; X64-NEXT:    retq
2414  %res0 = insertelement <8 x i32> undef, i32 %a0, i32 0
2415  %res1 = insertelement <8 x i32> %res0, i32 %a0, i32 1
2416  %res2 = insertelement <8 x i32> %res1, i32 %a0, i32 2
2417  %res3 = insertelement <8 x i32> %res2, i32 %a0, i32 3
2418  %res4 = insertelement <8 x i32> %res3, i32 %a0, i32 4
2419  %res5 = insertelement <8 x i32> %res4, i32 %a0, i32 5
2420  %res6 = insertelement <8 x i32> %res5, i32 %a0, i32 6
2421  %res7 = insertelement <8 x i32> %res6, i32 %a0, i32 7
2422  %res = bitcast <8 x i32> %res7 to <4 x i64>
2423  ret <4 x i64> %res
2424}
2425
2426define <4 x i64> @test_mm256_set1_epi64x(i64 %a0) nounwind {
2427; X32-LABEL: test_mm256_set1_epi64x:
2428; X32:       # BB#0:
2429; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
2430; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
2431; X32-NEXT:    vmovd %ecx, %xmm0
2432; X32-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
2433; X32-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
2434; X32-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
2435; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
2436; X32-NEXT:    retl
2437;
2438; X64-LABEL: test_mm256_set1_epi64x:
2439; X64:       # BB#0:
2440; X64-NEXT:    vmovq %rdi, %xmm0
2441; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
2442; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
2443; X64-NEXT:    retq
2444  %res0 = insertelement <4 x i64> undef, i64 %a0, i32 0
2445  %res1 = insertelement <4 x i64> %res0, i64 %a0, i32 1
2446  %res2 = insertelement <4 x i64> %res1, i64 %a0, i32 2
2447  %res3 = insertelement <4 x i64> %res2, i64 %a0, i32 3
2448  ret <4 x i64> %res3
2449}
2450
2451define <4 x double> @test_mm256_set1_pd(double %a0) nounwind {
2452; X32-LABEL: test_mm256_set1_pd:
2453; X32:       # BB#0:
2454; X32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
2455; X32-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2456; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
2457; X32-NEXT:    retl
2458;
2459; X64-LABEL: test_mm256_set1_pd:
2460; X64:       # BB#0:
2461; X64-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2462; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
2463; X64-NEXT:    retq
2464  %res0 = insertelement <4 x double> undef, double %a0, i32 0
2465  %res1 = insertelement <4 x double> %res0, double %a0, i32 1
2466  %res2 = insertelement <4 x double> %res1, double %a0, i32 2
2467  %res3 = insertelement <4 x double> %res2, double %a0, i32 3
2468  ret <4 x double> %res3
2469}
2470
2471define <8 x float> @test_mm256_set1_ps(float %a0) nounwind {
2472; X32-LABEL: test_mm256_set1_ps:
2473; X32:       # BB#0:
2474; X32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2475; X32-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
2476; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
2477; X32-NEXT:    retl
2478;
2479; X64-LABEL: test_mm256_set1_ps:
2480; X64:       # BB#0:
2481; X64-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
2482; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
2483; X64-NEXT:    retq
2484  %res0 = insertelement <8 x float> undef, float %a0, i32 0
2485  %res1 = insertelement <8 x float> %res0, float %a0, i32 1
2486  %res2 = insertelement <8 x float> %res1, float %a0, i32 2
2487  %res3 = insertelement <8 x float> %res2, float %a0, i32 3
2488  %res4 = insertelement <8 x float> %res3, float %a0, i32 4
2489  %res5 = insertelement <8 x float> %res4, float %a0, i32 5
2490  %res6 = insertelement <8 x float> %res5, float %a0, i32 6
2491  %res7 = insertelement <8 x float> %res6, float %a0, i32 7
2492  ret <8 x float> %res7
2493}
2494
2495define <4 x i64> @test_mm256_setr_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7, i8 %a8, i8 %a9, i8 %a10, i8 %a11, i8 %a12, i8 %a13, i8 %a14, i8 %a15, i8 %a16, i8 %a17, i8 %a18, i8 %a19, i8 %a20, i8 %a21, i8 %a22, i8 %a23, i8 %a24, i8 %a25, i8 %a26, i8 %a27, i8 %a28, i8 %a29, i8 %a30, i8 %a31) nounwind {
2496; X32-LABEL: test_mm256_setr_epi8:
2497; X32:       # BB#0:
2498; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2499; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
2500; X32-NEXT:    vmovd %ecx, %xmm0
2501; X32-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
2502; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2503; X32-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
2504; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2505; X32-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
2506; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2507; X32-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
2508; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2509; X32-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
2510; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2511; X32-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
2512; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2513; X32-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
2514; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2515; X32-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
2516; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2517; X32-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
2518; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2519; X32-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
2520; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2521; X32-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
2522; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2523; X32-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
2524; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2525; X32-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
2526; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2527; X32-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
2528; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2529; X32-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
2530; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2531; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
2532; X32-NEXT:    vmovd %ecx, %xmm1
2533; X32-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
2534; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2535; X32-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
2536; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2537; X32-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
2538; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2539; X32-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
2540; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2541; X32-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
2542; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2543; X32-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
2544; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2545; X32-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
2546; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2547; X32-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
2548; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2549; X32-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
2550; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2551; X32-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
2552; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2553; X32-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
2554; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2555; X32-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
2556; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2557; X32-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
2558; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2559; X32-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
2560; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2561; X32-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
2562; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2563; X32-NEXT:    retl
2564;
2565; X64-LABEL: test_mm256_setr_epi8:
2566; X64:       # BB#0:
2567; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
2568; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2569; X64-NEXT:    vmovd %eax, %xmm0
2570; X64-NEXT:    vpinsrb $1, %r10d, %xmm0, %xmm0
2571; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2572; X64-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
2573; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2574; X64-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
2575; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2576; X64-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
2577; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2578; X64-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
2579; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2580; X64-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
2581; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2582; X64-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
2583; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2584; X64-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
2585; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2586; X64-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
2587; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2588; X64-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
2589; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2590; X64-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
2591; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2592; X64-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
2593; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2594; X64-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
2595; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2596; X64-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
2597; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2598; X64-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
2599; X64-NEXT:    movzbl %sil, %eax
2600; X64-NEXT:    movzbl %dil, %esi
2601; X64-NEXT:    vmovd %esi, %xmm1
2602; X64-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
2603; X64-NEXT:    movzbl %dl, %eax
2604; X64-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
2605; X64-NEXT:    movzbl %cl, %eax
2606; X64-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
2607; X64-NEXT:    movzbl %r8b, %eax
2608; X64-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
2609; X64-NEXT:    movzbl %r9b, %eax
2610; X64-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
2611; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2612; X64-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
2613; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2614; X64-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
2615; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2616; X64-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
2617; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2618; X64-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
2619; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2620; X64-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
2621; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2622; X64-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
2623; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2624; X64-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
2625; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2626; X64-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
2627; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2628; X64-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
2629; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2630; X64-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
2631; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2632; X64-NEXT:    retq
2633  %res0  = insertelement <32 x i8> undef,  i8 %a0 , i32 0
2634  %res1  = insertelement <32 x i8> %res0,  i8 %a1 , i32 1
2635  %res2  = insertelement <32 x i8> %res1,  i8 %a2 , i32 2
2636  %res3  = insertelement <32 x i8> %res2,  i8 %a3 , i32 3
2637  %res4  = insertelement <32 x i8> %res3,  i8 %a4 , i32 4
2638  %res5  = insertelement <32 x i8> %res4,  i8 %a5 , i32 5
2639  %res6  = insertelement <32 x i8> %res5,  i8 %a6 , i32 6
2640  %res7  = insertelement <32 x i8> %res6,  i8 %a7 , i32 7
2641  %res8  = insertelement <32 x i8> %res7,  i8 %a8 , i32 8
2642  %res9  = insertelement <32 x i8> %res8,  i8 %a9 , i32 9
2643  %res10 = insertelement <32 x i8> %res9,  i8 %a10, i32 10
2644  %res11 = insertelement <32 x i8> %res10, i8 %a11, i32 11
2645  %res12 = insertelement <32 x i8> %res11, i8 %a12, i32 12
2646  %res13 = insertelement <32 x i8> %res12, i8 %a13, i32 13
2647  %res14 = insertelement <32 x i8> %res13, i8 %a14, i32 14
2648  %res15 = insertelement <32 x i8> %res14, i8 %a15, i32 15
2649  %res16 = insertelement <32 x i8> %res15, i8 %a16, i32 16
2650  %res17 = insertelement <32 x i8> %res16, i8 %a17, i32 17
2651  %res18 = insertelement <32 x i8> %res17, i8 %a18, i32 18
2652  %res19 = insertelement <32 x i8> %res18, i8 %a19, i32 19
2653  %res20 = insertelement <32 x i8> %res19, i8 %a20, i32 20
2654  %res21 = insertelement <32 x i8> %res20, i8 %a21, i32 21
2655  %res22 = insertelement <32 x i8> %res21, i8 %a22, i32 22
2656  %res23 = insertelement <32 x i8> %res22, i8 %a23, i32 23
2657  %res24 = insertelement <32 x i8> %res23, i8 %a24, i32 24
2658  %res25 = insertelement <32 x i8> %res24, i8 %a25, i32 25
2659  %res26 = insertelement <32 x i8> %res25, i8 %a26, i32 26
2660  %res27 = insertelement <32 x i8> %res26, i8 %a27, i32 27
2661  %res28 = insertelement <32 x i8> %res27, i8 %a28, i32 28
2662  %res29 = insertelement <32 x i8> %res28, i8 %a29, i32 29
2663  %res30 = insertelement <32 x i8> %res29, i8 %a30, i32 30
2664  %res31 = insertelement <32 x i8> %res30, i8 %a31, i32 31
2665  %res = bitcast <32 x i8> %res31 to <4 x i64>
2666  ret <4 x i64> %res
2667}
2668
2669define <4 x i64> @test_mm256_setr_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7, i16 %a8, i16 %a9, i16 %a10, i16 %a11, i16 %a12, i16 %a13, i16 %a14, i16 %a15) nounwind {
2670; X32-LABEL: test_mm256_setr_epi16:
2671; X32:       # BB#0:
2672; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
2673; X32-NEXT:    vmovd %eax, %xmm0
2674; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
2675; X32-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
2676; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
2677; X32-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
2678; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
2679; X32-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
2680; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
2681; X32-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
2682; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
2683; X32-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
2684; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
2685; X32-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
2686; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
2687; X32-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
2688; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
2689; X32-NEXT:    vmovd %eax, %xmm1
2690; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
2691; X32-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm1
2692; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
2693; X32-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
2694; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
2695; X32-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm1
2696; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
2697; X32-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
2698; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
2699; X32-NEXT:    vpinsrw $5, %eax, %xmm1, %xmm1
2700; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
2701; X32-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
2702; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
2703; X32-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
2704; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2705; X32-NEXT:    retl
2706;
2707; X64-LABEL: test_mm256_setr_epi16:
2708; X64:       # BB#0:
2709; X64-NEXT:    movw {{[0-9]+}}(%rsp), %ax
2710; X64-NEXT:    vmovd %eax, %xmm0
2711; X64-NEXT:    movw {{[0-9]+}}(%rsp), %ax
2712; X64-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
2713; X64-NEXT:    movw {{[0-9]+}}(%rsp), %ax
2714; X64-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
2715; X64-NEXT:    movw {{[0-9]+}}(%rsp), %ax
2716; X64-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
2717; X64-NEXT:    movw {{[0-9]+}}(%rsp), %ax
2718; X64-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
2719; X64-NEXT:    movw {{[0-9]+}}(%rsp), %ax
2720; X64-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
2721; X64-NEXT:    movw {{[0-9]+}}(%rsp), %ax
2722; X64-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
2723; X64-NEXT:    movw {{[0-9]+}}(%rsp), %ax
2724; X64-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
2725; X64-NEXT:    vmovd %edi, %xmm1
2726; X64-NEXT:    vpinsrw $1, %esi, %xmm1, %xmm1
2727; X64-NEXT:    vpinsrw $2, %edx, %xmm1, %xmm1
2728; X64-NEXT:    vpinsrw $3, %ecx, %xmm1, %xmm1
2729; X64-NEXT:    vpinsrw $4, %r8d, %xmm1, %xmm1
2730; X64-NEXT:    vpinsrw $5, %r9d, %xmm1, %xmm1
2731; X64-NEXT:    movw {{[0-9]+}}(%rsp), %ax
2732; X64-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
2733; X64-NEXT:    movw {{[0-9]+}}(%rsp), %ax
2734; X64-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
2735; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2736; X64-NEXT:    retq
2737  %res0  = insertelement <16 x i16> undef,  i16 %a0 , i32 0
2738  %res1  = insertelement <16 x i16> %res0,  i16 %a1 , i32 1
2739  %res2  = insertelement <16 x i16> %res1,  i16 %a2 , i32 2
2740  %res3  = insertelement <16 x i16> %res2,  i16 %a3 , i32 3
2741  %res4  = insertelement <16 x i16> %res3,  i16 %a4 , i32 4
2742  %res5  = insertelement <16 x i16> %res4,  i16 %a5 , i32 5
2743  %res6  = insertelement <16 x i16> %res5,  i16 %a6 , i32 6
2744  %res7  = insertelement <16 x i16> %res6,  i16 %a7 , i32 7
2745  %res8  = insertelement <16 x i16> %res7,  i16 %a8 , i32 8
2746  %res9  = insertelement <16 x i16> %res8,  i16 %a9 , i32 9
2747  %res10 = insertelement <16 x i16> %res9,  i16 %a10, i32 10
2748  %res11 = insertelement <16 x i16> %res10, i16 %a11, i32 11
2749  %res12 = insertelement <16 x i16> %res11, i16 %a12, i32 12
2750  %res13 = insertelement <16 x i16> %res12, i16 %a13, i32 13
2751  %res14 = insertelement <16 x i16> %res13, i16 %a14, i32 14
2752  %res15 = insertelement <16 x i16> %res14, i16 %a15, i32 15
2753  %res = bitcast <16 x i16> %res15 to <4 x i64>
2754  ret <4 x i64> %res
2755}
2756
2757define <4 x i64> @test_mm256_setr_epi32(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7) nounwind {
2758; X32-LABEL: test_mm256_setr_epi32:
2759; X32:       # BB#0:
2760; X32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2761; X32-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
2762; X32-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
2763; X32-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
2764; X32-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2765; X32-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
2766; X32-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
2767; X32-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
2768; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2769; X32-NEXT:    retl
2770;
2771; X64-LABEL: test_mm256_setr_epi32:
2772; X64:       # BB#0:
2773; X64-NEXT:    vmovd %r8d, %xmm0
2774; X64-NEXT:    vpinsrd $1, %r9d, %xmm0, %xmm0
2775; X64-NEXT:    vpinsrd $2, {{[0-9]+}}(%rsp), %xmm0, %xmm0
2776; X64-NEXT:    vpinsrd $3, {{[0-9]+}}(%rsp), %xmm0, %xmm0
2777; X64-NEXT:    vmovd %edi, %xmm1
2778; X64-NEXT:    vpinsrd $1, %esi, %xmm1, %xmm1
2779; X64-NEXT:    vpinsrd $2, %edx, %xmm1, %xmm1
2780; X64-NEXT:    vpinsrd $3, %ecx, %xmm1, %xmm1
2781; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2782; X64-NEXT:    retq
2783  %res0 = insertelement <8 x i32> undef, i32 %a0, i32 0
2784  %res1 = insertelement <8 x i32> %res0, i32 %a1, i32 1
2785  %res2 = insertelement <8 x i32> %res1, i32 %a2, i32 2
2786  %res3 = insertelement <8 x i32> %res2, i32 %a3, i32 3
2787  %res4 = insertelement <8 x i32> %res3, i32 %a4, i32 4
2788  %res5 = insertelement <8 x i32> %res4, i32 %a5, i32 5
2789  %res6 = insertelement <8 x i32> %res5, i32 %a6, i32 6
2790  %res7 = insertelement <8 x i32> %res6, i32 %a7, i32 7
2791  %res = bitcast <8 x i32> %res7 to <4 x i64>
2792  ret <4 x i64> %res
2793}
2794
2795define <4 x i64> @test_mm256_setr_epi64x(i64 %a0, i64 %a1, i64 %a2, i64 %a3) nounwind {
2796; X32-LABEL: test_mm256_setr_epi64x:
2797; X32:       # BB#0:
2798; X32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2799; X32-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
2800; X32-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
2801; X32-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
2802; X32-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2803; X32-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
2804; X32-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
2805; X32-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
2806; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2807; X32-NEXT:    retl
2808;
2809; X64-LABEL: test_mm256_setr_epi64x:
2810; X64:       # BB#0:
2811; X64-NEXT:    vmovq %rcx, %xmm0
2812; X64-NEXT:    vmovq %rdx, %xmm1
2813; X64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2814; X64-NEXT:    vmovq %rsi, %xmm1
2815; X64-NEXT:    vmovq %rdi, %xmm2
2816; X64-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
2817; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2818; X64-NEXT:    retq
2819  %res0 = insertelement <4 x i64> undef, i64 %a0, i32 0
2820  %res1 = insertelement <4 x i64> %res0, i64 %a1, i32 1
2821  %res2 = insertelement <4 x i64> %res1, i64 %a2, i32 2
2822  %res3 = insertelement <4 x i64> %res2, i64 %a3, i32 3
2823  ret <4 x i64> %res3
2824}
2825
2826define <8 x float> @test_mm256_setr_m128(<4 x float> %a0, <4 x float> %a1) nounwind {
2827; X32-LABEL: test_mm256_setr_m128:
2828; X32:       # BB#0:
2829; X32-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
2830; X32-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2831; X32-NEXT:    retl
2832;
2833; X64-LABEL: test_mm256_setr_m128:
2834; X64:       # BB#0:
2835; X64-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
2836; X64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2837; X64-NEXT:    retq
2838  %res = shufflevector <4 x float> %a0, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2839  ret <8 x float> %res
2840}
2841
2842define <4 x double> @test_mm256_setr_m128d(<2 x double> %a0, <2 x double> %a1) nounwind {
2843; X32-LABEL: test_mm256_setr_m128d:
2844; X32:       # BB#0:
2845; X32-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
2846; X32-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2847; X32-NEXT:    retl
2848;
2849; X64-LABEL: test_mm256_setr_m128d:
2850; X64:       # BB#0:
2851; X64-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
2852; X64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2853; X64-NEXT:    retq
2854  %arg0 = bitcast <2 x double> %a0 to <4 x float>
2855  %arg1 = bitcast <2 x double> %a1 to <4 x float>
2856  %res = shufflevector <4 x float> %arg0, <4 x float> %arg1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2857  %bc = bitcast <8 x float> %res to <4 x double>
2858  ret <4 x double> %bc
2859}
2860
2861define <4 x i64> @test_mm256_setr_m128i(<2 x i64> %a0, <2 x i64> %a1) nounwind {
2862; X32-LABEL: test_mm256_setr_m128i:
2863; X32:       # BB#0:
2864; X32-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
2865; X32-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2866; X32-NEXT:    retl
2867;
2868; X64-LABEL: test_mm256_setr_m128i:
2869; X64:       # BB#0:
2870; X64-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
2871; X64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2872; X64-NEXT:    retq
2873  %arg0 = bitcast <2 x i64> %a0 to <4 x float>
2874  %arg1 = bitcast <2 x i64> %a1 to <4 x float>
2875  %res = shufflevector <4 x float> %arg0, <4 x float> %arg1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2876  %bc = bitcast <8 x float> %res to <4 x i64>
2877  ret <4 x i64> %bc
2878}
2879
2880define <4 x double> @test_mm256_setr_pd(double %a0, double %a1, double %a2, double %a3) nounwind {
2881; X32-LABEL: test_mm256_setr_pd:
2882; X32:       # BB#0:
2883; X32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
2884; X32-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
2885; X32-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
2886; X32-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
2887; X32-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2888; X32-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm2[0]
2889; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2890; X32-NEXT:    retl
2891;
2892; X64-LABEL: test_mm256_setr_pd:
2893; X64:       # BB#0:
2894; X64-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
2895; X64-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2896; X64-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2897; X64-NEXT:    retq
2898  %res0 = insertelement <4 x double> undef, double %a0, i32 0
2899  %res1 = insertelement <4 x double> %res0, double %a1, i32 1
2900  %res2 = insertelement <4 x double> %res1, double %a2, i32 2
2901  %res3 = insertelement <4 x double> %res2, double %a3, i32 3
2902  ret <4 x double> %res3
2903}
2904
2905define <8 x float> @test_mm256_setr_ps(float %a0, float %a1, float %a2, float %a3, float %a4, float %a5, float %a6, float %a7) nounwind {
2906; X32-LABEL: test_mm256_setr_ps:
2907; X32:       # BB#0:
2908; X32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2909; X32-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
2910; X32-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
2911; X32-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
2912; X32-NEXT:    vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
2913; X32-NEXT:    vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero
2914; X32-NEXT:    vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero
2915; X32-NEXT:    vmovss {{.*#+}} xmm7 = mem[0],zero,zero,zero
2916; X32-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
2917; X32-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
2918; X32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
2919; X32-NEXT:    vinsertps {{.*#+}} xmm1 = xmm7[0],xmm6[0],xmm7[2,3]
2920; X32-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm5[0],xmm1[3]
2921; X32-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
2922; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2923; X32-NEXT:    retl
2924;
2925; X64-LABEL: test_mm256_setr_ps:
2926; X64:       # BB#0:
2927; X64-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3]
2928; X64-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3]
2929; X64-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0]
2930; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
2931; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
2932; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
2933; X64-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
2934; X64-NEXT:    retq
2935  %res0 = insertelement <8 x float> undef, float %a0, i32 0
2936  %res1 = insertelement <8 x float> %res0, float %a1, i32 1
2937  %res2 = insertelement <8 x float> %res1, float %a2, i32 2
2938  %res3 = insertelement <8 x float> %res2, float %a3, i32 3
2939  %res4 = insertelement <8 x float> %res3, float %a4, i32 4
2940  %res5 = insertelement <8 x float> %res4, float %a5, i32 5
2941  %res6 = insertelement <8 x float> %res5, float %a6, i32 6
2942  %res7 = insertelement <8 x float> %res6, float %a7, i32 7
2943  ret <8 x float> %res7
2944}
2945
2946define <4 x double> @test_mm256_setzero_pd() nounwind {
2947; X32-LABEL: test_mm256_setzero_pd:
2948; X32:       # BB#0:
2949; X32-NEXT:    vxorps %ymm0, %ymm0, %ymm0
2950; X32-NEXT:    retl
2951;
2952; X64-LABEL: test_mm256_setzero_pd:
2953; X64:       # BB#0:
2954; X64-NEXT:    vxorps %ymm0, %ymm0, %ymm0
2955; X64-NEXT:    retq
2956  ret <4 x double> zeroinitializer
2957}
2958
2959define <8 x float> @test_mm256_setzero_ps() nounwind {
2960; X32-LABEL: test_mm256_setzero_ps:
2961; X32:       # BB#0:
2962; X32-NEXT:    vxorps %ymm0, %ymm0, %ymm0
2963; X32-NEXT:    retl
2964;
2965; X64-LABEL: test_mm256_setzero_ps:
2966; X64:       # BB#0:
2967; X64-NEXT:    vxorps %ymm0, %ymm0, %ymm0
2968; X64-NEXT:    retq
2969  ret <8 x float> zeroinitializer
2970}
2971
2972define <4 x i64> @test_mm256_setzero_si256() nounwind {
2973; X32-LABEL: test_mm256_setzero_si256:
2974; X32:       # BB#0:
2975; X32-NEXT:    vxorps %ymm0, %ymm0, %ymm0
2976; X32-NEXT:    retl
2977;
2978; X64-LABEL: test_mm256_setzero_si256:
2979; X64:       # BB#0:
2980; X64-NEXT:    vxorps %ymm0, %ymm0, %ymm0
2981; X64-NEXT:    retq
2982  ret <4 x i64> zeroinitializer
2983}
2984
2985define <4 x double> @test_mm256_shuffle_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
2986; X32-LABEL: test_mm256_shuffle_pd:
2987; X32:       # BB#0:
2988; X32-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
2989; X32-NEXT:    retl
2990;
2991; X64-LABEL: test_mm256_shuffle_pd:
2992; X64:       # BB#0:
2993; X64-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
2994; X64-NEXT:    retq
2995  %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
2996  ret <4 x double> %res
2997}
2998
2999define <8 x float> @test_mm256_shuffle_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
3000; X32-LABEL: test_mm256_shuffle_ps:
3001; X32:       # BB#0:
3002; X32-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4]
3003; X32-NEXT:    retl
3004;
3005; X64-LABEL: test_mm256_shuffle_ps:
3006; X64:       # BB#0:
3007; X64-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4]
3008; X64-NEXT:    retq
3009  %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 0, i32 8, i32 8, i32 4, i32 4, i32 12, i32 12>
3010  ret <8 x float> %res
3011}
3012
3013define <4 x double> @test_mm256_sqrt_pd(<4 x double> %a0) nounwind {
3014; X32-LABEL: test_mm256_sqrt_pd:
3015; X32:       # BB#0:
3016; X32-NEXT:    vsqrtpd %ymm0, %ymm0
3017; X32-NEXT:    retl
3018;
3019; X64-LABEL: test_mm256_sqrt_pd:
3020; X64:       # BB#0:
3021; X64-NEXT:    vsqrtpd %ymm0, %ymm0
3022; X64-NEXT:    retq
3023  %res = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %a0)
3024  ret <4 x double> %res
3025}
3026declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
3027
3028define <8 x float> @test_mm256_sqrt_ps(<8 x float> %a0) nounwind {
3029; X32-LABEL: test_mm256_sqrt_ps:
3030; X32:       # BB#0:
3031; X32-NEXT:    vsqrtps %ymm0, %ymm0
3032; X32-NEXT:    retl
3033;
3034; X64-LABEL: test_mm256_sqrt_ps:
3035; X64:       # BB#0:
3036; X64-NEXT:    vsqrtps %ymm0, %ymm0
3037; X64-NEXT:    retq
3038  %res = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %a0)
3039  ret <8 x float> %res
3040}
3041declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
3042
3043define void @test_mm256_store_pd(double* %a0, <4 x double> %a1) nounwind {
3044; X32-LABEL: test_mm256_store_pd:
3045; X32:       # BB#0:
3046; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
3047; X32-NEXT:    vmovaps %ymm0, (%eax)
3048; X32-NEXT:    vzeroupper
3049; X32-NEXT:    retl
3050;
3051; X64-LABEL: test_mm256_store_pd:
3052; X64:       # BB#0:
3053; X64-NEXT:    vmovaps %ymm0, (%rdi)
3054; X64-NEXT:    vzeroupper
3055; X64-NEXT:    retq
3056  %arg0 = bitcast double* %a0 to <4 x double>*
3057  store <4 x double> %a1, <4 x double>* %arg0, align 32
3058  ret void
3059}
3060
3061define void @test_mm256_store_ps(float* %a0, <8 x float> %a1) nounwind {
3062; X32-LABEL: test_mm256_store_ps:
3063; X32:       # BB#0:
3064; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
3065; X32-NEXT:    vmovaps %ymm0, (%eax)
3066; X32-NEXT:    vzeroupper
3067; X32-NEXT:    retl
3068;
3069; X64-LABEL: test_mm256_store_ps:
3070; X64:       # BB#0:
3071; X64-NEXT:    vmovaps %ymm0, (%rdi)
3072; X64-NEXT:    vzeroupper
3073; X64-NEXT:    retq
3074  %arg0 = bitcast float* %a0 to <8 x float>*
3075  store <8 x float> %a1, <8 x float>* %arg0, align 32
3076  ret void
3077}
3078
3079define void @test_mm256_store_si256(<4 x i64>* %a0, <4 x i64> %a1) nounwind {
3080; X32-LABEL: test_mm256_store_si256:
3081; X32:       # BB#0:
3082; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
3083; X32-NEXT:    vmovaps %ymm0, (%eax)
3084; X32-NEXT:    vzeroupper
3085; X32-NEXT:    retl
3086;
3087; X64-LABEL: test_mm256_store_si256:
3088; X64:       # BB#0:
3089; X64-NEXT:    vmovaps %ymm0, (%rdi)
3090; X64-NEXT:    vzeroupper
3091; X64-NEXT:    retq
3092  store <4 x i64> %a1, <4 x i64>* %a0, align 32
3093  ret void
3094}
3095
3096define void @test_mm256_storeu_pd(double* %a0, <4 x double> %a1) nounwind {
3097; X32-LABEL: test_mm256_storeu_pd:
3098; X32:       # BB#0:
3099; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
3100; X32-NEXT:    vmovups %ymm0, (%eax)
3101; X32-NEXT:    vzeroupper
3102; X32-NEXT:    retl
3103;
3104; X64-LABEL: test_mm256_storeu_pd:
3105; X64:       # BB#0:
3106; X64-NEXT:    vmovups %ymm0, (%rdi)
3107; X64-NEXT:    vzeroupper
3108; X64-NEXT:    retq
3109  %arg0 = bitcast double* %a0 to <4 x double>*
3110  store <4 x double> %a1, <4 x double>* %arg0, align 1
3111  ret void
3112}
3113
3114define void @test_mm256_storeu_ps(float* %a0, <8 x float> %a1) nounwind {
3115; X32-LABEL: test_mm256_storeu_ps:
3116; X32:       # BB#0:
3117; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
3118; X32-NEXT:    vmovups %ymm0, (%eax)
3119; X32-NEXT:    vzeroupper
3120; X32-NEXT:    retl
3121;
3122; X64-LABEL: test_mm256_storeu_ps:
3123; X64:       # BB#0:
3124; X64-NEXT:    vmovups %ymm0, (%rdi)
3125; X64-NEXT:    vzeroupper
3126; X64-NEXT:    retq
3127  %arg0 = bitcast float* %a0 to <8 x float>*
3128  store <8 x float> %a1, <8 x float>* %arg0, align 1
3129  ret void
3130}
3131
3132define void @test_mm256_storeu_si256(<4 x i64>* %a0, <4 x i64> %a1) nounwind {
3133; X32-LABEL: test_mm256_storeu_si256:
3134; X32:       # BB#0:
3135; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
3136; X32-NEXT:    vmovups %ymm0, (%eax)
3137; X32-NEXT:    vzeroupper
3138; X32-NEXT:    retl
3139;
3140; X64-LABEL: test_mm256_storeu_si256:
3141; X64:       # BB#0:
3142; X64-NEXT:    vmovups %ymm0, (%rdi)
3143; X64-NEXT:    vzeroupper
3144; X64-NEXT:    retq
3145  store <4 x i64> %a1, <4 x i64>* %a0, align 1
3146  ret void
3147}
3148
3149define void @test_mm256_storeu2_m128(float* %a0, float* %a1, <8 x float> %a2) nounwind {
3150; X32-LABEL: test_mm256_storeu2_m128:
3151; X32:       # BB#0:
3152; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
3153; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
3154; X32-NEXT:    vmovups %xmm0, (%ecx)
3155; X32-NEXT:    vextractf128 $1, %ymm0, %xmm0
3156; X32-NEXT:    vmovups %xmm0, (%eax)
3157; X32-NEXT:    vzeroupper
3158; X32-NEXT:    retl
3159;
3160; X64-LABEL: test_mm256_storeu2_m128:
3161; X64:       # BB#0:
3162; X64-NEXT:    vmovups %xmm0, (%rdi)
3163; X64-NEXT:    vextractf128 $1, %ymm0, %xmm0
3164; X64-NEXT:    vmovups %xmm0, (%rsi)
3165; X64-NEXT:    vzeroupper
3166; X64-NEXT:    retq
3167  %arg0 = bitcast float* %a0 to <4 x float>*
3168  %lo = shufflevector <8 x float> %a2, <8 x float> %a2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3169  store <4 x float> %lo, <4 x float>* %arg0, align 1
3170  %arg1 = bitcast float* %a1 to <4 x float>*
3171  %hi = shufflevector <8 x float> %a2, <8 x float> %a2, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3172  store <4 x float> %hi, <4 x float>* %arg1, align 1
3173  ret void
3174}
3175
3176define void @test_mm256_storeu2_m128d(double* %a0, double* %a1, <4 x double> %a2) nounwind {
3177; X32-LABEL: test_mm256_storeu2_m128d:
3178; X32:       # BB#0:
3179; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
3180; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
3181; X32-NEXT:    vmovups %xmm0, (%ecx)
3182; X32-NEXT:    vextractf128 $1, %ymm0, %xmm0
3183; X32-NEXT:    vmovups %xmm0, (%eax)
3184; X32-NEXT:    vzeroupper
3185; X32-NEXT:    retl
3186;
3187; X64-LABEL: test_mm256_storeu2_m128d:
3188; X64:       # BB#0:
3189; X64-NEXT:    vmovups %xmm0, (%rdi)
3190; X64-NEXT:    vextractf128 $1, %ymm0, %xmm0
3191; X64-NEXT:    vmovups %xmm0, (%rsi)
3192; X64-NEXT:    vzeroupper
3193; X64-NEXT:    retq
3194  %arg0 = bitcast double* %a0 to <2 x double>*
3195  %lo = shufflevector <4 x double> %a2, <4 x double> %a2, <2 x i32> <i32 0, i32 1>
3196  store <2 x double> %lo, <2 x double>* %arg0, align 1
3197  %arg1 = bitcast double* %a1 to <2 x double>*
3198  %hi = shufflevector <4 x double> %a2, <4 x double> %a2, <2 x i32> <i32 2, i32 3>
3199  store <2 x double> %hi, <2 x double>* %arg1, align 1
3200  ret void
3201}
3202
3203define void @test_mm256_storeu2_m128i(<2 x i64>* %a0, <2 x i64>* %a1, <4 x i64> %a2) nounwind {
3204; X32-LABEL: test_mm256_storeu2_m128i:
3205; X32:       # BB#0:
3206; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
3207; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
3208; X32-NEXT:    vmovups %xmm0, (%ecx)
3209; X32-NEXT:    vextractf128 $1, %ymm0, %xmm0
3210; X32-NEXT:    vmovups %xmm0, (%eax)
3211; X32-NEXT:    vzeroupper
3212; X32-NEXT:    retl
3213;
3214; X64-LABEL: test_mm256_storeu2_m128i:
3215; X64:       # BB#0:
3216; X64-NEXT:    vmovups %xmm0, (%rdi)
3217; X64-NEXT:    vextractf128 $1, %ymm0, %xmm0
3218; X64-NEXT:    vmovups %xmm0, (%rsi)
3219; X64-NEXT:    vzeroupper
3220; X64-NEXT:    retq
3221  %arg0 = bitcast <2 x i64>* %a0 to <2 x i64>*
3222  %lo = shufflevector <4 x i64> %a2, <4 x i64> %a2, <2 x i32> <i32 0, i32 1>
3223  store <2 x i64> %lo, <2 x i64>* %arg0, align 1
3224  %arg1 = bitcast <2 x i64>* %a1 to <2 x i64>*
3225  %hi = shufflevector <4 x i64> %a2, <4 x i64> %a2, <2 x i32> <i32 2, i32 3>
3226  store <2 x i64> %hi, <2 x i64>* %arg1, align 1
3227  ret void
3228}
3229
3230define void @test_mm256_stream_pd(double *%a0, <4 x double> %a1) nounwind {
3231; X32-LABEL: test_mm256_stream_pd:
3232; X32:       # BB#0:
3233; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
3234; X32-NEXT:    vmovntps %ymm0, (%eax)
3235; X32-NEXT:    vzeroupper
3236; X32-NEXT:    retl
3237;
3238; X64-LABEL: test_mm256_stream_pd:
3239; X64:       # BB#0:
3240; X64-NEXT:    vmovntps %ymm0, (%rdi)
3241; X64-NEXT:    vzeroupper
3242; X64-NEXT:    retq
3243  %arg0 = bitcast double* %a0 to <4 x double>*
3244  store <4 x double> %a1, <4 x double>* %arg0, align 32, !nontemporal !0
3245  ret void
3246}
3247
3248define void @test_mm256_stream_ps(float *%a0, <8 x float> %a1) nounwind {
3249; X32-LABEL: test_mm256_stream_ps:
3250; X32:       # BB#0:
3251; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
3252; X32-NEXT:    vmovntps %ymm0, (%eax)
3253; X32-NEXT:    vzeroupper
3254; X32-NEXT:    retl
3255;
3256; X64-LABEL: test_mm256_stream_ps:
3257; X64:       # BB#0:
3258; X64-NEXT:    vmovntps %ymm0, (%rdi)
3259; X64-NEXT:    vzeroupper
3260; X64-NEXT:    retq
3261  %arg0 = bitcast float* %a0 to <8 x float>*
3262  store <8 x float> %a1, <8 x float>* %arg0, align 32, !nontemporal !0
3263  ret void
3264}
3265
3266define void @test_mm256_stream_si256(<4 x i64> *%a0, <4 x i64> %a1) nounwind {
3267; X32-LABEL: test_mm256_stream_si256:
3268; X32:       # BB#0:
3269; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
3270; X32-NEXT:    vmovntps %ymm0, (%eax)
3271; X32-NEXT:    vzeroupper
3272; X32-NEXT:    retl
3273;
3274; X64-LABEL: test_mm256_stream_si256:
3275; X64:       # BB#0:
3276; X64-NEXT:    vmovntps %ymm0, (%rdi)
3277; X64-NEXT:    vzeroupper
3278; X64-NEXT:    retq
3279  store <4 x i64> %a1, <4 x i64>* %a0, align 32, !nontemporal !0
3280  ret void
3281}
3282
3283define <4 x double> @test_mm256_sub_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
3284; X32-LABEL: test_mm256_sub_pd:
3285; X32:       # BB#0:
3286; X32-NEXT:    vsubpd %ymm1, %ymm0, %ymm0
3287; X32-NEXT:    retl
3288;
3289; X64-LABEL: test_mm256_sub_pd:
3290; X64:       # BB#0:
3291; X64-NEXT:    vsubpd %ymm1, %ymm0, %ymm0
3292; X64-NEXT:    retq
3293  %res = fsub <4 x double> %a0, %a1
3294  ret <4 x double> %res
3295}
3296
3297define <8 x float> @test_mm256_sub_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
3298; X32-LABEL: test_mm256_sub_ps:
3299; X32:       # BB#0:
3300; X32-NEXT:    vsubps %ymm1, %ymm0, %ymm0
3301; X32-NEXT:    retl
3302;
3303; X64-LABEL: test_mm256_sub_ps:
3304; X64:       # BB#0:
3305; X64-NEXT:    vsubps %ymm1, %ymm0, %ymm0
3306; X64-NEXT:    retq
3307  %res = fsub <8 x float> %a0, %a1
3308  ret <8 x float> %res
3309}
3310
3311define i32 @test_mm_testc_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
3312; X32-LABEL: test_mm_testc_pd:
3313; X32:       # BB#0:
3314; X32-NEXT:    vtestpd %xmm1, %xmm0
3315; X32-NEXT:    sbbl %eax, %eax
3316; X32-NEXT:    andl $1, %eax
3317; X32-NEXT:    retl
3318;
3319; X64-LABEL: test_mm_testc_pd:
3320; X64:       # BB#0:
3321; X64-NEXT:    vtestpd %xmm1, %xmm0
3322; X64-NEXT:    sbbl %eax, %eax
3323; X64-NEXT:    andl $1, %eax
3324; X64-NEXT:    retq
3325  %res = call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %a0, <2 x double> %a1)
3326  ret i32 %res
3327}
3328declare i32 @llvm.x86.avx.vtestc.pd(<2 x double>, <2 x double>) nounwind readnone
3329
3330define i32 @test_mm256_testc_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
3331; X32-LABEL: test_mm256_testc_pd:
3332; X32:       # BB#0:
3333; X32-NEXT:    vtestpd %ymm1, %ymm0
3334; X32-NEXT:    sbbl %eax, %eax
3335; X32-NEXT:    andl $1, %eax
3336; X32-NEXT:    vzeroupper
3337; X32-NEXT:    retl
3338;
3339; X64-LABEL: test_mm256_testc_pd:
3340; X64:       # BB#0:
3341; X64-NEXT:    vtestpd %ymm1, %ymm0
3342; X64-NEXT:    sbbl %eax, %eax
3343; X64-NEXT:    andl $1, %eax
3344; X64-NEXT:    vzeroupper
3345; X64-NEXT:    retq
3346  %res = call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %a0, <4 x double> %a1)
3347  ret i32 %res
3348}
3349declare i32 @llvm.x86.avx.vtestc.pd.256(<4 x double>, <4 x double>) nounwind readnone
3350
3351define i32 @test_mm_testc_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
3352; X32-LABEL: test_mm_testc_ps:
3353; X32:       # BB#0:
3354; X32-NEXT:    vtestps %xmm1, %xmm0
3355; X32-NEXT:    sbbl %eax, %eax
3356; X32-NEXT:    andl $1, %eax
3357; X32-NEXT:    retl
3358;
3359; X64-LABEL: test_mm_testc_ps:
3360; X64:       # BB#0:
3361; X64-NEXT:    vtestps %xmm1, %xmm0
3362; X64-NEXT:    sbbl %eax, %eax
3363; X64-NEXT:    andl $1, %eax
3364; X64-NEXT:    retq
3365  %res = call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %a0, <4 x float> %a1)
3366  ret i32 %res
3367}
3368declare i32 @llvm.x86.avx.vtestc.ps(<4 x float>, <4 x float>) nounwind readnone
3369
3370define i32 @test_mm256_testc_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
3371; X32-LABEL: test_mm256_testc_ps:
3372; X32:       # BB#0:
3373; X32-NEXT:    vtestps %ymm1, %ymm0
3374; X32-NEXT:    sbbl %eax, %eax
3375; X32-NEXT:    andl $1, %eax
3376; X32-NEXT:    vzeroupper
3377; X32-NEXT:    retl
3378;
3379; X64-LABEL: test_mm256_testc_ps:
3380; X64:       # BB#0:
3381; X64-NEXT:    vtestps %ymm1, %ymm0
3382; X64-NEXT:    sbbl %eax, %eax
3383; X64-NEXT:    andl $1, %eax
3384; X64-NEXT:    vzeroupper
3385; X64-NEXT:    retq
3386  %res = call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %a0, <8 x float> %a1)
3387  ret i32 %res
3388}
3389declare i32 @llvm.x86.avx.vtestc.ps.256(<8 x float>, <8 x float>) nounwind readnone
3390
3391define i32 @test_mm256_testc_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
3392; X32-LABEL: test_mm256_testc_si256:
3393; X32:       # BB#0:
3394; X32-NEXT:    vptest %ymm1, %ymm0
3395; X32-NEXT:    sbbl %eax, %eax
3396; X32-NEXT:    andl $1, %eax
3397; X32-NEXT:    vzeroupper
3398; X32-NEXT:    retl
3399;
3400; X64-LABEL: test_mm256_testc_si256:
3401; X64:       # BB#0:
3402; X64-NEXT:    vptest %ymm1, %ymm0
3403; X64-NEXT:    sbbl %eax, %eax
3404; X64-NEXT:    andl $1, %eax
3405; X64-NEXT:    vzeroupper
3406; X64-NEXT:    retq
3407  %res = call i32 @llvm.x86.avx.ptestc.256(<4 x i64> %a0, <4 x i64> %a1)
3408  ret i32 %res
3409}
3410declare i32 @llvm.x86.avx.ptestc.256(<4 x i64>, <4 x i64>) nounwind readnone
3411
3412define i32 @test_mm_testnzc_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
3413; X32-LABEL: test_mm_testnzc_pd:
3414; X32:       # BB#0:
3415; X32-NEXT:    xorl %eax, %eax
3416; X32-NEXT:    vtestpd %xmm1, %xmm0
3417; X32-NEXT:    seta %al
3418; X32-NEXT:    retl
3419;
3420; X64-LABEL: test_mm_testnzc_pd:
3421; X64:       # BB#0:
3422; X64-NEXT:    xorl %eax, %eax
3423; X64-NEXT:    vtestpd %xmm1, %xmm0
3424; X64-NEXT:    seta %al
3425; X64-NEXT:    retq
3426  %res = call i32 @llvm.x86.avx.vtestnzc.pd(<2 x double> %a0, <2 x double> %a1)
3427  ret i32 %res
3428}
3429declare i32 @llvm.x86.avx.vtestnzc.pd(<2 x double>, <2 x double>) nounwind readnone
3430
3431define i32 @test_mm256_testnzc_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
3432; X32-LABEL: test_mm256_testnzc_pd:
3433; X32:       # BB#0:
3434; X32-NEXT:    xorl %eax, %eax
3435; X32-NEXT:    vtestpd %ymm1, %ymm0
3436; X32-NEXT:    seta %al
3437; X32-NEXT:    vzeroupper
3438; X32-NEXT:    retl
3439;
3440; X64-LABEL: test_mm256_testnzc_pd:
3441; X64:       # BB#0:
3442; X64-NEXT:    xorl %eax, %eax
3443; X64-NEXT:    vtestpd %ymm1, %ymm0
3444; X64-NEXT:    seta %al
3445; X64-NEXT:    vzeroupper
3446; X64-NEXT:    retq
3447  %res = call i32 @llvm.x86.avx.vtestnzc.pd.256(<4 x double> %a0, <4 x double> %a1)
3448  ret i32 %res
3449}
3450declare i32 @llvm.x86.avx.vtestnzc.pd.256(<4 x double>, <4 x double>) nounwind readnone
3451
3452define i32 @test_mm_testnzc_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
3453; X32-LABEL: test_mm_testnzc_ps:
3454; X32:       # BB#0:
3455; X32-NEXT:    xorl %eax, %eax
3456; X32-NEXT:    vtestps %xmm1, %xmm0
3457; X32-NEXT:    seta %al
3458; X32-NEXT:    retl
3459;
3460; X64-LABEL: test_mm_testnzc_ps:
3461; X64:       # BB#0:
3462; X64-NEXT:    xorl %eax, %eax
3463; X64-NEXT:    vtestps %xmm1, %xmm0
3464; X64-NEXT:    seta %al
3465; X64-NEXT:    retq
3466  %res = call i32 @llvm.x86.avx.vtestnzc.ps(<4 x float> %a0, <4 x float> %a1)
3467  ret i32 %res
3468}
3469declare i32 @llvm.x86.avx.vtestnzc.ps(<4 x float>, <4 x float>) nounwind readnone
3470
3471define i32 @test_mm256_testnzc_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
3472; X32-LABEL: test_mm256_testnzc_ps:
3473; X32:       # BB#0:
3474; X32-NEXT:    xorl %eax, %eax
3475; X32-NEXT:    vtestps %ymm1, %ymm0
3476; X32-NEXT:    seta %al
3477; X32-NEXT:    vzeroupper
3478; X32-NEXT:    retl
3479;
3480; X64-LABEL: test_mm256_testnzc_ps:
3481; X64:       # BB#0:
3482; X64-NEXT:    xorl %eax, %eax
3483; X64-NEXT:    vtestps %ymm1, %ymm0
3484; X64-NEXT:    seta %al
3485; X64-NEXT:    vzeroupper
3486; X64-NEXT:    retq
3487  %res = call i32 @llvm.x86.avx.vtestnzc.ps.256(<8 x float> %a0, <8 x float> %a1)
3488  ret i32 %res
3489}
3490declare i32 @llvm.x86.avx.vtestnzc.ps.256(<8 x float>, <8 x float>) nounwind readnone
3491
3492define i32 @test_mm256_testnzc_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
3493; X32-LABEL: test_mm256_testnzc_si256:
3494; X32:       # BB#0:
3495; X32-NEXT:    xorl %eax, %eax
3496; X32-NEXT:    vptest %ymm1, %ymm0
3497; X32-NEXT:    seta %al
3498; X32-NEXT:    vzeroupper
3499; X32-NEXT:    retl
3500;
3501; X64-LABEL: test_mm256_testnzc_si256:
3502; X64:       # BB#0:
3503; X64-NEXT:    xorl %eax, %eax
3504; X64-NEXT:    vptest %ymm1, %ymm0
3505; X64-NEXT:    seta %al
3506; X64-NEXT:    vzeroupper
3507; X64-NEXT:    retq
3508  %res = call i32 @llvm.x86.avx.ptestnzc.256(<4 x i64> %a0, <4 x i64> %a1)
3509  ret i32 %res
3510}
3511declare i32 @llvm.x86.avx.ptestnzc.256(<4 x i64>, <4 x i64>) nounwind readnone
3512
3513define i32 @test_mm_testz_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
3514; X32-LABEL: test_mm_testz_pd:
3515; X32:       # BB#0:
3516; X32-NEXT:    xorl %eax, %eax
3517; X32-NEXT:    vtestpd %xmm1, %xmm0
3518; X32-NEXT:    sete %al
3519; X32-NEXT:    retl
3520;
3521; X64-LABEL: test_mm_testz_pd:
3522; X64:       # BB#0:
3523; X64-NEXT:    xorl %eax, %eax
3524; X64-NEXT:    vtestpd %xmm1, %xmm0
3525; X64-NEXT:    sete %al
3526; X64-NEXT:    retq
3527  %res = call i32 @llvm.x86.avx.vtestz.pd(<2 x double> %a0, <2 x double> %a1)
3528  ret i32 %res
3529}
3530declare i32 @llvm.x86.avx.vtestz.pd(<2 x double>, <2 x double>) nounwind readnone
3531
3532define i32 @test_mm256_testz_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
3533; X32-LABEL: test_mm256_testz_pd:
3534; X32:       # BB#0:
3535; X32-NEXT:    xorl %eax, %eax
3536; X32-NEXT:    vtestpd %ymm1, %ymm0
3537; X32-NEXT:    sete %al
3538; X32-NEXT:    vzeroupper
3539; X32-NEXT:    retl
3540;
3541; X64-LABEL: test_mm256_testz_pd:
3542; X64:       # BB#0:
3543; X64-NEXT:    xorl %eax, %eax
3544; X64-NEXT:    vtestpd %ymm1, %ymm0
3545; X64-NEXT:    sete %al
3546; X64-NEXT:    vzeroupper
3547; X64-NEXT:    retq
3548  %res = call i32 @llvm.x86.avx.vtestz.pd.256(<4 x double> %a0, <4 x double> %a1)
3549  ret i32 %res
3550}
3551declare i32 @llvm.x86.avx.vtestz.pd.256(<4 x double>, <4 x double>) nounwind readnone
3552
3553define i32 @test_mm_testz_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
3554; X32-LABEL: test_mm_testz_ps:
3555; X32:       # BB#0:
3556; X32-NEXT:    xorl %eax, %eax
3557; X32-NEXT:    vtestps %xmm1, %xmm0
3558; X32-NEXT:    sete %al
3559; X32-NEXT:    retl
3560;
3561; X64-LABEL: test_mm_testz_ps:
3562; X64:       # BB#0:
3563; X64-NEXT:    xorl %eax, %eax
3564; X64-NEXT:    vtestps %xmm1, %xmm0
3565; X64-NEXT:    sete %al
3566; X64-NEXT:    retq
3567  %res = call i32 @llvm.x86.avx.vtestz.ps(<4 x float> %a0, <4 x float> %a1)
3568  ret i32 %res
3569}
3570declare i32 @llvm.x86.avx.vtestz.ps(<4 x float>, <4 x float>) nounwind readnone
3571
3572define i32 @test_mm256_testz_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
3573; X32-LABEL: test_mm256_testz_ps:
3574; X32:       # BB#0:
3575; X32-NEXT:    xorl %eax, %eax
3576; X32-NEXT:    vtestps %ymm1, %ymm0
3577; X32-NEXT:    sete %al
3578; X32-NEXT:    vzeroupper
3579; X32-NEXT:    retl
3580;
3581; X64-LABEL: test_mm256_testz_ps:
3582; X64:       # BB#0:
3583; X64-NEXT:    xorl %eax, %eax
3584; X64-NEXT:    vtestps %ymm1, %ymm0
3585; X64-NEXT:    sete %al
3586; X64-NEXT:    vzeroupper
3587; X64-NEXT:    retq
3588  %res = call i32 @llvm.x86.avx.vtestz.ps.256(<8 x float> %a0, <8 x float> %a1)
3589  ret i32 %res
3590}
3591declare i32 @llvm.x86.avx.vtestz.ps.256(<8 x float>, <8 x float>) nounwind readnone
3592
3593define i32 @test_mm256_testz_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
3594; X32-LABEL: test_mm256_testz_si256:
3595; X32:       # BB#0:
3596; X32-NEXT:    xorl %eax, %eax
3597; X32-NEXT:    vptest %ymm1, %ymm0
3598; X32-NEXT:    sete %al
3599; X32-NEXT:    vzeroupper
3600; X32-NEXT:    retl
3601;
3602; X64-LABEL: test_mm256_testz_si256:
3603; X64:       # BB#0:
3604; X64-NEXT:    xorl %eax, %eax
3605; X64-NEXT:    vptest %ymm1, %ymm0
3606; X64-NEXT:    sete %al
3607; X64-NEXT:    vzeroupper
3608; X64-NEXT:    retq
3609  %res = call i32 @llvm.x86.avx.ptestz.256(<4 x i64> %a0, <4 x i64> %a1)
3610  ret i32 %res
3611}
3612declare i32 @llvm.x86.avx.ptestz.256(<4 x i64>, <4 x i64>) nounwind readnone
3613
3614define <2 x double> @test_mm_undefined_pd() nounwind {
3615; X32-LABEL: test_mm_undefined_pd:
3616; X32:       # BB#0:
3617; X32-NEXT:    retl
3618;
3619; X64-LABEL: test_mm_undefined_pd:
3620; X64:       # BB#0:
3621; X64-NEXT:    retq
3622  ret <2 x double> undef
3623}
3624
3625define <4 x double> @test_mm256_undefined_pd() nounwind {
3626; X32-LABEL: test_mm256_undefined_pd:
3627; X32:       # BB#0:
3628; X32-NEXT:    retl
3629;
3630; X64-LABEL: test_mm256_undefined_pd:
3631; X64:       # BB#0:
3632; X64-NEXT:    retq
3633  ret <4 x double> undef
3634}
3635
3636define <8 x float> @test_mm256_undefined_ps() nounwind {
3637; X32-LABEL: test_mm256_undefined_ps:
3638; X32:       # BB#0:
3639; X32-NEXT:    retl
3640;
3641; X64-LABEL: test_mm256_undefined_ps:
3642; X64:       # BB#0:
3643; X64-NEXT:    retq
3644  ret <8 x float> undef
3645}
3646
3647define <4 x i64> @test_mm256_undefined_si256() nounwind {
3648; X32-LABEL: test_mm256_undefined_si256:
3649; X32:       # BB#0:
3650; X32-NEXT:    retl
3651;
3652; X64-LABEL: test_mm256_undefined_si256:
3653; X64:       # BB#0:
3654; X64-NEXT:    retq
3655  ret <4 x i64> undef
3656}
3657
3658define <4 x double> @test_mm256_unpackhi_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
3659; X32-LABEL: test_mm256_unpackhi_pd:
3660; X32:       # BB#0:
3661; X32-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
3662; X32-NEXT:    retl
3663;
3664; X64-LABEL: test_mm256_unpackhi_pd:
3665; X64:       # BB#0:
3666; X64-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
3667; X64-NEXT:    retq
3668  %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
3669  ret <4 x double> %res
3670}
3671
3672define <8 x float> @test_mm256_unpackhi_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
3673; X32-LABEL: test_mm256_unpackhi_ps:
3674; X32:       # BB#0:
3675; X32-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
3676; X32-NEXT:    retl
3677;
3678; X64-LABEL: test_mm256_unpackhi_ps:
3679; X64:       # BB#0:
3680; X64-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
3681; X64-NEXT:    retq
3682  %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
3683  ret <8 x float> %res
3684}
3685
3686define <4 x double> @test_mm256_unpacklo_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
3687; X32-LABEL: test_mm256_unpacklo_pd:
3688; X32:       # BB#0:
3689; X32-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
3690; X32-NEXT:    retl
3691;
3692; X64-LABEL: test_mm256_unpacklo_pd:
3693; X64:       # BB#0:
3694; X64-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
3695; X64-NEXT:    retq
3696  %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
3697  ret <4 x double> %res
3698}
3699
3700define <8 x float> @test_mm256_unpacklo_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
3701; X32-LABEL: test_mm256_unpacklo_ps:
3702; X32:       # BB#0:
3703; X32-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
3704; X32-NEXT:    retl
3705;
3706; X64-LABEL: test_mm256_unpacklo_ps:
3707; X64:       # BB#0:
3708; X64-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
3709; X64-NEXT:    retq
3710  %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
3711  ret <8 x float> %res
3712}
3713
3714define <4 x double> @test_mm256_xor_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
3715; X32-LABEL: test_mm256_xor_pd:
3716; X32:       # BB#0:
3717; X32-NEXT:    vxorps %ymm1, %ymm0, %ymm0
3718; X32-NEXT:    retl
3719;
3720; X64-LABEL: test_mm256_xor_pd:
3721; X64:       # BB#0:
3722; X64-NEXT:    vxorps %ymm1, %ymm0, %ymm0
3723; X64-NEXT:    retq
3724  %1 = bitcast <4 x double> %a0 to <4 x i64>
3725  %2 = bitcast <4 x double> %a1 to <4 x i64>
3726  %res = xor <4 x i64> %1, %2
3727  %bc = bitcast <4 x i64> %res to <4 x double>
3728  ret <4 x double> %bc
3729}
3730
3731define <8 x float> @test_mm256_xor_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
3732; X32-LABEL: test_mm256_xor_ps:
3733; X32:       # BB#0:
3734; X32-NEXT:    vxorps %ymm1, %ymm0, %ymm0
3735; X32-NEXT:    retl
3736;
3737; X64-LABEL: test_mm256_xor_ps:
3738; X64:       # BB#0:
3739; X64-NEXT:    vxorps %ymm1, %ymm0, %ymm0
3740; X64-NEXT:    retq
3741  %1 = bitcast <8 x float> %a0 to <8 x i32>
3742  %2 = bitcast <8 x float> %a1 to <8 x i32>
3743  %res = xor <8 x i32> %1, %2
3744  %bc = bitcast <8 x i32> %res to <8 x float>
3745  ret <8 x float> %bc
3746}
3747
3748define void @test_mm256_zeroall() nounwind {
3749; X32-LABEL: test_mm256_zeroall:
3750; X32:       # BB#0:
3751; X32-NEXT:    vzeroall
3752; X32-NEXT:    retl
3753;
3754; X64-LABEL: test_mm256_zeroall:
3755; X64:       # BB#0:
3756; X64-NEXT:    vzeroall
3757; X64-NEXT:    retq
3758  call void @llvm.x86.avx.vzeroall()
3759  ret void
3760}
3761declare void @llvm.x86.avx.vzeroall() nounwind readnone
3762
3763define void @test_mm256_zeroupper() nounwind {
3764; X32-LABEL: test_mm256_zeroupper:
3765; X32:       # BB#0:
3766; X32-NEXT:    vzeroupper
3767; X32-NEXT:    retl
3768;
3769; X64-LABEL: test_mm256_zeroupper:
3770; X64:       # BB#0:
3771; X64-NEXT:    vzeroupper
3772; X64-NEXT:    retq
3773  call void @llvm.x86.avx.vzeroupper()
3774  ret void
3775}
3776declare void @llvm.x86.avx.vzeroupper() nounwind readnone
3777
3778!0 = !{i32 1}
3779