• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=X32
3; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=X64
4
5; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512vl-builtins.c
6
7define <2 x i64> @test_mm_broadcastd_epi32(<2 x i64> %a0) {
8; X32-LABEL: test_mm_broadcastd_epi32:
9; X32:       # BB#0:
10; X32-NEXT:    vpbroadcastd %xmm0, %xmm0
11; X32-NEXT:    retl
12;
13; X64-LABEL: test_mm_broadcastd_epi32:
14; X64:       # BB#0:
15; X64-NEXT:    vpbroadcastd %xmm0, %xmm0
16; X64-NEXT:    retq
17  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
18  %res0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <4 x i32> zeroinitializer
19  %res1 = bitcast <4 x i32> %res0 to <2 x i64>
20  ret <2 x i64> %res1
21}
22
23define <2 x i64> @test_mm_mask_broadcastd_epi32(<2 x i64> %a0, i8 %a1, <2 x i64> %a2) {
24; X32-LABEL: test_mm_mask_broadcastd_epi32:
25; X32:       # BB#0:
26; X32-NEXT:    pushl %eax
27; X32-NEXT:  .Ltmp0:
28; X32-NEXT:    .cfi_def_cfa_offset 8
29; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
30; X32-NEXT:    andb $15, %al
31; X32-NEXT:    movb %al, (%esp)
32; X32-NEXT:    movzbl (%esp), %eax
33; X32-NEXT:    kmovw %eax, %k1
34; X32-NEXT:    vpbroadcastd %xmm1, %xmm0 {%k1}
35; X32-NEXT:    popl %eax
36; X32-NEXT:    retl
37;
38; X64-LABEL: test_mm_mask_broadcastd_epi32:
39; X64:       # BB#0:
40; X64-NEXT:    andb $15, %dil
41; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
42; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
43; X64-NEXT:    kmovw %eax, %k1
44; X64-NEXT:    vpbroadcastd %xmm1, %xmm0 {%k1}
45; X64-NEXT:    retq
46  %trn1 = trunc i8 %a1 to i4
47  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
48  %arg1 = bitcast i4 %trn1 to <4 x i1>
49  %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
50  %res0 = shufflevector <4 x i32> %arg2, <4 x i32> undef, <4 x i32> zeroinitializer
51  %res1 = select <4 x i1> %arg1, <4 x i32> %res0, <4 x i32> %arg0
52  %res2 = bitcast <4 x i32> %res1 to <2 x i64>
53  ret <2 x i64> %res2
54}
55
56define <2 x i64> @test_mm_maskz_broadcastd_epi32(i8 %a0, <2 x i64> %a1) {
57; X32-LABEL: test_mm_maskz_broadcastd_epi32:
58; X32:       # BB#0:
59; X32-NEXT:    pushl %eax
60; X32-NEXT:  .Ltmp1:
61; X32-NEXT:    .cfi_def_cfa_offset 8
62; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
63; X32-NEXT:    andb $15, %al
64; X32-NEXT:    movb %al, (%esp)
65; X32-NEXT:    movzbl (%esp), %eax
66; X32-NEXT:    kmovw %eax, %k1
67; X32-NEXT:    vpbroadcastd %xmm0, %xmm0 {%k1} {z}
68; X32-NEXT:    popl %eax
69; X32-NEXT:    retl
70;
71; X64-LABEL: test_mm_maskz_broadcastd_epi32:
72; X64:       # BB#0:
73; X64-NEXT:    andb $15, %dil
74; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
75; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
76; X64-NEXT:    kmovw %eax, %k1
77; X64-NEXT:    vpbroadcastd %xmm0, %xmm0 {%k1} {z}
78; X64-NEXT:    retq
79  %trn0 = trunc i8 %a0 to i4
80  %arg0 = bitcast i4 %trn0 to <4 x i1>
81  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
82  %res0 = shufflevector <4 x i32> %arg1, <4 x i32> undef, <4 x i32> zeroinitializer
83  %res1 = select <4 x i1> %arg0, <4 x i32> %res0, <4 x i32> zeroinitializer
84  %res2 = bitcast <4 x i32> %res1 to <2 x i64>
85  ret <2 x i64> %res2
86}
87
88define <4 x i64> @test_mm256_broadcastd_epi32(<2 x i64> %a0) {
89; X32-LABEL: test_mm256_broadcastd_epi32:
90; X32:       # BB#0:
91; X32-NEXT:    vpbroadcastd %xmm0, %ymm0
92; X32-NEXT:    retl
93;
94; X64-LABEL: test_mm256_broadcastd_epi32:
95; X64:       # BB#0:
96; X64-NEXT:    vpbroadcastd %xmm0, %ymm0
97; X64-NEXT:    retq
98  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
99  %res0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <8 x i32> zeroinitializer
100  %res1 = bitcast <8 x i32> %res0 to <4 x i64>
101  ret <4 x i64> %res1
102}
103
104define <4 x i64> @test_mm256_mask_broadcastd_epi32(<4 x i64> %a0, i8 %a1, <2 x i64> %a2) {
105; X32-LABEL: test_mm256_mask_broadcastd_epi32:
106; X32:       # BB#0:
107; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
108; X32-NEXT:    kmovw %eax, %k1
109; X32-NEXT:    vpbroadcastd %xmm1, %ymm0 {%k1}
110; X32-NEXT:    retl
111;
112; X64-LABEL: test_mm256_mask_broadcastd_epi32:
113; X64:       # BB#0:
114; X64-NEXT:    kmovw %edi, %k1
115; X64-NEXT:    vpbroadcastd %xmm1, %ymm0 {%k1}
116; X64-NEXT:    retq
117  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
118  %arg1 = bitcast i8 %a1 to <8 x i1>
119  %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
120  %res0 = shufflevector <4 x i32> %arg2, <4 x i32> undef, <8 x i32> zeroinitializer
121  %res1 = select <8 x i1> %arg1, <8 x i32> %res0, <8 x i32> %arg0
122  %res2 = bitcast <8 x i32> %res1 to <4 x i64>
123  ret <4 x i64> %res2
124}
125
126define <4 x i64> @test_mm256_maskz_broadcastd_epi32(i8 %a0, <2 x i64> %a1) {
127; X32-LABEL: test_mm256_maskz_broadcastd_epi32:
128; X32:       # BB#0:
129; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
130; X32-NEXT:    kmovw %eax, %k1
131; X32-NEXT:    vpbroadcastd %xmm0, %ymm0 {%k1} {z}
132; X32-NEXT:    retl
133;
134; X64-LABEL: test_mm256_maskz_broadcastd_epi32:
135; X64:       # BB#0:
136; X64-NEXT:    kmovw %edi, %k1
137; X64-NEXT:    vpbroadcastd %xmm0, %ymm0 {%k1} {z}
138; X64-NEXT:    retq
139  %arg0 = bitcast i8 %a0 to <8 x i1>
140  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
141  %res0 = shufflevector <4 x i32> %arg1, <4 x i32> undef, <8 x i32> zeroinitializer
142  %res1 = select <8 x i1> %arg0, <8 x i32> %res0, <8 x i32> zeroinitializer
143  %res2 = bitcast <8 x i32> %res1 to <4 x i64>
144  ret <4 x i64> %res2
145}
146
147define <2 x i64> @test_mm_broadcastq_epi64(<2 x i64> %a0) {
148; X32-LABEL: test_mm_broadcastq_epi64:
149; X32:       # BB#0:
150; X32-NEXT:    vpbroadcastq %xmm0, %xmm0
151; X32-NEXT:    retl
152;
153; X64-LABEL: test_mm_broadcastq_epi64:
154; X64:       # BB#0:
155; X64-NEXT:    vpbroadcastq %xmm0, %xmm0
156; X64-NEXT:    retq
157  %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer
158  ret <2 x i64> %res
159}
160
161define <2 x i64> @test_mm_mask_broadcastq_epi64(<2 x i64> %a0, i8 %a1, <2 x i64> %a2) {
162; X32-LABEL: test_mm_mask_broadcastq_epi64:
163; X32:       # BB#0:
164; X32-NEXT:    pushl %eax
165; X32-NEXT:  .Ltmp2:
166; X32-NEXT:    .cfi_def_cfa_offset 8
167; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
168; X32-NEXT:    andb $3, %al
169; X32-NEXT:    movb %al, {{[0-9]+}}(%esp)
170; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
171; X32-NEXT:    kmovw %eax, %k1
172; X32-NEXT:    vpbroadcastq %xmm1, %xmm0 {%k1}
173; X32-NEXT:    popl %eax
174; X32-NEXT:    retl
175;
176; X64-LABEL: test_mm_mask_broadcastq_epi64:
177; X64:       # BB#0:
178; X64-NEXT:    andb $3, %dil
179; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
180; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
181; X64-NEXT:    kmovw %eax, %k1
182; X64-NEXT:    vpbroadcastq %xmm1, %xmm0 {%k1}
183; X64-NEXT:    retq
184  %trn1 = trunc i8 %a1 to i2
185  %arg1 = bitcast i2 %trn1 to <2 x i1>
186  %res0 = shufflevector <2 x i64> %a2, <2 x i64> undef, <2 x i32> zeroinitializer
187  %res1 = select <2 x i1> %arg1, <2 x i64> %res0, <2 x i64> %a0
188  ret <2 x i64> %res1
189}
190
191define <2 x i64> @test_mm_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) {
192; X32-LABEL: test_mm_maskz_broadcastq_epi64:
193; X32:       # BB#0:
194; X32-NEXT:    pushl %eax
195; X32-NEXT:  .Ltmp3:
196; X32-NEXT:    .cfi_def_cfa_offset 8
197; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
198; X32-NEXT:    andb $3, %al
199; X32-NEXT:    movb %al, {{[0-9]+}}(%esp)
200; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
201; X32-NEXT:    kmovw %eax, %k1
202; X32-NEXT:    vpbroadcastq %xmm0, %xmm0 {%k1} {z}
203; X32-NEXT:    popl %eax
204; X32-NEXT:    retl
205;
206; X64-LABEL: test_mm_maskz_broadcastq_epi64:
207; X64:       # BB#0:
208; X64-NEXT:    andb $3, %dil
209; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
210; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
211; X64-NEXT:    kmovw %eax, %k1
212; X64-NEXT:    vpbroadcastq %xmm0, %xmm0 {%k1} {z}
213; X64-NEXT:    retq
214  %trn0 = trunc i8 %a0 to i2
215  %arg0 = bitcast i2 %trn0 to <2 x i1>
216  %res0 = shufflevector <2 x i64> %a1, <2 x i64> undef, <2 x i32> zeroinitializer
217  %res1 = select <2 x i1> %arg0, <2 x i64> %res0, <2 x i64> zeroinitializer
218  ret <2 x i64> %res1
219}
220
221define <4 x i64> @test_mm256_broadcastq_epi64(<2 x i64> %a0) {
222; X32-LABEL: test_mm256_broadcastq_epi64:
223; X32:       # BB#0:
224; X32-NEXT:    vpbroadcastq %xmm0, %ymm0
225; X32-NEXT:    retl
226;
227; X64-LABEL: test_mm256_broadcastq_epi64:
228; X64:       # BB#0:
229; X64-NEXT:    vpbroadcastq %xmm0, %ymm0
230; X64-NEXT:    retq
231  %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> zeroinitializer
232  ret <4 x i64> %res
233}
234
235define <4 x i64> @test_mm256_mask_broadcastq_epi64(<4 x i64> %a0, i8 %a1, <2 x i64> %a2) {
236; X32-LABEL: test_mm256_mask_broadcastq_epi64:
237; X32:       # BB#0:
238; X32-NEXT:    pushl %eax
239; X32-NEXT:  .Ltmp4:
240; X32-NEXT:    .cfi_def_cfa_offset 8
241; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
242; X32-NEXT:    andb $15, %al
243; X32-NEXT:    movb %al, (%esp)
244; X32-NEXT:    movzbl (%esp), %eax
245; X32-NEXT:    kmovw %eax, %k1
246; X32-NEXT:    vpbroadcastq %xmm1, %ymm0 {%k1}
247; X32-NEXT:    popl %eax
248; X32-NEXT:    retl
249;
250; X64-LABEL: test_mm256_mask_broadcastq_epi64:
251; X64:       # BB#0:
252; X64-NEXT:    andb $15, %dil
253; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
254; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
255; X64-NEXT:    kmovw %eax, %k1
256; X64-NEXT:    vpbroadcastq %xmm1, %ymm0 {%k1}
257; X64-NEXT:    retq
258  %trn1 = trunc i8 %a1 to i4
259  %arg1 = bitcast i4 %trn1 to <4 x i1>
260  %res0 = shufflevector <2 x i64> %a2, <2 x i64> undef, <4 x i32> zeroinitializer
261  %res1 = select <4 x i1> %arg1, <4 x i64> %res0, <4 x i64> %a0
262  ret <4 x i64> %res1
263}
264
265define <4 x i64> @test_mm256_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) {
266; X32-LABEL: test_mm256_maskz_broadcastq_epi64:
267; X32:       # BB#0:
268; X32-NEXT:    pushl %eax
269; X32-NEXT:  .Ltmp5:
270; X32-NEXT:    .cfi_def_cfa_offset 8
271; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
272; X32-NEXT:    andb $15, %al
273; X32-NEXT:    movb %al, (%esp)
274; X32-NEXT:    movzbl (%esp), %eax
275; X32-NEXT:    kmovw %eax, %k1
276; X32-NEXT:    vpbroadcastq %xmm0, %ymm0 {%k1} {z}
277; X32-NEXT:    popl %eax
278; X32-NEXT:    retl
279;
280; X64-LABEL: test_mm256_maskz_broadcastq_epi64:
281; X64:       # BB#0:
282; X64-NEXT:    andb $15, %dil
283; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
284; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
285; X64-NEXT:    kmovw %eax, %k1
286; X64-NEXT:    vpbroadcastq %xmm0, %ymm0 {%k1} {z}
287; X64-NEXT:    retq
288  %trn0 = trunc i8 %a0 to i4
289  %arg0 = bitcast i4 %trn0 to <4 x i1>
290  %res0 = shufflevector <2 x i64> %a1, <2 x i64> undef, <4 x i32> zeroinitializer
291  %res1 = select <4 x i1> %arg0, <4 x i64> %res0, <4 x i64> zeroinitializer
292  ret <4 x i64> %res1
293}
294
295define <2 x double> @test_mm_broadcastsd_pd(<2 x double> %a0) {
296; X32-LABEL: test_mm_broadcastsd_pd:
297; X32:       # BB#0:
298; X32-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
299; X32-NEXT:    retl
300;
301; X64-LABEL: test_mm_broadcastsd_pd:
302; X64:       # BB#0:
303; X64-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
304; X64-NEXT:    retq
305  %res = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer
306  ret <2 x double> %res
307}
308
309define <2 x double> @test_mm_mask_broadcastsd_pd(<2 x double> %a0, i8 %a1, <2 x double> %a2) {
310; X32-LABEL: test_mm_mask_broadcastsd_pd:
311; X32:       # BB#0:
312; X32-NEXT:    pushl %eax
313; X32-NEXT:  .Ltmp6:
314; X32-NEXT:    .cfi_def_cfa_offset 8
315; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
316; X32-NEXT:    andb $3, %al
317; X32-NEXT:    movb %al, {{[0-9]+}}(%esp)
318; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
319; X32-NEXT:    kmovw %eax, %k1
320; X32-NEXT:    vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0]
321; X32-NEXT:    popl %eax
322; X32-NEXT:    retl
323;
324; X64-LABEL: test_mm_mask_broadcastsd_pd:
325; X64:       # BB#0:
326; X64-NEXT:    andb $3, %dil
327; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
328; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
329; X64-NEXT:    kmovw %eax, %k1
330; X64-NEXT:    vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0]
331; X64-NEXT:    retq
332  %trn1 = trunc i8 %a1 to i2
333  %arg1 = bitcast i2 %trn1 to <2 x i1>
334  %res0 = shufflevector <2 x double> %a2, <2 x double> undef, <2 x i32> zeroinitializer
335  %res1 = select <2 x i1> %arg1, <2 x double> %res0, <2 x double> %a0
336  ret <2 x double> %res1
337}
338
339define <2 x double> @test_mm_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) {
340; X32-LABEL: test_mm_maskz_broadcastsd_pd:
341; X32:       # BB#0:
342; X32-NEXT:    pushl %eax
343; X32-NEXT:  .Ltmp7:
344; X32-NEXT:    .cfi_def_cfa_offset 8
345; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
346; X32-NEXT:    andb $3, %al
347; X32-NEXT:    movb %al, {{[0-9]+}}(%esp)
348; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
349; X32-NEXT:    kmovw %eax, %k1
350; X32-NEXT:    vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0]
351; X32-NEXT:    popl %eax
352; X32-NEXT:    retl
353;
354; X64-LABEL: test_mm_maskz_broadcastsd_pd:
355; X64:       # BB#0:
356; X64-NEXT:    andb $3, %dil
357; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
358; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
359; X64-NEXT:    kmovw %eax, %k1
360; X64-NEXT:    vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0]
361; X64-NEXT:    retq
362  %trn0 = trunc i8 %a0 to i2
363  %arg0 = bitcast i2 %trn0 to <2 x i1>
364  %res0 = shufflevector <2 x double> %a1, <2 x double> undef, <2 x i32> zeroinitializer
365  %res1 = select <2 x i1> %arg0, <2 x double> %res0, <2 x double> zeroinitializer
366  ret <2 x double> %res1
367}
368
369define <4 x double> @test_mm256_broadcastsd_pd(<2 x double> %a0) {
370; X32-LABEL: test_mm256_broadcastsd_pd:
371; X32:       # BB#0:
372; X32-NEXT:    vbroadcastsd %xmm0, %ymm0
373; X32-NEXT:    retl
374;
375; X64-LABEL: test_mm256_broadcastsd_pd:
376; X64:       # BB#0:
377; X64-NEXT:    vbroadcastsd %xmm0, %ymm0
378; X64-NEXT:    retq
379  %res = shufflevector <2 x double> %a0, <2 x double> undef, <4 x i32> zeroinitializer
380  ret <4 x double> %res
381}
382
383define <4 x double> @test_mm256_mask_broadcastsd_pd(<4 x double> %a0, i8 %a1, <2 x double> %a2) {
384; X32-LABEL: test_mm256_mask_broadcastsd_pd:
385; X32:       # BB#0:
386; X32-NEXT:    pushl %eax
387; X32-NEXT:  .Ltmp8:
388; X32-NEXT:    .cfi_def_cfa_offset 8
389; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
390; X32-NEXT:    andb $15, %al
391; X32-NEXT:    movb %al, (%esp)
392; X32-NEXT:    movzbl (%esp), %eax
393; X32-NEXT:    kmovw %eax, %k1
394; X32-NEXT:    vbroadcastsd %xmm1, %ymm0 {%k1}
395; X32-NEXT:    popl %eax
396; X32-NEXT:    retl
397;
398; X64-LABEL: test_mm256_mask_broadcastsd_pd:
399; X64:       # BB#0:
400; X64-NEXT:    andb $15, %dil
401; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
402; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
403; X64-NEXT:    kmovw %eax, %k1
404; X64-NEXT:    vbroadcastsd %xmm1, %ymm0 {%k1}
405; X64-NEXT:    retq
406  %trn1 = trunc i8 %a1 to i4
407  %arg1 = bitcast i4 %trn1 to <4 x i1>
408  %res0 = shufflevector <2 x double> %a2, <2 x double> undef, <4 x i32> zeroinitializer
409  %res1 = select <4 x i1> %arg1, <4 x double> %res0, <4 x double> %a0
410  ret <4 x double> %res1
411}
412
413define <4 x double> @test_mm256_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) {
414; X32-LABEL: test_mm256_maskz_broadcastsd_pd:
415; X32:       # BB#0:
416; X32-NEXT:    pushl %eax
417; X32-NEXT:  .Ltmp9:
418; X32-NEXT:    .cfi_def_cfa_offset 8
419; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
420; X32-NEXT:    andb $15, %al
421; X32-NEXT:    movb %al, (%esp)
422; X32-NEXT:    movzbl (%esp), %eax
423; X32-NEXT:    kmovw %eax, %k1
424; X32-NEXT:    vbroadcastsd %xmm0, %ymm0 {%k1} {z}
425; X32-NEXT:    popl %eax
426; X32-NEXT:    retl
427;
428; X64-LABEL: test_mm256_maskz_broadcastsd_pd:
429; X64:       # BB#0:
430; X64-NEXT:    andb $15, %dil
431; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
432; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
433; X64-NEXT:    kmovw %eax, %k1
434; X64-NEXT:    vbroadcastsd %xmm0, %ymm0 {%k1} {z}
435; X64-NEXT:    retq
436  %trn0 = trunc i8 %a0 to i4
437  %arg0 = bitcast i4 %trn0 to <4 x i1>
438  %res0 = shufflevector <2 x double> %a1, <2 x double> undef, <4 x i32> zeroinitializer
439  %res1 = select <4 x i1> %arg0, <4 x double> %res0, <4 x double> zeroinitializer
440  ret <4 x double> %res1
441}
442
443define <4 x float> @test_mm_broadcastss_ps(<4 x float> %a0) {
444; X32-LABEL: test_mm_broadcastss_ps:
445; X32:       # BB#0:
446; X32-NEXT:    vbroadcastss %xmm0, %xmm0
447; X32-NEXT:    retl
448;
449; X64-LABEL: test_mm_broadcastss_ps:
450; X64:       # BB#0:
451; X64-NEXT:    vbroadcastss %xmm0, %xmm0
452; X64-NEXT:    retq
453  %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer
454  ret <4 x float> %res
455}
456
457define <4 x float> @test_mm_mask_broadcastss_ps(<4 x float> %a0, i8 %a1, <4 x float> %a2) {
458; X32-LABEL: test_mm_mask_broadcastss_ps:
459; X32:       # BB#0:
460; X32-NEXT:    pushl %eax
461; X32-NEXT:  .Ltmp10:
462; X32-NEXT:    .cfi_def_cfa_offset 8
463; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
464; X32-NEXT:    andb $15, %al
465; X32-NEXT:    movb %al, (%esp)
466; X32-NEXT:    movzbl (%esp), %eax
467; X32-NEXT:    kmovw %eax, %k1
468; X32-NEXT:    vbroadcastss %xmm1, %xmm0 {%k1}
469; X32-NEXT:    popl %eax
470; X32-NEXT:    retl
471;
472; X64-LABEL: test_mm_mask_broadcastss_ps:
473; X64:       # BB#0:
474; X64-NEXT:    andb $15, %dil
475; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
476; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
477; X64-NEXT:    kmovw %eax, %k1
478; X64-NEXT:    vbroadcastss %xmm1, %xmm0 {%k1}
479; X64-NEXT:    retq
480  %trn1 = trunc i8 %a1 to i4
481  %arg1 = bitcast i4 %trn1 to <4 x i1>
482  %res0 = shufflevector <4 x float> %a2, <4 x float> undef, <4 x i32> zeroinitializer
483  %res1 = select <4 x i1> %arg1, <4 x float> %res0, <4 x float> %a0
484  ret <4 x float> %res1
485}
486
487define <4 x float> @test_mm_maskz_broadcastss_ps(i8 %a0, <4 x float> %a1) {
488; X32-LABEL: test_mm_maskz_broadcastss_ps:
489; X32:       # BB#0:
490; X32-NEXT:    pushl %eax
491; X32-NEXT:  .Ltmp11:
492; X32-NEXT:    .cfi_def_cfa_offset 8
493; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
494; X32-NEXT:    andb $15, %al
495; X32-NEXT:    movb %al, (%esp)
496; X32-NEXT:    movzbl (%esp), %eax
497; X32-NEXT:    kmovw %eax, %k1
498; X32-NEXT:    vbroadcastss %xmm0, %xmm0 {%k1} {z}
499; X32-NEXT:    popl %eax
500; X32-NEXT:    retl
501;
502; X64-LABEL: test_mm_maskz_broadcastss_ps:
503; X64:       # BB#0:
504; X64-NEXT:    andb $15, %dil
505; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
506; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
507; X64-NEXT:    kmovw %eax, %k1
508; X64-NEXT:    vbroadcastss %xmm0, %xmm0 {%k1} {z}
509; X64-NEXT:    retq
510  %trn0 = trunc i8 %a0 to i4
511  %arg0 = bitcast i4 %trn0 to <4 x i1>
512  %res0 = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> zeroinitializer
513  %res1 = select <4 x i1> %arg0, <4 x float> %res0, <4 x float> zeroinitializer
514  ret <4 x float> %res1
515}
516
517define <8 x float> @test_mm256_broadcastss_ps(<4 x float> %a0) {
518; X32-LABEL: test_mm256_broadcastss_ps:
519; X32:       # BB#0:
520; X32-NEXT:    vbroadcastss %xmm0, %ymm0
521; X32-NEXT:    retl
522;
523; X64-LABEL: test_mm256_broadcastss_ps:
524; X64:       # BB#0:
525; X64-NEXT:    vbroadcastss %xmm0, %ymm0
526; X64-NEXT:    retq
527  %res = shufflevector <4 x float> %a0, <4 x float> undef, <8 x i32> zeroinitializer
528  ret <8 x float> %res
529}
530
531define <8 x float> @test_mm256_mask_broadcastss_ps(<8 x float> %a0, i8 %a1, <4 x float> %a2) {
532; X32-LABEL: test_mm256_mask_broadcastss_ps:
533; X32:       # BB#0:
534; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
535; X32-NEXT:    kmovw %eax, %k1
536; X32-NEXT:    vbroadcastss %xmm1, %ymm0 {%k1}
537; X32-NEXT:    retl
538;
539; X64-LABEL: test_mm256_mask_broadcastss_ps:
540; X64:       # BB#0:
541; X64-NEXT:    kmovw %edi, %k1
542; X64-NEXT:    vbroadcastss %xmm1, %ymm0 {%k1}
543; X64-NEXT:    retq
544  %arg1 = bitcast i8 %a1 to <8 x i1>
545  %res0 = shufflevector <4 x float> %a2, <4 x float> undef, <8 x i32> zeroinitializer
546  %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0
547  ret <8 x float> %res1
548}
549
550define <8 x float> @test_mm256_maskz_broadcastss_ps(i8 %a0, <4 x float> %a1) {
551; X32-LABEL: test_mm256_maskz_broadcastss_ps:
552; X32:       # BB#0:
553; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
554; X32-NEXT:    kmovw %eax, %k1
555; X32-NEXT:    vbroadcastss %xmm0, %ymm0 {%k1} {z}
556; X32-NEXT:    retl
557;
558; X64-LABEL: test_mm256_maskz_broadcastss_ps:
559; X64:       # BB#0:
560; X64-NEXT:    kmovw %edi, %k1
561; X64-NEXT:    vbroadcastss %xmm0, %ymm0 {%k1} {z}
562; X64-NEXT:    retq
563  %arg0 = bitcast i8 %a0 to <8 x i1>
564  %res0 = shufflevector <4 x float> %a1, <4 x float> undef, <8 x i32> zeroinitializer
565  %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer
566  ret <8 x float> %res1
567}
568
569define <2 x double> @test_mm_movddup_pd(<2 x double> %a0) {
570; X32-LABEL: test_mm_movddup_pd:
571; X32:       # BB#0:
572; X32-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
573; X32-NEXT:    retl
574;
575; X64-LABEL: test_mm_movddup_pd:
576; X64:       # BB#0:
577; X64-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
578; X64-NEXT:    retq
579  %res = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer
580  ret <2 x double> %res
581}
582
583define <2 x double> @test_mm_mask_movddup_pd(<2 x double> %a0, i8 %a1, <2 x double> %a2) {
584; X32-LABEL: test_mm_mask_movddup_pd:
585; X32:       # BB#0:
586; X32-NEXT:    pushl %eax
587; X32-NEXT:  .Ltmp12:
588; X32-NEXT:    .cfi_def_cfa_offset 8
589; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
590; X32-NEXT:    andb $3, %al
591; X32-NEXT:    movb %al, {{[0-9]+}}(%esp)
592; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
593; X32-NEXT:    kmovw %eax, %k1
594; X32-NEXT:    vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0]
595; X32-NEXT:    popl %eax
596; X32-NEXT:    retl
597;
598; X64-LABEL: test_mm_mask_movddup_pd:
599; X64:       # BB#0:
600; X64-NEXT:    andb $3, %dil
601; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
602; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
603; X64-NEXT:    kmovw %eax, %k1
604; X64-NEXT:    vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0]
605; X64-NEXT:    retq
606  %trn1 = trunc i8 %a1 to i2
607  %arg1 = bitcast i2 %trn1 to <2 x i1>
608  %res0 = shufflevector <2 x double> %a2, <2 x double> undef, <2 x i32> zeroinitializer
609  %res1 = select <2 x i1> %arg1, <2 x double> %res0, <2 x double> %a0
610  ret <2 x double> %res1
611}
612
613define <2 x double> @test_mm_maskz_movddup_pd(i8 %a0, <2 x double> %a1) {
614; X32-LABEL: test_mm_maskz_movddup_pd:
615; X32:       # BB#0:
616; X32-NEXT:    pushl %eax
617; X32-NEXT:  .Ltmp13:
618; X32-NEXT:    .cfi_def_cfa_offset 8
619; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
620; X32-NEXT:    andb $3, %al
621; X32-NEXT:    movb %al, {{[0-9]+}}(%esp)
622; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
623; X32-NEXT:    kmovw %eax, %k1
624; X32-NEXT:    vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0]
625; X32-NEXT:    popl %eax
626; X32-NEXT:    retl
627;
628; X64-LABEL: test_mm_maskz_movddup_pd:
629; X64:       # BB#0:
630; X64-NEXT:    andb $3, %dil
631; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
632; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
633; X64-NEXT:    kmovw %eax, %k1
634; X64-NEXT:    vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0]
635; X64-NEXT:    retq
636  %trn1 = trunc i8 %a0 to i2
637  %arg0 = bitcast i2 %trn1 to <2 x i1>
638  %res0 = shufflevector <2 x double> %a1, <2 x double> undef, <2 x i32> zeroinitializer
639  %res1 = select <2 x i1> %arg0, <2 x double> %res0, <2 x double> zeroinitializer
640  ret <2 x double> %res1
641}
642
643define <4 x double> @test_mm256_movddup_pd(<4 x double> %a0) {
644; X32-LABEL: test_mm256_movddup_pd:
645; X32:       # BB#0:
646; X32-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
647; X32-NEXT:    retl
648;
649; X64-LABEL: test_mm256_movddup_pd:
650; X64:       # BB#0:
651; X64-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
652; X64-NEXT:    retq
653  %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
654  ret <4 x double> %res
655}
656
657define <4 x double> @test_mm256_mask_movddup_pd(<4 x double> %a0, i8 %a1, <4 x double> %a2) {
658; X32-LABEL: test_mm256_mask_movddup_pd:
659; X32:       # BB#0:
660; X32-NEXT:    pushl %eax
661; X32-NEXT:  .Ltmp14:
662; X32-NEXT:    .cfi_def_cfa_offset 8
663; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
664; X32-NEXT:    andb $15, %al
665; X32-NEXT:    movb %al, (%esp)
666; X32-NEXT:    movzbl (%esp), %eax
667; X32-NEXT:    kmovw %eax, %k1
668; X32-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2]
669; X32-NEXT:    popl %eax
670; X32-NEXT:    retl
671;
672; X64-LABEL: test_mm256_mask_movddup_pd:
673; X64:       # BB#0:
674; X64-NEXT:    andb $15, %dil
675; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
676; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
677; X64-NEXT:    kmovw %eax, %k1
678; X64-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2]
679; X64-NEXT:    retq
680  %trn1 = trunc i8 %a1 to i4
681  %arg1 = bitcast i4 %trn1 to <4 x i1>
682  %res0 = shufflevector <4 x double> %a2, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
683  %res1 = select <4 x i1> %arg1, <4 x double> %res0, <4 x double> %a0
684  ret <4 x double> %res1
685}
686
687define <4 x double> @test_mm256_maskz_movddup_pd(i8 %a0, <4 x double> %a1) {
688; X32-LABEL: test_mm256_maskz_movddup_pd:
689; X32:       # BB#0:
690; X32-NEXT:    pushl %eax
691; X32-NEXT:  .Ltmp15:
692; X32-NEXT:    .cfi_def_cfa_offset 8
693; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
694; X32-NEXT:    andb $15, %al
695; X32-NEXT:    movb %al, (%esp)
696; X32-NEXT:    movzbl (%esp), %eax
697; X32-NEXT:    kmovw %eax, %k1
698; X32-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2]
699; X32-NEXT:    popl %eax
700; X32-NEXT:    retl
701;
702; X64-LABEL: test_mm256_maskz_movddup_pd:
703; X64:       # BB#0:
704; X64-NEXT:    andb $15, %dil
705; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
706; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
707; X64-NEXT:    kmovw %eax, %k1
708; X64-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2]
709; X64-NEXT:    retq
710  %trn1 = trunc i8 %a0 to i4
711  %arg0 = bitcast i4 %trn1 to <4 x i1>
712  %res0 = shufflevector <4 x double> %a1, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
713  %res1 = select <4 x i1> %arg0, <4 x double> %res0, <4 x double> zeroinitializer
714  ret <4 x double> %res1
715}
716
717define <4 x float> @test_mm_movehdup_ps(<4 x float> %a0) {
718; X32-LABEL: test_mm_movehdup_ps:
719; X32:       # BB#0:
720; X32-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
721; X32-NEXT:    retl
722;
723; X64-LABEL: test_mm_movehdup_ps:
724; X64:       # BB#0:
725; X64-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
726; X64-NEXT:    retq
727  %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
728  ret <4 x float> %res
729}
730
731define <4 x float> @test_mm_mask_movehdup_ps(<4 x float> %a0, i8 %a1, <4 x float> %a2) {
732; X32-LABEL: test_mm_mask_movehdup_ps:
733; X32:       # BB#0:
734; X32-NEXT:    pushl %eax
735; X32-NEXT:  .Ltmp16:
736; X32-NEXT:    .cfi_def_cfa_offset 8
737; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
738; X32-NEXT:    andb $15, %al
739; X32-NEXT:    movb %al, (%esp)
740; X32-NEXT:    movzbl (%esp), %eax
741; X32-NEXT:    kmovw %eax, %k1
742; X32-NEXT:    vmovshdup {{.*#+}} xmm0 {%k1} = xmm1[1,1,3,3]
743; X32-NEXT:    popl %eax
744; X32-NEXT:    retl
745;
746; X64-LABEL: test_mm_mask_movehdup_ps:
747; X64:       # BB#0:
748; X64-NEXT:    andb $15, %dil
749; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
750; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
751; X64-NEXT:    kmovw %eax, %k1
752; X64-NEXT:    vmovshdup {{.*#+}} xmm0 {%k1} = xmm1[1,1,3,3]
753; X64-NEXT:    retq
754  %trn1 = trunc i8 %a1 to i4
755  %arg1 = bitcast i4 %trn1 to <4 x i1>
756  %res0 = shufflevector <4 x float> %a2, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
757  %res1 = select <4 x i1> %arg1, <4 x float> %res0, <4 x float> %a0
758  ret <4 x float> %res1
759}
760
761define <4 x float> @test_mm_maskz_movehdup_ps(i8 %a0, <4 x float> %a1) {
762; X32-LABEL: test_mm_maskz_movehdup_ps:
763; X32:       # BB#0:
764; X32-NEXT:    pushl %eax
765; X32-NEXT:  .Ltmp17:
766; X32-NEXT:    .cfi_def_cfa_offset 8
767; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
768; X32-NEXT:    andb $15, %al
769; X32-NEXT:    movb %al, (%esp)
770; X32-NEXT:    movzbl (%esp), %eax
771; X32-NEXT:    kmovw %eax, %k1
772; X32-NEXT:    vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3]
773; X32-NEXT:    popl %eax
774; X32-NEXT:    retl
775;
776; X64-LABEL: test_mm_maskz_movehdup_ps:
777; X64:       # BB#0:
778; X64-NEXT:    andb $15, %dil
779; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
780; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
781; X64-NEXT:    kmovw %eax, %k1
782; X64-NEXT:    vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3]
783; X64-NEXT:    retq
784  %trn0 = trunc i8 %a0 to i4
785  %arg0 = bitcast i4 %trn0 to <4 x i1>
786  %res0 = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
787  %res1 = select <4 x i1> %arg0, <4 x float> %res0, <4 x float> zeroinitializer
788  ret <4 x float> %res1
789}
790
791define <8 x float> @test_mm256_movehdup_ps(<8 x float> %a0) {
792; X32-LABEL: test_mm256_movehdup_ps:
793; X32:       # BB#0:
794; X32-NEXT:    vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
795; X32-NEXT:    retl
796;
797; X64-LABEL: test_mm256_movehdup_ps:
798; X64:       # BB#0:
799; X64-NEXT:    vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
800; X64-NEXT:    retq
801  %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
802  ret <8 x float> %res
803}
804
805define <8 x float> @test_mm256_mask_movehdup_ps(<8 x float> %a0, i8 %a1, <8 x float> %a2) {
806; X32-LABEL: test_mm256_mask_movehdup_ps:
807; X32:       # BB#0:
808; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
809; X32-NEXT:    kmovw %eax, %k1
810; X32-NEXT:    vmovshdup {{.*#+}} ymm0 {%k1} = ymm1[1,1,3,3,5,5,7,7]
811; X32-NEXT:    retl
812;
813; X64-LABEL: test_mm256_mask_movehdup_ps:
814; X64:       # BB#0:
815; X64-NEXT:    kmovw %edi, %k1
816; X64-NEXT:    vmovshdup {{.*#+}} ymm0 {%k1} = ymm1[1,1,3,3,5,5,7,7]
817; X64-NEXT:    retq
818  %arg1 = bitcast i8 %a1 to <8 x i1>
819  %res0 = shufflevector <8 x float> %a2, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
820  %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0
821  ret <8 x float> %res1
822}
823
824define <8 x float> @test_mm256_maskz_movehdup_ps(i8 %a0, <8 x float> %a1) {
825; X32-LABEL: test_mm256_maskz_movehdup_ps:
826; X32:       # BB#0:
827; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
828; X32-NEXT:    kmovw %eax, %k1
829; X32-NEXT:    vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7]
830; X32-NEXT:    retl
831;
832; X64-LABEL: test_mm256_maskz_movehdup_ps:
833; X64:       # BB#0:
834; X64-NEXT:    kmovw %edi, %k1
835; X64-NEXT:    vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7]
836; X64-NEXT:    retq
837  %arg0 = bitcast i8 %a0 to <8 x i1>
838  %res0 = shufflevector <8 x float> %a1, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
839  %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer
840  ret <8 x float> %res1
841}
842
843define <4 x float> @test_mm_moveldup_ps(<4 x float> %a0) {
844; X32-LABEL: test_mm_moveldup_ps:
845; X32:       # BB#0:
846; X32-NEXT:    vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
847; X32-NEXT:    retl
848;
849; X64-LABEL: test_mm_moveldup_ps:
850; X64:       # BB#0:
851; X64-NEXT:    vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
852; X64-NEXT:    retq
853  %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
854  ret <4 x float> %res
855}
856
857define <4 x float> @test_mm_mask_moveldup_ps(<4 x float> %a0, i8 %a1, <4 x float> %a2) {
858; X32-LABEL: test_mm_mask_moveldup_ps:
859; X32:       # BB#0:
860; X32-NEXT:    pushl %eax
861; X32-NEXT:  .Ltmp18:
862; X32-NEXT:    .cfi_def_cfa_offset 8
863; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
864; X32-NEXT:    andb $15, %al
865; X32-NEXT:    movb %al, (%esp)
866; X32-NEXT:    movzbl (%esp), %eax
867; X32-NEXT:    kmovw %eax, %k1
868; X32-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} = xmm1[0,0,2,2]
869; X32-NEXT:    popl %eax
870; X32-NEXT:    retl
871;
872; X64-LABEL: test_mm_mask_moveldup_ps:
873; X64:       # BB#0:
874; X64-NEXT:    andb $15, %dil
875; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
876; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
877; X64-NEXT:    kmovw %eax, %k1
878; X64-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} = xmm1[0,0,2,2]
879; X64-NEXT:    retq
880  %trn1 = trunc i8 %a1 to i4
881  %arg1 = bitcast i4 %trn1 to <4 x i1>
882  %res0 = shufflevector <4 x float> %a2, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
883  %res1 = select <4 x i1> %arg1, <4 x float> %res0, <4 x float> %a0
884  ret <4 x float> %res1
885}
886
887define <4 x float> @test_mm_maskz_moveldup_ps(i8 %a0, <4 x float> %a1) {
888; X32-LABEL: test_mm_maskz_moveldup_ps:
889; X32:       # BB#0:
890; X32-NEXT:    pushl %eax
891; X32-NEXT:  .Ltmp19:
892; X32-NEXT:    .cfi_def_cfa_offset 8
893; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
894; X32-NEXT:    andb $15, %al
895; X32-NEXT:    movb %al, (%esp)
896; X32-NEXT:    movzbl (%esp), %eax
897; X32-NEXT:    kmovw %eax, %k1
898; X32-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2]
899; X32-NEXT:    popl %eax
900; X32-NEXT:    retl
901;
902; X64-LABEL: test_mm_maskz_moveldup_ps:
903; X64:       # BB#0:
904; X64-NEXT:    andb $15, %dil
905; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
906; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
907; X64-NEXT:    kmovw %eax, %k1
908; X64-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2]
909; X64-NEXT:    retq
910  %trn0 = trunc i8 %a0 to i4
911  %arg0 = bitcast i4 %trn0 to <4 x i1>
912  %res0 = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
913  %res1 = select <4 x i1> %arg0, <4 x float> %res0, <4 x float> zeroinitializer
914  ret <4 x float> %res1
915}
916
917define <8 x float> @test_mm256_moveldup_ps(<8 x float> %a0) {
918; X32-LABEL: test_mm256_moveldup_ps:
919; X32:       # BB#0:
920; X32-NEXT:    vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
921; X32-NEXT:    retl
922;
923; X64-LABEL: test_mm256_moveldup_ps:
924; X64:       # BB#0:
925; X64-NEXT:    vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
926; X64-NEXT:    retq
927  %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
928  ret <8 x float> %res
929}
930
931define <8 x float> @test_mm256_mask_moveldup_ps(<8 x float> %a0, i8 %a1, <8 x float> %a2) {
932; X32-LABEL: test_mm256_mask_moveldup_ps:
933; X32:       # BB#0:
934; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
935; X32-NEXT:    kmovw %eax, %k1
936; X32-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2,4,4,6,6]
937; X32-NEXT:    retl
938;
939; X64-LABEL: test_mm256_mask_moveldup_ps:
940; X64:       # BB#0:
941; X64-NEXT:    kmovw %edi, %k1
942; X64-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2,4,4,6,6]
943; X64-NEXT:    retq
944  %arg1 = bitcast i8 %a1 to <8 x i1>
945  %res0 = shufflevector <8 x float> %a2, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
946  %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0
947  ret <8 x float> %res1
948}
949
950define <8 x float> @test_mm256_maskz_moveldup_ps(i8 %a0, <8 x float> %a1) {
951; X32-LABEL: test_mm256_maskz_moveldup_ps:
952; X32:       # BB#0:
953; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
954; X32-NEXT:    kmovw %eax, %k1
955; X32-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
956; X32-NEXT:    retl
957;
958; X64-LABEL: test_mm256_maskz_moveldup_ps:
959; X64:       # BB#0:
960; X64-NEXT:    kmovw %edi, %k1
961; X64-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
962; X64-NEXT:    retq
963  %arg0 = bitcast i8 %a0 to <8 x i1>
964  %res0 = shufflevector <8 x float> %a1, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
965  %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer
966  ret <8 x float> %res1
967}
968
969define <4 x i64> @test_mm256_permutex_epi64(<4 x i64> %a0) {
970; X32-LABEL: test_mm256_permutex_epi64:
971; X32:       # BB#0:
972; X32-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,0,0,0]
973; X32-NEXT:    retl
974;
975; X64-LABEL: test_mm256_permutex_epi64:
976; X64:       # BB#0:
977; X64-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[3,0,0,0]
978; X64-NEXT:    retq
979  %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
980  ret <4 x i64> %res
981}
982
983define <4 x i64> @test_mm256_mask_permutex_epi64(<4 x i64> %a0, i8 %a1, <4 x i64> %a2) {
984; X32-LABEL: test_mm256_mask_permutex_epi64:
985; X32:       # BB#0:
986; X32-NEXT:    pushl %eax
987; X32-NEXT:  .Ltmp20:
988; X32-NEXT:    .cfi_def_cfa_offset 8
989; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
990; X32-NEXT:    andb $15, %al
991; X32-NEXT:    movb %al, (%esp)
992; X32-NEXT:    movzbl (%esp), %eax
993; X32-NEXT:    kmovw %eax, %k1
994; X32-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0]
995; X32-NEXT:    popl %eax
996; X32-NEXT:    retl
997;
998; X64-LABEL: test_mm256_mask_permutex_epi64:
999; X64:       # BB#0:
1000; X64-NEXT:    andb $15, %dil
1001; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
1002; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
1003; X64-NEXT:    kmovw %eax, %k1
1004; X64-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0]
1005; X64-NEXT:    retq
1006  %trn1 = trunc i8 %a1 to i4
1007  %arg1 = bitcast i4 %trn1 to <4 x i1>
1008  %res0 = shufflevector <4 x i64> %a2, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
1009  %res1 = select <4 x i1> %arg1, <4 x i64> %res0, <4 x i64> %a0
1010  ret <4 x i64> %res1
1011}
1012
1013define <4 x i64> @test_mm256_maskz_permutex_epi64(i8 %a0, <4 x i64> %a1) {
1014; X32-LABEL: test_mm256_maskz_permutex_epi64:
1015; X32:       # BB#0:
1016; X32-NEXT:    pushl %eax
1017; X32-NEXT:  .Ltmp21:
1018; X32-NEXT:    .cfi_def_cfa_offset 8
1019; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
1020; X32-NEXT:    andb $15, %al
1021; X32-NEXT:    movb %al, (%esp)
1022; X32-NEXT:    movzbl (%esp), %eax
1023; X32-NEXT:    kmovw %eax, %k1
1024; X32-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0]
1025; X32-NEXT:    popl %eax
1026; X32-NEXT:    retl
1027;
1028; X64-LABEL: test_mm256_maskz_permutex_epi64:
1029; X64:       # BB#0:
1030; X64-NEXT:    andb $15, %dil
1031; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
1032; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
1033; X64-NEXT:    kmovw %eax, %k1
1034; X64-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0]
1035; X64-NEXT:    retq
1036  %trn1 = trunc i8 %a0 to i4
1037  %arg0 = bitcast i4 %trn1 to <4 x i1>
1038  %res0 = shufflevector <4 x i64> %a1, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
1039  %res1 = select <4 x i1> %arg0, <4 x i64> %res0, <4 x i64> zeroinitializer
1040  ret <4 x i64> %res1
1041}
1042
1043define <4 x double> @test_mm256_permutex_pd(<4 x double> %a0) {
1044; X32-LABEL: test_mm256_permutex_pd:
1045; X32:       # BB#0:
1046; X32-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0]
1047; X32-NEXT:    retl
1048;
1049; X64-LABEL: test_mm256_permutex_pd:
1050; X64:       # BB#0:
1051; X64-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0]
1052; X64-NEXT:    retq
1053  %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
1054  ret <4 x double> %res
1055}
1056
1057define <4 x double> @test_mm256_mask_permutex_pd(<4 x double> %a0, i8 %a1, <4 x double> %a2) {
1058; X32-LABEL: test_mm256_mask_permutex_pd:
1059; X32:       # BB#0:
1060; X32-NEXT:    pushl %eax
1061; X32-NEXT:  .Ltmp22:
1062; X32-NEXT:    .cfi_def_cfa_offset 8
1063; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
1064; X32-NEXT:    andb $15, %al
1065; X32-NEXT:    movb %al, (%esp)
1066; X32-NEXT:    movzbl (%esp), %eax
1067; X32-NEXT:    kmovw %eax, %k1
1068; X32-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0]
1069; X32-NEXT:    popl %eax
1070; X32-NEXT:    retl
1071;
1072; X64-LABEL: test_mm256_mask_permutex_pd:
1073; X64:       # BB#0:
1074; X64-NEXT:    andb $15, %dil
1075; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
1076; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
1077; X64-NEXT:    kmovw %eax, %k1
1078; X64-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0]
1079; X64-NEXT:    retq
1080  %trn1 = trunc i8 %a1 to i4
1081  %arg1 = bitcast i4 %trn1 to <4 x i1>
1082  %res0 = shufflevector <4 x double> %a2, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
1083  %res1 = select <4 x i1> %arg1, <4 x double> %res0, <4 x double> %a0
1084  ret <4 x double> %res1
1085}
1086
1087define <4 x double> @test_mm256_maskz_permutex_pd(i8 %a0, <4 x double> %a1) {
1088; X32-LABEL: test_mm256_maskz_permutex_pd:
1089; X32:       # BB#0:
1090; X32-NEXT:    pushl %eax
1091; X32-NEXT:  .Ltmp23:
1092; X32-NEXT:    .cfi_def_cfa_offset 8
1093; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
1094; X32-NEXT:    andb $15, %al
1095; X32-NEXT:    movb %al, (%esp)
1096; X32-NEXT:    movzbl (%esp), %eax
1097; X32-NEXT:    kmovw %eax, %k1
1098; X32-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0]
1099; X32-NEXT:    popl %eax
1100; X32-NEXT:    retl
1101;
1102; X64-LABEL: test_mm256_maskz_permutex_pd:
1103; X64:       # BB#0:
1104; X64-NEXT:    andb $15, %dil
1105; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
1106; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
1107; X64-NEXT:    kmovw %eax, %k1
1108; X64-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0]
1109; X64-NEXT:    retq
1110  %trn1 = trunc i8 %a0 to i4
1111  %arg0 = bitcast i4 %trn1 to <4 x i1>
1112  %res0 = shufflevector <4 x double> %a1, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
1113  %res1 = select <4 x i1> %arg0, <4 x double> %res0, <4 x double> zeroinitializer
1114  ret <4 x double> %res1
1115}
1116
1117define <2 x double> @test_mm_shuffle_pd(<2 x double> %a0, <2 x double> %a1) {
1118; X32-LABEL: test_mm_shuffle_pd:
1119; X32:       # BB#0:
1120; X32-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1121; X32-NEXT:    retl
1122;
1123; X64-LABEL: test_mm_shuffle_pd:
1124; X64:       # BB#0:
1125; X64-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1126; X64-NEXT:    retq
1127  %res = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 3>
1128  ret <2 x double> %res
1129}
1130
1131define <2 x double> @test_mm_mask_shuffle_pd(<2 x double> %a0, i8 %a1, <2 x double> %a2, <2 x double> %a3) {
1132; X32-LABEL: test_mm_mask_shuffle_pd:
1133; X32:       # BB#0:
1134; X32-NEXT:    pushl %eax
1135; X32-NEXT:  .Ltmp24:
1136; X32-NEXT:    .cfi_def_cfa_offset 8
1137; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
1138; X32-NEXT:    andb $3, %al
1139; X32-NEXT:    movb %al, {{[0-9]+}}(%esp)
1140; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1141; X32-NEXT:    kmovw %eax, %k1
1142; X32-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} = xmm1[1],xmm2[1]
1143; X32-NEXT:    popl %eax
1144; X32-NEXT:    retl
1145;
1146; X64-LABEL: test_mm_mask_shuffle_pd:
1147; X64:       # BB#0:
1148; X64-NEXT:    andb $3, %dil
1149; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
1150; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
1151; X64-NEXT:    kmovw %eax, %k1
1152; X64-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} = xmm1[1],xmm2[1]
1153; X64-NEXT:    retq
1154  %trn1 = trunc i8 %a1 to i2
1155  %arg1 = bitcast i2 %trn1 to <2 x i1>
1156  %res0 = shufflevector <2 x double> %a2, <2 x double> %a3, <2 x i32> <i32 1, i32 3>
1157  %res1 = select <2 x i1> %arg1, <2 x double> %res0, <2 x double> %a0
1158  ret <2 x double> %res1
1159}
1160
1161define <2 x double> @test_mm_maskz_shuffle_pd(i8 %a0, <2 x double> %a1, <2 x double> %a2) {
1162; X32-LABEL: test_mm_maskz_shuffle_pd:
1163; X32:       # BB#0:
1164; X32-NEXT:    pushl %eax
1165; X32-NEXT:  .Ltmp25:
1166; X32-NEXT:    .cfi_def_cfa_offset 8
1167; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
1168; X32-NEXT:    andb $3, %al
1169; X32-NEXT:    movb %al, {{[0-9]+}}(%esp)
1170; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1171; X32-NEXT:    kmovw %eax, %k1
1172; X32-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1]
1173; X32-NEXT:    popl %eax
1174; X32-NEXT:    retl
1175;
1176; X64-LABEL: test_mm_maskz_shuffle_pd:
1177; X64:       # BB#0:
1178; X64-NEXT:    andb $3, %dil
1179; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
1180; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
1181; X64-NEXT:    kmovw %eax, %k1
1182; X64-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1]
1183; X64-NEXT:    retq
1184  %trn1 = trunc i8 %a0 to i2
1185  %arg0 = bitcast i2 %trn1 to <2 x i1>
1186  %res0 = shufflevector <2 x double> %a1, <2 x double> %a2, <2 x i32> <i32 1, i32 3>
1187  %res1 = select <2 x i1> %arg0, <2 x double> %res0, <2 x double> zeroinitializer
1188  ret <2 x double> %res1
1189}
1190
1191define <4 x double> @test_mm256_shuffle_pd(<4 x double> %a0, <4 x double> %a1) {
1192; X32-LABEL: test_mm256_shuffle_pd:
1193; X32:       # BB#0:
1194; X32-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[2],ymm1[2]
1195; X32-NEXT:    retl
1196;
1197; X64-LABEL: test_mm256_shuffle_pd:
1198; X64:       # BB#0:
1199; X64-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[2],ymm1[2]
1200; X64-NEXT:    retq
1201  %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 5, i32 2, i32 6>
1202  ret <4 x double> %res
1203}
1204
1205define <4 x double> @test_mm256_mask_shuffle_pd(<4 x double> %a0, i8 %a1, <4 x double> %a2, <4 x double> %a3) {
1206; X32-LABEL: test_mm256_mask_shuffle_pd:
1207; X32:       # BB#0:
1208; X32-NEXT:    pushl %eax
1209; X32-NEXT:  .Ltmp26:
1210; X32-NEXT:    .cfi_def_cfa_offset 8
1211; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
1212; X32-NEXT:    andb $15, %al
1213; X32-NEXT:    movb %al, (%esp)
1214; X32-NEXT:    movzbl (%esp), %eax
1215; X32-NEXT:    kmovw %eax, %k1
1216; X32-NEXT:    vshufpd {{.*#+}} ymm0 {%k1} = ymm1[1],ymm2[1],ymm1[2],ymm2[2]
1217; X32-NEXT:    popl %eax
1218; X32-NEXT:    retl
1219;
1220; X64-LABEL: test_mm256_mask_shuffle_pd:
1221; X64:       # BB#0:
1222; X64-NEXT:    andb $15, %dil
1223; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
1224; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
1225; X64-NEXT:    kmovw %eax, %k1
1226; X64-NEXT:    vshufpd {{.*#+}} ymm0 {%k1} = ymm1[1],ymm2[1],ymm1[2],ymm2[2]
1227; X64-NEXT:    retq
1228  %trn1 = trunc i8 %a1 to i4
1229  %arg1 = bitcast i4 %trn1 to <4 x i1>
1230  %res0 = shufflevector <4 x double> %a2, <4 x double> %a3, <4 x i32> <i32 1, i32 5, i32 2, i32 6>
1231  %res1 = select <4 x i1> %arg1, <4 x double> %res0, <4 x double> %a0
1232  ret <4 x double> %res1
1233}
1234
1235define <4 x double> @test_mm256_maskz_shuffle_pd(i8 %a0, <4 x double> %a1, <4 x double> %a2) {
1236; X32-LABEL: test_mm256_maskz_shuffle_pd:
1237; X32:       # BB#0:
1238; X32-NEXT:    pushl %eax
1239; X32-NEXT:  .Ltmp27:
1240; X32-NEXT:    .cfi_def_cfa_offset 8
1241; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
1242; X32-NEXT:    andb $15, %al
1243; X32-NEXT:    movb %al, (%esp)
1244; X32-NEXT:    movzbl (%esp), %eax
1245; X32-NEXT:    kmovw %eax, %k1
1246; X32-NEXT:    vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[2],ymm1[2]
1247; X32-NEXT:    popl %eax
1248; X32-NEXT:    retl
1249;
1250; X64-LABEL: test_mm256_maskz_shuffle_pd:
1251; X64:       # BB#0:
1252; X64-NEXT:    andb $15, %dil
1253; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
1254; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
1255; X64-NEXT:    kmovw %eax, %k1
1256; X64-NEXT:    vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[2],ymm1[2]
1257; X64-NEXT:    retq
1258  %trn1 = trunc i8 %a0 to i4
1259  %arg0 = bitcast i4 %trn1 to <4 x i1>
1260  %res0 = shufflevector <4 x double> %a1, <4 x double> %a2, <4 x i32> <i32 1, i32 5, i32 2, i32 6>
1261  %res1 = select <4 x i1> %arg0, <4 x double> %res0, <4 x double> zeroinitializer
1262  ret <4 x double> %res1
1263}
1264
1265define <4 x float> @test_mm_shuffle_ps(<4 x float> %a0, <4 x float> %a1) {
1266; X32-LABEL: test_mm_shuffle_ps:
1267; X32:       # BB#0:
1268; X32-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
1269; X32-NEXT:    retl
1270;
1271; X64-LABEL: test_mm_shuffle_ps:
1272; X64:       # BB#0:
1273; X64-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
1274; X64-NEXT:    retq
1275  %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 4>
1276  ret <4 x float> %res
1277}
1278
1279define <4 x float> @test_mm_mask_shuffle_ps(<4 x float> %a0, i8 %a1, <4 x float> %a2, <4 x float> %a3) {
1280; X32-LABEL: test_mm_mask_shuffle_ps:
1281; X32:       # BB#0:
1282; X32-NEXT:    pushl %eax
1283; X32-NEXT:  .Ltmp28:
1284; X32-NEXT:    .cfi_def_cfa_offset 8
1285; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
1286; X32-NEXT:    andb $15, %al
1287; X32-NEXT:    movb %al, (%esp)
1288; X32-NEXT:    movzbl (%esp), %eax
1289; X32-NEXT:    kmovw %eax, %k1
1290; X32-NEXT:    vshufps {{.*#+}} xmm0 {%k1} = xmm1[0,1],xmm2[0,0]
1291; X32-NEXT:    popl %eax
1292; X32-NEXT:    retl
1293;
1294; X64-LABEL: test_mm_mask_shuffle_ps:
1295; X64:       # BB#0:
1296; X64-NEXT:    andb $15, %dil
1297; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
1298; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
1299; X64-NEXT:    kmovw %eax, %k1
1300; X64-NEXT:    vshufps {{.*#+}} xmm0 {%k1} = xmm1[0,1],xmm2[0,0]
1301; X64-NEXT:    retq
1302  %trn1 = trunc i8 %a1 to i4
1303  %arg1 = bitcast i4 %trn1 to <4 x i1>
1304  %res0 = shufflevector <4 x float> %a2, <4 x float> %a3, <4 x i32> <i32 0, i32 1, i32 4, i32 4>
1305  %res1 = select <4 x i1> %arg1, <4 x float> %res0, <4 x float> %a0
1306  ret <4 x float> %res1
1307}
1308
1309define <4 x float> @test_mm_maskz_shuffle_ps(i8 %a0, <4 x float> %a1, <4 x float> %a2) {
1310; X32-LABEL: test_mm_maskz_shuffle_ps:
1311; X32:       # BB#0:
1312; X32-NEXT:    pushl %eax
1313; X32-NEXT:  .Ltmp29:
1314; X32-NEXT:    .cfi_def_cfa_offset 8
1315; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
1316; X32-NEXT:    andb $15, %al
1317; X32-NEXT:    movb %al, (%esp)
1318; X32-NEXT:    movzbl (%esp), %eax
1319; X32-NEXT:    kmovw %eax, %k1
1320; X32-NEXT:    vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1],xmm1[0,0]
1321; X32-NEXT:    popl %eax
1322; X32-NEXT:    retl
1323;
1324; X64-LABEL: test_mm_maskz_shuffle_ps:
1325; X64:       # BB#0:
1326; X64-NEXT:    andb $15, %dil
1327; X64-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
1328; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
1329; X64-NEXT:    kmovw %eax, %k1
1330; X64-NEXT:    vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1],xmm1[0,0]
1331; X64-NEXT:    retq
1332  %trn0 = trunc i8 %a0 to i4
1333  %arg0 = bitcast i4 %trn0 to <4 x i1>
1334  %res0 = shufflevector <4 x float> %a1, <4 x float> %a2, <4 x i32> <i32 0, i32 1, i32 4, i32 4>
1335  %res1 = select <4 x i1> %arg0, <4 x float> %res0, <4 x float> zeroinitializer
1336  ret <4 x float> %res1
1337}
1338
1339define <8 x float> @test_mm256_shuffle_ps(<8 x float> %a0, <8 x float> %a1) {
1340; X32-LABEL: test_mm256_shuffle_ps:
1341; X32:       # BB#0:
1342; X32-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4]
1343; X32-NEXT:    retl
1344;
1345; X64-LABEL: test_mm256_shuffle_ps:
1346; X64:       # BB#0:
1347; X64-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4]
1348; X64-NEXT:    retq
1349  %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12>
1350  ret <8 x float> %res
1351}
1352
1353define <8 x float> @test_mm256_mask_shuffle_ps(<8 x float> %a0, i8 %a1, <8 x float> %a2, <8 x float> %a3) {
1354; X32-LABEL: test_mm256_mask_shuffle_ps:
1355; X32:       # BB#0:
1356; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
1357; X32-NEXT:    kmovw %eax, %k1
1358; X32-NEXT:    vshufps {{.*#+}} ymm0 {%k1} = ymm1[0,1],ymm2[0,0],ymm1[4,5],ymm2[4,4]
1359; X32-NEXT:    retl
1360;
1361; X64-LABEL: test_mm256_mask_shuffle_ps:
1362; X64:       # BB#0:
1363; X64-NEXT:    kmovw %edi, %k1
1364; X64-NEXT:    vshufps {{.*#+}} ymm0 {%k1} = ymm1[0,1],ymm2[0,0],ymm1[4,5],ymm2[4,4]
1365; X64-NEXT:    retq
1366  %arg1 = bitcast i8 %a1 to <8 x i1>
1367  %res0 = shufflevector <8 x float> %a2, <8 x float> %a3, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12>
1368  %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0
1369  ret <8 x float> %res1
1370}
1371
1372define <8 x float> @test_mm256_maskz_shuffle_ps(i8 %a0, <8 x float> %a1, <8 x float> %a2) {
1373; X32-LABEL: test_mm256_maskz_shuffle_ps:
1374; X32:       # BB#0:
1375; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
1376; X32-NEXT:    kmovw %eax, %k1
1377; X32-NEXT:    vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4]
1378; X32-NEXT:    retl
1379;
1380; X64-LABEL: test_mm256_maskz_shuffle_ps:
1381; X64:       # BB#0:
1382; X64-NEXT:    kmovw %edi, %k1
1383; X64-NEXT:    vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4]
1384; X64-NEXT:    retq
1385  %arg0 = bitcast i8 %a0 to <8 x i1>
1386  %res0 = shufflevector <8 x float> %a1, <8 x float> %a2, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12>
1387  %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer
1388  ret <8 x float> %res1
1389}
1390
1391!0 = !{i32 1}
1392