• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X86
3; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X64
4
5; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512f-builtins.c
6
7
8define zeroext i16 @test_mm512_kunpackb(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) local_unnamed_addr #0 {
9; X86-LABEL: test_mm512_kunpackb:
10; X86:       # %bb.0: # %entry
11; X86-NEXT:    pushl %ebp
12; X86-NEXT:    .cfi_def_cfa_offset 8
13; X86-NEXT:    .cfi_offset %ebp, -8
14; X86-NEXT:    movl %esp, %ebp
15; X86-NEXT:    .cfi_def_cfa_register %ebp
16; X86-NEXT:    andl $-64, %esp
17; X86-NEXT:    subl $64, %esp
18; X86-NEXT:    vmovdqa64 136(%ebp), %zmm3
19; X86-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
20; X86-NEXT:    vpcmpneqd 8(%ebp), %zmm2, %k1
21; X86-NEXT:    kunpckbw %k0, %k1, %k1
22; X86-NEXT:    vpcmpneqd 72(%ebp), %zmm3, %k0 {%k1}
23; X86-NEXT:    kmovw %k0, %eax
24; X86-NEXT:    movzwl %ax, %eax
25; X86-NEXT:    movl %ebp, %esp
26; X86-NEXT:    popl %ebp
27; X86-NEXT:    .cfi_def_cfa %esp, 4
28; X86-NEXT:    vzeroupper
29; X86-NEXT:    retl
30;
31; X64-LABEL: test_mm512_kunpackb:
32; X64:       # %bb.0: # %entry
33; X64-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
34; X64-NEXT:    vpcmpneqd %zmm3, %zmm2, %k1
35; X64-NEXT:    kunpckbw %k0, %k1, %k1
36; X64-NEXT:    vpcmpneqd %zmm5, %zmm4, %k0 {%k1}
37; X64-NEXT:    kmovw %k0, %eax
38; X64-NEXT:    movzwl %ax, %eax
39; X64-NEXT:    vzeroupper
40; X64-NEXT:    retq
41entry:
42  %0 = bitcast <8 x i64> %__E to <16 x i32>
43  %1 = bitcast <8 x i64> %__F to <16 x i32>
44  %2 = bitcast <8 x i64> %__A to <16 x i32>
45  %3 = bitcast <8 x i64> %__B to <16 x i32>
46  %4 = icmp ne <16 x i32> %2, %3
47  %5 = bitcast <8 x i64> %__C to <16 x i32>
48  %6 = bitcast <8 x i64> %__D to <16 x i32>
49  %7 = icmp ne <16 x i32> %5, %6
50  %8 = shufflevector <16 x i1> %4, <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
51  %9 = shufflevector <16 x i1> %7, <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
52  %10 = shufflevector <8 x i1> %8, <8 x i1> %9, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
53  %11 = icmp ne <16 x i32> %0, %1
54  %12 = and <16 x i1> %11, %10
55  %13 = bitcast <16 x i1> %12 to i16
56  ret i16 %13
57}
58
59define i32 @test_mm512_kortestc(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D) {
60; X86-LABEL: test_mm512_kortestc:
61; X86:       # %bb.0: # %entry
62; X86-NEXT:    pushl %ebp
63; X86-NEXT:    .cfi_def_cfa_offset 8
64; X86-NEXT:    .cfi_offset %ebp, -8
65; X86-NEXT:    movl %esp, %ebp
66; X86-NEXT:    .cfi_def_cfa_register %ebp
67; X86-NEXT:    andl $-64, %esp
68; X86-NEXT:    subl $64, %esp
69; X86-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
70; X86-NEXT:    vpcmpneqd 8(%ebp), %zmm2, %k1
71; X86-NEXT:    korw %k0, %k1, %k0
72; X86-NEXT:    kmovw %k0, %eax
73; X86-NEXT:    cmpw $-1, %ax
74; X86-NEXT:    sete %al
75; X86-NEXT:    andb $1, %al
76; X86-NEXT:    movzbl %al, %eax
77; X86-NEXT:    movl %ebp, %esp
78; X86-NEXT:    popl %ebp
79; X86-NEXT:    .cfi_def_cfa %esp, 4
80; X86-NEXT:    vzeroupper
81; X86-NEXT:    retl
82;
83; X64-LABEL: test_mm512_kortestc:
84; X64:       # %bb.0: # %entry
85; X64-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
86; X64-NEXT:    vpcmpneqd %zmm3, %zmm2, %k1
87; X64-NEXT:    korw %k0, %k1, %k0
88; X64-NEXT:    kmovw %k0, %eax
89; X64-NEXT:    cmpw $-1, %ax
90; X64-NEXT:    sete %al
91; X64-NEXT:    andb $1, %al
92; X64-NEXT:    movzbl %al, %eax
93; X64-NEXT:    vzeroupper
94; X64-NEXT:    retq
95entry:
96  %0 = bitcast <8 x i64> %__A to <16 x i32>
97  %1 = bitcast <8 x i64> %__B to <16 x i32>
98  %2 = icmp ne <16 x i32> %0, %1
99  %3 = bitcast <8 x i64> %__C to <16 x i32>
100  %4 = bitcast <8 x i64> %__D to <16 x i32>
101  %5 = icmp ne <16 x i32> %3, %4
102  %6 = or <16 x i1> %5, %2                                                                                                                                                                                                                                 %7 = bitcast <16 x i1> %6 to i16
103  %8 = icmp eq i16 %7, -1
104  %9 = zext i1 %8 to i32
105  ret i32 %9
106}
107
108define i32 @test_mm512_kortestz(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D) {
109; X86-LABEL: test_mm512_kortestz:
110; X86:       # %bb.0: # %entry
111; X86-NEXT:    pushl %ebp
112; X86-NEXT:    .cfi_def_cfa_offset 8
113; X86-NEXT:    .cfi_offset %ebp, -8
114; X86-NEXT:    movl %esp, %ebp
115; X86-NEXT:    .cfi_def_cfa_register %ebp
116; X86-NEXT:    andl $-64, %esp
117; X86-NEXT:    subl $64, %esp
118; X86-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
119; X86-NEXT:    vpcmpneqd 8(%ebp), %zmm2, %k1
120; X86-NEXT:    korw %k0, %k1, %k0
121; X86-NEXT:    kmovw %k0, %eax
122; X86-NEXT:    cmpw $0, %ax
123; X86-NEXT:    sete %al
124; X86-NEXT:    andb $1, %al
125; X86-NEXT:    movzbl %al, %eax
126; X86-NEXT:    movl %ebp, %esp
127; X86-NEXT:    popl %ebp
128; X86-NEXT:    .cfi_def_cfa %esp, 4
129; X86-NEXT:    vzeroupper
130; X86-NEXT:    retl
131;
132; X64-LABEL: test_mm512_kortestz:
133; X64:       # %bb.0: # %entry
134; X64-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
135; X64-NEXT:    vpcmpneqd %zmm3, %zmm2, %k1
136; X64-NEXT:    korw %k0, %k1, %k0
137; X64-NEXT:    kmovw %k0, %eax
138; X64-NEXT:    cmpw $0, %ax
139; X64-NEXT:    sete %al
140; X64-NEXT:    andb $1, %al
141; X64-NEXT:    movzbl %al, %eax
142; X64-NEXT:    vzeroupper
143; X64-NEXT:    retq
144entry:
145  %0 = bitcast <8 x i64> %__A to <16 x i32>
146  %1 = bitcast <8 x i64> %__B to <16 x i32>
147  %2 = icmp ne <16 x i32> %0, %1
148  %3 = bitcast <8 x i64> %__C to <16 x i32>
149  %4 = bitcast <8 x i64> %__D to <16 x i32>
150  %5 = icmp ne <16 x i32> %3, %4
151  %6 = or <16 x i1> %5, %2
152  %7 = bitcast <16 x i1> %6 to i16
153  %8 = icmp eq i16 %7, 0
154  %9 = zext i1 %8 to i32
155  ret i32 %9
156}
157
158define <16 x float> @test_mm512_shuffle_f32x4(<16 x float> %__A, <16 x float> %__B) {
159; CHECK-LABEL: test_mm512_shuffle_f32x4:
160; CHECK:       # %bb.0: # %entry
161; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1]
162; CHECK-NEXT:    ret{{[l|q]}}
163entry:
164  %shuffle = shufflevector <16 x float> %__A, <16 x float> %__B, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
165  ret <16 x float> %shuffle
166}
167
168
169define <16 x float> @test_mm512_mask_shuffle_f32x4(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
170; X86-LABEL: test_mm512_mask_shuffle_f32x4:
171; X86:       # %bb.0: # %entry
172; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
173; X86-NEXT:    kmovw %eax, %k1
174; X86-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3]
175; X86-NEXT:    retl
176;
177; X64-LABEL: test_mm512_mask_shuffle_f32x4:
178; X64:       # %bb.0: # %entry
179; X64-NEXT:    kmovw %edi, %k1
180; X64-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3]
181; X64-NEXT:    retq
182entry:
183  %shuffle = shufflevector <16 x float> %__A, <16 x float> %__B, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
184  %0 = bitcast i16 %__U to <16 x i1>
185  %1 = select <16 x i1> %0, <16 x float> %shuffle, <16 x float> %__W
186  ret <16 x float> %1
187}
188
189define <16 x float> @test_mm512_maskz_shuffle_f32x4(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
190; X86-LABEL: test_mm512_maskz_shuffle_f32x4:
191; X86:       # %bb.0: # %entry
192; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
193; X86-NEXT:    kmovw %eax, %k1
194; X86-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3]
195; X86-NEXT:    retl
196;
197; X64-LABEL: test_mm512_maskz_shuffle_f32x4:
198; X64:       # %bb.0: # %entry
199; X64-NEXT:    kmovw %edi, %k1
200; X64-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3]
201; X64-NEXT:    retq
202entry:
203  %shuffle = shufflevector <16 x float> %__A, <16 x float> %__B, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
204  %0 = bitcast i16 %__U to <16 x i1>
205  %1 = select <16 x i1> %0, <16 x float> %shuffle, <16 x float> zeroinitializer
206  ret <16 x float> %1
207}
208
209define <8 x double> @test_mm512_shuffle_f64x2(<8 x double> %__A, <8 x double> %__B) {
210; CHECK-LABEL: test_mm512_shuffle_f64x2:
211; CHECK:       # %bb.0: # %entry
212; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1]
213; CHECK-NEXT:    ret{{[l|q]}}
214entry:
215  %shuffle = shufflevector <8 x double> %__A, <8 x double> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
216  ret <8 x double> %shuffle
217}
218
219define <8 x double> @test_mm512_mask_shuffle_f64x2(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
220; X86-LABEL: test_mm512_mask_shuffle_f64x2:
221; X86:       # %bb.0: # %entry
222; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
223; X86-NEXT:    kmovw %eax, %k1
224; X86-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1]
225; X86-NEXT:    retl
226;
227; X64-LABEL: test_mm512_mask_shuffle_f64x2:
228; X64:       # %bb.0: # %entry
229; X64-NEXT:    kmovw %edi, %k1
230; X64-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1]
231; X64-NEXT:    retq
232entry:
233  %shuffle = shufflevector <8 x double> %__A, <8 x double> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
234  %0 = bitcast i8 %__U to <8 x i1>
235  %1 = select <8 x i1> %0, <8 x double> %shuffle, <8 x double> %__W
236  ret <8 x double> %1
237}
238
239define <8 x double> @test_mm512_maskz_shuffle_f64x2(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
240; X86-LABEL: test_mm512_maskz_shuffle_f64x2:
241; X86:       # %bb.0: # %entry
242; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
243; X86-NEXT:    kmovw %eax, %k1
244; X86-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1]
245; X86-NEXT:    retl
246;
247; X64-LABEL: test_mm512_maskz_shuffle_f64x2:
248; X64:       # %bb.0: # %entry
249; X64-NEXT:    kmovw %edi, %k1
250; X64-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1]
251; X64-NEXT:    retq
252entry:
253  %shuffle = shufflevector <8 x double> %__A, <8 x double> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
254  %0 = bitcast i8 %__U to <8 x i1>
255  %1 = select <8 x i1> %0, <8 x double> %shuffle, <8 x double> zeroinitializer
256  ret <8 x double> %1
257}
258
259define <8 x i64> @test_mm512_shuffle_i32x4(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
260; CHECK-LABEL: test_mm512_shuffle_i32x4:
261; CHECK:       # %bb.0: # %entry
262; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1]
263; CHECK-NEXT:    ret{{[l|q]}}
264entry:
265  %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
266  ret <8 x i64> %shuffle
267}
268
269define <8 x i64> @test_mm512_mask_shuffle_i32x4(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
270; X86-LABEL: test_mm512_mask_shuffle_i32x4:
271; X86:       # %bb.0: # %entry
272; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
273; X86-NEXT:    kmovw %eax, %k1
274; X86-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3]
275; X86-NEXT:    retl
276;
277; X64-LABEL: test_mm512_mask_shuffle_i32x4:
278; X64:       # %bb.0: # %entry
279; X64-NEXT:    kmovw %edi, %k1
280; X64-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3]
281; X64-NEXT:    retq
282entry:
283  %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
284  %0 = bitcast <8 x i64> %shuffle to <16 x i32>
285  %1 = bitcast <8 x i64> %__W to <16 x i32>
286  %2 = bitcast i16 %__U to <16 x i1>
287  %3 = select <16 x i1> %2, <16 x i32> %0, <16 x i32> %1
288  %4 = bitcast <16 x i32> %3 to <8 x i64>
289  ret <8 x i64> %4
290}
291
292define <8 x i64> @test_mm512_maskz_shuffle_i32x4(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
293; X86-LABEL: test_mm512_maskz_shuffle_i32x4:
294; X86:       # %bb.0: # %entry
295; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
296; X86-NEXT:    kmovw %eax, %k1
297; X86-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3]
298; X86-NEXT:    retl
299;
300; X64-LABEL: test_mm512_maskz_shuffle_i32x4:
301; X64:       # %bb.0: # %entry
302; X64-NEXT:    kmovw %edi, %k1
303; X64-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3]
304; X64-NEXT:    retq
305entry:
306  %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
307  %0 = bitcast <8 x i64> %shuffle to <16 x i32>
308  %1 = bitcast i16 %__U to <16 x i1>
309  %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer
310  %3 = bitcast <16 x i32> %2 to <8 x i64>
311  ret <8 x i64> %3
312}
313
314define <8 x i64> @test_mm512_shuffle_i64x2(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
315; CHECK-LABEL: test_mm512_shuffle_i64x2:
316; CHECK:       # %bb.0: # %entry
317; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1]
318; CHECK-NEXT:    ret{{[l|q]}}
319entry:
320  %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
321  ret <8 x i64> %shuffle
322}
323
324define <8 x i64> @test_mm512_mask_shuffle_i64x2(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
325; X86-LABEL: test_mm512_mask_shuffle_i64x2:
326; X86:       # %bb.0: # %entry
327; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
328; X86-NEXT:    kmovw %eax, %k1
329; X86-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1]
330; X86-NEXT:    retl
331;
332; X64-LABEL: test_mm512_mask_shuffle_i64x2:
333; X64:       # %bb.0: # %entry
334; X64-NEXT:    kmovw %edi, %k1
335; X64-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1]
336; X64-NEXT:    retq
337entry:
338  %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
339  %0 = bitcast i8 %__U to <8 x i1>
340  %1 = select <8 x i1> %0, <8 x i64> %shuffle, <8 x i64> %__W
341  ret <8 x i64> %1
342}
343
344define <8 x i64> @test_mm512_maskz_shuffle_i64x2(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
345; X86-LABEL: test_mm512_maskz_shuffle_i64x2:
346; X86:       # %bb.0: # %entry
347; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
348; X86-NEXT:    kmovw %eax, %k1
349; X86-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1]
350; X86-NEXT:    retl
351;
352; X64-LABEL: test_mm512_maskz_shuffle_i64x2:
353; X64:       # %bb.0: # %entry
354; X64-NEXT:    kmovw %edi, %k1
355; X64-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1]
356; X64-NEXT:    retq
357entry:
358  %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
359  %0 = bitcast i8 %__U to <8 x i1>
360  %1 = select <8 x i1> %0, <8 x i64> %shuffle, <8 x i64> zeroinitializer
361  ret <8 x i64> %1
362}
363
364
365define zeroext i16 @test_mm512_testn_epi32_mask(<8 x i64> %__A, <8 x i64> %__B) {
366; CHECK-LABEL: test_mm512_testn_epi32_mask:
367; CHECK:       # %bb.0: # %entry
368; CHECK-NEXT:    vptestnmd %zmm0, %zmm1, %k0
369; CHECK-NEXT:    kmovw %k0, %eax
370; CHECK-NEXT:    movzwl %ax, %eax
371; CHECK-NEXT:    vzeroupper
372; CHECK-NEXT:    ret{{[l|q]}}
373entry:
374  %and1.i.i = and <8 x i64> %__B, %__A
375  %0 = bitcast <8 x i64> %and1.i.i to <16 x i32>
376  %1 = icmp eq <16 x i32> %0, zeroinitializer
377  %2 = bitcast <16 x i1> %1 to i16
378  ret i16 %2
379}
380
381define zeroext i16 @test_mm512_mask_testn_epi32_mask(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
382; X86-LABEL: test_mm512_mask_testn_epi32_mask:
383; X86:       # %bb.0: # %entry
384; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
385; X86-NEXT:    kmovw %eax, %k1
386; X86-NEXT:    vptestnmd %zmm0, %zmm1, %k0 {%k1}
387; X86-NEXT:    kmovw %k0, %eax
388; X86-NEXT:    movzwl %ax, %eax
389; X86-NEXT:    vzeroupper
390; X86-NEXT:    retl
391;
392; X64-LABEL: test_mm512_mask_testn_epi32_mask:
393; X64:       # %bb.0: # %entry
394; X64-NEXT:    kmovw %edi, %k1
395; X64-NEXT:    vptestnmd %zmm0, %zmm1, %k0 {%k1}
396; X64-NEXT:    kmovw %k0, %eax
397; X64-NEXT:    movzwl %ax, %eax
398; X64-NEXT:    vzeroupper
399; X64-NEXT:    retq
400entry:
401  %and1.i.i = and <8 x i64> %__B, %__A
402  %0 = bitcast <8 x i64> %and1.i.i to <16 x i32>
403  %1 = icmp eq <16 x i32> %0, zeroinitializer
404  %2 = bitcast i16 %__U to <16 x i1>
405  %3 = and <16 x i1> %1, %2
406  %4 = bitcast <16 x i1> %3 to i16
407  ret i16 %4
408}
409
410define zeroext i8 @test_mm512_testn_epi64_mask(<8 x i64> %__A, <8 x i64> %__B) {
411; CHECK-LABEL: test_mm512_testn_epi64_mask:
412; CHECK:       # %bb.0: # %entry
413; CHECK-NEXT:    vptestnmq %zmm0, %zmm1, %k0
414; CHECK-NEXT:    kmovw %k0, %eax
415; CHECK-NEXT:    movzbl %al, %eax
416; CHECK-NEXT:    vzeroupper
417; CHECK-NEXT:    ret{{[l|q]}}
418entry:
419  %and1.i.i = and <8 x i64> %__B, %__A
420  %0 = icmp eq <8 x i64> %and1.i.i, zeroinitializer
421  %1 = bitcast <8 x i1> %0 to i8
422  ret i8 %1
423}
424
425define zeroext i8 @test_mm512_mask_testn_epi64_mask(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
426; X86-LABEL: test_mm512_mask_testn_epi64_mask:
427; X86:       # %bb.0: # %entry
428; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
429; X86-NEXT:    kmovw %eax, %k1
430; X86-NEXT:    vptestnmq %zmm0, %zmm1, %k0 {%k1}
431; X86-NEXT:    kmovw %k0, %eax
432; X86-NEXT:    movzbl %al, %eax
433; X86-NEXT:    vzeroupper
434; X86-NEXT:    retl
435;
436; X64-LABEL: test_mm512_mask_testn_epi64_mask:
437; X64:       # %bb.0: # %entry
438; X64-NEXT:    kmovw %edi, %k1
439; X64-NEXT:    vptestnmq %zmm0, %zmm1, %k0 {%k1}
440; X64-NEXT:    kmovw %k0, %eax
441; X64-NEXT:    movzbl %al, %eax
442; X64-NEXT:    vzeroupper
443; X64-NEXT:    retq
444entry:
445  %and1.i.i = and <8 x i64> %__B, %__A
446  %0 = icmp eq <8 x i64> %and1.i.i, zeroinitializer
447  %1 = bitcast i8 %__U to <8 x i1>
448  %2 = and <8 x i1> %0, %1
449  %3 = bitcast <8 x i1> %2 to i8
450  ret i8 %3
451}
452
453define zeroext i16 @test_mm512_mask_test_epi32_mask(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
454; X86-LABEL: test_mm512_mask_test_epi32_mask:
455; X86:       # %bb.0: # %entry
456; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
457; X86-NEXT:    kmovw %eax, %k1
458; X86-NEXT:    vptestmd %zmm0, %zmm1, %k0 {%k1}
459; X86-NEXT:    kmovw %k0, %eax
460; X86-NEXT:    movzwl %ax, %eax
461; X86-NEXT:    vzeroupper
462; X86-NEXT:    retl
463;
464; X64-LABEL: test_mm512_mask_test_epi32_mask:
465; X64:       # %bb.0: # %entry
466; X64-NEXT:    kmovw %edi, %k1
467; X64-NEXT:    vptestmd %zmm0, %zmm1, %k0 {%k1}
468; X64-NEXT:    kmovw %k0, %eax
469; X64-NEXT:    movzwl %ax, %eax
470; X64-NEXT:    vzeroupper
471; X64-NEXT:    retq
472entry:
473  %and1.i.i = and <8 x i64> %__B, %__A
474  %0 = bitcast <8 x i64> %and1.i.i to <16 x i32>
475  %1 = icmp ne <16 x i32> %0, zeroinitializer
476  %2 = bitcast i16 %__U to <16 x i1>
477  %3 = and <16 x i1> %1, %2
478  %4 = bitcast <16 x i1> %3 to i16
479  ret i16 %4
480}
481
482define zeroext i8 @test_mm512_mask_test_epi64_mask(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
483; X86-LABEL: test_mm512_mask_test_epi64_mask:
484; X86:       # %bb.0: # %entry
485; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
486; X86-NEXT:    kmovw %eax, %k1
487; X86-NEXT:    vptestmq %zmm0, %zmm1, %k0 {%k1}
488; X86-NEXT:    kmovw %k0, %eax
489; X86-NEXT:    movzbl %al, %eax
490; X86-NEXT:    vzeroupper
491; X86-NEXT:    retl
492;
493; X64-LABEL: test_mm512_mask_test_epi64_mask:
494; X64:       # %bb.0: # %entry
495; X64-NEXT:    kmovw %edi, %k1
496; X64-NEXT:    vptestmq %zmm0, %zmm1, %k0 {%k1}
497; X64-NEXT:    kmovw %k0, %eax
498; X64-NEXT:    movzbl %al, %eax
499; X64-NEXT:    vzeroupper
500; X64-NEXT:    retq
501entry:
502  %and1.i.i = and <8 x i64> %__B, %__A
503  %0 = icmp ne <8 x i64> %and1.i.i, zeroinitializer
504  %1 = bitcast i8 %__U to <8 x i1>
505  %2 = and <8 x i1> %0, %1
506  %3 = bitcast <8 x i1> %2 to i8
507  ret i8 %3
508}
509
510define <8 x i64> @test_mm512_mask_set1_epi32(<8 x i64> %__O, i16 zeroext %__M, i32 %__A) {
511; X86-LABEL: test_mm512_mask_set1_epi32:
512; X86:       # %bb.0: # %entry
513; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
514; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
515; X86-NEXT:    kmovw %ecx, %k1
516; X86-NEXT:    vpbroadcastd %eax, %zmm0 {%k1}
517; X86-NEXT:    retl
518;
519; X64-LABEL: test_mm512_mask_set1_epi32:
520; X64:       # %bb.0: # %entry
521; X64-NEXT:    kmovw %edi, %k1
522; X64-NEXT:    vpbroadcastd %esi, %zmm0 {%k1}
523; X64-NEXT:    retq
524entry:
525  %vecinit.i.i = insertelement <16 x i32> undef, i32 %__A, i32 0
526  %vecinit15.i.i = shufflevector <16 x i32> %vecinit.i.i, <16 x i32> undef, <16 x i32> zeroinitializer
527  %0 = bitcast <8 x i64> %__O to <16 x i32>
528  %1 = bitcast i16 %__M to <16 x i1>
529  %2 = select <16 x i1> %1, <16 x i32> %vecinit15.i.i, <16 x i32> %0
530  %3 = bitcast <16 x i32> %2 to <8 x i64>
531  ret <8 x i64> %3
532}
533
534define <8 x i64> @test_mm512_maskz_set1_epi32(i16 zeroext %__M, i32 %__A)  {
535; X86-LABEL: test_mm512_maskz_set1_epi32:
536; X86:       # %bb.0: # %entry
537; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
538; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
539; X86-NEXT:    kmovw %ecx, %k1
540; X86-NEXT:    vpbroadcastd %eax, %zmm0 {%k1} {z}
541; X86-NEXT:    retl
542;
543; X64-LABEL: test_mm512_maskz_set1_epi32:
544; X64:       # %bb.0: # %entry
545; X64-NEXT:    kmovw %edi, %k1
546; X64-NEXT:    vpbroadcastd %esi, %zmm0 {%k1} {z}
547; X64-NEXT:    retq
548entry:
549  %vecinit.i.i = insertelement <16 x i32> undef, i32 %__A, i32 0
550  %vecinit15.i.i = shufflevector <16 x i32> %vecinit.i.i, <16 x i32> undef, <16 x i32> zeroinitializer
551  %0 = bitcast i16 %__M to <16 x i1>
552  %1 = select <16 x i1> %0, <16 x i32> %vecinit15.i.i, <16 x i32> zeroinitializer
553  %2 = bitcast <16 x i32> %1 to <8 x i64>
554  ret <8 x i64> %2
555}
556
557define <8 x i64> @test_mm512_mask_set1_epi64(<8 x i64> %__O, i8 zeroext %__M, i64 %__A) {
558; X86-LABEL: test_mm512_mask_set1_epi64:
559; X86:       # %bb.0: # %entry
560; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
561; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
562; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
563; X86-NEXT:    kmovw %eax, %k1
564; X86-NEXT:    vpbroadcastq %xmm1, %zmm0 {%k1}
565; X86-NEXT:    retl
566;
567; X64-LABEL: test_mm512_mask_set1_epi64:
568; X64:       # %bb.0: # %entry
569; X64-NEXT:    kmovw %edi, %k1
570; X64-NEXT:    vpbroadcastq %rsi, %zmm0 {%k1}
571; X64-NEXT:    retq
572entry:
573  %vecinit.i.i = insertelement <8 x i64> undef, i64 %__A, i32 0
574  %vecinit7.i.i = shufflevector <8 x i64> %vecinit.i.i, <8 x i64> undef, <8 x i32> zeroinitializer
575  %0 = bitcast i8 %__M to <8 x i1>
576  %1 = select <8 x i1> %0, <8 x i64> %vecinit7.i.i, <8 x i64> %__O
577  ret <8 x i64> %1
578}
579
580define <8 x i64> @test_mm512_maskz_set1_epi64(i8 zeroext %__M, i64 %__A)  {
581; X86-LABEL: test_mm512_maskz_set1_epi64:
582; X86:       # %bb.0: # %entry
583; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
584; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
585; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
586; X86-NEXT:    kmovw %eax, %k1
587; X86-NEXT:    vpbroadcastq %xmm0, %zmm0 {%k1} {z}
588; X86-NEXT:    retl
589;
590; X64-LABEL: test_mm512_maskz_set1_epi64:
591; X64:       # %bb.0: # %entry
592; X64-NEXT:    kmovw %edi, %k1
593; X64-NEXT:    vpbroadcastq %rsi, %zmm0 {%k1} {z}
594; X64-NEXT:    retq
595entry:
596  %vecinit.i.i = insertelement <8 x i64> undef, i64 %__A, i32 0
597  %vecinit7.i.i = shufflevector <8 x i64> %vecinit.i.i, <8 x i64> undef, <8 x i32> zeroinitializer
598  %0 = bitcast i8 %__M to <8 x i1>
599  %1 = select <8 x i1> %0, <8 x i64> %vecinit7.i.i, <8 x i64> zeroinitializer
600  ret <8 x i64> %1
601}
602
603
604define <8 x i64> @test_mm512_broadcastd_epi32(<2 x i64> %a0) {
605; CHECK-LABEL: test_mm512_broadcastd_epi32:
606; CHECK:       # %bb.0:
607; CHECK-NEXT:    vbroadcastss %xmm0, %zmm0
608; CHECK-NEXT:    ret{{[l|q]}}
609  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
610  %res0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <16 x i32> zeroinitializer
611  %res1 = bitcast <16 x i32> %res0 to <8 x i64>
612  ret <8 x i64> %res1
613}
614
615define <8 x i64> @test_mm512_mask_broadcastd_epi32(<8 x i64> %a0, i16 %a1, <2 x i64> %a2) {
616; X86-LABEL: test_mm512_mask_broadcastd_epi32:
617; X86:       # %bb.0:
618; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
619; X86-NEXT:    kmovw %eax, %k1
620; X86-NEXT:    vpbroadcastd %xmm1, %zmm0 {%k1}
621; X86-NEXT:    retl
622;
623; X64-LABEL: test_mm512_mask_broadcastd_epi32:
624; X64:       # %bb.0:
625; X64-NEXT:    kmovw %edi, %k1
626; X64-NEXT:    vpbroadcastd %xmm1, %zmm0 {%k1}
627; X64-NEXT:    retq
628  %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
629  %arg1 = bitcast i16 %a1 to <16 x i1>
630  %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
631  %res0 = shufflevector <4 x i32> %arg2, <4 x i32> undef, <16 x i32> zeroinitializer
632  %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0
633  %res2 = bitcast <16 x i32> %res1 to <8 x i64>
634  ret <8 x i64> %res2
635}
636
637define <8 x i64> @test_mm512_maskz_broadcastd_epi32(i16 %a0, <2 x i64> %a1) {
638; X86-LABEL: test_mm512_maskz_broadcastd_epi32:
639; X86:       # %bb.0:
640; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
641; X86-NEXT:    kmovw %eax, %k1
642; X86-NEXT:    vpbroadcastd %xmm0, %zmm0 {%k1} {z}
643; X86-NEXT:    retl
644;
645; X64-LABEL: test_mm512_maskz_broadcastd_epi32:
646; X64:       # %bb.0:
647; X64-NEXT:    kmovw %edi, %k1
648; X64-NEXT:    vpbroadcastd %xmm0, %zmm0 {%k1} {z}
649; X64-NEXT:    retq
650  %arg0 = bitcast i16 %a0 to <16 x i1>
651  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
652  %res0 = shufflevector <4 x i32> %arg1, <4 x i32> undef, <16 x i32> zeroinitializer
653  %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer
654  %res2 = bitcast <16 x i32> %res1 to <8 x i64>
655  ret <8 x i64> %res2
656}
657
658define <8 x i64> @test_mm512_broadcastq_epi64(<2 x i64> %a0) {
659; CHECK-LABEL: test_mm512_broadcastq_epi64:
660; CHECK:       # %bb.0:
661; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm0
662; CHECK-NEXT:    ret{{[l|q]}}
663  %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <8 x i32> zeroinitializer
664  ret <8 x i64> %res
665}
666
667define <8 x i64> @test_mm512_mask_broadcastq_epi64(<8 x i64> %a0, i8 %a1, <2 x i64> %a2) {
668; X86-LABEL: test_mm512_mask_broadcastq_epi64:
669; X86:       # %bb.0:
670; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
671; X86-NEXT:    kmovw %eax, %k1
672; X86-NEXT:    vpbroadcastq %xmm1, %zmm0 {%k1}
673; X86-NEXT:    retl
674;
675; X64-LABEL: test_mm512_mask_broadcastq_epi64:
676; X64:       # %bb.0:
677; X64-NEXT:    kmovw %edi, %k1
678; X64-NEXT:    vpbroadcastq %xmm1, %zmm0 {%k1}
679; X64-NEXT:    retq
680  %arg1 = bitcast i8 %a1 to <8 x i1>
681  %res0 = shufflevector <2 x i64> %a2, <2 x i64> undef, <8 x i32> zeroinitializer
682  %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0
683  ret <8 x i64> %res1
684}
685
686define <8 x i64> @test_mm512_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) {
687; X86-LABEL: test_mm512_maskz_broadcastq_epi64:
688; X86:       # %bb.0:
689; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
690; X86-NEXT:    kmovw %eax, %k1
691; X86-NEXT:    vpbroadcastq %xmm0, %zmm0 {%k1} {z}
692; X86-NEXT:    retl
693;
694; X64-LABEL: test_mm512_maskz_broadcastq_epi64:
695; X64:       # %bb.0:
696; X64-NEXT:    kmovw %edi, %k1
697; X64-NEXT:    vpbroadcastq %xmm0, %zmm0 {%k1} {z}
698; X64-NEXT:    retq
699  %arg0 = bitcast i8 %a0 to <8 x i1>
700  %res0 = shufflevector <2 x i64> %a1, <2 x i64> undef, <8 x i32> zeroinitializer
701  %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer
702  ret <8 x i64> %res1
703}
704
705define <8 x double> @test_mm512_broadcastsd_pd(<2 x double> %a0) {
706; CHECK-LABEL: test_mm512_broadcastsd_pd:
707; CHECK:       # %bb.0:
708; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm0
709; CHECK-NEXT:    ret{{[l|q]}}
710  %res = shufflevector <2 x double> %a0, <2 x double> undef, <8 x i32> zeroinitializer
711  ret <8 x double> %res
712}
713
714define <8 x double> @test_mm512_mask_broadcastsd_pd(<8 x double> %a0, i8 %a1, <2 x double> %a2) {
715; X86-LABEL: test_mm512_mask_broadcastsd_pd:
716; X86:       # %bb.0:
717; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
718; X86-NEXT:    kmovw %eax, %k1
719; X86-NEXT:    vbroadcastsd %xmm1, %zmm0 {%k1}
720; X86-NEXT:    retl
721;
722; X64-LABEL: test_mm512_mask_broadcastsd_pd:
723; X64:       # %bb.0:
724; X64-NEXT:    kmovw %edi, %k1
725; X64-NEXT:    vbroadcastsd %xmm1, %zmm0 {%k1}
726; X64-NEXT:    retq
727  %arg1 = bitcast i8 %a1 to <8 x i1>
728  %res0 = shufflevector <2 x double> %a2, <2 x double> undef, <8 x i32> zeroinitializer
729  %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
730  ret <8 x double> %res1
731}
732
733define <8 x double> @test_mm512_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) {
734; X86-LABEL: test_mm512_maskz_broadcastsd_pd:
735; X86:       # %bb.0:
736; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
737; X86-NEXT:    kmovw %eax, %k1
738; X86-NEXT:    vbroadcastsd %xmm0, %zmm0 {%k1} {z}
739; X86-NEXT:    retl
740;
741; X64-LABEL: test_mm512_maskz_broadcastsd_pd:
742; X64:       # %bb.0:
743; X64-NEXT:    kmovw %edi, %k1
744; X64-NEXT:    vbroadcastsd %xmm0, %zmm0 {%k1} {z}
745; X64-NEXT:    retq
746  %arg0 = bitcast i8 %a0 to <8 x i1>
747  %res0 = shufflevector <2 x double> %a1, <2 x double> undef, <8 x i32> zeroinitializer
748  %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
749  ret <8 x double> %res1
750}
751
752define <16 x float> @test_mm512_broadcastss_ps(<4 x float> %a0) {
753; CHECK-LABEL: test_mm512_broadcastss_ps:
754; CHECK:       # %bb.0:
755; CHECK-NEXT:    vbroadcastss %xmm0, %zmm0
756; CHECK-NEXT:    ret{{[l|q]}}
757  %res = shufflevector <4 x float> %a0, <4 x float> undef, <16 x i32> zeroinitializer
758  ret <16 x float> %res
759}
760
761define <16 x float> @test_mm512_mask_broadcastss_ps(<16 x float> %a0, i16 %a1, <4 x float> %a2) {
762; X86-LABEL: test_mm512_mask_broadcastss_ps:
763; X86:       # %bb.0:
764; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
765; X86-NEXT:    kmovw %eax, %k1
766; X86-NEXT:    vbroadcastss %xmm1, %zmm0 {%k1}
767; X86-NEXT:    retl
768;
769; X64-LABEL: test_mm512_mask_broadcastss_ps:
770; X64:       # %bb.0:
771; X64-NEXT:    kmovw %edi, %k1
772; X64-NEXT:    vbroadcastss %xmm1, %zmm0 {%k1}
773; X64-NEXT:    retq
774  %arg1 = bitcast i16 %a1 to <16 x i1>
775  %res0 = shufflevector <4 x float> %a2, <4 x float> undef, <16 x i32> zeroinitializer
776  %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
777  ret <16 x float> %res1
778}
779
780define <16 x float> @test_mm512_maskz_broadcastss_ps(i16 %a0, <4 x float> %a1) {
781; X86-LABEL: test_mm512_maskz_broadcastss_ps:
782; X86:       # %bb.0:
783; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
784; X86-NEXT:    kmovw %eax, %k1
785; X86-NEXT:    vbroadcastss %xmm0, %zmm0 {%k1} {z}
786; X86-NEXT:    retl
787;
788; X64-LABEL: test_mm512_maskz_broadcastss_ps:
789; X64:       # %bb.0:
790; X64-NEXT:    kmovw %edi, %k1
791; X64-NEXT:    vbroadcastss %xmm0, %zmm0 {%k1} {z}
792; X64-NEXT:    retq
793  %arg0 = bitcast i16 %a0 to <16 x i1>
794  %res0 = shufflevector <4 x float> %a1, <4 x float> undef, <16 x i32> zeroinitializer
795  %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
796  ret <16 x float> %res1
797}
798
799define <8 x double> @test_mm512_movedup_pd(<8 x double> %a0) {
800; CHECK-LABEL: test_mm512_movedup_pd:
801; CHECK:       # %bb.0:
802; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
803; CHECK-NEXT:    ret{{[l|q]}}
804  %res = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
805  ret <8 x double> %res
806}
807
808define <8 x double> @test_mm512_mask_movedup_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2) {
809; X86-LABEL: test_mm512_mask_movedup_pd:
810; X86:       # %bb.0:
811; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
812; X86-NEXT:    kmovw %eax, %k1
813; X86-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6]
814; X86-NEXT:    retl
815;
816; X64-LABEL: test_mm512_mask_movedup_pd:
817; X64:       # %bb.0:
818; X64-NEXT:    kmovw %edi, %k1
819; X64-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6]
820; X64-NEXT:    retq
821  %arg1 = bitcast i8 %a1 to <8 x i1>
822  %res0 = shufflevector <8 x double> %a2, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
823  %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
824  ret <8 x double> %res1
825}
826
827define <8 x double> @test_mm512_maskz_movedup_pd(i8 %a0, <8 x double> %a1) {
828; X86-LABEL: test_mm512_maskz_movedup_pd:
829; X86:       # %bb.0:
830; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
831; X86-NEXT:    kmovw %eax, %k1
832; X86-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
833; X86-NEXT:    retl
834;
835; X64-LABEL: test_mm512_maskz_movedup_pd:
836; X64:       # %bb.0:
837; X64-NEXT:    kmovw %edi, %k1
838; X64-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
839; X64-NEXT:    retq
840  %arg0 = bitcast i8 %a0 to <8 x i1>
841  %res0 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
842  %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
843  ret <8 x double> %res1
844}
845
846define <16 x float> @test_mm512_movehdup_ps(<16 x float> %a0) {
847; CHECK-LABEL: test_mm512_movehdup_ps:
848; CHECK:       # %bb.0:
849; CHECK-NEXT:    vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
850; CHECK-NEXT:    ret{{[l|q]}}
851  %res = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
852  ret <16 x float> %res
853}
854
855define <16 x float> @test_mm512_mask_movehdup_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) {
856; X86-LABEL: test_mm512_mask_movehdup_ps:
857; X86:       # %bb.0:
858; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
859; X86-NEXT:    kmovw %eax, %k1
860; X86-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
861; X86-NEXT:    retl
862;
863; X64-LABEL: test_mm512_mask_movehdup_ps:
864; X64:       # %bb.0:
865; X64-NEXT:    kmovw %edi, %k1
866; X64-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
867; X64-NEXT:    retq
868  %arg1 = bitcast i16 %a1 to <16 x i1>
869  %res0 = shufflevector <16 x float> %a2, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
870  %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
871  ret <16 x float> %res1
872}
873
874define <16 x float> @test_mm512_maskz_movehdup_ps(i16 %a0, <16 x float> %a1) {
875; X86-LABEL: test_mm512_maskz_movehdup_ps:
876; X86:       # %bb.0:
877; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
878; X86-NEXT:    kmovw %eax, %k1
879; X86-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
880; X86-NEXT:    retl
881;
882; X64-LABEL: test_mm512_maskz_movehdup_ps:
883; X64:       # %bb.0:
884; X64-NEXT:    kmovw %edi, %k1
885; X64-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
886; X64-NEXT:    retq
887  %arg0 = bitcast i16 %a0 to <16 x i1>
888  %res0 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
889  %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
890  ret <16 x float> %res1
891}
892
893define <16 x float> @test_mm512_moveldup_ps(<16 x float> %a0) {
894; CHECK-LABEL: test_mm512_moveldup_ps:
895; CHECK:       # %bb.0:
896; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
897; CHECK-NEXT:    ret{{[l|q]}}
898  %res = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
899  ret <16 x float> %res
900}
901
902define <16 x float> @test_mm512_mask_moveldup_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) {
903; X86-LABEL: test_mm512_mask_moveldup_ps:
904; X86:       # %bb.0:
905; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
906; X86-NEXT:    kmovw %eax, %k1
907; X86-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
908; X86-NEXT:    retl
909;
910; X64-LABEL: test_mm512_mask_moveldup_ps:
911; X64:       # %bb.0:
912; X64-NEXT:    kmovw %edi, %k1
913; X64-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
914; X64-NEXT:    retq
915  %arg1 = bitcast i16 %a1 to <16 x i1>
916  %res0 = shufflevector <16 x float> %a2, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
917  %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
918  ret <16 x float> %res1
919}
920
921define <16 x float> @test_mm512_maskz_moveldup_ps(i16 %a0, <16 x float> %a1) {
922; X86-LABEL: test_mm512_maskz_moveldup_ps:
923; X86:       # %bb.0:
924; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
925; X86-NEXT:    kmovw %eax, %k1
926; X86-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
927; X86-NEXT:    retl
928;
929; X64-LABEL: test_mm512_maskz_moveldup_ps:
930; X64:       # %bb.0:
931; X64-NEXT:    kmovw %edi, %k1
932; X64-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
933; X64-NEXT:    retq
934  %arg0 = bitcast i16 %a0 to <16 x i1>
935  %res0 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
936  %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
937  ret <16 x float> %res1
938}
939
940define <8 x double> @test_mm512_permute_pd(<8 x double> %a0) {
941; CHECK-LABEL: test_mm512_permute_pd:
942; CHECK:       # %bb.0:
943; CHECK-NEXT:    vpermilpd {{.*#+}} zmm0 = zmm0[0,1,2,2,4,4,6,6]
944; CHECK-NEXT:    ret{{[l|q]}}
945  %res = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
946  ret <8 x double> %res
947}
948
949define <8 x double> @test_mm512_mask_permute_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2) {
950; X86-LABEL: test_mm512_mask_permute_pd:
951; X86:       # %bb.0:
952; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
953; X86-NEXT:    kmovw %eax, %k1
954; X86-NEXT:    vpermilpd {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,2,4,4,6,6]
955; X86-NEXT:    retl
956;
957; X64-LABEL: test_mm512_mask_permute_pd:
958; X64:       # %bb.0:
959; X64-NEXT:    kmovw %edi, %k1
960; X64-NEXT:    vpermilpd {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,2,4,4,6,6]
961; X64-NEXT:    retq
962  %arg1 = bitcast i8 %a1 to <8 x i1>
963  %res0 = shufflevector <8 x double> %a2, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
964  %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
965  ret <8 x double> %res1
966}
967
968define <8 x double> @test_mm512_maskz_permute_pd(i8 %a0, <8 x double> %a1) {
969; X86-LABEL: test_mm512_maskz_permute_pd:
970; X86:       # %bb.0:
971; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
972; X86-NEXT:    kmovw %eax, %k1
973; X86-NEXT:    vpermilpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,2,4,4,6,6]
974; X86-NEXT:    retl
975;
976; X64-LABEL: test_mm512_maskz_permute_pd:
977; X64:       # %bb.0:
978; X64-NEXT:    kmovw %edi, %k1
979; X64-NEXT:    vpermilpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,2,4,4,6,6]
980; X64-NEXT:    retq
981  %arg0 = bitcast i8 %a0 to <8 x i1>
982  %res0 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
983  %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
984  ret <8 x double> %res1
985}
986
987define <16 x float> @test_mm512_permute_ps(<16 x float> %a0) {
988; CHECK-LABEL: test_mm512_permute_ps:
989; CHECK:       # %bb.0:
990; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
991; CHECK-NEXT:    ret{{[l|q]}}
992  %res = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
993  ret <16 x float> %res
994}
995
996define <16 x float> @test_mm512_mask_permute_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) {
997; X86-LABEL: test_mm512_mask_permute_ps:
998; X86:       # %bb.0:
999; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1000; X86-NEXT:    kmovw %eax, %k1
1001; X86-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} = zmm1[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
1002; X86-NEXT:    retl
1003;
1004; X64-LABEL: test_mm512_mask_permute_ps:
1005; X64:       # %bb.0:
1006; X64-NEXT:    kmovw %edi, %k1
1007; X64-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} = zmm1[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
1008; X64-NEXT:    retq
1009  %arg1 = bitcast i16 %a1 to <16 x i1>
1010  %res0 = shufflevector <16 x float> %a2, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
1011  %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
1012  ret <16 x float> %res1
1013}
1014
1015define <16 x float> @test_mm512_maskz_permute_ps(i16 %a0, <16 x float> %a1) {
1016; X86-LABEL: test_mm512_maskz_permute_ps:
1017; X86:       # %bb.0:
1018; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1019; X86-NEXT:    kmovw %eax, %k1
1020; X86-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
1021; X86-NEXT:    retl
1022;
1023; X64-LABEL: test_mm512_maskz_permute_ps:
1024; X64:       # %bb.0:
1025; X64-NEXT:    kmovw %edi, %k1
1026; X64-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
1027; X64-NEXT:    retq
1028  %arg0 = bitcast i16 %a0 to <16 x i1>
1029  %res0 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
1030  %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
1031  ret <16 x float> %res1
1032}
1033
1034define <8 x i64> @test_mm512_permutex_epi64(<8 x i64> %a0) {
1035; CHECK-LABEL: test_mm512_permutex_epi64:
1036; CHECK:       # %bb.0:
1037; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 = zmm0[0,0,0,0,4,4,4,4]
1038; CHECK-NEXT:    ret{{[l|q]}}
1039  %res = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
1040  ret <8 x i64> %res
1041}
1042
1043define <8 x i64> @test_mm512_mask_permutex_epi64(<8 x i64> %a0, i8 %a1, <8 x i64> %a2) {
1044; X86-LABEL: test_mm512_mask_permutex_epi64:
1045; X86:       # %bb.0:
1046; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1047; X86-NEXT:    kmovw %eax, %k1
1048; X86-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4]
1049; X86-NEXT:    retl
1050;
1051; X64-LABEL: test_mm512_mask_permutex_epi64:
1052; X64:       # %bb.0:
1053; X64-NEXT:    kmovw %edi, %k1
1054; X64-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4]
1055; X64-NEXT:    retq
1056  %arg1 = bitcast i8 %a1 to <8 x i1>
1057  %res0 = shufflevector <8 x i64> %a2, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
1058  %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0
1059  ret <8 x i64> %res1
1060}
1061
1062define <8 x i64> @test_mm512_maskz_permutex_epi64(i8 %a0, <8 x i64> %a1) {
1063; X86-LABEL: test_mm512_maskz_permutex_epi64:
1064; X86:       # %bb.0:
1065; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1066; X86-NEXT:    kmovw %eax, %k1
1067; X86-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
1068; X86-NEXT:    retl
1069;
1070; X64-LABEL: test_mm512_maskz_permutex_epi64:
1071; X64:       # %bb.0:
1072; X64-NEXT:    kmovw %edi, %k1
1073; X64-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
1074; X64-NEXT:    retq
1075  %arg0 = bitcast i8 %a0 to <8 x i1>
1076  %res0 = shufflevector <8 x i64> %a1, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
1077  %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer
1078  ret <8 x i64> %res1
1079}
1080
1081define <8 x double> @test_mm512_permutex_pd(<8 x double> %a0) {
1082; CHECK-LABEL: test_mm512_permutex_pd:
1083; CHECK:       # %bb.0:
1084; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 = zmm0[0,0,0,0,4,4,4,4]
1085; CHECK-NEXT:    ret{{[l|q]}}
1086  %res = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
1087  ret <8 x double> %res
1088}
1089
1090define <8 x double> @test_mm512_mask_permutex_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2) {
1091; X86-LABEL: test_mm512_mask_permutex_pd:
1092; X86:       # %bb.0:
1093; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1094; X86-NEXT:    kmovw %eax, %k1
1095; X86-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4]
1096; X86-NEXT:    retl
1097;
1098; X64-LABEL: test_mm512_mask_permutex_pd:
1099; X64:       # %bb.0:
1100; X64-NEXT:    kmovw %edi, %k1
1101; X64-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4]
1102; X64-NEXT:    retq
1103  %arg1 = bitcast i8 %a1 to <8 x i1>
1104  %res0 = shufflevector <8 x double> %a2, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
1105  %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
1106  ret <8 x double> %res1
1107}
1108
1109define <8 x double> @test_mm512_maskz_permutex_pd(i8 %a0, <8 x double> %a1) {
1110; X86-LABEL: test_mm512_maskz_permutex_pd:
1111; X86:       # %bb.0:
1112; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1113; X86-NEXT:    kmovw %eax, %k1
1114; X86-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
1115; X86-NEXT:    retl
1116;
1117; X64-LABEL: test_mm512_maskz_permutex_pd:
1118; X64:       # %bb.0:
1119; X64-NEXT:    kmovw %edi, %k1
1120; X64-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
1121; X64-NEXT:    retq
1122  %arg0 = bitcast i8 %a0 to <8 x i1>
1123  %res0 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
1124  %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
1125  ret <8 x double> %res1
1126}
1127
1128define <8 x i64> @test_mm512_shuffle_epi32(<8 x i64> %a0) {
1129; CHECK-LABEL: test_mm512_shuffle_epi32:
1130; CHECK:       # %bb.0:
1131; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
1132; CHECK-NEXT:    ret{{[l|q]}}
1133  %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
1134  %res0 = shufflevector <16 x i32> %arg0, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
1135  %res1 = bitcast <16 x i32> %res0 to <8 x i64>
1136  ret <8 x i64> %res1
1137}
1138
1139define <8 x i64> @test_mm512_mask_shuffle_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2) {
1140; X86-LABEL: test_mm512_mask_shuffle_epi32:
1141; X86:       # %bb.0:
1142; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1143; X86-NEXT:    kmovw %eax, %k1
1144; X86-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = zmm1[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
1145; X86-NEXT:    retl
1146;
1147; X64-LABEL: test_mm512_mask_shuffle_epi32:
1148; X64:       # %bb.0:
1149; X64-NEXT:    kmovw %edi, %k1
1150; X64-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = zmm1[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
1151; X64-NEXT:    retq
1152  %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
1153  %arg1 = bitcast i16 %a1 to <16 x i1>
1154  %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
1155  %res0 = shufflevector <16 x i32> %arg2, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
1156  %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0
1157  %res2 = bitcast <16 x i32> %res1 to <8 x i64>
1158  ret <8 x i64> %res2
1159}
1160
1161define <8 x i64> @test_mm512_maskz_shuffle_epi32(i16 %a0, <8 x i64> %a1) {
1162; X86-LABEL: test_mm512_maskz_shuffle_epi32:
1163; X86:       # %bb.0:
1164; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1165; X86-NEXT:    kmovw %eax, %k1
1166; X86-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
1167; X86-NEXT:    retl
1168;
1169; X64-LABEL: test_mm512_maskz_shuffle_epi32:
1170; X64:       # %bb.0:
1171; X64-NEXT:    kmovw %edi, %k1
1172; X64-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
1173; X64-NEXT:    retq
1174  %arg0 = bitcast i16 %a0 to <16 x i1>
1175  %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
1176  %res0 = shufflevector <16 x i32> %arg1, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
1177  %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer
1178  %res2 = bitcast <16 x i32> %res1 to <8 x i64>
1179  ret <8 x i64> %res2
1180}
1181
1182define <8 x double> @test_mm512_shuffle_pd(<8 x double> %a0, <8 x double> %a1) {
1183; CHECK-LABEL: test_mm512_shuffle_pd:
1184; CHECK:       # %bb.0:
1185; CHECK-NEXT:    vshufpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[3],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1186; CHECK-NEXT:    ret{{[l|q]}}
1187  %res = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14>
1188  ret <8 x double> %res
1189}
1190
1191define <8 x double> @test_mm512_mask_shuffle_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2, <8 x double> %a3) {
1192; X86-LABEL: test_mm512_mask_shuffle_pd:
1193; X86:       # %bb.0:
1194; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1195; X86-NEXT:    kmovw %eax, %k1
1196; X86-NEXT:    vshufpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[3],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
1197; X86-NEXT:    retl
1198;
1199; X64-LABEL: test_mm512_mask_shuffle_pd:
1200; X64:       # %bb.0:
1201; X64-NEXT:    kmovw %edi, %k1
1202; X64-NEXT:    vshufpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[3],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
1203; X64-NEXT:    retq
1204  %arg1 = bitcast i8 %a1 to <8 x i1>
1205  %res0 = shufflevector <8 x double> %a2, <8 x double> %a3, <8 x i32> <i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14>
1206  %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
1207  ret <8 x double> %res1
1208}
1209
1210define <8 x double> @test_mm512_maskz_shuffle_pd(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
1211; X86-LABEL: test_mm512_maskz_shuffle_pd:
1212; X86:       # %bb.0:
1213; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1214; X86-NEXT:    kmovw %eax, %k1
1215; X86-NEXT:    vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[3],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1216; X86-NEXT:    retl
1217;
1218; X64-LABEL: test_mm512_maskz_shuffle_pd:
1219; X64:       # %bb.0:
1220; X64-NEXT:    kmovw %edi, %k1
1221; X64-NEXT:    vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[3],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1222; X64-NEXT:    retq
1223  %arg0 = bitcast i8 %a0 to <8 x i1>
1224  %res0 = shufflevector <8 x double> %a1, <8 x double> %a2, <8 x i32> <i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14>
1225  %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
1226  ret <8 x double> %res1
1227}
1228
1229define <8 x i64> @test_mm512_unpackhi_epi32(<8 x i64> %a0, <8 x i64> %a1) {
1230; CHECK-LABEL: test_mm512_unpackhi_epi32:
1231; CHECK:       # %bb.0:
1232; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
1233; CHECK-NEXT:    ret{{[l|q]}}
1234  %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
1235  %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
1236  %res0 = shufflevector <16 x i32> %arg0, <16 x i32> %arg1, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
1237  %res1 = bitcast <16 x i32> %res0 to <8 x i64>
1238  ret <8 x i64> %res1
1239}
1240
1241define <8 x i64> @test_mm512_mask_unpackhi_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2, <8 x i64> %a3) {
1242; X86-LABEL: test_mm512_mask_unpackhi_epi32:
1243; X86:       # %bb.0:
1244; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1245; X86-NEXT:    kmovw %eax, %k1
1246; X86-NEXT:    vpunpckhdq {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15]
1247; X86-NEXT:    retl
1248;
1249; X64-LABEL: test_mm512_mask_unpackhi_epi32:
1250; X64:       # %bb.0:
1251; X64-NEXT:    kmovw %edi, %k1
1252; X64-NEXT:    vpunpckhdq {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15]
1253; X64-NEXT:    retq
1254  %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
1255  %arg1 = bitcast i16 %a1 to <16 x i1>
1256  %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
1257  %arg3 = bitcast <8 x i64> %a3 to <16 x i32>
1258  %res0 = shufflevector <16 x i32> %arg2, <16 x i32> %arg3, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
1259  %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0
1260  %res2 = bitcast <16 x i32> %res1 to <8 x i64>
1261  ret <8 x i64> %res2
1262}
1263
1264define <8 x i64> @test_mm512_maskz_unpackhi_epi32(i16 %a0, <8 x i64> %a1, <8 x i64> %a2) {
1265; X86-LABEL: test_mm512_maskz_unpackhi_epi32:
1266; X86:       # %bb.0:
1267; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1268; X86-NEXT:    kmovw %eax, %k1
1269; X86-NEXT:    vpunpckhdq {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
1270; X86-NEXT:    retl
1271;
1272; X64-LABEL: test_mm512_maskz_unpackhi_epi32:
1273; X64:       # %bb.0:
1274; X64-NEXT:    kmovw %edi, %k1
1275; X64-NEXT:    vpunpckhdq {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
1276; X64-NEXT:    retq
1277  %arg0 = bitcast i16 %a0 to <16 x i1>
1278  %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
1279  %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
1280  %res0 = shufflevector <16 x i32> %arg1, <16 x i32> %arg2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
1281  %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer
1282  %res2 = bitcast <16 x i32> %res1 to <8 x i64>
1283  ret <8 x i64> %res2
1284}
1285
1286define <8 x i64> @test_mm512_unpackhi_epi64(<8 x i64> %a0, <8 x i64> %a1) {
1287; CHECK-LABEL: test_mm512_unpackhi_epi64:
1288; CHECK:       # %bb.0:
1289; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
1290; CHECK-NEXT:    ret{{[l|q]}}
1291  %res = shufflevector <8 x i64> %a0, <8 x i64> %a1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
1292  ret <8 x i64> %res
1293}
1294
1295define <8 x i64> @test_mm512_mask_unpackhi_epi64(<8 x i64> %a0, i8 %a1, <8 x i64> %a2, <8 x i64> %a3) {
1296; X86-LABEL: test_mm512_mask_unpackhi_epi64:
1297; X86:       # %bb.0:
1298; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1299; X86-NEXT:    kmovw %eax, %k1
1300; X86-NEXT:    vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7]
1301; X86-NEXT:    retl
1302;
1303; X64-LABEL: test_mm512_mask_unpackhi_epi64:
1304; X64:       # %bb.0:
1305; X64-NEXT:    kmovw %edi, %k1
1306; X64-NEXT:    vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7]
1307; X64-NEXT:    retq
1308  %arg1 = bitcast i8 %a1 to <8 x i1>
1309  %res0 = shufflevector <8 x i64> %a2, <8 x i64> %a3, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
1310  %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0
1311  ret <8 x i64> %res1
1312}
1313
1314define <8 x i64> @test_mm512_maskz_unpackhi_epi64(i8 %a0, <8 x i64> %a1, <8 x i64> %a2) {
1315; X86-LABEL: test_mm512_maskz_unpackhi_epi64:
1316; X86:       # %bb.0:
1317; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1318; X86-NEXT:    kmovw %eax, %k1
1319; X86-NEXT:    vpunpckhqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
1320; X86-NEXT:    retl
1321;
1322; X64-LABEL: test_mm512_maskz_unpackhi_epi64:
1323; X64:       # %bb.0:
1324; X64-NEXT:    kmovw %edi, %k1
1325; X64-NEXT:    vpunpckhqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
1326; X64-NEXT:    retq
1327  %arg0 = bitcast i8 %a0 to <8 x i1>
1328  %res0 = shufflevector <8 x i64> %a1, <8 x i64> %a2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
1329  %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer
1330  ret <8 x i64> %res1
1331}
1332
1333define <8 x double> @test_mm512_unpackhi_pd(<8 x double> %a0, <8 x double> %a1) {
1334; CHECK-LABEL: test_mm512_unpackhi_pd:
1335; CHECK:       # %bb.0:
1336; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
1337; CHECK-NEXT:    ret{{[l|q]}}
1338  %res = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
1339  ret <8 x double> %res
1340}
1341
1342define <8 x double> @test_mm512_mask_unpackhi_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2, <8 x double> %a3) {
1343; X86-LABEL: test_mm512_mask_unpackhi_pd:
1344; X86:       # %bb.0:
1345; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1346; X86-NEXT:    kmovw %eax, %k1
1347; X86-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7]
1348; X86-NEXT:    retl
1349;
1350; X64-LABEL: test_mm512_mask_unpackhi_pd:
1351; X64:       # %bb.0:
1352; X64-NEXT:    kmovw %edi, %k1
1353; X64-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7]
1354; X64-NEXT:    retq
1355  %arg1 = bitcast i8 %a1 to <8 x i1>
1356  %res0 = shufflevector <8 x double> %a2, <8 x double> %a3, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
1357  %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
1358  ret <8 x double> %res1
1359}
1360
1361define <8 x double> @test_mm512_maskz_unpackhi_pd(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
1362; X86-LABEL: test_mm512_maskz_unpackhi_pd:
1363; X86:       # %bb.0:
1364; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1365; X86-NEXT:    kmovw %eax, %k1
1366; X86-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
1367; X86-NEXT:    retl
1368;
1369; X64-LABEL: test_mm512_maskz_unpackhi_pd:
1370; X64:       # %bb.0:
1371; X64-NEXT:    kmovw %edi, %k1
1372; X64-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
1373; X64-NEXT:    retq
1374  %arg0 = bitcast i8 %a0 to <8 x i1>
1375  %res0 = shufflevector <8 x double> %a1, <8 x double> %a2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
1376  %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
1377  ret <8 x double> %res1
1378}
1379
1380define <16 x float> @test_mm512_unpackhi_ps(<16 x float> %a0, <16 x float> %a1) {
1381; CHECK-LABEL: test_mm512_unpackhi_ps:
1382; CHECK:       # %bb.0:
1383; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
1384; CHECK-NEXT:    ret{{[l|q]}}
1385  %res = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
1386  ret <16 x float> %res
1387}
1388
1389define <16 x float> @test_mm512_mask_unpackhi_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2, <16 x float> %a3) {
1390; X86-LABEL: test_mm512_mask_unpackhi_ps:
1391; X86:       # %bb.0:
1392; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1393; X86-NEXT:    kmovw %eax, %k1
1394; X86-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15]
1395; X86-NEXT:    retl
1396;
1397; X64-LABEL: test_mm512_mask_unpackhi_ps:
1398; X64:       # %bb.0:
1399; X64-NEXT:    kmovw %edi, %k1
1400; X64-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15]
1401; X64-NEXT:    retq
1402  %arg1 = bitcast i16 %a1 to <16 x i1>
1403  %res0 = shufflevector <16 x float> %a2, <16 x float> %a3, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
1404  %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
1405  ret <16 x float> %res1
1406}
1407
1408define <16 x float> @test_mm512_maskz_unpackhi_ps(i16 %a0, <16 x float> %a1, <16 x float> %a2) {
1409; X86-LABEL: test_mm512_maskz_unpackhi_ps:
1410; X86:       # %bb.0:
1411; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1412; X86-NEXT:    kmovw %eax, %k1
1413; X86-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
1414; X86-NEXT:    retl
1415;
1416; X64-LABEL: test_mm512_maskz_unpackhi_ps:
1417; X64:       # %bb.0:
1418; X64-NEXT:    kmovw %edi, %k1
1419; X64-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
1420; X64-NEXT:    retq
1421  %arg0 = bitcast i16 %a0 to <16 x i1>
1422  %res0 = shufflevector <16 x float> %a1, <16 x float> %a2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
1423  %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
1424  ret <16 x float> %res1
1425}
1426
1427define <8 x i64> @test_mm512_unpacklo_epi32(<8 x i64> %a0, <8 x i64> %a1) {
1428; CHECK-LABEL: test_mm512_unpacklo_epi32:
1429; CHECK:       # %bb.0:
1430; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
1431; CHECK-NEXT:    ret{{[l|q]}}
1432  %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
1433  %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
1434  %res0 = shufflevector <16 x i32> %arg0, <16 x i32> %arg1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
1435  %res1 = bitcast <16 x i32> %res0 to <8 x i64>
1436  ret <8 x i64> %res1
1437}
1438
1439define <8 x i64> @test_mm512_mask_unpacklo_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2, <8 x i64> %a3) {
1440; X86-LABEL: test_mm512_mask_unpacklo_epi32:
1441; X86:       # %bb.0:
1442; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1443; X86-NEXT:    kmovw %eax, %k1
1444; X86-NEXT:    vpunpckldq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13]
1445; X86-NEXT:    retl
1446;
1447; X64-LABEL: test_mm512_mask_unpacklo_epi32:
1448; X64:       # %bb.0:
1449; X64-NEXT:    kmovw %edi, %k1
1450; X64-NEXT:    vpunpckldq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13]
1451; X64-NEXT:    retq
1452  %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
1453  %arg1 = bitcast i16 %a1 to <16 x i1>
1454  %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
1455  %arg3 = bitcast <8 x i64> %a3 to <16 x i32>
1456  %res0 = shufflevector <16 x i32> %arg2, <16 x i32> %arg3, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
1457  %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0
1458  %res2 = bitcast <16 x i32> %res1 to <8 x i64>
1459  ret <8 x i64> %res2
1460}
1461
1462define <8 x i64> @test_mm512_maskz_unpacklo_epi32(i16 %a0, <8 x i64> %a1, <8 x i64> %a2) {
1463; X86-LABEL: test_mm512_maskz_unpacklo_epi32:
1464; X86:       # %bb.0:
1465; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1466; X86-NEXT:    kmovw %eax, %k1
1467; X86-NEXT:    vpunpckldq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
1468; X86-NEXT:    retl
1469;
1470; X64-LABEL: test_mm512_maskz_unpacklo_epi32:
1471; X64:       # %bb.0:
1472; X64-NEXT:    kmovw %edi, %k1
1473; X64-NEXT:    vpunpckldq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
1474; X64-NEXT:    retq
1475  %arg0 = bitcast i16 %a0 to <16 x i1>
1476  %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
1477  %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
1478  %res0 = shufflevector <16 x i32> %arg1, <16 x i32> %arg2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
1479  %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer
1480  %res2 = bitcast <16 x i32> %res1 to <8 x i64>
1481  ret <8 x i64> %res2
1482}
1483
1484define <8 x i64> @test_mm512_unpacklo_epi64(<8 x i64> %a0, <8 x i64> %a1) {
1485; CHECK-LABEL: test_mm512_unpacklo_epi64:
1486; CHECK:       # %bb.0:
1487; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1488; CHECK-NEXT:    ret{{[l|q]}}
1489  %res = shufflevector <8 x i64> %a0, <8 x i64> %a1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
1490  ret <8 x i64> %res
1491}
1492
1493define <8 x i64> @test_mm512_mask_unpacklo_epi64(<8 x i64> %a0, i8 %a1, <8 x i64> %a2, <8 x i64> %a3) {
1494; X86-LABEL: test_mm512_mask_unpacklo_epi64:
1495; X86:       # %bb.0:
1496; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1497; X86-NEXT:    kmovw %eax, %k1
1498; X86-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
1499; X86-NEXT:    retl
1500;
1501; X64-LABEL: test_mm512_mask_unpacklo_epi64:
1502; X64:       # %bb.0:
1503; X64-NEXT:    kmovw %edi, %k1
1504; X64-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
1505; X64-NEXT:    retq
1506  %arg1 = bitcast i8 %a1 to <8 x i1>
1507  %res0 = shufflevector <8 x i64> %a2, <8 x i64> %a3, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
1508  %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0
1509  ret <8 x i64> %res1
1510}
1511
1512define <8 x i64> @test_mm512_maskz_unpacklo_epi64(i8 %a0, <8 x i64> %a1, <8 x i64> %a2) {
1513; X86-LABEL: test_mm512_maskz_unpacklo_epi64:
1514; X86:       # %bb.0:
1515; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1516; X86-NEXT:    kmovw %eax, %k1
1517; X86-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1518; X86-NEXT:    retl
1519;
1520; X64-LABEL: test_mm512_maskz_unpacklo_epi64:
1521; X64:       # %bb.0:
1522; X64-NEXT:    kmovw %edi, %k1
1523; X64-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1524; X64-NEXT:    retq
1525  %arg0 = bitcast i8 %a0 to <8 x i1>
1526  %res0 = shufflevector <8 x i64> %a1, <8 x i64> %a2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
1527  %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer
1528  ret <8 x i64> %res1
1529}
1530
1531define <8 x double> @test_mm512_unpacklo_pd(<8 x double> %a0, <8 x double> %a1) {
1532; CHECK-LABEL: test_mm512_unpacklo_pd:
1533; CHECK:       # %bb.0:
1534; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1535; CHECK-NEXT:    ret{{[l|q]}}
1536  %res = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
1537  ret <8 x double> %res
1538}
1539
1540define <8 x double> @test_mm512_mask_unpacklo_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2, <8 x double> %a3) {
1541; X86-LABEL: test_mm512_mask_unpacklo_pd:
1542; X86:       # %bb.0:
1543; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1544; X86-NEXT:    kmovw %eax, %k1
1545; X86-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
1546; X86-NEXT:    retl
1547;
1548; X64-LABEL: test_mm512_mask_unpacklo_pd:
1549; X64:       # %bb.0:
1550; X64-NEXT:    kmovw %edi, %k1
1551; X64-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
1552; X64-NEXT:    retq
1553  %arg1 = bitcast i8 %a1 to <8 x i1>
1554  %res0 = shufflevector <8 x double> %a2, <8 x double> %a3, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
1555  %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
1556  ret <8 x double> %res1
1557}
1558
1559define <8 x double> @test_mm512_maskz_unpacklo_pd(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
1560; X86-LABEL: test_mm512_maskz_unpacklo_pd:
1561; X86:       # %bb.0:
1562; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1563; X86-NEXT:    kmovw %eax, %k1
1564; X86-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1565; X86-NEXT:    retl
1566;
1567; X64-LABEL: test_mm512_maskz_unpacklo_pd:
1568; X64:       # %bb.0:
1569; X64-NEXT:    kmovw %edi, %k1
1570; X64-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1571; X64-NEXT:    retq
1572  %arg0 = bitcast i8 %a0 to <8 x i1>
1573  %res0 = shufflevector <8 x double> %a1, <8 x double> %a2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
1574  %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
1575  ret <8 x double> %res1
1576}
1577
1578define <16 x float> @test_mm512_unpacklo_ps(<16 x float> %a0, <16 x float> %a1) {
1579; CHECK-LABEL: test_mm512_unpacklo_ps:
1580; CHECK:       # %bb.0:
1581; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
1582; CHECK-NEXT:    ret{{[l|q]}}
1583  %res = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
1584  ret <16 x float> %res
1585}
1586
1587define <16 x float> @test_mm512_mask_unpacklo_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2, <16 x float> %a3) {
1588; X86-LABEL: test_mm512_mask_unpacklo_ps:
1589; X86:       # %bb.0:
1590; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1591; X86-NEXT:    kmovw %eax, %k1
1592; X86-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13]
1593; X86-NEXT:    retl
1594;
1595; X64-LABEL: test_mm512_mask_unpacklo_ps:
1596; X64:       # %bb.0:
1597; X64-NEXT:    kmovw %edi, %k1
1598; X64-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13]
1599; X64-NEXT:    retq
1600  %arg1 = bitcast i16 %a1 to <16 x i1>
1601  %res0 = shufflevector <16 x float> %a2, <16 x float> %a3, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
1602  %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
1603  ret <16 x float> %res1
1604}
1605
1606define <16 x float> @test_mm512_maskz_unpacklo_ps(i16 %a0, <16 x float> %a1, <16 x float> %a2) {
1607; X86-LABEL: test_mm512_maskz_unpacklo_ps:
1608; X86:       # %bb.0:
1609; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1610; X86-NEXT:    kmovw %eax, %k1
1611; X86-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
1612; X86-NEXT:    retl
1613;
1614; X64-LABEL: test_mm512_maskz_unpacklo_ps:
1615; X64:       # %bb.0:
1616; X64-NEXT:    kmovw %edi, %k1
1617; X64-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
1618; X64-NEXT:    retq
1619  %arg0 = bitcast i16 %a0 to <16 x i1>
1620  %res0 = shufflevector <16 x float> %a1, <16 x float> %a2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
1621  %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
1622  ret <16 x float> %res1
1623}
1624
1625define <8 x double> @test_mm512_zextpd128_pd512(<2 x double> %a0) nounwind {
1626; CHECK-LABEL: test_mm512_zextpd128_pd512:
1627; CHECK:       # %bb.0:
1628; CHECK-NEXT:    vmovaps %xmm0, %xmm0
1629; CHECK-NEXT:    ret{{[l|q]}}
1630  %res = shufflevector <2 x double> %a0, <2 x double> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
1631  ret <8 x double> %res
1632}
1633
1634define <8 x double> @test_mm512_zextpd256_pd512(<4 x double> %a0) nounwind {
1635; CHECK-LABEL: test_mm512_zextpd256_pd512:
1636; CHECK:       # %bb.0:
1637; CHECK-NEXT:    vmovaps %ymm0, %ymm0
1638; CHECK-NEXT:    ret{{[l|q]}}
1639  %res = shufflevector <4 x double> %a0, <4 x double> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1640  ret <8 x double> %res
1641}
1642
1643define <16 x float> @test_mm512_zextps128_ps512(<4 x float> %a0) nounwind {
1644; CHECK-LABEL: test_mm512_zextps128_ps512:
1645; CHECK:       # %bb.0:
1646; CHECK-NEXT:    vmovaps %xmm0, %xmm0
1647; CHECK-NEXT:    ret{{[l|q]}}
1648  %res = shufflevector <4 x float> %a0, <4 x float> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
1649  ret <16 x float> %res
1650}
1651
1652define <16 x float> @test_mm512_zextps256_ps512(<8 x float> %a0) nounwind {
1653; CHECK-LABEL: test_mm512_zextps256_ps512:
1654; CHECK:       # %bb.0:
1655; CHECK-NEXT:    vmovaps %ymm0, %ymm0
1656; CHECK-NEXT:    ret{{[l|q]}}
1657  %res = shufflevector <8 x float> %a0, <8 x float> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1658  ret <16 x float> %res
1659}
1660
1661define <8 x i64> @test_mm512_zextsi128_si512(<2 x i64> %a0) nounwind {
1662; CHECK-LABEL: test_mm512_zextsi128_si512:
1663; CHECK:       # %bb.0:
1664; CHECK-NEXT:    vmovaps %xmm0, %xmm0
1665; CHECK-NEXT:    ret{{[l|q]}}
1666  %res = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
1667  ret <8 x i64> %res
1668}
1669
1670define <8 x i64> @test_mm512_zextsi256_si512(<4 x i64> %a0) nounwind {
1671; CHECK-LABEL: test_mm512_zextsi256_si512:
1672; CHECK:       # %bb.0:
1673; CHECK-NEXT:    vmovaps %ymm0, %ymm0
1674; CHECK-NEXT:    ret{{[l|q]}}
1675  %res = shufflevector <4 x i64> %a0, <4 x i64> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1676  ret <8 x i64> %res
1677}
1678
1679define <8 x i64> @test_mm512_mul_epi32(<8 x i64> %__A, <8 x i64> %__B) nounwind {
1680; CHECK-LABEL: test_mm512_mul_epi32:
1681; CHECK:       # %bb.0:
1682; CHECK-NEXT:    vpmuldq %zmm0, %zmm1, %zmm0
1683; CHECK-NEXT:    ret{{[l|q]}}
1684  %tmp = shl <8 x i64> %__A, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1685  %tmp1 = ashr exact <8 x i64> %tmp, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1686  %tmp2 = shl <8 x i64> %__B, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1687  %tmp3 = ashr exact <8 x i64> %tmp2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1688  %tmp4 = mul nsw <8 x i64> %tmp3, %tmp1
1689  ret <8 x i64> %tmp4
1690}
1691
1692define <8 x i64> @test_mm512_maskz_mul_epi32(i8 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B) nounwind {
1693; X86-LABEL: test_mm512_maskz_mul_epi32:
1694; X86:       # %bb.0: # %entry
1695; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1696; X86-NEXT:    kmovw %eax, %k1
1697; X86-NEXT:    vpmuldq %zmm0, %zmm1, %zmm0 {%k1} {z}
1698; X86-NEXT:    retl
1699;
1700; X64-LABEL: test_mm512_maskz_mul_epi32:
1701; X64:       # %bb.0: # %entry
1702; X64-NEXT:    kmovw %edi, %k1
1703; X64-NEXT:    vpmuldq %zmm0, %zmm1, %zmm0 {%k1} {z}
1704; X64-NEXT:    retq
1705entry:
1706  %0 = shl <8 x i64> %__A, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1707  %1 = ashr exact <8 x i64> %0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1708  %2 = shl <8 x i64> %__B, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1709  %3 = ashr exact <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1710  %4 = mul nsw <8 x i64> %3, %1
1711  %5 = bitcast i8 %__k to <8 x i1>
1712  %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer
1713  ret <8 x i64> %6
1714}
1715
1716define <8 x i64> @test_mm512_mask_mul_epi32(i8 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__src) nounwind {
1717; X86-LABEL: test_mm512_mask_mul_epi32:
1718; X86:       # %bb.0: # %entry
1719; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1720; X86-NEXT:    kmovw %eax, %k1
1721; X86-NEXT:    vpmuldq %zmm0, %zmm1, %zmm2 {%k1}
1722; X86-NEXT:    vmovdqa64 %zmm2, %zmm0
1723; X86-NEXT:    retl
1724;
1725; X64-LABEL: test_mm512_mask_mul_epi32:
1726; X64:       # %bb.0: # %entry
1727; X64-NEXT:    kmovw %edi, %k1
1728; X64-NEXT:    vpmuldq %zmm0, %zmm1, %zmm2 {%k1}
1729; X64-NEXT:    vmovdqa64 %zmm2, %zmm0
1730; X64-NEXT:    retq
1731entry:
1732  %0 = shl <8 x i64> %__A, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1733  %1 = ashr exact <8 x i64> %0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1734  %2 = shl <8 x i64> %__B, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1735  %3 = ashr exact <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1736  %4 = mul nsw <8 x i64> %3, %1
1737  %5 = bitcast i8 %__k to <8 x i1>
1738  %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> %__src
1739  ret <8 x i64> %6
1740}
1741
1742define <8 x i64> @test_mm512_mul_epu32(<8 x i64> %__A, <8 x i64> %__B) nounwind {
1743; CHECK-LABEL: test_mm512_mul_epu32:
1744; CHECK:       # %bb.0:
1745; CHECK-NEXT:    vpmuludq %zmm0, %zmm1, %zmm0
1746; CHECK-NEXT:    ret{{[l|q]}}
1747  %tmp = and <8 x i64> %__A, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1748  %tmp1 = and <8 x i64> %__B, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1749  %tmp2 = mul nuw <8 x i64> %tmp1, %tmp
1750  ret <8 x i64> %tmp2
1751}
1752
1753define <8 x i64> @test_mm512_maskz_mul_epu32(i8 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B) nounwind {
1754; X86-LABEL: test_mm512_maskz_mul_epu32:
1755; X86:       # %bb.0: # %entry
1756; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1757; X86-NEXT:    kmovw %eax, %k1
1758; X86-NEXT:    vpmuludq %zmm0, %zmm1, %zmm0 {%k1} {z}
1759; X86-NEXT:    retl
1760;
1761; X64-LABEL: test_mm512_maskz_mul_epu32:
1762; X64:       # %bb.0: # %entry
1763; X64-NEXT:    kmovw %edi, %k1
1764; X64-NEXT:    vpmuludq %zmm0, %zmm1, %zmm0 {%k1} {z}
1765; X64-NEXT:    retq
1766entry:
1767  %0 = and <8 x i64> %__A, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1768  %1 = and <8 x i64> %__B, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1769  %2 = mul nuw <8 x i64> %1, %0
1770  %3 = bitcast i8 %__k to <8 x i1>
1771  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
1772  ret <8 x i64> %4
1773}
1774
1775define <8 x i64> @test_mm512_mask_mul_epu32(i8 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__src) nounwind {
1776; X86-LABEL: test_mm512_mask_mul_epu32:
1777; X86:       # %bb.0: # %entry
1778; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1779; X86-NEXT:    kmovw %eax, %k1
1780; X86-NEXT:    vpmuludq %zmm0, %zmm1, %zmm2 {%k1}
1781; X86-NEXT:    vmovdqa64 %zmm2, %zmm0
1782; X86-NEXT:    retl
1783;
1784; X64-LABEL: test_mm512_mask_mul_epu32:
1785; X64:       # %bb.0: # %entry
1786; X64-NEXT:    kmovw %edi, %k1
1787; X64-NEXT:    vpmuludq %zmm0, %zmm1, %zmm2 {%k1}
1788; X64-NEXT:    vmovdqa64 %zmm2, %zmm0
1789; X64-NEXT:    retq
1790entry:
1791  %0 = and <8 x i64> %__A, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1792  %1 = and <8 x i64> %__B, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1793  %2 = mul nuw <8 x i64> %1, %0
1794  %3 = bitcast i8 %__k to <8 x i1>
1795  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %__src
1796  ret <8 x i64> %4
1797}
1798
1799define <8 x double> @test_mm512_set1_epi8(i8 signext %d) nounwind {
1800; X86-LABEL: test_mm512_set1_epi8:
1801; X86:       # %bb.0: # %entry
1802; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1803; X86-NEXT:    vmovd %eax, %xmm0
1804; X86-NEXT:    vpbroadcastb %xmm0, %ymm0
1805; X86-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
1806; X86-NEXT:    retl
1807;
1808; X64-LABEL: test_mm512_set1_epi8:
1809; X64:       # %bb.0: # %entry
1810; X64-NEXT:    vmovd %edi, %xmm0
1811; X64-NEXT:    vpbroadcastb %xmm0, %ymm0
1812; X64-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
1813; X64-NEXT:    retq
1814entry:
1815  %vecinit.i = insertelement <64 x i8> undef, i8 %d, i32 0
1816  %vecinit63.i = shufflevector <64 x i8> %vecinit.i, <64 x i8> undef, <64 x i32> zeroinitializer
1817  %0 = bitcast <64 x i8> %vecinit63.i to <8 x double>
1818  ret <8 x double> %0
1819}
1820
1821define <2 x double> @test_mm_cvtu32_sd(<2 x double> %__A, i32 %__B) {
1822; X86-LABEL: test_mm_cvtu32_sd:
1823; X86:       # %bb.0: # %entry
1824; X86-NEXT:    vcvtusi2sdl {{[0-9]+}}(%esp), %xmm0, %xmm0
1825; X86-NEXT:    retl
1826;
1827; X64-LABEL: test_mm_cvtu32_sd:
1828; X64:       # %bb.0: # %entry
1829; X64-NEXT:    vcvtusi2sd %edi, %xmm0, %xmm0
1830; X64-NEXT:    retq
1831entry:
1832  %conv.i = uitofp i32 %__B to double
1833  %vecins.i = insertelement <2 x double> %__A, double %conv.i, i32 0
1834  ret <2 x double> %vecins.i
1835}
1836
1837define <2 x double> @test_mm_cvtu64_sd(<2 x double> %__A, i64 %__B) {
1838; X86-LABEL: test_mm_cvtu64_sd:
1839; X86:       # %bb.0: # %entry
1840; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1841; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
1842; X86-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
1843; X86-NEXT:    vsubpd {{\.LCPI.*}}, %xmm1, %xmm1
1844; X86-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1845; X86-NEXT:    vaddsd %xmm1, %xmm2, %xmm1
1846; X86-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1847; X86-NEXT:    retl
1848;
1849; X64-LABEL: test_mm_cvtu64_sd:
1850; X64:       # %bb.0: # %entry
1851; X64-NEXT:    vcvtusi2sd %rdi, %xmm0, %xmm0
1852; X64-NEXT:    retq
1853entry:
1854  %conv.i = uitofp i64 %__B to double
1855  %vecins.i = insertelement <2 x double> %__A, double %conv.i, i32 0
1856  ret <2 x double> %vecins.i
1857}
1858
1859define <4 x float> @test_mm_cvtu32_ss(<4 x float> %__A, i32 %__B) {
1860; X86-LABEL: test_mm_cvtu32_ss:
1861; X86:       # %bb.0: # %entry
1862; X86-NEXT:    vcvtusi2ssl {{[0-9]+}}(%esp), %xmm0, %xmm0
1863; X86-NEXT:    retl
1864;
1865; X64-LABEL: test_mm_cvtu32_ss:
1866; X64:       # %bb.0: # %entry
1867; X64-NEXT:    vcvtusi2ss %edi, %xmm0, %xmm0
1868; X64-NEXT:    retq
1869entry:
1870  %conv.i = uitofp i32 %__B to float
1871  %vecins.i = insertelement <4 x float> %__A, float %conv.i, i32 0
1872  ret <4 x float> %vecins.i
1873}
1874
1875define <4 x float> @test_mm_cvtu64_ss(<4 x float> %__A, i64 %__B) {
1876; X86-LABEL: test_mm_cvtu64_ss:
1877; X86:       # %bb.0: # %entry
1878; X86-NEXT:    pushl %ebp
1879; X86-NEXT:    .cfi_def_cfa_offset 8
1880; X86-NEXT:    .cfi_offset %ebp, -8
1881; X86-NEXT:    movl %esp, %ebp
1882; X86-NEXT:    .cfi_def_cfa_register %ebp
1883; X86-NEXT:    andl $-8, %esp
1884; X86-NEXT:    subl $16, %esp
1885; X86-NEXT:    movl 12(%ebp), %eax
1886; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1887; X86-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
1888; X86-NEXT:    vmovq %xmm1, {{[0-9]+}}(%esp)
1889; X86-NEXT:    shrl $31, %eax
1890; X86-NEXT:    fildll {{[0-9]+}}(%esp)
1891; X86-NEXT:    fadds {{\.LCPI.*}}(,%eax,4)
1892; X86-NEXT:    fstps {{[0-9]+}}(%esp)
1893; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1894; X86-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1895; X86-NEXT:    movl %ebp, %esp
1896; X86-NEXT:    popl %ebp
1897; X86-NEXT:    .cfi_def_cfa %esp, 4
1898; X86-NEXT:    retl
1899;
1900; X64-LABEL: test_mm_cvtu64_ss:
1901; X64:       # %bb.0: # %entry
1902; X64-NEXT:    vcvtusi2ss %rdi, %xmm0, %xmm0
1903; X64-NEXT:    retq
1904entry:
1905  %conv.i = uitofp i64 %__B to float
1906  %vecins.i = insertelement <4 x float> %__A, float %conv.i, i32 0
1907  ret <4 x float> %vecins.i
1908}
1909
1910define <16 x float> @test_mm512_cvtph_ps(<4 x i64> %__A) {
1911; CHECK-LABEL: test_mm512_cvtph_ps:
1912; CHECK:       # %bb.0: # %entry
1913; CHECK-NEXT:    vcvtph2ps %ymm0, %zmm0
1914; CHECK-NEXT:    ret{{[l|q]}}
1915entry:
1916  %0 = bitcast <4 x i64> %__A to <16 x i16>
1917  %1 = bitcast <16 x i16> %0 to <16 x half>
1918  %2 = fpext <16 x half> %1 to <16 x float>
1919  ret <16 x float> %2
1920}
1921
1922define <16 x float> @test_mm512_mask_cvtph_ps(<16 x float> %__W, i16 zeroext %__U, <4 x i64> %__A) {
1923; X86-LABEL: test_mm512_mask_cvtph_ps:
1924; X86:       # %bb.0: # %entry
1925; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1926; X86-NEXT:    kmovw %eax, %k1
1927; X86-NEXT:    vcvtph2ps %ymm1, %zmm0 {%k1}
1928; X86-NEXT:    retl
1929;
1930; X64-LABEL: test_mm512_mask_cvtph_ps:
1931; X64:       # %bb.0: # %entry
1932; X64-NEXT:    kmovw %edi, %k1
1933; X64-NEXT:    vcvtph2ps %ymm1, %zmm0 {%k1}
1934; X64-NEXT:    retq
1935entry:
1936  %0 = bitcast <4 x i64> %__A to <16 x i16>
1937  %1 = bitcast <16 x i16> %0 to <16 x half>
1938  %2 = bitcast i16 %__U to <16 x i1>
1939  %3 = fpext <16 x half> %1 to <16 x float>
1940  %4 = select <16 x i1> %2, <16 x float> %3, <16 x float> %__W
1941  ret <16 x float> %4
1942}
1943
1944define <16 x float> @test_mm512_maskz_cvtph_ps(i16 zeroext %__U, <4 x i64> %__A) {
1945; X86-LABEL: test_mm512_maskz_cvtph_ps:
1946; X86:       # %bb.0: # %entry
1947; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1948; X86-NEXT:    kmovw %eax, %k1
1949; X86-NEXT:    vcvtph2ps %ymm0, %zmm0 {%k1} {z}
1950; X86-NEXT:    retl
1951;
1952; X64-LABEL: test_mm512_maskz_cvtph_ps:
1953; X64:       # %bb.0: # %entry
1954; X64-NEXT:    kmovw %edi, %k1
1955; X64-NEXT:    vcvtph2ps %ymm0, %zmm0 {%k1} {z}
1956; X64-NEXT:    retq
1957entry:
1958  %0 = bitcast <4 x i64> %__A to <16 x i16>
1959  %1 = bitcast <16 x i16> %0 to <16 x half>
1960  %2 = bitcast i16 %__U to <16 x i1>
1961  %3 = fpext <16 x half> %1 to <16 x float>
1962  %4 = select <16 x i1> %2, <16 x float> %3, <16 x float> zeroinitializer
1963  ret <16 x float> %4
1964}
1965
1966define <8 x double> @test_mm512_cvtps_pd(<8 x float> %__A) {
1967; CHECK-LABEL: test_mm512_cvtps_pd:
1968; CHECK:       # %bb.0: # %entry
1969; CHECK-NEXT:    vcvtps2pd %ymm0, %zmm0
1970; CHECK-NEXT:    ret{{[l|q]}}
1971entry:
1972  %conv.i = fpext <8 x float> %__A to <8 x double>
1973  ret <8 x double> %conv.i
1974}
1975
1976define <8 x double> @test_mm512_cvtpslo_pd(<16 x float> %__A) {
1977; CHECK-LABEL: test_mm512_cvtpslo_pd:
1978; CHECK:       # %bb.0: # %entry
1979; CHECK-NEXT:    vcvtps2pd %ymm0, %zmm0
1980; CHECK-NEXT:    ret{{[l|q]}}
1981entry:
1982  %shuffle.i.i = shufflevector <16 x float> %__A, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1983  %conv.i.i = fpext <8 x float> %shuffle.i.i to <8 x double>
1984  ret <8 x double> %conv.i.i
1985}
1986
1987define <8 x double> @test_mm512_mask_cvtps_pd(<8 x double> %__W, i8 zeroext %__U, <8 x float> %__A) {
1988; X86-LABEL: test_mm512_mask_cvtps_pd:
1989; X86:       # %bb.0: # %entry
1990; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1991; X86-NEXT:    kmovw %eax, %k1
1992; X86-NEXT:    vcvtps2pd %ymm1, %zmm0 {%k1}
1993; X86-NEXT:    retl
1994;
1995; X64-LABEL: test_mm512_mask_cvtps_pd:
1996; X64:       # %bb.0: # %entry
1997; X64-NEXT:    kmovw %edi, %k1
1998; X64-NEXT:    vcvtps2pd %ymm1, %zmm0 {%k1}
1999; X64-NEXT:    retq
2000entry:
2001  %conv.i.i = fpext <8 x float> %__A to <8 x double>
2002  %0 = bitcast i8 %__U to <8 x i1>
2003  %1 = select <8 x i1> %0, <8 x double> %conv.i.i, <8 x double> %__W
2004  ret <8 x double> %1
2005}
2006
2007define <8 x double> @test_mm512_mask_cvtpslo_pd(<8 x double> %__W, i8 zeroext %__U, <16 x float> %__A) {
2008; X86-LABEL: test_mm512_mask_cvtpslo_pd:
2009; X86:       # %bb.0: # %entry
2010; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2011; X86-NEXT:    kmovw %eax, %k1
2012; X86-NEXT:    vcvtps2pd %ymm1, %zmm0 {%k1}
2013; X86-NEXT:    retl
2014;
2015; X64-LABEL: test_mm512_mask_cvtpslo_pd:
2016; X64:       # %bb.0: # %entry
2017; X64-NEXT:    kmovw %edi, %k1
2018; X64-NEXT:    vcvtps2pd %ymm1, %zmm0 {%k1}
2019; X64-NEXT:    retq
2020entry:
2021  %shuffle.i.i = shufflevector <16 x float> %__A, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2022  %conv.i.i.i = fpext <8 x float> %shuffle.i.i to <8 x double>
2023  %0 = bitcast i8 %__U to <8 x i1>
2024  %1 = select <8 x i1> %0, <8 x double> %conv.i.i.i, <8 x double> %__W
2025  ret <8 x double> %1
2026}
2027
2028define <8 x double> @test_mm512_maskz_cvtps_pd(i8 zeroext %__U, <8 x float> %__A) {
2029; X86-LABEL: test_mm512_maskz_cvtps_pd:
2030; X86:       # %bb.0: # %entry
2031; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2032; X86-NEXT:    kmovw %eax, %k1
2033; X86-NEXT:    vcvtps2pd %ymm0, %zmm0 {%k1} {z}
2034; X86-NEXT:    retl
2035;
2036; X64-LABEL: test_mm512_maskz_cvtps_pd:
2037; X64:       # %bb.0: # %entry
2038; X64-NEXT:    kmovw %edi, %k1
2039; X64-NEXT:    vcvtps2pd %ymm0, %zmm0 {%k1} {z}
2040; X64-NEXT:    retq
2041entry:
2042  %conv.i.i = fpext <8 x float> %__A to <8 x double>
2043  %0 = bitcast i8 %__U to <8 x i1>
2044  %1 = select <8 x i1> %0, <8 x double> %conv.i.i, <8 x double> zeroinitializer
2045  ret <8 x double> %1
2046}
2047
2048define <2 x i64> @test_mm512_cvtepi32_epi8(<8 x i64> %__A) {
2049; CHECK-LABEL: test_mm512_cvtepi32_epi8:
2050; CHECK:       # %bb.0: # %entry
2051; CHECK-NEXT:    vpmovdb %zmm0, %xmm0
2052; CHECK-NEXT:    vzeroupper
2053; CHECK-NEXT:    ret{{[l|q]}}
2054entry:
2055  %0 = bitcast <8 x i64> %__A to <16 x i32>
2056  %conv.i = trunc <16 x i32> %0 to <16 x i8>
2057  %1 = bitcast <16 x i8> %conv.i to <2 x i64>
2058  ret <2 x i64> %1
2059}
2060
2061define <2 x i64> @test_mm512_mask_cvtepi32_epi8(<2 x i64> %__O, i16 zeroext %__M, <8 x i64> %__A) {
2062; X86-LABEL: test_mm512_mask_cvtepi32_epi8:
2063; X86:       # %bb.0: # %entry
2064; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2065; X86-NEXT:    kmovw %eax, %k1
2066; X86-NEXT:    vpmovdb %zmm1, %xmm0 {%k1}
2067; X86-NEXT:    vzeroupper
2068; X86-NEXT:    retl
2069;
2070; X64-LABEL: test_mm512_mask_cvtepi32_epi8:
2071; X64:       # %bb.0: # %entry
2072; X64-NEXT:    kmovw %edi, %k1
2073; X64-NEXT:    vpmovdb %zmm1, %xmm0 {%k1}
2074; X64-NEXT:    vzeroupper
2075; X64-NEXT:    retq
2076entry:
2077  %0 = bitcast <8 x i64> %__A to <16 x i32>
2078  %1 = bitcast <2 x i64> %__O to <16 x i8>
2079  %2 = tail call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %0, <16 x i8> %1, i16 %__M)
2080  %3 = bitcast <16 x i8> %2 to <2 x i64>
2081  ret <2 x i64> %3
2082}
2083
2084define <2 x i64> @test_mm512_maskz_cvtepi32_epi8(i16 zeroext %__M, <8 x i64> %__A) {
2085; X86-LABEL: test_mm512_maskz_cvtepi32_epi8:
2086; X86:       # %bb.0: # %entry
2087; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2088; X86-NEXT:    kmovw %eax, %k1
2089; X86-NEXT:    vpmovdb %zmm0, %xmm0 {%k1} {z}
2090; X86-NEXT:    vzeroupper
2091; X86-NEXT:    retl
2092;
2093; X64-LABEL: test_mm512_maskz_cvtepi32_epi8:
2094; X64:       # %bb.0: # %entry
2095; X64-NEXT:    kmovw %edi, %k1
2096; X64-NEXT:    vpmovdb %zmm0, %xmm0 {%k1} {z}
2097; X64-NEXT:    vzeroupper
2098; X64-NEXT:    retq
2099entry:
2100  %0 = bitcast <8 x i64> %__A to <16 x i32>
2101  %1 = tail call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %0, <16 x i8> zeroinitializer, i16 %__M)
2102  %2 = bitcast <16 x i8> %1 to <2 x i64>
2103  ret <2 x i64> %2
2104}
2105
2106define <4 x i64> @test_mm512_cvtepi64_epi32(<8 x i64> %__A) {
2107; CHECK-LABEL: test_mm512_cvtepi64_epi32:
2108; CHECK:       # %bb.0: # %entry
2109; CHECK-NEXT:    vpmovqd %zmm0, %ymm0
2110; CHECK-NEXT:    ret{{[l|q]}}
2111entry:
2112  %conv.i = trunc <8 x i64> %__A to <8 x i32>
2113  %0 = bitcast <8 x i32> %conv.i to <4 x i64>
2114  ret <4 x i64> %0
2115}
2116
2117define <4 x i64> @test_mm512_mask_cvtepi64_epi32(<4 x i64> %__O, i8 zeroext %__M, <8 x i64> %__A) {
2118; X86-LABEL: test_mm512_mask_cvtepi64_epi32:
2119; X86:       # %bb.0: # %entry
2120; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2121; X86-NEXT:    kmovw %eax, %k1
2122; X86-NEXT:    vpmovqd %zmm1, %ymm0 {%k1}
2123; X86-NEXT:    retl
2124;
2125; X64-LABEL: test_mm512_mask_cvtepi64_epi32:
2126; X64:       # %bb.0: # %entry
2127; X64-NEXT:    kmovw %edi, %k1
2128; X64-NEXT:    vpmovqd %zmm1, %ymm0 {%k1}
2129; X64-NEXT:    retq
2130entry:
2131  %conv.i.i = trunc <8 x i64> %__A to <8 x i32>
2132  %0 = bitcast <4 x i64> %__O to <8 x i32>
2133  %1 = bitcast i8 %__M to <8 x i1>
2134  %2 = select <8 x i1> %1, <8 x i32> %conv.i.i, <8 x i32> %0
2135  %3 = bitcast <8 x i32> %2 to <4 x i64>
2136  ret <4 x i64> %3
2137}
2138
2139define <4 x i64> @test_mm512_maskz_cvtepi64_epi32(i8 zeroext %__M, <8 x i64> %__A) {
2140; X86-LABEL: test_mm512_maskz_cvtepi64_epi32:
2141; X86:       # %bb.0: # %entry
2142; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2143; X86-NEXT:    kmovw %eax, %k1
2144; X86-NEXT:    vpmovqd %zmm0, %ymm0 {%k1} {z}
2145; X86-NEXT:    retl
2146;
2147; X64-LABEL: test_mm512_maskz_cvtepi64_epi32:
2148; X64:       # %bb.0: # %entry
2149; X64-NEXT:    kmovw %edi, %k1
2150; X64-NEXT:    vpmovqd %zmm0, %ymm0 {%k1} {z}
2151; X64-NEXT:    retq
2152entry:
2153  %conv.i.i = trunc <8 x i64> %__A to <8 x i32>
2154  %0 = bitcast i8 %__M to <8 x i1>
2155  %1 = select <8 x i1> %0, <8 x i32> %conv.i.i, <8 x i32> zeroinitializer
2156  %2 = bitcast <8 x i32> %1 to <4 x i64>
2157  ret <4 x i64> %2
2158}
2159
2160define <2 x i64> @test_mm512_cvtepi64_epi16(<8 x i64> %__A) {
2161; CHECK-LABEL: test_mm512_cvtepi64_epi16:
2162; CHECK:       # %bb.0: # %entry
2163; CHECK-NEXT:    vpmovqw %zmm0, %xmm0
2164; CHECK-NEXT:    vzeroupper
2165; CHECK-NEXT:    ret{{[l|q]}}
2166entry:
2167  %conv.i = trunc <8 x i64> %__A to <8 x i16>
2168  %0 = bitcast <8 x i16> %conv.i to <2 x i64>
2169  ret <2 x i64> %0
2170}
2171
2172define <2 x i64> @test_mm512_mask_cvtepi64_epi16(<2 x i64> %__O, i8 zeroext %__M, <8 x i64> %__A) {
2173; X86-LABEL: test_mm512_mask_cvtepi64_epi16:
2174; X86:       # %bb.0: # %entry
2175; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2176; X86-NEXT:    kmovw %eax, %k1
2177; X86-NEXT:    vpmovqw %zmm1, %xmm0 {%k1}
2178; X86-NEXT:    vzeroupper
2179; X86-NEXT:    retl
2180;
2181; X64-LABEL: test_mm512_mask_cvtepi64_epi16:
2182; X64:       # %bb.0: # %entry
2183; X64-NEXT:    kmovw %edi, %k1
2184; X64-NEXT:    vpmovqw %zmm1, %xmm0 {%k1}
2185; X64-NEXT:    vzeroupper
2186; X64-NEXT:    retq
2187entry:
2188  %0 = bitcast <2 x i64> %__O to <8 x i16>
2189  %1 = tail call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %__A, <8 x i16> %0, i8 %__M)
2190  %2 = bitcast <8 x i16> %1 to <2 x i64>
2191  ret <2 x i64> %2
2192}
2193
2194define <2 x i64> @test_mm512_maskz_cvtepi64_epi16(i8 zeroext %__M, <8 x i64> %__A) {
2195; X86-LABEL: test_mm512_maskz_cvtepi64_epi16:
2196; X86:       # %bb.0: # %entry
2197; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2198; X86-NEXT:    kmovw %eax, %k1
2199; X86-NEXT:    vpmovqw %zmm0, %xmm0 {%k1} {z}
2200; X86-NEXT:    vzeroupper
2201; X86-NEXT:    retl
2202;
2203; X64-LABEL: test_mm512_maskz_cvtepi64_epi16:
2204; X64:       # %bb.0: # %entry
2205; X64-NEXT:    kmovw %edi, %k1
2206; X64-NEXT:    vpmovqw %zmm0, %xmm0 {%k1} {z}
2207; X64-NEXT:    vzeroupper
2208; X64-NEXT:    retq
2209entry:
2210  %0 = tail call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %__A, <8 x i16> zeroinitializer, i8 %__M)
2211  %1 = bitcast <8 x i16> %0 to <2 x i64>
2212  ret <2 x i64> %1
2213}
2214
2215declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32>, <16 x i8>, i16)
2216declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64>, <8 x i16>, i8)
2217
2218define <8 x i64> @test_mm512_ternarylogic_epi32(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) {
2219; CHECK-LABEL: test_mm512_ternarylogic_epi32:
2220; CHECK:       # %bb.0: # %entry
2221; CHECK-NEXT:    vpternlogd $4, %zmm2, %zmm1, %zmm0
2222; CHECK-NEXT:    ret{{[l|q]}}
2223entry:
2224  %0 = bitcast <8 x i64> %__A to <16 x i32>
2225  %1 = bitcast <8 x i64> %__B to <16 x i32>
2226  %2 = bitcast <8 x i64> %__C to <16 x i32>
2227  %3 = tail call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2, i32 4)
2228  %4 = bitcast <16 x i32> %3 to <8 x i64>
2229  ret <8 x i64> %4
2230}
2231
2232declare <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32) #1
2233
2234define <8 x i64> @test_mm512_mask_ternarylogic_epi32(<8 x i64> %__A, i16 zeroext %__U, <8 x i64> %__B, <8 x i64> %__C) {
2235; X86-LABEL: test_mm512_mask_ternarylogic_epi32:
2236; X86:       # %bb.0: # %entry
2237; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2238; X86-NEXT:    kmovw %eax, %k1
2239; X86-NEXT:    vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1}
2240; X86-NEXT:    retl
2241;
2242; X64-LABEL: test_mm512_mask_ternarylogic_epi32:
2243; X64:       # %bb.0: # %entry
2244; X64-NEXT:    kmovw %edi, %k1
2245; X64-NEXT:    vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1}
2246; X64-NEXT:    retq
2247entry:
2248  %0 = bitcast <8 x i64> %__A to <16 x i32>
2249  %1 = bitcast <8 x i64> %__B to <16 x i32>
2250  %2 = bitcast <8 x i64> %__C to <16 x i32>
2251  %3 = tail call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2, i32 4)
2252  %4 = bitcast i16 %__U to <16 x i1>
2253  %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %0
2254  %6 = bitcast <16 x i32> %5 to <8 x i64>
2255  ret <8 x i64> %6
2256}
2257
2258define <8 x i64> @test_mm512_maskz_ternarylogic_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) {
2259; X86-LABEL: test_mm512_maskz_ternarylogic_epi32:
2260; X86:       # %bb.0: # %entry
2261; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2262; X86-NEXT:    kmovw %eax, %k1
2263; X86-NEXT:    vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1} {z}
2264; X86-NEXT:    retl
2265;
2266; X64-LABEL: test_mm512_maskz_ternarylogic_epi32:
2267; X64:       # %bb.0: # %entry
2268; X64-NEXT:    kmovw %edi, %k1
2269; X64-NEXT:    vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1} {z}
2270; X64-NEXT:    retq
2271entry:
2272  %0 = bitcast <8 x i64> %__A to <16 x i32>
2273  %1 = bitcast <8 x i64> %__B to <16 x i32>
2274  %2 = bitcast <8 x i64> %__C to <16 x i32>
2275  %3 = tail call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2, i32 4)
2276  %4 = bitcast i16 %__U to <16 x i1>
2277  %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer
2278  %6 = bitcast <16 x i32> %5 to <8 x i64>
2279  ret <8 x i64> %6
2280}
2281
2282define <8 x i64> @test_mm512_ternarylogic_epi64(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) {
2283; CHECK-LABEL: test_mm512_ternarylogic_epi64:
2284; CHECK:       # %bb.0: # %entry
2285; CHECK-NEXT:    vpternlogq $4, %zmm2, %zmm1, %zmm0
2286; CHECK-NEXT:    ret{{[l|q]}}
2287entry:
2288  %0 = tail call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, i32 4)
2289  ret <8 x i64> %0
2290}
2291
2292declare <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32) #1
2293
2294define <8 x i64> @test_mm512_mask_ternarylogic_epi64(<8 x i64> %__A, i8 zeroext %__U, <8 x i64> %__B, <8 x i64> %__C) {
2295; X86-LABEL: test_mm512_mask_ternarylogic_epi64:
2296; X86:       # %bb.0: # %entry
2297; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2298; X86-NEXT:    kmovw %eax, %k1
2299; X86-NEXT:    vpternlogq $4, %zmm2, %zmm1, %zmm0 {%k1}
2300; X86-NEXT:    retl
2301;
2302; X64-LABEL: test_mm512_mask_ternarylogic_epi64:
2303; X64:       # %bb.0: # %entry
2304; X64-NEXT:    kmovw %edi, %k1
2305; X64-NEXT:    vpternlogq $4, %zmm2, %zmm1, %zmm0 {%k1}
2306; X64-NEXT:    retq
2307entry:
2308  %0 = tail call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, i32 4)
2309  %1 = bitcast i8 %__U to <8 x i1>
2310  %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__A
2311  ret <8 x i64> %2
2312}
2313
2314define <8 x i64> @test_mm512_maskz_ternarylogic_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) {
2315; X86-LABEL: test_mm512_maskz_ternarylogic_epi64:
2316; X86:       # %bb.0: # %entry
2317; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2318; X86-NEXT:    kmovw %eax, %k1
2319; X86-NEXT:    vpternlogq $4, %zmm2, %zmm1, %zmm0 {%k1} {z}
2320; X86-NEXT:    retl
2321;
2322; X64-LABEL: test_mm512_maskz_ternarylogic_epi64:
2323; X64:       # %bb.0: # %entry
2324; X64-NEXT:    kmovw %edi, %k1
2325; X64-NEXT:    vpternlogq $4, %zmm2, %zmm1, %zmm0 {%k1} {z}
2326; X64-NEXT:    retq
2327entry:
2328  %0 = tail call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, i32 4)
2329  %1 = bitcast i8 %__U to <8 x i1>
2330  %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
2331  ret <8 x i64> %2
2332}
2333
2334declare <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>)
2335
2336define <8 x i64> @test_mm512_mask2_permutex2var_epi32(<8 x i64> %__A, <8 x i64> %__I, i16 zeroext %__U, <8 x i64> %__B) {
2337; X86-LABEL: test_mm512_mask2_permutex2var_epi32:
2338; X86:       # %bb.0: # %entry
2339; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2340; X86-NEXT:    kmovw %eax, %k1
2341; X86-NEXT:    vpermi2d %zmm2, %zmm0, %zmm1 {%k1}
2342; X86-NEXT:    vmovdqa64 %zmm1, %zmm0
2343; X86-NEXT:    retl
2344;
2345; X64-LABEL: test_mm512_mask2_permutex2var_epi32:
2346; X64:       # %bb.0: # %entry
2347; X64-NEXT:    kmovw %edi, %k1
2348; X64-NEXT:    vpermi2d %zmm2, %zmm0, %zmm1 {%k1}
2349; X64-NEXT:    vmovdqa64 %zmm1, %zmm0
2350; X64-NEXT:    retq
2351entry:
2352  %0 = bitcast <8 x i64> %__A to <16 x i32>
2353  %1 = bitcast <8 x i64> %__I to <16 x i32>
2354  %2 = bitcast <8 x i64> %__B to <16 x i32>
2355  %3 = tail call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
2356  %4 = bitcast i16 %__U to <16 x i1>
2357  %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %1
2358  %6 = bitcast <16 x i32> %5 to <8 x i64>
2359  ret <8 x i64> %6
2360}
2361
2362declare <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>)
2363
2364define <8 x double> @test_mm512_mask2_permutex2var_pd(<8 x double> %__A, <8 x i64> %__I, i8 zeroext %__U, <8 x double> %__B) {
2365; X86-LABEL: test_mm512_mask2_permutex2var_pd:
2366; X86:       # %bb.0: # %entry
2367; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2368; X86-NEXT:    kmovw %eax, %k1
2369; X86-NEXT:    vpermi2pd %zmm2, %zmm0, %zmm1 {%k1}
2370; X86-NEXT:    vmovapd %zmm1, %zmm0
2371; X86-NEXT:    retl
2372;
2373; X64-LABEL: test_mm512_mask2_permutex2var_pd:
2374; X64:       # %bb.0: # %entry
2375; X64-NEXT:    kmovw %edi, %k1
2376; X64-NEXT:    vpermi2pd %zmm2, %zmm0, %zmm1 {%k1}
2377; X64-NEXT:    vmovapd %zmm1, %zmm0
2378; X64-NEXT:    retq
2379entry:
2380  %0 = tail call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B)
2381  %1 = bitcast <8 x i64> %__I to <8 x double>
2382  %2 = bitcast i8 %__U to <8 x i1>
2383  %3 = select <8 x i1> %2, <8 x double> %0, <8 x double> %1
2384  ret <8 x double> %3
2385}
2386
2387declare <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>)
2388
2389define <16 x float> @test_mm512_mask2_permutex2var_ps(<16 x float> %__A, <8 x i64> %__I, i16 zeroext %__U, <16 x float> %__B) {
2390; X86-LABEL: test_mm512_mask2_permutex2var_ps:
2391; X86:       # %bb.0: # %entry
2392; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2393; X86-NEXT:    kmovw %eax, %k1
2394; X86-NEXT:    vpermi2ps %zmm2, %zmm0, %zmm1 {%k1}
2395; X86-NEXT:    vmovaps %zmm1, %zmm0
2396; X86-NEXT:    retl
2397;
2398; X64-LABEL: test_mm512_mask2_permutex2var_ps:
2399; X64:       # %bb.0: # %entry
2400; X64-NEXT:    kmovw %edi, %k1
2401; X64-NEXT:    vpermi2ps %zmm2, %zmm0, %zmm1 {%k1}
2402; X64-NEXT:    vmovaps %zmm1, %zmm0
2403; X64-NEXT:    retq
2404entry:
2405  %0 = bitcast <8 x i64> %__I to <16 x i32>
2406  %1 = tail call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %__A, <16 x i32> %0, <16 x float> %__B)
2407  %2 = bitcast <8 x i64> %__I to <16 x float>
2408  %3 = bitcast i16 %__U to <16 x i1>
2409  %4 = select <16 x i1> %3, <16 x float> %1, <16 x float> %2
2410  ret <16 x float> %4
2411}
2412
2413declare <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>)
2414
2415define <8 x i64> @test_mm512_mask2_permutex2var_epi64(<8 x i64> %__A, <8 x i64> %__I, i8 zeroext %__U, <8 x i64> %__B) {
2416; X86-LABEL: test_mm512_mask2_permutex2var_epi64:
2417; X86:       # %bb.0: # %entry
2418; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2419; X86-NEXT:    kmovw %eax, %k1
2420; X86-NEXT:    vpermi2q %zmm2, %zmm0, %zmm1 {%k1}
2421; X86-NEXT:    vmovdqa64 %zmm1, %zmm0
2422; X86-NEXT:    retl
2423;
2424; X64-LABEL: test_mm512_mask2_permutex2var_epi64:
2425; X64:       # %bb.0: # %entry
2426; X64-NEXT:    kmovw %edi, %k1
2427; X64-NEXT:    vpermi2q %zmm2, %zmm0, %zmm1 {%k1}
2428; X64-NEXT:    vmovdqa64 %zmm1, %zmm0
2429; X64-NEXT:    retq
2430entry:
2431  %0 = tail call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B)
2432  %1 = bitcast i8 %__U to <8 x i1>
2433  %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__I
2434  ret <8 x i64> %2
2435}
2436
2437define <8 x i64> @test_mm512_permutex2var_epi32(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) {
2438; CHECK-LABEL: test_mm512_permutex2var_epi32:
2439; CHECK:       # %bb.0: # %entry
2440; CHECK-NEXT:    vpermt2d %zmm2, %zmm1, %zmm0
2441; CHECK-NEXT:    ret{{[l|q]}}
2442entry:
2443  %0 = bitcast <8 x i64> %__A to <16 x i32>
2444  %1 = bitcast <8 x i64> %__I to <16 x i32>
2445  %2 = bitcast <8 x i64> %__B to <16 x i32>
2446  %3 = tail call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
2447  %4 = bitcast <16 x i32> %3 to <8 x i64>
2448  ret <8 x i64> %4
2449}
2450
2451define <8 x i64> @test_mm512_maskz_permutex2var_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) {
2452; X86-LABEL: test_mm512_maskz_permutex2var_epi32:
2453; X86:       # %bb.0: # %entry
2454; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2455; X86-NEXT:    kmovw %eax, %k1
2456; X86-NEXT:    vpermt2d %zmm2, %zmm1, %zmm0 {%k1} {z}
2457; X86-NEXT:    retl
2458;
2459; X64-LABEL: test_mm512_maskz_permutex2var_epi32:
2460; X64:       # %bb.0: # %entry
2461; X64-NEXT:    kmovw %edi, %k1
2462; X64-NEXT:    vpermt2d %zmm2, %zmm1, %zmm0 {%k1} {z}
2463; X64-NEXT:    retq
2464entry:
2465  %0 = bitcast <8 x i64> %__A to <16 x i32>
2466  %1 = bitcast <8 x i64> %__I to <16 x i32>
2467  %2 = bitcast <8 x i64> %__B to <16 x i32>
2468  %3 = tail call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
2469  %4 = bitcast i16 %__U to <16 x i1>
2470  %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer
2471  %6 = bitcast <16 x i32> %5 to <8 x i64>
2472  ret <8 x i64> %6
2473}
2474
2475define <8 x i64> @test_mm512_mask_permutex2var_epi32(<8 x i64> %__A, i16 zeroext %__U, <8 x i64> %__I, <8 x i64> %__B) {
2476; X86-LABEL: test_mm512_mask_permutex2var_epi32:
2477; X86:       # %bb.0: # %entry
2478; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2479; X86-NEXT:    kmovw %eax, %k1
2480; X86-NEXT:    vpermt2d %zmm2, %zmm1, %zmm0 {%k1}
2481; X86-NEXT:    retl
2482;
2483; X64-LABEL: test_mm512_mask_permutex2var_epi32:
2484; X64:       # %bb.0: # %entry
2485; X64-NEXT:    kmovw %edi, %k1
2486; X64-NEXT:    vpermt2d %zmm2, %zmm1, %zmm0 {%k1}
2487; X64-NEXT:    retq
2488entry:
2489  %0 = bitcast <8 x i64> %__A to <16 x i32>
2490  %1 = bitcast <8 x i64> %__I to <16 x i32>
2491  %2 = bitcast <8 x i64> %__B to <16 x i32>
2492  %3 = tail call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
2493  %4 = bitcast i16 %__U to <16 x i1>
2494  %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %0
2495  %6 = bitcast <16 x i32> %5 to <8 x i64>
2496  ret <8 x i64> %6
2497}
2498
2499define <8 x double> @test_mm512_permutex2var_pd(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B) {
2500; CHECK-LABEL: test_mm512_permutex2var_pd:
2501; CHECK:       # %bb.0: # %entry
2502; CHECK-NEXT:    vpermt2pd %zmm2, %zmm1, %zmm0
2503; CHECK-NEXT:    ret{{[l|q]}}
2504entry:
2505  %0 = tail call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B)
2506  ret <8 x double> %0
2507}
2508
2509define <8 x double> @test_mm512_mask_permutex2var_pd(<8 x double> %__A, i8 zeroext %__U, <8 x i64> %__I, <8 x double> %__B) {
2510; X86-LABEL: test_mm512_mask_permutex2var_pd:
2511; X86:       # %bb.0: # %entry
2512; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2513; X86-NEXT:    kmovw %eax, %k1
2514; X86-NEXT:    vpermt2pd %zmm2, %zmm1, %zmm0 {%k1}
2515; X86-NEXT:    retl
2516;
2517; X64-LABEL: test_mm512_mask_permutex2var_pd:
2518; X64:       # %bb.0: # %entry
2519; X64-NEXT:    kmovw %edi, %k1
2520; X64-NEXT:    vpermt2pd %zmm2, %zmm1, %zmm0 {%k1}
2521; X64-NEXT:    retq
2522entry:
2523  %0 = tail call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B)
2524  %1 = bitcast i8 %__U to <8 x i1>
2525  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
2526  ret <8 x double> %2
2527}
2528
2529define <8 x double> @test_mm512_maskz_permutex2var_pd(i8 zeroext %__U, <8 x double> %__A, <8 x i64> %__I, <8 x double> %__B) {
2530; X86-LABEL: test_mm512_maskz_permutex2var_pd:
2531; X86:       # %bb.0: # %entry
2532; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2533; X86-NEXT:    kmovw %eax, %k1
2534; X86-NEXT:    vpermt2pd %zmm2, %zmm1, %zmm0 {%k1} {z}
2535; X86-NEXT:    retl
2536;
2537; X64-LABEL: test_mm512_maskz_permutex2var_pd:
2538; X64:       # %bb.0: # %entry
2539; X64-NEXT:    kmovw %edi, %k1
2540; X64-NEXT:    vpermt2pd %zmm2, %zmm1, %zmm0 {%k1} {z}
2541; X64-NEXT:    retq
2542entry:
2543  %0 = tail call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B)
2544  %1 = bitcast i8 %__U to <8 x i1>
2545  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
2546  ret <8 x double> %2
2547}
2548
2549define <16 x float> @test_mm512_permutex2var_ps(<16 x float> %__A, <8 x i64> %__I, <16 x float> %__B) {
2550; CHECK-LABEL: test_mm512_permutex2var_ps:
2551; CHECK:       # %bb.0: # %entry
2552; CHECK-NEXT:    vpermt2ps %zmm2, %zmm1, %zmm0
2553; CHECK-NEXT:    ret{{[l|q]}}
2554entry:
2555  %0 = bitcast <8 x i64> %__I to <16 x i32>
2556  %1 = tail call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %__A, <16 x i32> %0, <16 x float> %__B)
2557  ret <16 x float> %1
2558}
2559
2560define <16 x float> @test_mm512_mask_permutex2var_ps(<16 x float> %__A, i16 zeroext %__U, <8 x i64> %__I, <16 x float> %__B) {
2561; X86-LABEL: test_mm512_mask_permutex2var_ps:
2562; X86:       # %bb.0: # %entry
2563; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2564; X86-NEXT:    kmovw %eax, %k1
2565; X86-NEXT:    vpermt2ps %zmm2, %zmm1, %zmm0 {%k1}
2566; X86-NEXT:    retl
2567;
2568; X64-LABEL: test_mm512_mask_permutex2var_ps:
2569; X64:       # %bb.0: # %entry
2570; X64-NEXT:    kmovw %edi, %k1
2571; X64-NEXT:    vpermt2ps %zmm2, %zmm1, %zmm0 {%k1}
2572; X64-NEXT:    retq
2573entry:
2574  %0 = bitcast <8 x i64> %__I to <16 x i32>
2575  %1 = tail call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %__A, <16 x i32> %0, <16 x float> %__B)
2576  %2 = bitcast i16 %__U to <16 x i1>
2577  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %__A
2578  ret <16 x float> %3
2579}
2580
2581define <16 x float> @test_mm512_maskz_permutex2var_ps(i16 zeroext %__U, <16 x float> %__A, <8 x i64> %__I, <16 x float> %__B) {
2582; X86-LABEL: test_mm512_maskz_permutex2var_ps:
2583; X86:       # %bb.0: # %entry
2584; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2585; X86-NEXT:    kmovw %eax, %k1
2586; X86-NEXT:    vpermt2ps %zmm2, %zmm1, %zmm0 {%k1} {z}
2587; X86-NEXT:    retl
2588;
2589; X64-LABEL: test_mm512_maskz_permutex2var_ps:
2590; X64:       # %bb.0: # %entry
2591; X64-NEXT:    kmovw %edi, %k1
2592; X64-NEXT:    vpermt2ps %zmm2, %zmm1, %zmm0 {%k1} {z}
2593; X64-NEXT:    retq
2594entry:
2595  %0 = bitcast <8 x i64> %__I to <16 x i32>
2596  %1 = tail call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %__A, <16 x i32> %0, <16 x float> %__B)
2597  %2 = bitcast i16 %__U to <16 x i1>
2598  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
2599  ret <16 x float> %3
2600}
2601
2602define <8 x i64> @test_mm512_permutex2var_epi64(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) {
2603; CHECK-LABEL: test_mm512_permutex2var_epi64:
2604; CHECK:       # %bb.0: # %entry
2605; CHECK-NEXT:    vpermt2q %zmm2, %zmm1, %zmm0
2606; CHECK-NEXT:    ret{{[l|q]}}
2607entry:
2608  %0 = tail call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B)
2609  ret <8 x i64> %0
2610}
2611
2612define <8 x i64> @test_mm512_mask_permutex2var_epi64(<8 x i64> %__A, i8 zeroext %__U, <8 x i64> %__I, <8 x i64> %__B) {
2613; X86-LABEL: test_mm512_mask_permutex2var_epi64:
2614; X86:       # %bb.0: # %entry
2615; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2616; X86-NEXT:    kmovw %eax, %k1
2617; X86-NEXT:    vpermt2q %zmm2, %zmm1, %zmm0 {%k1}
2618; X86-NEXT:    retl
2619;
2620; X64-LABEL: test_mm512_mask_permutex2var_epi64:
2621; X64:       # %bb.0: # %entry
2622; X64-NEXT:    kmovw %edi, %k1
2623; X64-NEXT:    vpermt2q %zmm2, %zmm1, %zmm0 {%k1}
2624; X64-NEXT:    retq
2625entry:
2626  %0 = tail call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B)
2627  %1 = bitcast i8 %__U to <8 x i1>
2628  %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__A
2629  ret <8 x i64> %2
2630}
2631
2632define <8 x i64> @test_mm512_maskz_permutex2var_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) {
2633; X86-LABEL: test_mm512_maskz_permutex2var_epi64:
2634; X86:       # %bb.0: # %entry
2635; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2636; X86-NEXT:    kmovw %eax, %k1
2637; X86-NEXT:    vpermt2q %zmm2, %zmm1, %zmm0 {%k1} {z}
2638; X86-NEXT:    retl
2639;
2640; X64-LABEL: test_mm512_maskz_permutex2var_epi64:
2641; X64:       # %bb.0: # %entry
2642; X64-NEXT:    kmovw %edi, %k1
2643; X64-NEXT:    vpermt2q %zmm2, %zmm1, %zmm0 {%k1} {z}
2644; X64-NEXT:    retq
2645entry:
2646  %0 = tail call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B)
2647  %1 = bitcast i8 %__U to <8 x i1>
2648  %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
2649  ret <8 x i64> %2
2650}
2651define <4 x float> @test_mm_mask_add_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2652; X86-LABEL: test_mm_mask_add_ss:
2653; X86:       # %bb.0: # %entry
2654; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2655; X86-NEXT:    kmovw %eax, %k1
2656; X86-NEXT:    vaddss %xmm2, %xmm1, %xmm0 {%k1}
2657; X86-NEXT:    retl
2658;
2659; X64-LABEL: test_mm_mask_add_ss:
2660; X64:       # %bb.0: # %entry
2661; X64-NEXT:    kmovw %edi, %k1
2662; X64-NEXT:    vaddss %xmm2, %xmm1, %xmm0 {%k1}
2663; X64-NEXT:    retq
2664entry:
2665  %vecext.i.i = extractelement <4 x float> %__B, i32 0
2666  %vecext1.i.i = extractelement <4 x float> %__A, i32 0
2667  %add.i.i = fadd float %vecext1.i.i, %vecext.i.i
2668  %0 = and i8 %__U, 1
2669  %tobool.i = icmp eq i8 %0, 0
2670  %vecext1.i = extractelement <4 x float> %__W, i32 0
2671  %cond.i = select i1 %tobool.i, float %vecext1.i, float %add.i.i
2672  %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
2673  ret <4 x float> %vecins.i
2674}
2675
2676define <4 x float> @test_mm_maskz_add_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2677; X86-LABEL: test_mm_maskz_add_ss:
2678; X86:       # %bb.0: # %entry
2679; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2680; X86-NEXT:    kmovw %eax, %k1
2681; X86-NEXT:    vaddss %xmm1, %xmm0, %xmm0 {%k1} {z}
2682; X86-NEXT:    retl
2683;
2684; X64-LABEL: test_mm_maskz_add_ss:
2685; X64:       # %bb.0: # %entry
2686; X64-NEXT:    kmovw %edi, %k1
2687; X64-NEXT:    vaddss %xmm1, %xmm0, %xmm0 {%k1} {z}
2688; X64-NEXT:    retq
2689entry:
2690  %vecext.i.i = extractelement <4 x float> %__B, i32 0
2691  %vecext1.i.i = extractelement <4 x float> %__A, i32 0
2692  %add.i.i = fadd float %vecext1.i.i, %vecext.i.i
2693  %0 = and i8 %__U, 1
2694  %tobool.i = icmp eq i8 %0, 0
2695  %cond.i = select i1 %tobool.i, float 0.000000e+00, float %add.i.i
2696  %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
2697  ret <4 x float> %vecins.i
2698}
2699
2700define <2 x double> @test_mm_mask_add_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2701; X86-LABEL: test_mm_mask_add_sd:
2702; X86:       # %bb.0: # %entry
2703; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2704; X86-NEXT:    kmovw %eax, %k1
2705; X86-NEXT:    vaddsd %xmm2, %xmm1, %xmm0 {%k1}
2706; X86-NEXT:    retl
2707;
2708; X64-LABEL: test_mm_mask_add_sd:
2709; X64:       # %bb.0: # %entry
2710; X64-NEXT:    kmovw %edi, %k1
2711; X64-NEXT:    vaddsd %xmm2, %xmm1, %xmm0 {%k1}
2712; X64-NEXT:    retq
2713entry:
2714  %vecext.i.i = extractelement <2 x double> %__B, i32 0
2715  %vecext1.i.i = extractelement <2 x double> %__A, i32 0
2716  %add.i.i = fadd double %vecext1.i.i, %vecext.i.i
2717  %0 = and i8 %__U, 1
2718  %tobool.i = icmp eq i8 %0, 0
2719  %vecext1.i = extractelement <2 x double> %__W, i32 0
2720  %cond.i = select i1 %tobool.i, double %vecext1.i, double %add.i.i
2721  %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
2722  ret <2 x double> %vecins.i
2723}
2724
2725define <2 x double> @test_mm_maskz_add_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2726; X86-LABEL: test_mm_maskz_add_sd:
2727; X86:       # %bb.0: # %entry
2728; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2729; X86-NEXT:    kmovw %eax, %k1
2730; X86-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2731; X86-NEXT:    retl
2732;
2733; X64-LABEL: test_mm_maskz_add_sd:
2734; X64:       # %bb.0: # %entry
2735; X64-NEXT:    kmovw %edi, %k1
2736; X64-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2737; X64-NEXT:    retq
2738entry:
2739  %vecext.i.i = extractelement <2 x double> %__B, i32 0
2740  %vecext1.i.i = extractelement <2 x double> %__A, i32 0
2741  %add.i.i = fadd double %vecext1.i.i, %vecext.i.i
2742  %0 = and i8 %__U, 1
2743  %tobool.i = icmp eq i8 %0, 0
2744  %cond.i = select i1 %tobool.i, double 0.000000e+00, double %add.i.i
2745  %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
2746  ret <2 x double> %vecins.i
2747}
2748
2749define <4 x float> @test_mm_mask_sub_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2750; X86-LABEL: test_mm_mask_sub_ss:
2751; X86:       # %bb.0: # %entry
2752; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2753; X86-NEXT:    kmovw %eax, %k1
2754; X86-NEXT:    vsubss %xmm2, %xmm1, %xmm0 {%k1}
2755; X86-NEXT:    retl
2756;
2757; X64-LABEL: test_mm_mask_sub_ss:
2758; X64:       # %bb.0: # %entry
2759; X64-NEXT:    kmovw %edi, %k1
2760; X64-NEXT:    vsubss %xmm2, %xmm1, %xmm0 {%k1}
2761; X64-NEXT:    retq
2762entry:
2763  %vecext.i.i = extractelement <4 x float> %__B, i32 0
2764  %vecext1.i.i = extractelement <4 x float> %__A, i32 0
2765  %sub.i.i = fsub float %vecext1.i.i, %vecext.i.i
2766  %0 = and i8 %__U, 1
2767  %tobool.i = icmp eq i8 %0, 0
2768  %vecext1.i = extractelement <4 x float> %__W, i32 0
2769  %cond.i = select i1 %tobool.i, float %vecext1.i, float %sub.i.i
2770  %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
2771  ret <4 x float> %vecins.i
2772}
2773
2774define <4 x float> @test_mm_maskz_sub_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2775; X86-LABEL: test_mm_maskz_sub_ss:
2776; X86:       # %bb.0: # %entry
2777; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2778; X86-NEXT:    kmovw %eax, %k1
2779; X86-NEXT:    vsubss %xmm1, %xmm0, %xmm0 {%k1} {z}
2780; X86-NEXT:    retl
2781;
2782; X64-LABEL: test_mm_maskz_sub_ss:
2783; X64:       # %bb.0: # %entry
2784; X64-NEXT:    kmovw %edi, %k1
2785; X64-NEXT:    vsubss %xmm1, %xmm0, %xmm0 {%k1} {z}
2786; X64-NEXT:    retq
2787entry:
2788  %vecext.i.i = extractelement <4 x float> %__B, i32 0
2789  %vecext1.i.i = extractelement <4 x float> %__A, i32 0
2790  %sub.i.i = fsub float %vecext1.i.i, %vecext.i.i
2791  %0 = and i8 %__U, 1
2792  %tobool.i = icmp eq i8 %0, 0
2793  %cond.i = select i1 %tobool.i, float 0.000000e+00, float %sub.i.i
2794  %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
2795  ret <4 x float> %vecins.i
2796}
2797
2798define <2 x double> @test_mm_mask_sub_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2799; X86-LABEL: test_mm_mask_sub_sd:
2800; X86:       # %bb.0: # %entry
2801; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2802; X86-NEXT:    kmovw %eax, %k1
2803; X86-NEXT:    vsubsd %xmm2, %xmm1, %xmm0 {%k1}
2804; X86-NEXT:    retl
2805;
2806; X64-LABEL: test_mm_mask_sub_sd:
2807; X64:       # %bb.0: # %entry
2808; X64-NEXT:    kmovw %edi, %k1
2809; X64-NEXT:    vsubsd %xmm2, %xmm1, %xmm0 {%k1}
2810; X64-NEXT:    retq
2811entry:
2812  %vecext.i.i = extractelement <2 x double> %__B, i32 0
2813  %vecext1.i.i = extractelement <2 x double> %__A, i32 0
2814  %sub.i.i = fsub double %vecext1.i.i, %vecext.i.i
2815  %0 = and i8 %__U, 1
2816  %tobool.i = icmp eq i8 %0, 0
2817  %vecext1.i = extractelement <2 x double> %__W, i32 0
2818  %cond.i = select i1 %tobool.i, double %vecext1.i, double %sub.i.i
2819  %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
2820  ret <2 x double> %vecins.i
2821}
2822
2823define <2 x double> @test_mm_maskz_sub_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2824; X86-LABEL: test_mm_maskz_sub_sd:
2825; X86:       # %bb.0: # %entry
2826; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2827; X86-NEXT:    kmovw %eax, %k1
2828; X86-NEXT:    vsubsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2829; X86-NEXT:    retl
2830;
2831; X64-LABEL: test_mm_maskz_sub_sd:
2832; X64:       # %bb.0: # %entry
2833; X64-NEXT:    kmovw %edi, %k1
2834; X64-NEXT:    vsubsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2835; X64-NEXT:    retq
2836entry:
2837  %vecext.i.i = extractelement <2 x double> %__B, i32 0
2838  %vecext1.i.i = extractelement <2 x double> %__A, i32 0
2839  %sub.i.i = fsub double %vecext1.i.i, %vecext.i.i
2840  %0 = and i8 %__U, 1
2841  %tobool.i = icmp eq i8 %0, 0
2842  %cond.i = select i1 %tobool.i, double 0.000000e+00, double %sub.i.i
2843  %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
2844  ret <2 x double> %vecins.i
2845}
2846
2847define <4 x float> @test_mm_mask_mul_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2848; X86-LABEL: test_mm_mask_mul_ss:
2849; X86:       # %bb.0: # %entry
2850; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2851; X86-NEXT:    kmovw %eax, %k1
2852; X86-NEXT:    vmulss %xmm2, %xmm1, %xmm0 {%k1}
2853; X86-NEXT:    retl
2854;
2855; X64-LABEL: test_mm_mask_mul_ss:
2856; X64:       # %bb.0: # %entry
2857; X64-NEXT:    kmovw %edi, %k1
2858; X64-NEXT:    vmulss %xmm2, %xmm1, %xmm0 {%k1}
2859; X64-NEXT:    retq
2860entry:
2861  %vecext.i.i = extractelement <4 x float> %__B, i32 0
2862  %vecext1.i.i = extractelement <4 x float> %__A, i32 0
2863  %mul.i.i = fmul float %vecext1.i.i, %vecext.i.i
2864  %0 = and i8 %__U, 1
2865  %tobool.i = icmp eq i8 %0, 0
2866  %vecext1.i = extractelement <4 x float> %__W, i32 0
2867  %cond.i = select i1 %tobool.i, float %vecext1.i, float %mul.i.i
2868  %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
2869  ret <4 x float> %vecins.i
2870}
2871
2872define <4 x float> @test_mm_maskz_mul_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2873; X86-LABEL: test_mm_maskz_mul_ss:
2874; X86:       # %bb.0: # %entry
2875; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2876; X86-NEXT:    kmovw %eax, %k1
2877; X86-NEXT:    vmulss %xmm1, %xmm0, %xmm0 {%k1} {z}
2878; X86-NEXT:    retl
2879;
2880; X64-LABEL: test_mm_maskz_mul_ss:
2881; X64:       # %bb.0: # %entry
2882; X64-NEXT:    kmovw %edi, %k1
2883; X64-NEXT:    vmulss %xmm1, %xmm0, %xmm0 {%k1} {z}
2884; X64-NEXT:    retq
2885entry:
2886  %vecext.i.i = extractelement <4 x float> %__B, i32 0
2887  %vecext1.i.i = extractelement <4 x float> %__A, i32 0
2888  %mul.i.i = fmul float %vecext1.i.i, %vecext.i.i
2889  %0 = and i8 %__U, 1
2890  %tobool.i = icmp eq i8 %0, 0
2891  %cond.i = select i1 %tobool.i, float 0.000000e+00, float %mul.i.i
2892  %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
2893  ret <4 x float> %vecins.i
2894}
2895
2896define <2 x double> @test_mm_mask_mul_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2897; X86-LABEL: test_mm_mask_mul_sd:
2898; X86:       # %bb.0: # %entry
2899; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2900; X86-NEXT:    kmovw %eax, %k1
2901; X86-NEXT:    vmulsd %xmm2, %xmm1, %xmm0 {%k1}
2902; X86-NEXT:    retl
2903;
2904; X64-LABEL: test_mm_mask_mul_sd:
2905; X64:       # %bb.0: # %entry
2906; X64-NEXT:    kmovw %edi, %k1
2907; X64-NEXT:    vmulsd %xmm2, %xmm1, %xmm0 {%k1}
2908; X64-NEXT:    retq
2909entry:
2910  %vecext.i.i = extractelement <2 x double> %__B, i32 0
2911  %vecext1.i.i = extractelement <2 x double> %__A, i32 0
2912  %mul.i.i = fmul double %vecext1.i.i, %vecext.i.i
2913  %0 = and i8 %__U, 1
2914  %tobool.i = icmp eq i8 %0, 0
2915  %vecext1.i = extractelement <2 x double> %__W, i32 0
2916  %cond.i = select i1 %tobool.i, double %vecext1.i, double %mul.i.i
2917  %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
2918  ret <2 x double> %vecins.i
2919}
2920
2921define <2 x double> @test_mm_maskz_mul_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2922; X86-LABEL: test_mm_maskz_mul_sd:
2923; X86:       # %bb.0: # %entry
2924; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2925; X86-NEXT:    kmovw %eax, %k1
2926; X86-NEXT:    vmulsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2927; X86-NEXT:    retl
2928;
2929; X64-LABEL: test_mm_maskz_mul_sd:
2930; X64:       # %bb.0: # %entry
2931; X64-NEXT:    kmovw %edi, %k1
2932; X64-NEXT:    vmulsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2933; X64-NEXT:    retq
2934entry:
2935  %vecext.i.i = extractelement <2 x double> %__B, i32 0
2936  %vecext1.i.i = extractelement <2 x double> %__A, i32 0
2937  %mul.i.i = fmul double %vecext1.i.i, %vecext.i.i
2938  %0 = and i8 %__U, 1
2939  %tobool.i = icmp eq i8 %0, 0
2940  %cond.i = select i1 %tobool.i, double 0.000000e+00, double %mul.i.i
2941  %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
2942  ret <2 x double> %vecins.i
2943}
2944
2945define <4 x float> @test_mm_mask_div_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2946; X86-LABEL: test_mm_mask_div_ss:
2947; X86:       # %bb.0: # %entry
2948; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2949; X86-NEXT:    kmovw %eax, %k1
2950; X86-NEXT:    vdivss %xmm2, %xmm1, %xmm0 {%k1}
2951; X86-NEXT:    retl
2952;
2953; X64-LABEL: test_mm_mask_div_ss:
2954; X64:       # %bb.0: # %entry
2955; X64-NEXT:    kmovw %edi, %k1
2956; X64-NEXT:    vdivss %xmm2, %xmm1, %xmm0 {%k1}
2957; X64-NEXT:    retq
2958entry:
2959  %0 = extractelement <4 x float> %__A, i64 0
2960  %1 = extractelement <4 x float> %__B, i64 0
2961  %2 = extractelement <4 x float> %__W, i64 0
2962  %3 = fdiv float %0, %1
2963  %4 = bitcast i8 %__U to <8 x i1>
2964  %5 = extractelement <8 x i1> %4, i64 0
2965  %6 = select i1 %5, float %3, float %2
2966  %7 = insertelement <4 x float> %__A, float %6, i64 0
2967  ret <4 x float> %7
2968}
2969
2970define <4 x float> @test_mm_maskz_div_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2971; X86-LABEL: test_mm_maskz_div_ss:
2972; X86:       # %bb.0: # %entry
2973; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2974; X86-NEXT:    kmovw %eax, %k1
2975; X86-NEXT:    vdivss %xmm1, %xmm0, %xmm0 {%k1} {z}
2976; X86-NEXT:    retl
2977;
2978; X64-LABEL: test_mm_maskz_div_ss:
2979; X64:       # %bb.0: # %entry
2980; X64-NEXT:    kmovw %edi, %k1
2981; X64-NEXT:    vdivss %xmm1, %xmm0, %xmm0 {%k1} {z}
2982; X64-NEXT:    retq
2983entry:
2984  %0 = extractelement <4 x float> %__A, i64 0
2985  %1 = extractelement <4 x float> %__B, i64 0
2986  %2 = fdiv float %0, %1
2987  %3 = bitcast i8 %__U to <8 x i1>
2988  %4 = extractelement <8 x i1> %3, i64 0
2989  %5 = select i1 %4, float %2, float 0.000000e+00
2990  %6 = insertelement <4 x float> %__A, float %5, i64 0
2991  ret <4 x float> %6
2992}
2993
2994define <2 x double> @test_mm_mask_div_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2995; X86-LABEL: test_mm_mask_div_sd:
2996; X86:       # %bb.0: # %entry
2997; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
2998; X86-NEXT:    kmovw %eax, %k1
2999; X86-NEXT:    vdivsd %xmm2, %xmm1, %xmm0 {%k1}
3000; X86-NEXT:    retl
3001;
3002; X64-LABEL: test_mm_mask_div_sd:
3003; X64:       # %bb.0: # %entry
3004; X64-NEXT:    kmovw %edi, %k1
3005; X64-NEXT:    vdivsd %xmm2, %xmm1, %xmm0 {%k1}
3006; X64-NEXT:    retq
3007entry:
3008  %0 = extractelement <2 x double> %__A, i64 0
3009  %1 = extractelement <2 x double> %__B, i64 0
3010  %2 = extractelement <2 x double> %__W, i64 0
3011  %3 = fdiv double %0, %1
3012  %4 = bitcast i8 %__U to <8 x i1>
3013  %5 = extractelement <8 x i1> %4, i64 0
3014  %6 = select i1 %5, double %3, double %2
3015  %7 = insertelement <2 x double> %__A, double %6, i64 0
3016  ret <2 x double> %7
3017}
3018
3019define <2 x double> @test_mm_maskz_div_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
3020; X86-LABEL: test_mm_maskz_div_sd:
3021; X86:       # %bb.0: # %entry
3022; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3023; X86-NEXT:    kmovw %eax, %k1
3024; X86-NEXT:    vdivsd %xmm1, %xmm0, %xmm0 {%k1} {z}
3025; X86-NEXT:    retl
3026;
3027; X64-LABEL: test_mm_maskz_div_sd:
3028; X64:       # %bb.0: # %entry
3029; X64-NEXT:    kmovw %edi, %k1
3030; X64-NEXT:    vdivsd %xmm1, %xmm0, %xmm0 {%k1} {z}
3031; X64-NEXT:    retq
3032entry:
3033  %0 = extractelement <2 x double> %__A, i64 0
3034  %1 = extractelement <2 x double> %__B, i64 0
3035  %2 = fdiv double %0, %1
3036  %3 = bitcast i8 %__U to <8 x i1>
3037  %4 = extractelement <8 x i1> %3, i64 0
3038  %5 = select i1 %4, double %2, double 0.000000e+00
3039  %6 = insertelement <2 x double> %__A, double %5, i64 0
3040  ret <2 x double> %6
3041}
3042
3043
3044define <8 x double> @test_mm512_fmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3045; CHECK-LABEL: test_mm512_fmadd_round_pd:
3046; CHECK:       # %bb.0: # %entry
3047; CHECK-NEXT:    vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
3048; CHECK-NEXT:    ret{{[l|q]}}
3049entry:
3050  %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
3051  ret <8 x double> %0
3052}
3053
3054declare <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i32) #1
3055
3056define <8 x double> @test_mm512_mask_fmadd_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
3057; X86-LABEL: test_mm512_mask_fmadd_round_pd:
3058; X86:       # %bb.0: # %entry
3059; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3060; X86-NEXT:    kmovw %eax, %k1
3061; X86-NEXT:    vfmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3062; X86-NEXT:    retl
3063;
3064; X64-LABEL: test_mm512_mask_fmadd_round_pd:
3065; X64:       # %bb.0: # %entry
3066; X64-NEXT:    kmovw %edi, %k1
3067; X64-NEXT:    vfmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3068; X64-NEXT:    retq
3069entry:
3070  %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
3071  %1 = bitcast i8 %__U to <8 x i1>
3072  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
3073  ret <8 x double> %2
3074}
3075
3076define <8 x double> @test_mm512_mask3_fmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
3077; X86-LABEL: test_mm512_mask3_fmadd_round_pd:
3078; X86:       # %bb.0: # %entry
3079; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3080; X86-NEXT:    kmovw %eax, %k1
3081; X86-NEXT:    vfmadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3082; X86-NEXT:    vmovapd %zmm2, %zmm0
3083; X86-NEXT:    retl
3084;
3085; X64-LABEL: test_mm512_mask3_fmadd_round_pd:
3086; X64:       # %bb.0: # %entry
3087; X64-NEXT:    kmovw %edi, %k1
3088; X64-NEXT:    vfmadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3089; X64-NEXT:    vmovapd %zmm2, %zmm0
3090; X64-NEXT:    retq
3091entry:
3092  %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
3093  %1 = bitcast i8 %__U to <8 x i1>
3094  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
3095  ret <8 x double> %2
3096}
3097
3098define <8 x double> @test_mm512_maskz_fmadd_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3099; X86-LABEL: test_mm512_maskz_fmadd_round_pd:
3100; X86:       # %bb.0: # %entry
3101; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3102; X86-NEXT:    kmovw %eax, %k1
3103; X86-NEXT:    vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3104; X86-NEXT:    retl
3105;
3106; X64-LABEL: test_mm512_maskz_fmadd_round_pd:
3107; X64:       # %bb.0: # %entry
3108; X64-NEXT:    kmovw %edi, %k1
3109; X64-NEXT:    vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3110; X64-NEXT:    retq
3111entry:
3112  %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
3113  %1 = bitcast i8 %__U to <8 x i1>
3114  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3115  ret <8 x double> %2
3116}
3117
3118define <8 x double> @test_mm512_fmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3119; X86-LABEL: test_mm512_fmsub_round_pd:
3120; X86:       # %bb.0: # %entry
3121; X86-NEXT:    vpxorq {{\.LCPI.*}}{1to8}, %zmm2, %zmm2
3122; X86-NEXT:    vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
3123; X86-NEXT:    retl
3124;
3125; X64-LABEL: test_mm512_fmsub_round_pd:
3126; X64:       # %bb.0: # %entry
3127; X64-NEXT:    vpxorq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3128; X64-NEXT:    vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
3129; X64-NEXT:    retq
3130entry:
3131  %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3132  %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
3133  ret <8 x double> %0
3134}
3135
3136define <8 x double> @test_mm512_mask_fmsub_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
3137; X86-LABEL: test_mm512_mask_fmsub_round_pd:
3138; X86:       # %bb.0: # %entry
3139; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3140; X86-NEXT:    kmovw %eax, %k1
3141; X86-NEXT:    vfmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3142; X86-NEXT:    retl
3143;
3144; X64-LABEL: test_mm512_mask_fmsub_round_pd:
3145; X64:       # %bb.0: # %entry
3146; X64-NEXT:    kmovw %edi, %k1
3147; X64-NEXT:    vfmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3148; X64-NEXT:    retq
3149entry:
3150  %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3151  %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
3152  %1 = bitcast i8 %__U to <8 x i1>
3153  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
3154  ret <8 x double> %2
3155}
3156
3157define <8 x double> @test_mm512_maskz_fmsub_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3158; X86-LABEL: test_mm512_maskz_fmsub_round_pd:
3159; X86:       # %bb.0: # %entry
3160; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3161; X86-NEXT:    kmovw %eax, %k1
3162; X86-NEXT:    vfmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3163; X86-NEXT:    retl
3164;
3165; X64-LABEL: test_mm512_maskz_fmsub_round_pd:
3166; X64:       # %bb.0: # %entry
3167; X64-NEXT:    kmovw %edi, %k1
3168; X64-NEXT:    vfmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3169; X64-NEXT:    retq
3170entry:
3171  %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3172  %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
3173  %1 = bitcast i8 %__U to <8 x i1>
3174  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3175  ret <8 x double> %2
3176}
3177
3178define <8 x double> @test_mm512_fnmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3179; X86-LABEL: test_mm512_fnmadd_round_pd:
3180; X86:       # %bb.0: # %entry
3181; X86-NEXT:    vpxorq {{\.LCPI.*}}{1to8}, %zmm0, %zmm0
3182; X86-NEXT:    vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
3183; X86-NEXT:    retl
3184;
3185; X64-LABEL: test_mm512_fnmadd_round_pd:
3186; X64:       # %bb.0: # %entry
3187; X64-NEXT:    vpxorq {{.*}}(%rip){1to8}, %zmm0, %zmm0
3188; X64-NEXT:    vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
3189; X64-NEXT:    retq
3190entry:
3191  %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3192  %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %__C, i32 8)
3193  ret <8 x double> %0
3194}
3195
3196define <8 x double> @test_mm512_mask3_fnmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
3197; X86-LABEL: test_mm512_mask3_fnmadd_round_pd:
3198; X86:       # %bb.0: # %entry
3199; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3200; X86-NEXT:    kmovw %eax, %k1
3201; X86-NEXT:    vfnmadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3202; X86-NEXT:    vmovapd %zmm2, %zmm0
3203; X86-NEXT:    retl
3204;
3205; X64-LABEL: test_mm512_mask3_fnmadd_round_pd:
3206; X64:       # %bb.0: # %entry
3207; X64-NEXT:    kmovw %edi, %k1
3208; X64-NEXT:    vfnmadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3209; X64-NEXT:    vmovapd %zmm2, %zmm0
3210; X64-NEXT:    retq
3211entry:
3212  %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3213  %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %__C, i32 8)
3214  %1 = bitcast i8 %__U to <8 x i1>
3215  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
3216  ret <8 x double> %2
3217}
3218
3219define <8 x double> @test_mm512_maskz_fnmadd_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3220; X86-LABEL: test_mm512_maskz_fnmadd_round_pd:
3221; X86:       # %bb.0: # %entry
3222; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3223; X86-NEXT:    kmovw %eax, %k1
3224; X86-NEXT:    vfnmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3225; X86-NEXT:    retl
3226;
3227; X64-LABEL: test_mm512_maskz_fnmadd_round_pd:
3228; X64:       # %bb.0: # %entry
3229; X64-NEXT:    kmovw %edi, %k1
3230; X64-NEXT:    vfnmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3231; X64-NEXT:    retq
3232entry:
3233  %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3234  %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %__C, i32 8)
3235  %1 = bitcast i8 %__U to <8 x i1>
3236  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3237  ret <8 x double> %2
3238}
3239
3240define <8 x double> @test_mm512_fnmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3241; CHECK-LABEL: test_mm512_fnmsub_round_pd:
3242; CHECK:       # %bb.0: # %entry
3243; CHECK-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
3244; CHECK-NEXT:    vpxorq %zmm3, %zmm0, %zmm4
3245; CHECK-NEXT:    vpxorq %zmm3, %zmm2, %zmm0
3246; CHECK-NEXT:    vfmadd231pd {rn-sae}, %zmm4, %zmm1, %zmm0
3247; CHECK-NEXT:    ret{{[l|q]}}
3248entry:
3249  %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3250  %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3251  %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %sub1, i32 8)
3252  ret <8 x double> %0
3253}
3254
3255define <8 x double> @test_mm512_maskz_fnmsub_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3256; X86-LABEL: test_mm512_maskz_fnmsub_round_pd:
3257; X86:       # %bb.0: # %entry
3258; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3259; X86-NEXT:    kmovw %eax, %k1
3260; X86-NEXT:    vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3261; X86-NEXT:    retl
3262;
3263; X64-LABEL: test_mm512_maskz_fnmsub_round_pd:
3264; X64:       # %bb.0: # %entry
3265; X64-NEXT:    kmovw %edi, %k1
3266; X64-NEXT:    vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3267; X64-NEXT:    retq
3268entry:
3269  %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3270  %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3271  %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %sub1, i32 8)
3272  %1 = bitcast i8 %__U to <8 x i1>
3273  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3274  ret <8 x double> %2
3275}
3276
3277define <8 x double> @test_mm512_fmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3278; CHECK-LABEL: test_mm512_fmadd_pd:
3279; CHECK:       # %bb.0: # %entry
3280; CHECK-NEXT:    vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3281; CHECK-NEXT:    ret{{[l|q]}}
3282entry:
3283  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
3284  ret <8 x double> %0
3285}
3286
3287define <8 x double> @test_mm512_mask_fmadd_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
3288; X86-LABEL: test_mm512_mask_fmadd_pd:
3289; X86:       # %bb.0: # %entry
3290; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3291; X86-NEXT:    kmovw %eax, %k1
3292; X86-NEXT:    vfmadd132pd {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) + zmm2
3293; X86-NEXT:    retl
3294;
3295; X64-LABEL: test_mm512_mask_fmadd_pd:
3296; X64:       # %bb.0: # %entry
3297; X64-NEXT:    kmovw %edi, %k1
3298; X64-NEXT:    vfmadd132pd {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) + zmm2
3299; X64-NEXT:    retq
3300entry:
3301  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
3302  %1 = bitcast i8 %__U to <8 x i1>
3303  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
3304  ret <8 x double> %2
3305}
3306
3307define <8 x double> @test_mm512_mask3_fmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
3308; X86-LABEL: test_mm512_mask3_fmadd_pd:
3309; X86:       # %bb.0: # %entry
3310; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3311; X86-NEXT:    kmovw %eax, %k1
3312; X86-NEXT:    vfmadd231pd {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) + zmm2
3313; X86-NEXT:    vmovapd %zmm2, %zmm0
3314; X86-NEXT:    retl
3315;
3316; X64-LABEL: test_mm512_mask3_fmadd_pd:
3317; X64:       # %bb.0: # %entry
3318; X64-NEXT:    kmovw %edi, %k1
3319; X64-NEXT:    vfmadd231pd {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) + zmm2
3320; X64-NEXT:    vmovapd %zmm2, %zmm0
3321; X64-NEXT:    retq
3322entry:
3323  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
3324  %1 = bitcast i8 %__U to <8 x i1>
3325  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
3326  ret <8 x double> %2
3327}
3328
3329define <8 x double> @test_mm512_maskz_fmadd_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3330; X86-LABEL: test_mm512_maskz_fmadd_pd:
3331; X86:       # %bb.0: # %entry
3332; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3333; X86-NEXT:    kmovw %eax, %k1
3334; X86-NEXT:    vfmadd213pd {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) + zmm2
3335; X86-NEXT:    retl
3336;
3337; X64-LABEL: test_mm512_maskz_fmadd_pd:
3338; X64:       # %bb.0: # %entry
3339; X64-NEXT:    kmovw %edi, %k1
3340; X64-NEXT:    vfmadd213pd {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) + zmm2
3341; X64-NEXT:    retq
3342entry:
3343  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
3344  %1 = bitcast i8 %__U to <8 x i1>
3345  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3346  ret <8 x double> %2
3347}
3348
3349define <8 x double> @test_mm512_fmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3350; X86-LABEL: test_mm512_fmsub_pd:
3351; X86:       # %bb.0: # %entry
3352; X86-NEXT:    vpxorq {{\.LCPI.*}}{1to8}, %zmm2, %zmm2
3353; X86-NEXT:    vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3354; X86-NEXT:    retl
3355;
3356; X64-LABEL: test_mm512_fmsub_pd:
3357; X64:       # %bb.0: # %entry
3358; X64-NEXT:    vpxorq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3359; X64-NEXT:    vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3360; X64-NEXT:    retq
3361entry:
3362  %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3363  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
3364  ret <8 x double> %0
3365}
3366
3367define <8 x double> @test_mm512_mask_fmsub_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
3368; X86-LABEL: test_mm512_mask_fmsub_pd:
3369; X86:       # %bb.0: # %entry
3370; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3371; X86-NEXT:    kmovw %eax, %k1
3372; X86-NEXT:    vfmsub132pd {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) - zmm2
3373; X86-NEXT:    retl
3374;
3375; X64-LABEL: test_mm512_mask_fmsub_pd:
3376; X64:       # %bb.0: # %entry
3377; X64-NEXT:    kmovw %edi, %k1
3378; X64-NEXT:    vfmsub132pd {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) - zmm2
3379; X64-NEXT:    retq
3380entry:
3381  %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3382  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
3383  %1 = bitcast i8 %__U to <8 x i1>
3384  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
3385  ret <8 x double> %2
3386}
3387
3388define <8 x double> @test_mm512_maskz_fmsub_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3389; X86-LABEL: test_mm512_maskz_fmsub_pd:
3390; X86:       # %bb.0: # %entry
3391; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3392; X86-NEXT:    kmovw %eax, %k1
3393; X86-NEXT:    vfmsub213pd {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) - zmm2
3394; X86-NEXT:    retl
3395;
3396; X64-LABEL: test_mm512_maskz_fmsub_pd:
3397; X64:       # %bb.0: # %entry
3398; X64-NEXT:    kmovw %edi, %k1
3399; X64-NEXT:    vfmsub213pd {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) - zmm2
3400; X64-NEXT:    retq
3401entry:
3402  %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3403  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
3404  %1 = bitcast i8 %__U to <8 x i1>
3405  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3406  ret <8 x double> %2
3407}
3408
3409define <8 x double> @test_mm512_fnmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3410; X86-LABEL: test_mm512_fnmadd_pd:
3411; X86:       # %bb.0: # %entry
3412; X86-NEXT:    vpxorq {{\.LCPI.*}}{1to8}, %zmm0, %zmm0
3413; X86-NEXT:    vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3414; X86-NEXT:    retl
3415;
3416; X64-LABEL: test_mm512_fnmadd_pd:
3417; X64:       # %bb.0: # %entry
3418; X64-NEXT:    vpxorq {{.*}}(%rip){1to8}, %zmm0, %zmm0
3419; X64-NEXT:    vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3420; X64-NEXT:    retq
3421entry:
3422  %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3423  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %__C) #10
3424  ret <8 x double> %0
3425}
3426
3427define <8 x double> @test_mm512_mask3_fnmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
3428; X86-LABEL: test_mm512_mask3_fnmadd_pd:
3429; X86:       # %bb.0: # %entry
3430; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3431; X86-NEXT:    kmovw %eax, %k1
3432; X86-NEXT:    vfnmadd231pd {{.*#+}} zmm2 {%k1} = -(zmm0 * zmm1) + zmm2
3433; X86-NEXT:    vmovapd %zmm2, %zmm0
3434; X86-NEXT:    retl
3435;
3436; X64-LABEL: test_mm512_mask3_fnmadd_pd:
3437; X64:       # %bb.0: # %entry
3438; X64-NEXT:    kmovw %edi, %k1
3439; X64-NEXT:    vfnmadd231pd {{.*#+}} zmm2 {%k1} = -(zmm0 * zmm1) + zmm2
3440; X64-NEXT:    vmovapd %zmm2, %zmm0
3441; X64-NEXT:    retq
3442entry:
3443  %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3444  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %__C) #10
3445  %1 = bitcast i8 %__U to <8 x i1>
3446  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
3447  ret <8 x double> %2
3448}
3449
3450define <8 x double> @test_mm512_maskz_fnmadd_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3451; X86-LABEL: test_mm512_maskz_fnmadd_pd:
3452; X86:       # %bb.0: # %entry
3453; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3454; X86-NEXT:    kmovw %eax, %k1
3455; X86-NEXT:    vfnmadd213pd {{.*#+}} zmm0 {%k1} {z} = -(zmm1 * zmm0) + zmm2
3456; X86-NEXT:    retl
3457;
3458; X64-LABEL: test_mm512_maskz_fnmadd_pd:
3459; X64:       # %bb.0: # %entry
3460; X64-NEXT:    kmovw %edi, %k1
3461; X64-NEXT:    vfnmadd213pd {{.*#+}} zmm0 {%k1} {z} = -(zmm1 * zmm0) + zmm2
3462; X64-NEXT:    retq
3463entry:
3464  %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3465  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %__C) #10
3466  %1 = bitcast i8 %__U to <8 x i1>
3467  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3468  ret <8 x double> %2
3469}
3470
3471define <8 x double> @test_mm512_fnmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3472; CHECK-LABEL: test_mm512_fnmsub_pd:
3473; CHECK:       # %bb.0: # %entry
3474; CHECK-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
3475; CHECK-NEXT:    vpxorq %zmm3, %zmm0, %zmm4
3476; CHECK-NEXT:    vpxorq %zmm3, %zmm2, %zmm0
3477; CHECK-NEXT:    vfmadd231pd {{.*#+}} zmm0 = (zmm1 * zmm4) + zmm0
3478; CHECK-NEXT:    ret{{[l|q]}}
3479entry:
3480  %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3481  %sub1.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3482  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %sub1.i) #10
3483  ret <8 x double> %0
3484}
3485
3486define <8 x double> @test_mm512_maskz_fnmsub_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3487; X86-LABEL: test_mm512_maskz_fnmsub_pd:
3488; X86:       # %bb.0: # %entry
3489; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3490; X86-NEXT:    kmovw %eax, %k1
3491; X86-NEXT:    vfnmsub213pd {{.*#+}} zmm0 {%k1} {z} = -(zmm1 * zmm0) - zmm2
3492; X86-NEXT:    retl
3493;
3494; X64-LABEL: test_mm512_maskz_fnmsub_pd:
3495; X64:       # %bb.0: # %entry
3496; X64-NEXT:    kmovw %edi, %k1
3497; X64-NEXT:    vfnmsub213pd {{.*#+}} zmm0 {%k1} {z} = -(zmm1 * zmm0) - zmm2
3498; X64-NEXT:    retq
3499entry:
3500  %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3501  %sub1.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3502  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %sub1.i) #10
3503  %1 = bitcast i8 %__U to <8 x i1>
3504  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3505  ret <8 x double> %2
3506}
3507
3508define <16 x float> @test_mm512_fmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3509; CHECK-LABEL: test_mm512_fmadd_round_ps:
3510; CHECK:       # %bb.0: # %entry
3511; CHECK-NEXT:    vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
3512; CHECK-NEXT:    ret{{[l|q]}}
3513entry:
3514  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
3515  ret <16 x float> %0
3516}
3517
3518declare <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i32) #1
3519
3520define <16 x float> @test_mm512_mask_fmadd_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
3521; X86-LABEL: test_mm512_mask_fmadd_round_ps:
3522; X86:       # %bb.0: # %entry
3523; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
3524; X86-NEXT:    kmovw %eax, %k1
3525; X86-NEXT:    vfmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3526; X86-NEXT:    retl
3527;
3528; X64-LABEL: test_mm512_mask_fmadd_round_ps:
3529; X64:       # %bb.0: # %entry
3530; X64-NEXT:    kmovw %edi, %k1
3531; X64-NEXT:    vfmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3532; X64-NEXT:    retq
3533entry:
3534  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
3535  %1 = bitcast i16 %__U to <16 x i1>
3536  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
3537  ret <16 x float> %2
3538}
3539
3540define <16 x float> @test_mm512_mask3_fmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
3541; X86-LABEL: test_mm512_mask3_fmadd_round_ps:
3542; X86:       # %bb.0: # %entry
3543; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
3544; X86-NEXT:    kmovw %eax, %k1
3545; X86-NEXT:    vfmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3546; X86-NEXT:    vmovaps %zmm2, %zmm0
3547; X86-NEXT:    retl
3548;
3549; X64-LABEL: test_mm512_mask3_fmadd_round_ps:
3550; X64:       # %bb.0: # %entry
3551; X64-NEXT:    kmovw %edi, %k1
3552; X64-NEXT:    vfmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3553; X64-NEXT:    vmovaps %zmm2, %zmm0
3554; X64-NEXT:    retq
3555entry:
3556  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
3557  %1 = bitcast i16 %__U to <16 x i1>
3558  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
3559  ret <16 x float> %2
3560}
3561
3562define <16 x float> @test_mm512_maskz_fmadd_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3563; X86-LABEL: test_mm512_maskz_fmadd_round_ps:
3564; X86:       # %bb.0: # %entry
3565; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
3566; X86-NEXT:    kmovw %eax, %k1
3567; X86-NEXT:    vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3568; X86-NEXT:    retl
3569;
3570; X64-LABEL: test_mm512_maskz_fmadd_round_ps:
3571; X64:       # %bb.0: # %entry
3572; X64-NEXT:    kmovw %edi, %k1
3573; X64-NEXT:    vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3574; X64-NEXT:    retq
3575entry:
3576  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
3577  %1 = bitcast i16 %__U to <16 x i1>
3578  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3579  ret <16 x float> %2
3580}
3581
3582define <16 x float> @test_mm512_fmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3583; X86-LABEL: test_mm512_fmsub_round_ps:
3584; X86:       # %bb.0: # %entry
3585; X86-NEXT:    vpxord {{\.LCPI.*}}{1to16}, %zmm2, %zmm2
3586; X86-NEXT:    vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
3587; X86-NEXT:    retl
3588;
3589; X64-LABEL: test_mm512_fmsub_round_ps:
3590; X64:       # %bb.0: # %entry
3591; X64-NEXT:    vpxord {{.*}}(%rip){1to16}, %zmm2, %zmm2
3592; X64-NEXT:    vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
3593; X64-NEXT:    retq
3594entry:
3595  %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3596  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
3597  ret <16 x float> %0
3598}
3599
3600define <16 x float> @test_mm512_mask_fmsub_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
3601; X86-LABEL: test_mm512_mask_fmsub_round_ps:
3602; X86:       # %bb.0: # %entry
3603; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
3604; X86-NEXT:    kmovw %eax, %k1
3605; X86-NEXT:    vfmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3606; X86-NEXT:    retl
3607;
3608; X64-LABEL: test_mm512_mask_fmsub_round_ps:
3609; X64:       # %bb.0: # %entry
3610; X64-NEXT:    kmovw %edi, %k1
3611; X64-NEXT:    vfmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3612; X64-NEXT:    retq
3613entry:
3614  %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3615  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
3616  %1 = bitcast i16 %__U to <16 x i1>
3617  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
3618  ret <16 x float> %2
3619}
3620
3621define <16 x float> @test_mm512_maskz_fmsub_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3622; X86-LABEL: test_mm512_maskz_fmsub_round_ps:
3623; X86:       # %bb.0: # %entry
3624; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
3625; X86-NEXT:    kmovw %eax, %k1
3626; X86-NEXT:    vfmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3627; X86-NEXT:    retl
3628;
3629; X64-LABEL: test_mm512_maskz_fmsub_round_ps:
3630; X64:       # %bb.0: # %entry
3631; X64-NEXT:    kmovw %edi, %k1
3632; X64-NEXT:    vfmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3633; X64-NEXT:    retq
3634entry:
3635  %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3636  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
3637  %1 = bitcast i16 %__U to <16 x i1>
3638  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3639  ret <16 x float> %2
3640}
3641
3642define <16 x float> @test_mm512_fnmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3643; X86-LABEL: test_mm512_fnmadd_round_ps:
3644; X86:       # %bb.0: # %entry
3645; X86-NEXT:    vpxord {{\.LCPI.*}}{1to16}, %zmm0, %zmm0
3646; X86-NEXT:    vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
3647; X86-NEXT:    retl
3648;
3649; X64-LABEL: test_mm512_fnmadd_round_ps:
3650; X64:       # %bb.0: # %entry
3651; X64-NEXT:    vpxord {{.*}}(%rip){1to16}, %zmm0, %zmm0
3652; X64-NEXT:    vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
3653; X64-NEXT:    retq
3654entry:
3655  %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3656  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %__C, i32 8)
3657  ret <16 x float> %0
3658}
3659
3660define <16 x float> @test_mm512_mask3_fnmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
3661; X86-LABEL: test_mm512_mask3_fnmadd_round_ps:
3662; X86:       # %bb.0: # %entry
3663; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
3664; X86-NEXT:    kmovw %eax, %k1
3665; X86-NEXT:    vfnmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3666; X86-NEXT:    vmovaps %zmm2, %zmm0
3667; X86-NEXT:    retl
3668;
3669; X64-LABEL: test_mm512_mask3_fnmadd_round_ps:
3670; X64:       # %bb.0: # %entry
3671; X64-NEXT:    kmovw %edi, %k1
3672; X64-NEXT:    vfnmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3673; X64-NEXT:    vmovaps %zmm2, %zmm0
3674; X64-NEXT:    retq
3675entry:
3676  %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3677  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %__C, i32 8)
3678  %1 = bitcast i16 %__U to <16 x i1>
3679  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
3680  ret <16 x float> %2
3681}
3682
3683define <16 x float> @test_mm512_maskz_fnmadd_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3684; X86-LABEL: test_mm512_maskz_fnmadd_round_ps:
3685; X86:       # %bb.0: # %entry
3686; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
3687; X86-NEXT:    kmovw %eax, %k1
3688; X86-NEXT:    vfnmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3689; X86-NEXT:    retl
3690;
3691; X64-LABEL: test_mm512_maskz_fnmadd_round_ps:
3692; X64:       # %bb.0: # %entry
3693; X64-NEXT:    kmovw %edi, %k1
3694; X64-NEXT:    vfnmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3695; X64-NEXT:    retq
3696entry:
3697  %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3698  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %__C, i32 8)
3699  %1 = bitcast i16 %__U to <16 x i1>
3700  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3701  ret <16 x float> %2
3702}
3703
3704define <16 x float> @test_mm512_fnmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3705; CHECK-LABEL: test_mm512_fnmsub_round_ps:
3706; CHECK:       # %bb.0: # %entry
3707; CHECK-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
3708; CHECK-NEXT:    vpxord %zmm3, %zmm0, %zmm4
3709; CHECK-NEXT:    vpxord %zmm3, %zmm2, %zmm0
3710; CHECK-NEXT:    vfmadd231ps {rn-sae}, %zmm4, %zmm1, %zmm0
3711; CHECK-NEXT:    ret{{[l|q]}}
3712entry:
3713  %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3714  %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3715  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %sub1, i32 8)
3716  ret <16 x float> %0
3717}
3718
3719define <16 x float> @test_mm512_maskz_fnmsub_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3720; X86-LABEL: test_mm512_maskz_fnmsub_round_ps:
3721; X86:       # %bb.0: # %entry
3722; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
3723; X86-NEXT:    kmovw %eax, %k1
3724; X86-NEXT:    vfnmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3725; X86-NEXT:    retl
3726;
3727; X64-LABEL: test_mm512_maskz_fnmsub_round_ps:
3728; X64:       # %bb.0: # %entry
3729; X64-NEXT:    kmovw %edi, %k1
3730; X64-NEXT:    vfnmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3731; X64-NEXT:    retq
3732entry:
3733  %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3734  %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3735  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %sub1, i32 8)
3736  %1 = bitcast i16 %__U to <16 x i1>
3737  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3738  ret <16 x float> %2
3739}
3740
3741define <16 x float> @test_mm512_fmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3742; CHECK-LABEL: test_mm512_fmadd_ps:
3743; CHECK:       # %bb.0: # %entry
3744; CHECK-NEXT:    vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3745; CHECK-NEXT:    ret{{[l|q]}}
3746entry:
3747  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
3748  ret <16 x float> %0
3749}
3750
3751define <16 x float> @test_mm512_mask_fmadd_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
3752; X86-LABEL: test_mm512_mask_fmadd_ps:
3753; X86:       # %bb.0: # %entry
3754; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
3755; X86-NEXT:    kmovw %eax, %k1
3756; X86-NEXT:    vfmadd132ps {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) + zmm2
3757; X86-NEXT:    retl
3758;
3759; X64-LABEL: test_mm512_mask_fmadd_ps:
3760; X64:       # %bb.0: # %entry
3761; X64-NEXT:    kmovw %edi, %k1
3762; X64-NEXT:    vfmadd132ps {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) + zmm2
3763; X64-NEXT:    retq
3764entry:
3765  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
3766  %1 = bitcast i16 %__U to <16 x i1>
3767  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
3768  ret <16 x float> %2
3769}
3770
3771define <16 x float> @test_mm512_mask3_fmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
3772; X86-LABEL: test_mm512_mask3_fmadd_ps:
3773; X86:       # %bb.0: # %entry
3774; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
3775; X86-NEXT:    kmovw %eax, %k1
3776; X86-NEXT:    vfmadd231ps {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) + zmm2
3777; X86-NEXT:    vmovaps %zmm2, %zmm0
3778; X86-NEXT:    retl
3779;
3780; X64-LABEL: test_mm512_mask3_fmadd_ps:
3781; X64:       # %bb.0: # %entry
3782; X64-NEXT:    kmovw %edi, %k1
3783; X64-NEXT:    vfmadd231ps {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) + zmm2
3784; X64-NEXT:    vmovaps %zmm2, %zmm0
3785; X64-NEXT:    retq
3786entry:
3787  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
3788  %1 = bitcast i16 %__U to <16 x i1>
3789  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
3790  ret <16 x float> %2
3791}
3792
3793define <16 x float> @test_mm512_maskz_fmadd_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3794; X86-LABEL: test_mm512_maskz_fmadd_ps:
3795; X86:       # %bb.0: # %entry
3796; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
3797; X86-NEXT:    kmovw %eax, %k1
3798; X86-NEXT:    vfmadd213ps {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) + zmm2
3799; X86-NEXT:    retl
3800;
3801; X64-LABEL: test_mm512_maskz_fmadd_ps:
3802; X64:       # %bb.0: # %entry
3803; X64-NEXT:    kmovw %edi, %k1
3804; X64-NEXT:    vfmadd213ps {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) + zmm2
3805; X64-NEXT:    retq
3806entry:
3807  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
3808  %1 = bitcast i16 %__U to <16 x i1>
3809  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3810  ret <16 x float> %2
3811}
3812
3813define <16 x float> @test_mm512_fmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3814; X86-LABEL: test_mm512_fmsub_ps:
3815; X86:       # %bb.0: # %entry
3816; X86-NEXT:    vpxord {{\.LCPI.*}}{1to16}, %zmm2, %zmm2
3817; X86-NEXT:    vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3818; X86-NEXT:    retl
3819;
3820; X64-LABEL: test_mm512_fmsub_ps:
3821; X64:       # %bb.0: # %entry
3822; X64-NEXT:    vpxord {{.*}}(%rip){1to16}, %zmm2, %zmm2
3823; X64-NEXT:    vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3824; X64-NEXT:    retq
3825entry:
3826  %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3827  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
3828  ret <16 x float> %0
3829}
3830
3831define <16 x float> @test_mm512_mask_fmsub_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
3832; X86-LABEL: test_mm512_mask_fmsub_ps:
3833; X86:       # %bb.0: # %entry
3834; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
3835; X86-NEXT:    kmovw %eax, %k1
3836; X86-NEXT:    vfmsub132ps {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) - zmm2
3837; X86-NEXT:    retl
3838;
3839; X64-LABEL: test_mm512_mask_fmsub_ps:
3840; X64:       # %bb.0: # %entry
3841; X64-NEXT:    kmovw %edi, %k1
3842; X64-NEXT:    vfmsub132ps {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) - zmm2
3843; X64-NEXT:    retq
3844entry:
3845  %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3846  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
3847  %1 = bitcast i16 %__U to <16 x i1>
3848  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
3849  ret <16 x float> %2
3850}
3851
3852define <16 x float> @test_mm512_maskz_fmsub_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3853; X86-LABEL: test_mm512_maskz_fmsub_ps:
3854; X86:       # %bb.0: # %entry
3855; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
3856; X86-NEXT:    kmovw %eax, %k1
3857; X86-NEXT:    vfmsub213ps {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) - zmm2
3858; X86-NEXT:    retl
3859;
3860; X64-LABEL: test_mm512_maskz_fmsub_ps:
3861; X64:       # %bb.0: # %entry
3862; X64-NEXT:    kmovw %edi, %k1
3863; X64-NEXT:    vfmsub213ps {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) - zmm2
3864; X64-NEXT:    retq
3865entry:
3866  %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3867  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
3868  %1 = bitcast i16 %__U to <16 x i1>
3869  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3870  ret <16 x float> %2
3871}
3872
3873define <16 x float> @test_mm512_fnmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3874; X86-LABEL: test_mm512_fnmadd_ps:
3875; X86:       # %bb.0: # %entry
3876; X86-NEXT:    vpxord {{\.LCPI.*}}{1to16}, %zmm0, %zmm0
3877; X86-NEXT:    vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3878; X86-NEXT:    retl
3879;
3880; X64-LABEL: test_mm512_fnmadd_ps:
3881; X64:       # %bb.0: # %entry
3882; X64-NEXT:    vpxord {{.*}}(%rip){1to16}, %zmm0, %zmm0
3883; X64-NEXT:    vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3884; X64-NEXT:    retq
3885entry:
3886  %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3887  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %__C) #10
3888  ret <16 x float> %0
3889}
3890
3891define <16 x float> @test_mm512_mask3_fnmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
3892; X86-LABEL: test_mm512_mask3_fnmadd_ps:
3893; X86:       # %bb.0: # %entry
3894; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
3895; X86-NEXT:    kmovw %eax, %k1
3896; X86-NEXT:    vfnmadd231ps {{.*#+}} zmm2 {%k1} = -(zmm0 * zmm1) + zmm2
3897; X86-NEXT:    vmovaps %zmm2, %zmm0
3898; X86-NEXT:    retl
3899;
3900; X64-LABEL: test_mm512_mask3_fnmadd_ps:
3901; X64:       # %bb.0: # %entry
3902; X64-NEXT:    kmovw %edi, %k1
3903; X64-NEXT:    vfnmadd231ps {{.*#+}} zmm2 {%k1} = -(zmm0 * zmm1) + zmm2
3904; X64-NEXT:    vmovaps %zmm2, %zmm0
3905; X64-NEXT:    retq
3906entry:
3907  %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3908  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %__C) #10
3909  %1 = bitcast i16 %__U to <16 x i1>
3910  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
3911  ret <16 x float> %2
3912}
3913
3914define <16 x float> @test_mm512_maskz_fnmadd_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3915; X86-LABEL: test_mm512_maskz_fnmadd_ps:
3916; X86:       # %bb.0: # %entry
3917; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
3918; X86-NEXT:    kmovw %eax, %k1
3919; X86-NEXT:    vfnmadd213ps {{.*#+}} zmm0 {%k1} {z} = -(zmm1 * zmm0) + zmm2
3920; X86-NEXT:    retl
3921;
3922; X64-LABEL: test_mm512_maskz_fnmadd_ps:
3923; X64:       # %bb.0: # %entry
3924; X64-NEXT:    kmovw %edi, %k1
3925; X64-NEXT:    vfnmadd213ps {{.*#+}} zmm0 {%k1} {z} = -(zmm1 * zmm0) + zmm2
3926; X64-NEXT:    retq
3927entry:
3928  %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3929  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %__C) #10
3930  %1 = bitcast i16 %__U to <16 x i1>
3931  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3932  ret <16 x float> %2
3933}
3934
3935define <16 x float> @test_mm512_fnmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3936; CHECK-LABEL: test_mm512_fnmsub_ps:
3937; CHECK:       # %bb.0: # %entry
3938; CHECK-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
3939; CHECK-NEXT:    vpxord %zmm3, %zmm0, %zmm4
3940; CHECK-NEXT:    vpxord %zmm3, %zmm2, %zmm0
3941; CHECK-NEXT:    vfmadd231ps {{.*#+}} zmm0 = (zmm1 * zmm4) + zmm0
3942; CHECK-NEXT:    ret{{[l|q]}}
3943entry:
3944  %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3945  %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3946  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %sub1.i) #10
3947  ret <16 x float> %0
3948}
3949
3950define <16 x float> @test_mm512_maskz_fnmsub_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3951; X86-LABEL: test_mm512_maskz_fnmsub_ps:
3952; X86:       # %bb.0: # %entry
3953; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
3954; X86-NEXT:    kmovw %eax, %k1
3955; X86-NEXT:    vfnmsub213ps {{.*#+}} zmm0 {%k1} {z} = -(zmm1 * zmm0) - zmm2
3956; X86-NEXT:    retl
3957;
3958; X64-LABEL: test_mm512_maskz_fnmsub_ps:
3959; X64:       # %bb.0: # %entry
3960; X64-NEXT:    kmovw %edi, %k1
3961; X64-NEXT:    vfnmsub213ps {{.*#+}} zmm0 {%k1} {z} = -(zmm1 * zmm0) - zmm2
3962; X64-NEXT:    retq
3963entry:
3964  %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3965  %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3966  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %sub1.i) #10
3967  %1 = bitcast i16 %__U to <16 x i1>
3968  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3969  ret <16 x float> %2
3970}
3971
3972define <8 x double> @test_mm512_fmaddsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3973; CHECK-LABEL: test_mm512_fmaddsub_round_pd:
3974; CHECK:       # %bb.0: # %entry
3975; CHECK-NEXT:    vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
3976; CHECK-NEXT:    ret{{[l|q]}}
3977entry:
3978  %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
3979  ret <8 x double> %0
3980}
3981
3982declare <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i32) #1
3983
3984define <8 x double> @test_mm512_mask_fmaddsub_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
3985; X86-LABEL: test_mm512_mask_fmaddsub_round_pd:
3986; X86:       # %bb.0: # %entry
3987; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
3988; X86-NEXT:    kmovw %eax, %k1
3989; X86-NEXT:    vfmaddsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3990; X86-NEXT:    retl
3991;
3992; X64-LABEL: test_mm512_mask_fmaddsub_round_pd:
3993; X64:       # %bb.0: # %entry
3994; X64-NEXT:    kmovw %edi, %k1
3995; X64-NEXT:    vfmaddsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3996; X64-NEXT:    retq
3997entry:
3998  %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
3999  %1 = bitcast i8 %__U to <8 x i1>
4000  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
4001  ret <8 x double> %2
4002}
4003
4004define <8 x double> @test_mm512_mask3_fmaddsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4005; X86-LABEL: test_mm512_mask3_fmaddsub_round_pd:
4006; X86:       # %bb.0: # %entry
4007; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4008; X86-NEXT:    kmovw %eax, %k1
4009; X86-NEXT:    vfmaddsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4010; X86-NEXT:    vmovapd %zmm2, %zmm0
4011; X86-NEXT:    retl
4012;
4013; X64-LABEL: test_mm512_mask3_fmaddsub_round_pd:
4014; X64:       # %bb.0: # %entry
4015; X64-NEXT:    kmovw %edi, %k1
4016; X64-NEXT:    vfmaddsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4017; X64-NEXT:    vmovapd %zmm2, %zmm0
4018; X64-NEXT:    retq
4019entry:
4020  %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
4021  %1 = bitcast i8 %__U to <8 x i1>
4022  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
4023  ret <8 x double> %2
4024}
4025
4026define <8 x double> @test_mm512_maskz_fmaddsub_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
4027; X86-LABEL: test_mm512_maskz_fmaddsub_round_pd:
4028; X86:       # %bb.0: # %entry
4029; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4030; X86-NEXT:    kmovw %eax, %k1
4031; X86-NEXT:    vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
4032; X86-NEXT:    retl
4033;
4034; X64-LABEL: test_mm512_maskz_fmaddsub_round_pd:
4035; X64:       # %bb.0: # %entry
4036; X64-NEXT:    kmovw %edi, %k1
4037; X64-NEXT:    vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
4038; X64-NEXT:    retq
4039entry:
4040  %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
4041  %1 = bitcast i8 %__U to <8 x i1>
4042  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
4043  ret <8 x double> %2
4044}
4045
4046define <8 x double> @test_mm512_fmsubadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
4047; X86-LABEL: test_mm512_fmsubadd_round_pd:
4048; X86:       # %bb.0: # %entry
4049; X86-NEXT:    vpxorq {{\.LCPI.*}}{1to8}, %zmm2, %zmm2
4050; X86-NEXT:    vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
4051; X86-NEXT:    retl
4052;
4053; X64-LABEL: test_mm512_fmsubadd_round_pd:
4054; X64:       # %bb.0: # %entry
4055; X64-NEXT:    vpxorq {{.*}}(%rip){1to8}, %zmm2, %zmm2
4056; X64-NEXT:    vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
4057; X64-NEXT:    retq
4058entry:
4059  %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4060  %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
4061  ret <8 x double> %0
4062}
4063
4064define <8 x double> @test_mm512_mask_fmsubadd_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4065; X86-LABEL: test_mm512_mask_fmsubadd_round_pd:
4066; X86:       # %bb.0: # %entry
4067; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4068; X86-NEXT:    kmovw %eax, %k1
4069; X86-NEXT:    vfmsubadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4070; X86-NEXT:    retl
4071;
4072; X64-LABEL: test_mm512_mask_fmsubadd_round_pd:
4073; X64:       # %bb.0: # %entry
4074; X64-NEXT:    kmovw %edi, %k1
4075; X64-NEXT:    vfmsubadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4076; X64-NEXT:    retq
4077entry:
4078  %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4079  %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
4080  %1 = bitcast i8 %__U to <8 x i1>
4081  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
4082  ret <8 x double> %2
4083}
4084
4085define <8 x double> @test_mm512_maskz_fmsubadd_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
4086; X86-LABEL: test_mm512_maskz_fmsubadd_round_pd:
4087; X86:       # %bb.0: # %entry
4088; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4089; X86-NEXT:    kmovw %eax, %k1
4090; X86-NEXT:    vfmsubadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
4091; X86-NEXT:    retl
4092;
4093; X64-LABEL: test_mm512_maskz_fmsubadd_round_pd:
4094; X64:       # %bb.0: # %entry
4095; X64-NEXT:    kmovw %edi, %k1
4096; X64-NEXT:    vfmsubadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
4097; X64-NEXT:    retq
4098entry:
4099  %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4100  %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
4101  %1 = bitcast i8 %__U to <8 x i1>
4102  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
4103  ret <8 x double> %2
4104}
4105
4106define <8 x double> @test_mm512_fmaddsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
4107; CHECK-LABEL: test_mm512_fmaddsub_pd:
4108; CHECK:       # %bb.0: # %entry
4109; CHECK-NEXT:    vfmaddsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
4110; CHECK-NEXT:    ret{{[l|q]}}
4111entry:
4112  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4113  %1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4114  %2 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %1) #10
4115  %3 = shufflevector <8 x double> %2, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4116  ret <8 x double> %3
4117}
4118
4119define <8 x double> @test_mm512_mask_fmaddsub_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4120; X86-LABEL: test_mm512_mask_fmaddsub_pd:
4121; X86:       # %bb.0: # %entry
4122; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4123; X86-NEXT:    kmovw %eax, %k1
4124; X86-NEXT:    vfmaddsub132pd {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) +/- zmm2
4125; X86-NEXT:    retl
4126;
4127; X64-LABEL: test_mm512_mask_fmaddsub_pd:
4128; X64:       # %bb.0: # %entry
4129; X64-NEXT:    kmovw %edi, %k1
4130; X64-NEXT:    vfmaddsub132pd {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) +/- zmm2
4131; X64-NEXT:    retq
4132entry:
4133  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4134  %1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4135  %2 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %1) #10
4136  %3 = shufflevector <8 x double> %2, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4137  %4 = bitcast i8 %__U to <8 x i1>
4138  %5 = select <8 x i1> %4, <8 x double> %3, <8 x double> %__A
4139  ret <8 x double> %5
4140}
4141
4142define <8 x double> @test_mm512_mask3_fmaddsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4143; X86-LABEL: test_mm512_mask3_fmaddsub_pd:
4144; X86:       # %bb.0: # %entry
4145; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4146; X86-NEXT:    kmovw %eax, %k1
4147; X86-NEXT:    vfmaddsub231pd {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) +/- zmm2
4148; X86-NEXT:    vmovapd %zmm2, %zmm0
4149; X86-NEXT:    retl
4150;
4151; X64-LABEL: test_mm512_mask3_fmaddsub_pd:
4152; X64:       # %bb.0: # %entry
4153; X64-NEXT:    kmovw %edi, %k1
4154; X64-NEXT:    vfmaddsub231pd {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) +/- zmm2
4155; X64-NEXT:    vmovapd %zmm2, %zmm0
4156; X64-NEXT:    retq
4157entry:
4158  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4159  %1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4160  %2 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %1) #10
4161  %3 = shufflevector <8 x double> %2, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4162  %4 = bitcast i8 %__U to <8 x i1>
4163  %5 = select <8 x i1> %4, <8 x double> %3, <8 x double> %__C
4164  ret <8 x double> %5
4165}
4166
4167define <8 x double> @test_mm512_maskz_fmaddsub_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
4168; X86-LABEL: test_mm512_maskz_fmaddsub_pd:
4169; X86:       # %bb.0: # %entry
4170; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4171; X86-NEXT:    kmovw %eax, %k1
4172; X86-NEXT:    vfmaddsub213pd {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) +/- zmm2
4173; X86-NEXT:    retl
4174;
4175; X64-LABEL: test_mm512_maskz_fmaddsub_pd:
4176; X64:       # %bb.0: # %entry
4177; X64-NEXT:    kmovw %edi, %k1
4178; X64-NEXT:    vfmaddsub213pd {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) +/- zmm2
4179; X64-NEXT:    retq
4180entry:
4181  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4182  %1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4183  %2 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %1) #10
4184  %3 = shufflevector <8 x double> %2, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4185  %4 = bitcast i8 %__U to <8 x i1>
4186  %5 = select <8 x i1> %4, <8 x double> %3, <8 x double> zeroinitializer
4187  ret <8 x double> %5
4188}
4189
4190define <8 x double> @test_mm512_fmsubadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
4191; CHECK-LABEL: test_mm512_fmsubadd_pd:
4192; CHECK:       # %bb.0: # %entry
4193; CHECK-NEXT:    vfmsubadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
4194; CHECK-NEXT:    ret{{[l|q]}}
4195entry:
4196  %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4197  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
4198  %1 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4199  %2 = shufflevector <8 x double> %1, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4200  ret <8 x double> %2
4201}
4202
4203define <8 x double> @test_mm512_mask_fmsubadd_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4204; X86-LABEL: test_mm512_mask_fmsubadd_pd:
4205; X86:       # %bb.0: # %entry
4206; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4207; X86-NEXT:    kmovw %eax, %k1
4208; X86-NEXT:    vfmsubadd132pd {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) -/+ zmm2
4209; X86-NEXT:    retl
4210;
4211; X64-LABEL: test_mm512_mask_fmsubadd_pd:
4212; X64:       # %bb.0: # %entry
4213; X64-NEXT:    kmovw %edi, %k1
4214; X64-NEXT:    vfmsubadd132pd {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) -/+ zmm2
4215; X64-NEXT:    retq
4216entry:
4217  %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4218  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
4219  %1 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4220  %2 = shufflevector <8 x double> %1, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4221  %3 = bitcast i8 %__U to <8 x i1>
4222  %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> %__A
4223  ret <8 x double> %4
4224}
4225
4226define <8 x double> @test_mm512_maskz_fmsubadd_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
4227; X86-LABEL: test_mm512_maskz_fmsubadd_pd:
4228; X86:       # %bb.0: # %entry
4229; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4230; X86-NEXT:    kmovw %eax, %k1
4231; X86-NEXT:    vfmsubadd213pd {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) -/+ zmm2
4232; X86-NEXT:    retl
4233;
4234; X64-LABEL: test_mm512_maskz_fmsubadd_pd:
4235; X64:       # %bb.0: # %entry
4236; X64-NEXT:    kmovw %edi, %k1
4237; X64-NEXT:    vfmsubadd213pd {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) -/+ zmm2
4238; X64-NEXT:    retq
4239entry:
4240  %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4241  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
4242  %1 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4243  %2 = shufflevector <8 x double> %1, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4244  %3 = bitcast i8 %__U to <8 x i1>
4245  %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer
4246  ret <8 x double> %4
4247}
4248
4249define <16 x float> @test_mm512_fmaddsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4250; CHECK-LABEL: test_mm512_fmaddsub_round_ps:
4251; CHECK:       # %bb.0: # %entry
4252; CHECK-NEXT:    vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0
4253; CHECK-NEXT:    ret{{[l|q]}}
4254entry:
4255  %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
4256  ret <16 x float> %0
4257}
4258
4259declare <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i32) #1
4260
4261define <16 x float> @test_mm512_mask_fmaddsub_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4262; X86-LABEL: test_mm512_mask_fmaddsub_round_ps:
4263; X86:       # %bb.0: # %entry
4264; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
4265; X86-NEXT:    kmovw %eax, %k1
4266; X86-NEXT:    vfmaddsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4267; X86-NEXT:    retl
4268;
4269; X64-LABEL: test_mm512_mask_fmaddsub_round_ps:
4270; X64:       # %bb.0: # %entry
4271; X64-NEXT:    kmovw %edi, %k1
4272; X64-NEXT:    vfmaddsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4273; X64-NEXT:    retq
4274entry:
4275  %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
4276  %1 = bitcast i16 %__U to <16 x i1>
4277  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
4278  ret <16 x float> %2
4279}
4280
4281define <16 x float> @test_mm512_mask3_fmaddsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4282; X86-LABEL: test_mm512_mask3_fmaddsub_round_ps:
4283; X86:       # %bb.0: # %entry
4284; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
4285; X86-NEXT:    kmovw %eax, %k1
4286; X86-NEXT:    vfmaddsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4287; X86-NEXT:    vmovaps %zmm2, %zmm0
4288; X86-NEXT:    retl
4289;
4290; X64-LABEL: test_mm512_mask3_fmaddsub_round_ps:
4291; X64:       # %bb.0: # %entry
4292; X64-NEXT:    kmovw %edi, %k1
4293; X64-NEXT:    vfmaddsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4294; X64-NEXT:    vmovaps %zmm2, %zmm0
4295; X64-NEXT:    retq
4296entry:
4297  %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
4298  %1 = bitcast i16 %__U to <16 x i1>
4299  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
4300  ret <16 x float> %2
4301}
4302
4303define <16 x float> @test_mm512_maskz_fmaddsub_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4304; X86-LABEL: test_mm512_maskz_fmaddsub_round_ps:
4305; X86:       # %bb.0: # %entry
4306; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
4307; X86-NEXT:    kmovw %eax, %k1
4308; X86-NEXT:    vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
4309; X86-NEXT:    retl
4310;
4311; X64-LABEL: test_mm512_maskz_fmaddsub_round_ps:
4312; X64:       # %bb.0: # %entry
4313; X64-NEXT:    kmovw %edi, %k1
4314; X64-NEXT:    vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
4315; X64-NEXT:    retq
4316entry:
4317  %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
4318  %1 = bitcast i16 %__U to <16 x i1>
4319  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
4320  ret <16 x float> %2
4321}
4322
4323define <16 x float> @test_mm512_fmsubadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4324; X86-LABEL: test_mm512_fmsubadd_round_ps:
4325; X86:       # %bb.0: # %entry
4326; X86-NEXT:    vpxord {{\.LCPI.*}}{1to16}, %zmm2, %zmm2
4327; X86-NEXT:    vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0
4328; X86-NEXT:    retl
4329;
4330; X64-LABEL: test_mm512_fmsubadd_round_ps:
4331; X64:       # %bb.0: # %entry
4332; X64-NEXT:    vpxord {{.*}}(%rip){1to16}, %zmm2, %zmm2
4333; X64-NEXT:    vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0
4334; X64-NEXT:    retq
4335entry:
4336  %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4337  %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
4338  ret <16 x float> %0
4339}
4340
4341define <16 x float> @test_mm512_mask_fmsubadd_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4342; X86-LABEL: test_mm512_mask_fmsubadd_round_ps:
4343; X86:       # %bb.0: # %entry
4344; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
4345; X86-NEXT:    kmovw %eax, %k1
4346; X86-NEXT:    vfmsubadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4347; X86-NEXT:    retl
4348;
4349; X64-LABEL: test_mm512_mask_fmsubadd_round_ps:
4350; X64:       # %bb.0: # %entry
4351; X64-NEXT:    kmovw %edi, %k1
4352; X64-NEXT:    vfmsubadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4353; X64-NEXT:    retq
4354entry:
4355  %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4356  %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
4357  %1 = bitcast i16 %__U to <16 x i1>
4358  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
4359  ret <16 x float> %2
4360}
4361
4362define <16 x float> @test_mm512_maskz_fmsubadd_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4363; X86-LABEL: test_mm512_maskz_fmsubadd_round_ps:
4364; X86:       # %bb.0: # %entry
4365; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
4366; X86-NEXT:    kmovw %eax, %k1
4367; X86-NEXT:    vfmsubadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
4368; X86-NEXT:    retl
4369;
4370; X64-LABEL: test_mm512_maskz_fmsubadd_round_ps:
4371; X64:       # %bb.0: # %entry
4372; X64-NEXT:    kmovw %edi, %k1
4373; X64-NEXT:    vfmsubadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
4374; X64-NEXT:    retq
4375entry:
4376  %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4377  %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
4378  %1 = bitcast i16 %__U to <16 x i1>
4379  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
4380  ret <16 x float> %2
4381}
4382
4383define <16 x float> @test_mm512_fmaddsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4384; CHECK-LABEL: test_mm512_fmaddsub_ps:
4385; CHECK:       # %bb.0: # %entry
4386; CHECK-NEXT:    vfmaddsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
4387; CHECK-NEXT:    ret{{[l|q]}}
4388entry:
4389  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4390  %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4391  %2 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %1) #10
4392  %3 = shufflevector <16 x float> %2, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4393  ret <16 x float> %3
4394}
4395
4396define <16 x float> @test_mm512_mask_fmaddsub_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4397; X86-LABEL: test_mm512_mask_fmaddsub_ps:
4398; X86:       # %bb.0: # %entry
4399; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
4400; X86-NEXT:    kmovw %eax, %k1
4401; X86-NEXT:    vfmaddsub132ps {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) +/- zmm2
4402; X86-NEXT:    retl
4403;
4404; X64-LABEL: test_mm512_mask_fmaddsub_ps:
4405; X64:       # %bb.0: # %entry
4406; X64-NEXT:    kmovw %edi, %k1
4407; X64-NEXT:    vfmaddsub132ps {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) +/- zmm2
4408; X64-NEXT:    retq
4409entry:
4410  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4411  %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4412  %2 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %1) #10
4413  %3 = shufflevector <16 x float> %2, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4414  %4 = bitcast i16 %__U to <16 x i1>
4415  %5 = select <16 x i1> %4, <16 x float> %3, <16 x float> %__A
4416  ret <16 x float> %5
4417}
4418
4419define <16 x float> @test_mm512_mask3_fmaddsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4420; X86-LABEL: test_mm512_mask3_fmaddsub_ps:
4421; X86:       # %bb.0: # %entry
4422; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
4423; X86-NEXT:    kmovw %eax, %k1
4424; X86-NEXT:    vfmaddsub231ps {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) +/- zmm2
4425; X86-NEXT:    vmovaps %zmm2, %zmm0
4426; X86-NEXT:    retl
4427;
4428; X64-LABEL: test_mm512_mask3_fmaddsub_ps:
4429; X64:       # %bb.0: # %entry
4430; X64-NEXT:    kmovw %edi, %k1
4431; X64-NEXT:    vfmaddsub231ps {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) +/- zmm2
4432; X64-NEXT:    vmovaps %zmm2, %zmm0
4433; X64-NEXT:    retq
4434entry:
4435  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4436  %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4437  %2 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %1) #10
4438  %3 = shufflevector <16 x float> %2, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4439  %4 = bitcast i16 %__U to <16 x i1>
4440  %5 = select <16 x i1> %4, <16 x float> %3, <16 x float> %__C
4441  ret <16 x float> %5
4442}
4443
4444define <16 x float> @test_mm512_maskz_fmaddsub_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4445; X86-LABEL: test_mm512_maskz_fmaddsub_ps:
4446; X86:       # %bb.0: # %entry
4447; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
4448; X86-NEXT:    kmovw %eax, %k1
4449; X86-NEXT:    vfmaddsub213ps {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) +/- zmm2
4450; X86-NEXT:    retl
4451;
4452; X64-LABEL: test_mm512_maskz_fmaddsub_ps:
4453; X64:       # %bb.0: # %entry
4454; X64-NEXT:    kmovw %edi, %k1
4455; X64-NEXT:    vfmaddsub213ps {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) +/- zmm2
4456; X64-NEXT:    retq
4457entry:
4458  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4459  %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4460  %2 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %1) #10
4461  %3 = shufflevector <16 x float> %2, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4462  %4 = bitcast i16 %__U to <16 x i1>
4463  %5 = select <16 x i1> %4, <16 x float> %3, <16 x float> zeroinitializer
4464  ret <16 x float> %5
4465}
4466
4467define <16 x float> @test_mm512_fmsubadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4468; CHECK-LABEL: test_mm512_fmsubadd_ps:
4469; CHECK:       # %bb.0: # %entry
4470; CHECK-NEXT:    vfmsubadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
4471; CHECK-NEXT:    ret{{[l|q]}}
4472entry:
4473  %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4474  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
4475  %1 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4476  %2 = shufflevector <16 x float> %1, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4477  ret <16 x float> %2
4478}
4479
4480define <16 x float> @test_mm512_mask_fmsubadd_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4481; X86-LABEL: test_mm512_mask_fmsubadd_ps:
4482; X86:       # %bb.0: # %entry
4483; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
4484; X86-NEXT:    kmovw %eax, %k1
4485; X86-NEXT:    vfmsubadd132ps {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) -/+ zmm2
4486; X86-NEXT:    retl
4487;
4488; X64-LABEL: test_mm512_mask_fmsubadd_ps:
4489; X64:       # %bb.0: # %entry
4490; X64-NEXT:    kmovw %edi, %k1
4491; X64-NEXT:    vfmsubadd132ps {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) -/+ zmm2
4492; X64-NEXT:    retq
4493entry:
4494  %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4495  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
4496  %1 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4497  %2 = shufflevector <16 x float> %1, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4498  %3 = bitcast i16 %__U to <16 x i1>
4499  %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> %__A
4500  ret <16 x float> %4
4501}
4502
4503define <16 x float> @test_mm512_maskz_fmsubadd_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4504; X86-LABEL: test_mm512_maskz_fmsubadd_ps:
4505; X86:       # %bb.0: # %entry
4506; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
4507; X86-NEXT:    kmovw %eax, %k1
4508; X86-NEXT:    vfmsubadd213ps {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) -/+ zmm2
4509; X86-NEXT:    retl
4510;
4511; X64-LABEL: test_mm512_maskz_fmsubadd_ps:
4512; X64:       # %bb.0: # %entry
4513; X64-NEXT:    kmovw %edi, %k1
4514; X64-NEXT:    vfmsubadd213ps {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) -/+ zmm2
4515; X64-NEXT:    retq
4516entry:
4517  %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4518  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
4519  %1 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4520  %2 = shufflevector <16 x float> %1, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4521  %3 = bitcast i16 %__U to <16 x i1>
4522  %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer
4523  ret <16 x float> %4
4524}
4525
4526define <8 x double> @test_mm512_mask3_fmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4527; X86-LABEL: test_mm512_mask3_fmsub_round_pd:
4528; X86:       # %bb.0: # %entry
4529; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4530; X86-NEXT:    kmovw %eax, %k1
4531; X86-NEXT:    vfmsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4532; X86-NEXT:    vmovapd %zmm2, %zmm0
4533; X86-NEXT:    retl
4534;
4535; X64-LABEL: test_mm512_mask3_fmsub_round_pd:
4536; X64:       # %bb.0: # %entry
4537; X64-NEXT:    kmovw %edi, %k1
4538; X64-NEXT:    vfmsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4539; X64-NEXT:    vmovapd %zmm2, %zmm0
4540; X64-NEXT:    retq
4541entry:
4542  %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4543  %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
4544  %1 = bitcast i8 %__U to <8 x i1>
4545  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
4546  ret <8 x double> %2
4547}
4548
4549define <8 x double> @test_mm512_mask3_fmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4550; X86-LABEL: test_mm512_mask3_fmsub_pd:
4551; X86:       # %bb.0: # %entry
4552; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4553; X86-NEXT:    kmovw %eax, %k1
4554; X86-NEXT:    vfmsub231pd {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) - zmm2
4555; X86-NEXT:    vmovapd %zmm2, %zmm0
4556; X86-NEXT:    retl
4557;
4558; X64-LABEL: test_mm512_mask3_fmsub_pd:
4559; X64:       # %bb.0: # %entry
4560; X64-NEXT:    kmovw %edi, %k1
4561; X64-NEXT:    vfmsub231pd {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) - zmm2
4562; X64-NEXT:    vmovapd %zmm2, %zmm0
4563; X64-NEXT:    retq
4564entry:
4565  %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4566  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
4567  %1 = bitcast i8 %__U to <8 x i1>
4568  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
4569  ret <8 x double> %2
4570}
4571
4572define <16 x float> @test_mm512_mask3_fmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4573; X86-LABEL: test_mm512_mask3_fmsub_round_ps:
4574; X86:       # %bb.0: # %entry
4575; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
4576; X86-NEXT:    kmovw %eax, %k1
4577; X86-NEXT:    vfmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4578; X86-NEXT:    vmovaps %zmm2, %zmm0
4579; X86-NEXT:    retl
4580;
4581; X64-LABEL: test_mm512_mask3_fmsub_round_ps:
4582; X64:       # %bb.0: # %entry
4583; X64-NEXT:    kmovw %edi, %k1
4584; X64-NEXT:    vfmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4585; X64-NEXT:    vmovaps %zmm2, %zmm0
4586; X64-NEXT:    retq
4587entry:
4588  %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4589  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
4590  %1 = bitcast i16 %__U to <16 x i1>
4591  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
4592  ret <16 x float> %2
4593}
4594
4595define <16 x float> @test_mm512_mask3_fmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4596; X86-LABEL: test_mm512_mask3_fmsub_ps:
4597; X86:       # %bb.0: # %entry
4598; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
4599; X86-NEXT:    kmovw %eax, %k1
4600; X86-NEXT:    vfmsub231ps {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) - zmm2
4601; X86-NEXT:    vmovaps %zmm2, %zmm0
4602; X86-NEXT:    retl
4603;
4604; X64-LABEL: test_mm512_mask3_fmsub_ps:
4605; X64:       # %bb.0: # %entry
4606; X64-NEXT:    kmovw %edi, %k1
4607; X64-NEXT:    vfmsub231ps {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) - zmm2
4608; X64-NEXT:    vmovaps %zmm2, %zmm0
4609; X64-NEXT:    retq
4610entry:
4611  %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4612  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
4613  %1 = bitcast i16 %__U to <16 x i1>
4614  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
4615  ret <16 x float> %2
4616}
4617
4618define <8 x double> @test_mm512_mask3_fmsubadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4619; X86-LABEL: test_mm512_mask3_fmsubadd_round_pd:
4620; X86:       # %bb.0: # %entry
4621; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4622; X86-NEXT:    kmovw %eax, %k1
4623; X86-NEXT:    vfmsubadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4624; X86-NEXT:    vmovapd %zmm2, %zmm0
4625; X86-NEXT:    retl
4626;
4627; X64-LABEL: test_mm512_mask3_fmsubadd_round_pd:
4628; X64:       # %bb.0: # %entry
4629; X64-NEXT:    kmovw %edi, %k1
4630; X64-NEXT:    vfmsubadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4631; X64-NEXT:    vmovapd %zmm2, %zmm0
4632; X64-NEXT:    retq
4633entry:
4634  %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4635  %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
4636  %1 = bitcast i8 %__U to <8 x i1>
4637  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
4638  ret <8 x double> %2
4639}
4640
4641define <8 x double> @test_mm512_mask3_fmsubadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4642; X86-LABEL: test_mm512_mask3_fmsubadd_pd:
4643; X86:       # %bb.0: # %entry
4644; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4645; X86-NEXT:    kmovw %eax, %k1
4646; X86-NEXT:    vfmsubadd231pd {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) -/+ zmm2
4647; X86-NEXT:    vmovapd %zmm2, %zmm0
4648; X86-NEXT:    retl
4649;
4650; X64-LABEL: test_mm512_mask3_fmsubadd_pd:
4651; X64:       # %bb.0: # %entry
4652; X64-NEXT:    kmovw %edi, %k1
4653; X64-NEXT:    vfmsubadd231pd {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) -/+ zmm2
4654; X64-NEXT:    vmovapd %zmm2, %zmm0
4655; X64-NEXT:    retq
4656entry:
4657  %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4658  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
4659  %1 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4660  %2 = shufflevector <8 x double> %1, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4661  %3 = bitcast i8 %__U to <8 x i1>
4662  %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> %__C
4663  ret <8 x double> %4
4664}
4665
4666define <16 x float> @test_mm512_mask3_fmsubadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4667; X86-LABEL: test_mm512_mask3_fmsubadd_round_ps:
4668; X86:       # %bb.0: # %entry
4669; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
4670; X86-NEXT:    kmovw %eax, %k1
4671; X86-NEXT:    vfmsubadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4672; X86-NEXT:    vmovaps %zmm2, %zmm0
4673; X86-NEXT:    retl
4674;
4675; X64-LABEL: test_mm512_mask3_fmsubadd_round_ps:
4676; X64:       # %bb.0: # %entry
4677; X64-NEXT:    kmovw %edi, %k1
4678; X64-NEXT:    vfmsubadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4679; X64-NEXT:    vmovaps %zmm2, %zmm0
4680; X64-NEXT:    retq
4681entry:
4682  %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4683  %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
4684  %1 = bitcast i16 %__U to <16 x i1>
4685  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
4686  ret <16 x float> %2
4687}
4688
4689define <16 x float> @test_mm512_mask3_fmsubadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4690; X86-LABEL: test_mm512_mask3_fmsubadd_ps:
4691; X86:       # %bb.0: # %entry
4692; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
4693; X86-NEXT:    kmovw %eax, %k1
4694; X86-NEXT:    vfmsubadd231ps {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) -/+ zmm2
4695; X86-NEXT:    vmovaps %zmm2, %zmm0
4696; X86-NEXT:    retl
4697;
4698; X64-LABEL: test_mm512_mask3_fmsubadd_ps:
4699; X64:       # %bb.0: # %entry
4700; X64-NEXT:    kmovw %edi, %k1
4701; X64-NEXT:    vfmsubadd231ps {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) -/+ zmm2
4702; X64-NEXT:    vmovaps %zmm2, %zmm0
4703; X64-NEXT:    retq
4704entry:
4705  %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4706  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
4707  %1 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4708  %2 = shufflevector <16 x float> %1, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4709  %3 = bitcast i16 %__U to <16 x i1>
4710  %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> %__C
4711  ret <16 x float> %4
4712}
4713
4714define <8 x double> @test_mm512_mask_fnmadd_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4715; X86-LABEL: test_mm512_mask_fnmadd_round_pd:
4716; X86:       # %bb.0: # %entry
4717; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4718; X86-NEXT:    kmovw %eax, %k1
4719; X86-NEXT:    vfnmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4720; X86-NEXT:    retl
4721;
4722; X64-LABEL: test_mm512_mask_fnmadd_round_pd:
4723; X64:       # %bb.0: # %entry
4724; X64-NEXT:    kmovw %edi, %k1
4725; X64-NEXT:    vfnmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4726; X64-NEXT:    retq
4727entry:
4728  %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
4729  %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %__C, i32 8)
4730  %1 = bitcast i8 %__U to <8 x i1>
4731  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
4732  ret <8 x double> %2
4733}
4734
4735define <8 x double> @test_mm512_mask_fnmadd_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4736; X86-LABEL: test_mm512_mask_fnmadd_pd:
4737; X86:       # %bb.0: # %entry
4738; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4739; X86-NEXT:    kmovw %eax, %k1
4740; X86-NEXT:    vfnmadd132pd {{.*#+}} zmm0 {%k1} = -(zmm0 * zmm1) + zmm2
4741; X86-NEXT:    retl
4742;
4743; X64-LABEL: test_mm512_mask_fnmadd_pd:
4744; X64:       # %bb.0: # %entry
4745; X64-NEXT:    kmovw %edi, %k1
4746; X64-NEXT:    vfnmadd132pd {{.*#+}} zmm0 {%k1} = -(zmm0 * zmm1) + zmm2
4747; X64-NEXT:    retq
4748entry:
4749  %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
4750  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %__C) #10
4751  %1 = bitcast i8 %__U to <8 x i1>
4752  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
4753  ret <8 x double> %2
4754}
4755
4756define <16 x float> @test_mm512_mask_fnmadd_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4757; X86-LABEL: test_mm512_mask_fnmadd_round_ps:
4758; X86:       # %bb.0: # %entry
4759; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
4760; X86-NEXT:    kmovw %eax, %k1
4761; X86-NEXT:    vfnmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4762; X86-NEXT:    retl
4763;
4764; X64-LABEL: test_mm512_mask_fnmadd_round_ps:
4765; X64:       # %bb.0: # %entry
4766; X64-NEXT:    kmovw %edi, %k1
4767; X64-NEXT:    vfnmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4768; X64-NEXT:    retq
4769entry:
4770  %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
4771  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %__C, i32 8)
4772  %1 = bitcast i16 %__U to <16 x i1>
4773  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
4774  ret <16 x float> %2
4775}
4776
4777define <16 x float> @test_mm512_mask_fnmadd_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4778; X86-LABEL: test_mm512_mask_fnmadd_ps:
4779; X86:       # %bb.0: # %entry
4780; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
4781; X86-NEXT:    kmovw %eax, %k1
4782; X86-NEXT:    vfnmadd132ps {{.*#+}} zmm0 {%k1} = -(zmm0 * zmm1) + zmm2
4783; X86-NEXT:    retl
4784;
4785; X64-LABEL: test_mm512_mask_fnmadd_ps:
4786; X64:       # %bb.0: # %entry
4787; X64-NEXT:    kmovw %edi, %k1
4788; X64-NEXT:    vfnmadd132ps {{.*#+}} zmm0 {%k1} = -(zmm0 * zmm1) + zmm2
4789; X64-NEXT:    retq
4790entry:
4791  %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
4792  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %__C) #10
4793  %1 = bitcast i16 %__U to <16 x i1>
4794  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
4795  ret <16 x float> %2
4796}
4797
4798define <8 x double> @test_mm512_mask_fnmsub_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4799; X86-LABEL: test_mm512_mask_fnmsub_round_pd:
4800; X86:       # %bb.0: # %entry
4801; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4802; X86-NEXT:    kmovw %eax, %k1
4803; X86-NEXT:    vfnmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4804; X86-NEXT:    retl
4805;
4806; X64-LABEL: test_mm512_mask_fnmsub_round_pd:
4807; X64:       # %bb.0: # %entry
4808; X64-NEXT:    kmovw %edi, %k1
4809; X64-NEXT:    vfnmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4810; X64-NEXT:    retq
4811entry:
4812  %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
4813  %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4814  %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %sub, <8 x double> %sub1, i32 8)
4815  %1 = bitcast i8 %__U to <8 x i1>
4816  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
4817  ret <8 x double> %2
4818}
4819
4820define <8 x double> @test_mm512_mask3_fnmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4821; X86-LABEL: test_mm512_mask3_fnmsub_round_pd:
4822; X86:       # %bb.0: # %entry
4823; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4824; X86-NEXT:    kmovw %eax, %k1
4825; X86-NEXT:    vfnmsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4826; X86-NEXT:    vmovapd %zmm2, %zmm0
4827; X86-NEXT:    retl
4828;
4829; X64-LABEL: test_mm512_mask3_fnmsub_round_pd:
4830; X64:       # %bb.0: # %entry
4831; X64-NEXT:    kmovw %edi, %k1
4832; X64-NEXT:    vfnmsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4833; X64-NEXT:    vmovapd %zmm2, %zmm0
4834; X64-NEXT:    retq
4835entry:
4836  %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
4837  %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4838  %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %sub, <8 x double> %sub1, i32 8)
4839  %1 = bitcast i8 %__U to <8 x i1>
4840  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
4841  ret <8 x double> %2
4842}
4843
4844define <8 x double> @test_mm512_mask_fnmsub_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4845; X86-LABEL: test_mm512_mask_fnmsub_pd:
4846; X86:       # %bb.0: # %entry
4847; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4848; X86-NEXT:    kmovw %eax, %k1
4849; X86-NEXT:    vfnmsub132pd {{.*#+}} zmm0 {%k1} = -(zmm0 * zmm1) - zmm2
4850; X86-NEXT:    retl
4851;
4852; X64-LABEL: test_mm512_mask_fnmsub_pd:
4853; X64:       # %bb.0: # %entry
4854; X64-NEXT:    kmovw %edi, %k1
4855; X64-NEXT:    vfnmsub132pd {{.*#+}} zmm0 {%k1} = -(zmm0 * zmm1) - zmm2
4856; X64-NEXT:    retq
4857entry:
4858  %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
4859  %sub2.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4860  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %sub.i, <8 x double> %sub2.i) #10
4861  %1 = bitcast i8 %__U to <8 x i1>
4862  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
4863  ret <8 x double> %2
4864}
4865
4866define <8 x double> @test_mm512_mask3_fnmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4867; X86-LABEL: test_mm512_mask3_fnmsub_pd:
4868; X86:       # %bb.0: # %entry
4869; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4870; X86-NEXT:    kmovw %eax, %k1
4871; X86-NEXT:    vfnmsub231pd {{.*#+}} zmm2 {%k1} = -(zmm0 * zmm1) - zmm2
4872; X86-NEXT:    vmovapd %zmm2, %zmm0
4873; X86-NEXT:    retl
4874;
4875; X64-LABEL: test_mm512_mask3_fnmsub_pd:
4876; X64:       # %bb.0: # %entry
4877; X64-NEXT:    kmovw %edi, %k1
4878; X64-NEXT:    vfnmsub231pd {{.*#+}} zmm2 {%k1} = -(zmm0 * zmm1) - zmm2
4879; X64-NEXT:    vmovapd %zmm2, %zmm0
4880; X64-NEXT:    retq
4881entry:
4882  %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
4883  %sub2.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4884  %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %sub.i, <8 x double> %sub2.i) #10
4885  %1 = bitcast i8 %__U to <8 x i1>
4886  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
4887  ret <8 x double> %2
4888}
4889
4890define <16 x float> @test_mm512_mask_fnmsub_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4891; X86-LABEL: test_mm512_mask_fnmsub_round_ps:
4892; X86:       # %bb.0: # %entry
4893; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
4894; X86-NEXT:    kmovw %eax, %k1
4895; X86-NEXT:    vfnmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4896; X86-NEXT:    retl
4897;
4898; X64-LABEL: test_mm512_mask_fnmsub_round_ps:
4899; X64:       # %bb.0: # %entry
4900; X64-NEXT:    kmovw %edi, %k1
4901; X64-NEXT:    vfnmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4902; X64-NEXT:    retq
4903entry:
4904  %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
4905  %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4906  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %sub, <16 x float> %sub1, i32 8)
4907  %1 = bitcast i16 %__U to <16 x i1>
4908  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
4909  ret <16 x float> %2
4910}
4911
4912define <16 x float> @test_mm512_mask3_fnmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4913; X86-LABEL: test_mm512_mask3_fnmsub_round_ps:
4914; X86:       # %bb.0: # %entry
4915; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
4916; X86-NEXT:    kmovw %eax, %k1
4917; X86-NEXT:    vfnmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4918; X86-NEXT:    vmovaps %zmm2, %zmm0
4919; X86-NEXT:    retl
4920;
4921; X64-LABEL: test_mm512_mask3_fnmsub_round_ps:
4922; X64:       # %bb.0: # %entry
4923; X64-NEXT:    kmovw %edi, %k1
4924; X64-NEXT:    vfnmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4925; X64-NEXT:    vmovaps %zmm2, %zmm0
4926; X64-NEXT:    retq
4927entry:
4928  %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
4929  %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4930  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %sub, <16 x float> %sub1, i32 8)
4931  %1 = bitcast i16 %__U to <16 x i1>
4932  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
4933  ret <16 x float> %2
4934}
4935
4936define <16 x float> @test_mm512_mask_fnmsub_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4937; X86-LABEL: test_mm512_mask_fnmsub_ps:
4938; X86:       # %bb.0: # %entry
4939; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
4940; X86-NEXT:    kmovw %eax, %k1
4941; X86-NEXT:    vfnmsub132ps {{.*#+}} zmm0 {%k1} = -(zmm0 * zmm1) - zmm2
4942; X86-NEXT:    retl
4943;
4944; X64-LABEL: test_mm512_mask_fnmsub_ps:
4945; X64:       # %bb.0: # %entry
4946; X64-NEXT:    kmovw %edi, %k1
4947; X64-NEXT:    vfnmsub132ps {{.*#+}} zmm0 {%k1} = -(zmm0 * zmm1) - zmm2
4948; X64-NEXT:    retq
4949entry:
4950  %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
4951  %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4952  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %sub.i, <16 x float> %sub1.i) #10
4953  %1 = bitcast i16 %__U to <16 x i1>
4954  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
4955  ret <16 x float> %2
4956}
4957
4958define <16 x float> @test_mm512_mask3_fnmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4959; X86-LABEL: test_mm512_mask3_fnmsub_ps:
4960; X86:       # %bb.0: # %entry
4961; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
4962; X86-NEXT:    kmovw %eax, %k1
4963; X86-NEXT:    vfnmsub231ps {{.*#+}} zmm2 {%k1} = -(zmm0 * zmm1) - zmm2
4964; X86-NEXT:    vmovaps %zmm2, %zmm0
4965; X86-NEXT:    retl
4966;
4967; X64-LABEL: test_mm512_mask3_fnmsub_ps:
4968; X64:       # %bb.0: # %entry
4969; X64-NEXT:    kmovw %edi, %k1
4970; X64-NEXT:    vfnmsub231ps {{.*#+}} zmm2 {%k1} = -(zmm0 * zmm1) - zmm2
4971; X64-NEXT:    vmovaps %zmm2, %zmm0
4972; X64-NEXT:    retq
4973entry:
4974  %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
4975  %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4976  %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %sub.i, <16 x float> %sub1.i) #10
4977  %1 = bitcast i16 %__U to <16 x i1>
4978  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
4979  ret <16 x float> %2
4980}
4981
4982define <4 x float> @test_mm_mask_fmadd_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
4983; X86-LABEL: test_mm_mask_fmadd_ss:
4984; X86:       # %bb.0: # %entry
4985; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
4986; X86-NEXT:    kmovw %eax, %k1
4987; X86-NEXT:    vfmadd213ss {{.*#+}} xmm0 {%k1} = (xmm1 * xmm0) + xmm2
4988; X86-NEXT:    retl
4989;
4990; X64-LABEL: test_mm_mask_fmadd_ss:
4991; X64:       # %bb.0: # %entry
4992; X64-NEXT:    kmovw %edi, %k1
4993; X64-NEXT:    vfmadd213ss {{.*#+}} xmm0 {%k1} = (xmm1 * xmm0) + xmm2
4994; X64-NEXT:    retq
4995entry:
4996  %0 = extractelement <4 x float> %__W, i64 0
4997  %1 = extractelement <4 x float> %__A, i64 0
4998  %2 = extractelement <4 x float> %__B, i64 0
4999  %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5000  %4 = and i8 %__U, 1
5001  %tobool.i = icmp eq i8 %4, 0
5002  %vecext1.i = extractelement <4 x float> %__W, i32 0
5003  %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
5004  %vecins.i = insertelement <4 x float> %__W, float %cond.i, i32 0
5005  ret <4 x float> %vecins.i
5006}
5007
5008define <4 x float> @test_mm_mask_fmadd_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
5009; X86-LABEL: test_mm_mask_fmadd_round_ss:
5010; X86:       # %bb.0: # %entry
5011; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5012; X86-NEXT:    kmovw %eax, %k1
5013; X86-NEXT:    vfmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5014; X86-NEXT:    retl
5015;
5016; X64-LABEL: test_mm_mask_fmadd_round_ss:
5017; X64:       # %bb.0: # %entry
5018; X64-NEXT:    kmovw %edi, %k1
5019; X64-NEXT:    vfmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5020; X64-NEXT:    retq
5021entry:
5022  %0 = extractelement <4 x float> %__W, i64 0
5023  %1 = extractelement <4 x float> %__A, i64 0
5024  %2 = extractelement <4 x float> %__B, i64 0
5025  %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5026  %4 = bitcast i8 %__U to <8 x i1>
5027  %5 = extractelement <8 x i1> %4, i64 0
5028  %6 = select i1 %5, float %3, float %0
5029  %7 = insertelement <4 x float> %__W, float %6, i64 0
5030  ret <4 x float> %7
5031}
5032
5033declare float @llvm.x86.avx512.vfmadd.f32(float, float, float, i32) #1
5034
5035define <4 x float> @test_mm_maskz_fmadd_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5036; X86-LABEL: test_mm_maskz_fmadd_ss:
5037; X86:       # %bb.0: # %entry
5038; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5039; X86-NEXT:    kmovw %eax, %k1
5040; X86-NEXT:    vfmadd213ss {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2
5041; X86-NEXT:    retl
5042;
5043; X64-LABEL: test_mm_maskz_fmadd_ss:
5044; X64:       # %bb.0: # %entry
5045; X64-NEXT:    kmovw %edi, %k1
5046; X64-NEXT:    vfmadd213ss {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2
5047; X64-NEXT:    retq
5048entry:
5049  %0 = extractelement <4 x float> %__A, i64 0
5050  %1 = extractelement <4 x float> %__B, i64 0
5051  %2 = extractelement <4 x float> %__C, i64 0
5052  %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5053  %4 = and i8 %__U, 1
5054  %tobool.i = icmp eq i8 %4, 0
5055  %cond.i = select i1 %tobool.i, float 0.000000e+00, float %3
5056  %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
5057  ret <4 x float> %vecins.i
5058}
5059
5060define <4 x float> @test_mm_maskz_fmadd_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5061; X86-LABEL: test_mm_maskz_fmadd_round_ss:
5062; X86:       # %bb.0: # %entry
5063; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5064; X86-NEXT:    kmovw %eax, %k1
5065; X86-NEXT:    vfmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5066; X86-NEXT:    retl
5067;
5068; X64-LABEL: test_mm_maskz_fmadd_round_ss:
5069; X64:       # %bb.0: # %entry
5070; X64-NEXT:    kmovw %edi, %k1
5071; X64-NEXT:    vfmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5072; X64-NEXT:    retq
5073entry:
5074  %0 = extractelement <4 x float> %__A, i64 0
5075  %1 = extractelement <4 x float> %__B, i64 0
5076  %2 = extractelement <4 x float> %__C, i64 0
5077  %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5078  %4 = bitcast i8 %__U to <8 x i1>
5079  %5 = extractelement <8 x i1> %4, i64 0
5080  %6 = select i1 %5, float %3, float 0.000000e+00
5081  %7 = insertelement <4 x float> %__A, float %6, i64 0
5082  ret <4 x float> %7
5083}
5084
5085define <4 x float> @test_mm_mask3_fmadd_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5086; X86-LABEL: test_mm_mask3_fmadd_ss:
5087; X86:       # %bb.0: # %entry
5088; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5089; X86-NEXT:    kmovw %eax, %k1
5090; X86-NEXT:    vfmadd231ss {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) + xmm2
5091; X86-NEXT:    vmovaps %xmm2, %xmm0
5092; X86-NEXT:    retl
5093;
5094; X64-LABEL: test_mm_mask3_fmadd_ss:
5095; X64:       # %bb.0: # %entry
5096; X64-NEXT:    kmovw %edi, %k1
5097; X64-NEXT:    vfmadd231ss {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) + xmm2
5098; X64-NEXT:    vmovaps %xmm2, %xmm0
5099; X64-NEXT:    retq
5100entry:
5101  %0 = extractelement <4 x float> %__W, i64 0
5102  %1 = extractelement <4 x float> %__X, i64 0
5103  %2 = extractelement <4 x float> %__Y, i64 0
5104  %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5105  %4 = and i8 %__U, 1
5106  %tobool.i = icmp eq i8 %4, 0
5107  %vecext1.i = extractelement <4 x float> %__Y, i32 0
5108  %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
5109  %vecins.i = insertelement <4 x float> %__Y, float %cond.i, i32 0
5110  ret <4 x float> %vecins.i
5111}
5112
5113define <4 x float> @test_mm_mask3_fmadd_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5114; X86-LABEL: test_mm_mask3_fmadd_round_ss:
5115; X86:       # %bb.0: # %entry
5116; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5117; X86-NEXT:    kmovw %eax, %k1
5118; X86-NEXT:    vfmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5119; X86-NEXT:    vmovaps %xmm2, %xmm0
5120; X86-NEXT:    retl
5121;
5122; X64-LABEL: test_mm_mask3_fmadd_round_ss:
5123; X64:       # %bb.0: # %entry
5124; X64-NEXT:    kmovw %edi, %k1
5125; X64-NEXT:    vfmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5126; X64-NEXT:    vmovaps %xmm2, %xmm0
5127; X64-NEXT:    retq
5128entry:
5129  %0 = extractelement <4 x float> %__W, i64 0
5130  %1 = extractelement <4 x float> %__X, i64 0
5131  %2 = extractelement <4 x float> %__Y, i64 0
5132  %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5133  %4 = bitcast i8 %__U to <8 x i1>
5134  %5 = extractelement <8 x i1> %4, i64 0
5135  %6 = select i1 %5, float %3, float %2
5136  %7 = insertelement <4 x float> %__Y, float %6, i64 0
5137  ret <4 x float> %7
5138}
5139
5140define <4 x float> @test_mm_mask_fmsub_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
5141; X86-LABEL: test_mm_mask_fmsub_ss:
5142; X86:       # %bb.0: # %entry
5143; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5144; X86-NEXT:    kmovw %eax, %k1
5145; X86-NEXT:    vfmsub213ss {{.*#+}} xmm0 {%k1} = (xmm1 * xmm0) - xmm2
5146; X86-NEXT:    retl
5147;
5148; X64-LABEL: test_mm_mask_fmsub_ss:
5149; X64:       # %bb.0: # %entry
5150; X64-NEXT:    kmovw %edi, %k1
5151; X64-NEXT:    vfmsub213ss {{.*#+}} xmm0 {%k1} = (xmm1 * xmm0) - xmm2
5152; X64-NEXT:    retq
5153entry:
5154  %0 = extractelement <4 x float> %__W, i64 0
5155  %1 = extractelement <4 x float> %__A, i64 0
5156  %.rhs.i = extractelement <4 x float> %__B, i64 0
5157  %2 = fsub float -0.000000e+00, %.rhs.i
5158  %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5159  %4 = and i8 %__U, 1
5160  %tobool.i = icmp eq i8 %4, 0
5161  %vecext1.i = extractelement <4 x float> %__W, i32 0
5162  %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
5163  %vecins.i = insertelement <4 x float> %__W, float %cond.i, i32 0
5164  ret <4 x float> %vecins.i
5165}
5166
5167define <4 x float> @test_mm_mask_fmsub_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
5168; X86-LABEL: test_mm_mask_fmsub_round_ss:
5169; X86:       # %bb.0: # %entry
5170; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5171; X86-NEXT:    kmovw %eax, %k1
5172; X86-NEXT:    vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5173; X86-NEXT:    retl
5174;
5175; X64-LABEL: test_mm_mask_fmsub_round_ss:
5176; X64:       # %bb.0: # %entry
5177; X64-NEXT:    kmovw %edi, %k1
5178; X64-NEXT:    vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5179; X64-NEXT:    retq
5180entry:
5181  %0 = extractelement <4 x float> %__W, i64 0
5182  %1 = extractelement <4 x float> %__A, i64 0
5183  %.rhs = extractelement <4 x float> %__B, i64 0
5184  %2 = fsub float -0.000000e+00, %.rhs
5185  %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5186  %4 = bitcast i8 %__U to <8 x i1>
5187  %5 = extractelement <8 x i1> %4, i64 0
5188  %6 = select i1 %5, float %3, float %0
5189  %7 = insertelement <4 x float> %__W, float %6, i64 0
5190  ret <4 x float> %7
5191}
5192
5193define <4 x float> @test_mm_maskz_fmsub_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5194; X86-LABEL: test_mm_maskz_fmsub_ss:
5195; X86:       # %bb.0: # %entry
5196; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5197; X86-NEXT:    kmovw %eax, %k1
5198; X86-NEXT:    vfmsub213ss {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2
5199; X86-NEXT:    retl
5200;
5201; X64-LABEL: test_mm_maskz_fmsub_ss:
5202; X64:       # %bb.0: # %entry
5203; X64-NEXT:    kmovw %edi, %k1
5204; X64-NEXT:    vfmsub213ss {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2
5205; X64-NEXT:    retq
5206entry:
5207  %0 = extractelement <4 x float> %__A, i64 0
5208  %1 = extractelement <4 x float> %__B, i64 0
5209  %.rhs.i = extractelement <4 x float> %__C, i64 0
5210  %2 = fsub float -0.000000e+00, %.rhs.i
5211  %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5212  %4 = and i8 %__U, 1
5213  %tobool.i = icmp eq i8 %4, 0
5214  %cond.i = select i1 %tobool.i, float 0.000000e+00, float %3
5215  %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
5216  ret <4 x float> %vecins.i
5217}
5218
5219define <4 x float> @test_mm_maskz_fmsub_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5220; X86-LABEL: test_mm_maskz_fmsub_round_ss:
5221; X86:       # %bb.0: # %entry
5222; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5223; X86-NEXT:    kmovw %eax, %k1
5224; X86-NEXT:    vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5225; X86-NEXT:    retl
5226;
5227; X64-LABEL: test_mm_maskz_fmsub_round_ss:
5228; X64:       # %bb.0: # %entry
5229; X64-NEXT:    kmovw %edi, %k1
5230; X64-NEXT:    vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5231; X64-NEXT:    retq
5232entry:
5233  %0 = extractelement <4 x float> %__A, i64 0
5234  %1 = extractelement <4 x float> %__B, i64 0
5235  %.rhs = extractelement <4 x float> %__C, i64 0
5236  %2 = fsub float -0.000000e+00, %.rhs
5237  %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5238  %4 = bitcast i8 %__U to <8 x i1>
5239  %5 = extractelement <8 x i1> %4, i64 0
5240  %6 = select i1 %5, float %3, float 0.000000e+00
5241  %7 = insertelement <4 x float> %__A, float %6, i64 0
5242  ret <4 x float> %7
5243}
5244
5245define <4 x float> @test_mm_mask3_fmsub_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5246; X86-LABEL: test_mm_mask3_fmsub_ss:
5247; X86:       # %bb.0: # %entry
5248; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5249; X86-NEXT:    kmovw %eax, %k1
5250; X86-NEXT:    vfmsub231ss {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) - xmm2
5251; X86-NEXT:    vmovaps %xmm2, %xmm0
5252; X86-NEXT:    retl
5253;
5254; X64-LABEL: test_mm_mask3_fmsub_ss:
5255; X64:       # %bb.0: # %entry
5256; X64-NEXT:    kmovw %edi, %k1
5257; X64-NEXT:    vfmsub231ss {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) - xmm2
5258; X64-NEXT:    vmovaps %xmm2, %xmm0
5259; X64-NEXT:    retq
5260entry:
5261  %0 = extractelement <4 x float> %__W, i64 0
5262  %1 = extractelement <4 x float> %__X, i64 0
5263  %.rhs.i = extractelement <4 x float> %__Y, i64 0
5264  %2 = fsub float -0.000000e+00, %.rhs.i
5265  %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5266  %4 = and i8 %__U, 1
5267  %tobool.i = icmp eq i8 %4, 0
5268  %vecext1.i = extractelement <4 x float> %__Y, i32 0
5269  %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
5270  %vecins.i = insertelement <4 x float> %__Y, float %cond.i, i32 0
5271  ret <4 x float> %vecins.i
5272}
5273
5274define <4 x float> @test_mm_mask3_fmsub_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5275; X86-LABEL: test_mm_mask3_fmsub_round_ss:
5276; X86:       # %bb.0: # %entry
5277; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5278; X86-NEXT:    kmovw %eax, %k1
5279; X86-NEXT:    vfmsub231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5280; X86-NEXT:    vmovaps %xmm2, %xmm0
5281; X86-NEXT:    retl
5282;
5283; X64-LABEL: test_mm_mask3_fmsub_round_ss:
5284; X64:       # %bb.0: # %entry
5285; X64-NEXT:    kmovw %edi, %k1
5286; X64-NEXT:    vfmsub231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5287; X64-NEXT:    vmovaps %xmm2, %xmm0
5288; X64-NEXT:    retq
5289entry:
5290  %0 = extractelement <4 x float> %__W, i64 0
5291  %1 = extractelement <4 x float> %__X, i64 0
5292  %.rhs = extractelement <4 x float> %__Y, i64 0
5293  %2 = fsub float -0.000000e+00, %.rhs
5294  %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5295  %4 = bitcast i8 %__U to <8 x i1>
5296  %5 = extractelement <8 x i1> %4, i64 0
5297  %6 = select i1 %5, float %3, float %.rhs
5298  %7 = insertelement <4 x float> %__Y, float %6, i64 0
5299  ret <4 x float> %7
5300}
5301
5302define <4 x float> @test_mm_mask_fnmadd_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
5303; X86-LABEL: test_mm_mask_fnmadd_ss:
5304; X86:       # %bb.0: # %entry
5305; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5306; X86-NEXT:    kmovw %eax, %k1
5307; X86-NEXT:    vfnmadd213ss {{.*#+}} xmm0 {%k1} = -(xmm1 * xmm0) + xmm2
5308; X86-NEXT:    retl
5309;
5310; X64-LABEL: test_mm_mask_fnmadd_ss:
5311; X64:       # %bb.0: # %entry
5312; X64-NEXT:    kmovw %edi, %k1
5313; X64-NEXT:    vfnmadd213ss {{.*#+}} xmm0 {%k1} = -(xmm1 * xmm0) + xmm2
5314; X64-NEXT:    retq
5315entry:
5316  %0 = extractelement <4 x float> %__W, i64 0
5317  %.rhs.i = extractelement <4 x float> %__A, i64 0
5318  %1 = fsub float -0.000000e+00, %.rhs.i
5319  %2 = extractelement <4 x float> %__B, i64 0
5320  %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5321  %4 = and i8 %__U, 1
5322  %tobool.i = icmp eq i8 %4, 0
5323  %vecext1.i = extractelement <4 x float> %__W, i32 0
5324  %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
5325  %vecins.i = insertelement <4 x float> %__W, float %cond.i, i32 0
5326  ret <4 x float> %vecins.i
5327}
5328
5329define <4 x float> @test_mm_mask_fnmadd_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
5330; X86-LABEL: test_mm_mask_fnmadd_round_ss:
5331; X86:       # %bb.0: # %entry
5332; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5333; X86-NEXT:    kmovw %eax, %k1
5334; X86-NEXT:    vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5335; X86-NEXT:    retl
5336;
5337; X64-LABEL: test_mm_mask_fnmadd_round_ss:
5338; X64:       # %bb.0: # %entry
5339; X64-NEXT:    kmovw %edi, %k1
5340; X64-NEXT:    vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5341; X64-NEXT:    retq
5342entry:
5343  %0 = extractelement <4 x float> %__W, i64 0
5344  %.rhs = extractelement <4 x float> %__A, i64 0
5345  %1 = fsub float -0.000000e+00, %.rhs
5346  %2 = extractelement <4 x float> %__B, i64 0
5347  %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5348  %4 = bitcast i8 %__U to <8 x i1>
5349  %5 = extractelement <8 x i1> %4, i64 0
5350  %6 = select i1 %5, float %3, float %0
5351  %7 = insertelement <4 x float> %__W, float %6, i64 0
5352  ret <4 x float> %7
5353}
5354
5355define <4 x float> @test_mm_maskz_fnmadd_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5356; X86-LABEL: test_mm_maskz_fnmadd_ss:
5357; X86:       # %bb.0: # %entry
5358; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5359; X86-NEXT:    kmovw %eax, %k1
5360; X86-NEXT:    vfnmadd213ss {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2
5361; X86-NEXT:    retl
5362;
5363; X64-LABEL: test_mm_maskz_fnmadd_ss:
5364; X64:       # %bb.0: # %entry
5365; X64-NEXT:    kmovw %edi, %k1
5366; X64-NEXT:    vfnmadd213ss {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2
5367; X64-NEXT:    retq
5368entry:
5369  %0 = extractelement <4 x float> %__A, i64 0
5370  %.rhs.i = extractelement <4 x float> %__B, i64 0
5371  %1 = fsub float -0.000000e+00, %.rhs.i
5372  %2 = extractelement <4 x float> %__C, i64 0
5373  %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5374  %4 = and i8 %__U, 1
5375  %tobool.i = icmp eq i8 %4, 0
5376  %cond.i = select i1 %tobool.i, float 0.000000e+00, float %3
5377  %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
5378  ret <4 x float> %vecins.i
5379}
5380
5381define <4 x float> @test_mm_maskz_fnmadd_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5382; X86-LABEL: test_mm_maskz_fnmadd_round_ss:
5383; X86:       # %bb.0: # %entry
5384; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5385; X86-NEXT:    kmovw %eax, %k1
5386; X86-NEXT:    vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5387; X86-NEXT:    retl
5388;
5389; X64-LABEL: test_mm_maskz_fnmadd_round_ss:
5390; X64:       # %bb.0: # %entry
5391; X64-NEXT:    kmovw %edi, %k1
5392; X64-NEXT:    vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5393; X64-NEXT:    retq
5394entry:
5395  %0 = extractelement <4 x float> %__A, i64 0
5396  %.rhs = extractelement <4 x float> %__B, i64 0
5397  %1 = fsub float -0.000000e+00, %.rhs
5398  %2 = extractelement <4 x float> %__C, i64 0
5399  %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5400  %4 = bitcast i8 %__U to <8 x i1>
5401  %5 = extractelement <8 x i1> %4, i64 0
5402  %6 = select i1 %5, float %3, float 0.000000e+00
5403  %7 = insertelement <4 x float> %__A, float %6, i64 0
5404  ret <4 x float> %7
5405}
5406
5407define <4 x float> @test_mm_mask3_fnmadd_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5408; X86-LABEL: test_mm_mask3_fnmadd_ss:
5409; X86:       # %bb.0: # %entry
5410; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5411; X86-NEXT:    kmovw %eax, %k1
5412; X86-NEXT:    vfnmadd231ss {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) + xmm2
5413; X86-NEXT:    vmovaps %xmm2, %xmm0
5414; X86-NEXT:    retl
5415;
5416; X64-LABEL: test_mm_mask3_fnmadd_ss:
5417; X64:       # %bb.0: # %entry
5418; X64-NEXT:    kmovw %edi, %k1
5419; X64-NEXT:    vfnmadd231ss {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) + xmm2
5420; X64-NEXT:    vmovaps %xmm2, %xmm0
5421; X64-NEXT:    retq
5422entry:
5423  %0 = extractelement <4 x float> %__W, i64 0
5424  %.rhs.i = extractelement <4 x float> %__X, i64 0
5425  %1 = fsub float -0.000000e+00, %.rhs.i
5426  %2 = extractelement <4 x float> %__Y, i64 0
5427  %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5428  %4 = and i8 %__U, 1
5429  %tobool.i = icmp eq i8 %4, 0
5430  %vecext1.i = extractelement <4 x float> %__Y, i32 0
5431  %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
5432  %vecins.i = insertelement <4 x float> %__Y, float %cond.i, i32 0
5433  ret <4 x float> %vecins.i
5434}
5435
5436define <4 x float> @test_mm_mask3_fnmadd_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5437; X86-LABEL: test_mm_mask3_fnmadd_round_ss:
5438; X86:       # %bb.0: # %entry
5439; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5440; X86-NEXT:    kmovw %eax, %k1
5441; X86-NEXT:    vfnmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5442; X86-NEXT:    vmovaps %xmm2, %xmm0
5443; X86-NEXT:    retl
5444;
5445; X64-LABEL: test_mm_mask3_fnmadd_round_ss:
5446; X64:       # %bb.0: # %entry
5447; X64-NEXT:    kmovw %edi, %k1
5448; X64-NEXT:    vfnmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5449; X64-NEXT:    vmovaps %xmm2, %xmm0
5450; X64-NEXT:    retq
5451entry:
5452  %0 = extractelement <4 x float> %__W, i64 0
5453  %.rhs = extractelement <4 x float> %__X, i64 0
5454  %1 = fsub float -0.000000e+00, %.rhs
5455  %2 = extractelement <4 x float> %__Y, i64 0
5456  %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5457  %4 = bitcast i8 %__U to <8 x i1>
5458  %5 = extractelement <8 x i1> %4, i64 0
5459  %6 = select i1 %5, float %3, float %2
5460  %7 = insertelement <4 x float> %__Y, float %6, i64 0
5461  ret <4 x float> %7
5462}
5463
5464define <4 x float> @test_mm_mask_fnmsub_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
5465; X86-LABEL: test_mm_mask_fnmsub_ss:
5466; X86:       # %bb.0: # %entry
5467; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5468; X86-NEXT:    kmovw %eax, %k1
5469; X86-NEXT:    vfnmsub213ss {{.*#+}} xmm0 {%k1} = -(xmm1 * xmm0) - xmm2
5470; X86-NEXT:    retl
5471;
5472; X64-LABEL: test_mm_mask_fnmsub_ss:
5473; X64:       # %bb.0: # %entry
5474; X64-NEXT:    kmovw %edi, %k1
5475; X64-NEXT:    vfnmsub213ss {{.*#+}} xmm0 {%k1} = -(xmm1 * xmm0) - xmm2
5476; X64-NEXT:    retq
5477entry:
5478  %0 = extractelement <4 x float> %__W, i64 0
5479  %.rhs.i = extractelement <4 x float> %__A, i64 0
5480  %1 = fsub float -0.000000e+00, %.rhs.i
5481  %.rhs7.i = extractelement <4 x float> %__B, i64 0
5482  %2 = fsub float -0.000000e+00, %.rhs7.i
5483  %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5484  %4 = and i8 %__U, 1
5485  %tobool.i = icmp eq i8 %4, 0
5486  %vecext2.i = extractelement <4 x float> %__W, i32 0
5487  %cond.i = select i1 %tobool.i, float %vecext2.i, float %3
5488  %vecins.i = insertelement <4 x float> %__W, float %cond.i, i32 0
5489  ret <4 x float> %vecins.i
5490}
5491
5492define <4 x float> @test_mm_mask_fnmsub_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
5493; X86-LABEL: test_mm_mask_fnmsub_round_ss:
5494; X86:       # %bb.0: # %entry
5495; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5496; X86-NEXT:    kmovw %eax, %k1
5497; X86-NEXT:    vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5498; X86-NEXT:    retl
5499;
5500; X64-LABEL: test_mm_mask_fnmsub_round_ss:
5501; X64:       # %bb.0: # %entry
5502; X64-NEXT:    kmovw %edi, %k1
5503; X64-NEXT:    vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5504; X64-NEXT:    retq
5505entry:
5506  %0 = extractelement <4 x float> %__W, i64 0
5507  %.rhs = extractelement <4 x float> %__A, i64 0
5508  %1 = fsub float -0.000000e+00, %.rhs
5509  %.rhs2 = extractelement <4 x float> %__B, i64 0
5510  %2 = fsub float -0.000000e+00, %.rhs2
5511  %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5512  %4 = bitcast i8 %__U to <8 x i1>
5513  %5 = extractelement <8 x i1> %4, i64 0
5514  %6 = select i1 %5, float %3, float %0
5515  %7 = insertelement <4 x float> %__W, float %6, i64 0
5516  ret <4 x float> %7
5517}
5518
5519define <4 x float> @test_mm_maskz_fnmsub_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5520; X86-LABEL: test_mm_maskz_fnmsub_ss:
5521; X86:       # %bb.0: # %entry
5522; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5523; X86-NEXT:    kmovw %eax, %k1
5524; X86-NEXT:    vfnmsub213ss {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2
5525; X86-NEXT:    retl
5526;
5527; X64-LABEL: test_mm_maskz_fnmsub_ss:
5528; X64:       # %bb.0: # %entry
5529; X64-NEXT:    kmovw %edi, %k1
5530; X64-NEXT:    vfnmsub213ss {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2
5531; X64-NEXT:    retq
5532entry:
5533  %0 = extractelement <4 x float> %__A, i64 0
5534  %.rhs.i = extractelement <4 x float> %__B, i64 0
5535  %1 = fsub float -0.000000e+00, %.rhs.i
5536  %.rhs5.i = extractelement <4 x float> %__C, i64 0
5537  %2 = fsub float -0.000000e+00, %.rhs5.i
5538  %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5539  %4 = and i8 %__U, 1
5540  %tobool.i = icmp eq i8 %4, 0
5541  %cond.i = select i1 %tobool.i, float 0.000000e+00, float %3
5542  %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
5543  ret <4 x float> %vecins.i
5544}
5545
5546define <4 x float> @test_mm_maskz_fnmsub_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5547; X86-LABEL: test_mm_maskz_fnmsub_round_ss:
5548; X86:       # %bb.0: # %entry
5549; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5550; X86-NEXT:    kmovw %eax, %k1
5551; X86-NEXT:    vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5552; X86-NEXT:    retl
5553;
5554; X64-LABEL: test_mm_maskz_fnmsub_round_ss:
5555; X64:       # %bb.0: # %entry
5556; X64-NEXT:    kmovw %edi, %k1
5557; X64-NEXT:    vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5558; X64-NEXT:    retq
5559entry:
5560  %0 = extractelement <4 x float> %__A, i64 0
5561  %.rhs = extractelement <4 x float> %__B, i64 0
5562  %1 = fsub float -0.000000e+00, %.rhs
5563  %.rhs2 = extractelement <4 x float> %__C, i64 0
5564  %2 = fsub float -0.000000e+00, %.rhs2
5565  %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5566  %4 = bitcast i8 %__U to <8 x i1>
5567  %5 = extractelement <8 x i1> %4, i64 0
5568  %6 = select i1 %5, float %3, float 0.000000e+00
5569  %7 = insertelement <4 x float> %__A, float %6, i64 0
5570  ret <4 x float> %7
5571}
5572
5573define <4 x float> @test_mm_mask3_fnmsub_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5574; X86-LABEL: test_mm_mask3_fnmsub_ss:
5575; X86:       # %bb.0: # %entry
5576; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5577; X86-NEXT:    kmovw %eax, %k1
5578; X86-NEXT:    vfnmsub231ss {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) - xmm2
5579; X86-NEXT:    vmovaps %xmm2, %xmm0
5580; X86-NEXT:    retl
5581;
5582; X64-LABEL: test_mm_mask3_fnmsub_ss:
5583; X64:       # %bb.0: # %entry
5584; X64-NEXT:    kmovw %edi, %k1
5585; X64-NEXT:    vfnmsub231ss {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) - xmm2
5586; X64-NEXT:    vmovaps %xmm2, %xmm0
5587; X64-NEXT:    retq
5588entry:
5589  %0 = extractelement <4 x float> %__W, i64 0
5590  %.rhs.i = extractelement <4 x float> %__X, i64 0
5591  %1 = fsub float -0.000000e+00, %.rhs.i
5592  %.rhs7.i = extractelement <4 x float> %__Y, i64 0
5593  %2 = fsub float -0.000000e+00, %.rhs7.i
5594  %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5595  %4 = and i8 %__U, 1
5596  %tobool.i = icmp eq i8 %4, 0
5597  %vecext2.i = extractelement <4 x float> %__Y, i32 0
5598  %cond.i = select i1 %tobool.i, float %vecext2.i, float %3
5599  %vecins.i = insertelement <4 x float> %__Y, float %cond.i, i32 0
5600  ret <4 x float> %vecins.i
5601}
5602
5603define <4 x float> @test_mm_mask3_fnmsub_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5604; X86-LABEL: test_mm_mask3_fnmsub_round_ss:
5605; X86:       # %bb.0: # %entry
5606; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5607; X86-NEXT:    kmovw %eax, %k1
5608; X86-NEXT:    vfnmsub231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5609; X86-NEXT:    vmovaps %xmm2, %xmm0
5610; X86-NEXT:    retl
5611;
5612; X64-LABEL: test_mm_mask3_fnmsub_round_ss:
5613; X64:       # %bb.0: # %entry
5614; X64-NEXT:    kmovw %edi, %k1
5615; X64-NEXT:    vfnmsub231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5616; X64-NEXT:    vmovaps %xmm2, %xmm0
5617; X64-NEXT:    retq
5618entry:
5619  %0 = extractelement <4 x float> %__W, i64 0
5620  %.rhs = extractelement <4 x float> %__X, i64 0
5621  %1 = fsub float -0.000000e+00, %.rhs
5622  %.rhs1 = extractelement <4 x float> %__Y, i64 0
5623  %2 = fsub float -0.000000e+00, %.rhs1
5624  %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5625  %4 = bitcast i8 %__U to <8 x i1>
5626  %5 = extractelement <8 x i1> %4, i64 0
5627  %6 = select i1 %5, float %3, float %.rhs1
5628  %7 = insertelement <4 x float> %__Y, float %6, i64 0
5629  ret <4 x float> %7
5630}
5631
5632define <2 x double> @test_mm_mask_fmadd_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
5633; X86-LABEL: test_mm_mask_fmadd_sd:
5634; X86:       # %bb.0: # %entry
5635; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5636; X86-NEXT:    kmovw %eax, %k1
5637; X86-NEXT:    vfmadd213sd {{.*#+}} xmm0 {%k1} = (xmm1 * xmm0) + xmm2
5638; X86-NEXT:    retl
5639;
5640; X64-LABEL: test_mm_mask_fmadd_sd:
5641; X64:       # %bb.0: # %entry
5642; X64-NEXT:    kmovw %edi, %k1
5643; X64-NEXT:    vfmadd213sd {{.*#+}} xmm0 {%k1} = (xmm1 * xmm0) + xmm2
5644; X64-NEXT:    retq
5645entry:
5646  %0 = extractelement <2 x double> %__W, i64 0
5647  %1 = extractelement <2 x double> %__A, i64 0
5648  %2 = extractelement <2 x double> %__B, i64 0
5649  %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5650  %4 = and i8 %__U, 1
5651  %tobool.i = icmp eq i8 %4, 0
5652  %vecext1.i = extractelement <2 x double> %__W, i32 0
5653  %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
5654  %vecins.i = insertelement <2 x double> %__W, double %cond.i, i32 0
5655  ret <2 x double> %vecins.i
5656}
5657
5658define <2 x double> @test_mm_mask_fmadd_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
5659; X86-LABEL: test_mm_mask_fmadd_round_sd:
5660; X86:       # %bb.0: # %entry
5661; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5662; X86-NEXT:    kmovw %eax, %k1
5663; X86-NEXT:    vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5664; X86-NEXT:    retl
5665;
5666; X64-LABEL: test_mm_mask_fmadd_round_sd:
5667; X64:       # %bb.0: # %entry
5668; X64-NEXT:    kmovw %edi, %k1
5669; X64-NEXT:    vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5670; X64-NEXT:    retq
5671entry:
5672  %0 = extractelement <2 x double> %__W, i64 0
5673  %1 = extractelement <2 x double> %__A, i64 0
5674  %2 = extractelement <2 x double> %__B, i64 0
5675  %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
5676  %4 = bitcast i8 %__U to <8 x i1>
5677  %5 = extractelement <8 x i1> %4, i64 0
5678  %6 = select i1 %5, double %3, double %0
5679  %7 = insertelement <2 x double> %__W, double %6, i64 0
5680  ret <2 x double> %7
5681}
5682
5683declare double @llvm.x86.avx512.vfmadd.f64(double, double, double, i32) #1
5684
5685define <2 x double> @test_mm_maskz_fmadd_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5686; X86-LABEL: test_mm_maskz_fmadd_sd:
5687; X86:       # %bb.0: # %entry
5688; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5689; X86-NEXT:    kmovw %eax, %k1
5690; X86-NEXT:    vfmadd213sd {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2
5691; X86-NEXT:    retl
5692;
5693; X64-LABEL: test_mm_maskz_fmadd_sd:
5694; X64:       # %bb.0: # %entry
5695; X64-NEXT:    kmovw %edi, %k1
5696; X64-NEXT:    vfmadd213sd {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2
5697; X64-NEXT:    retq
5698entry:
5699  %0 = extractelement <2 x double> %__A, i64 0
5700  %1 = extractelement <2 x double> %__B, i64 0
5701  %2 = extractelement <2 x double> %__C, i64 0
5702  %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5703  %4 = and i8 %__U, 1
5704  %tobool.i = icmp eq i8 %4, 0
5705  %cond.i = select i1 %tobool.i, double 0.000000e+00, double %3
5706  %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
5707  ret <2 x double> %vecins.i
5708}
5709
5710define <2 x double> @test_mm_maskz_fmadd_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5711; X86-LABEL: test_mm_maskz_fmadd_round_sd:
5712; X86:       # %bb.0: # %entry
5713; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5714; X86-NEXT:    kmovw %eax, %k1
5715; X86-NEXT:    vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5716; X86-NEXT:    retl
5717;
5718; X64-LABEL: test_mm_maskz_fmadd_round_sd:
5719; X64:       # %bb.0: # %entry
5720; X64-NEXT:    kmovw %edi, %k1
5721; X64-NEXT:    vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5722; X64-NEXT:    retq
5723entry:
5724  %0 = extractelement <2 x double> %__A, i64 0
5725  %1 = extractelement <2 x double> %__B, i64 0
5726  %2 = extractelement <2 x double> %__C, i64 0
5727  %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
5728  %4 = bitcast i8 %__U to <8 x i1>
5729  %5 = extractelement <8 x i1> %4, i64 0
5730  %6 = select i1 %5, double %3, double 0.000000e+00
5731  %7 = insertelement <2 x double> %__A, double %6, i64 0
5732  ret <2 x double> %7
5733}
5734
5735define <2 x double> @test_mm_mask3_fmadd_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
5736; X86-LABEL: test_mm_mask3_fmadd_sd:
5737; X86:       # %bb.0: # %entry
5738; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5739; X86-NEXT:    kmovw %eax, %k1
5740; X86-NEXT:    vfmadd231sd {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) + xmm2
5741; X86-NEXT:    vmovapd %xmm2, %xmm0
5742; X86-NEXT:    retl
5743;
5744; X64-LABEL: test_mm_mask3_fmadd_sd:
5745; X64:       # %bb.0: # %entry
5746; X64-NEXT:    kmovw %edi, %k1
5747; X64-NEXT:    vfmadd231sd {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) + xmm2
5748; X64-NEXT:    vmovapd %xmm2, %xmm0
5749; X64-NEXT:    retq
5750entry:
5751  %0 = extractelement <2 x double> %__W, i64 0
5752  %1 = extractelement <2 x double> %__X, i64 0
5753  %2 = extractelement <2 x double> %__Y, i64 0
5754  %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5755  %4 = and i8 %__U, 1
5756  %tobool.i = icmp eq i8 %4, 0
5757  %vecext1.i = extractelement <2 x double> %__Y, i32 0
5758  %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
5759  %vecins.i = insertelement <2 x double> %__Y, double %cond.i, i32 0
5760  ret <2 x double> %vecins.i
5761}
5762
5763define <2 x double> @test_mm_mask3_fmadd_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
5764; X86-LABEL: test_mm_mask3_fmadd_round_sd:
5765; X86:       # %bb.0: # %entry
5766; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5767; X86-NEXT:    kmovw %eax, %k1
5768; X86-NEXT:    vfmadd231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5769; X86-NEXT:    vmovapd %xmm2, %xmm0
5770; X86-NEXT:    retl
5771;
5772; X64-LABEL: test_mm_mask3_fmadd_round_sd:
5773; X64:       # %bb.0: # %entry
5774; X64-NEXT:    kmovw %edi, %k1
5775; X64-NEXT:    vfmadd231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5776; X64-NEXT:    vmovapd %xmm2, %xmm0
5777; X64-NEXT:    retq
5778entry:
5779  %0 = extractelement <2 x double> %__W, i64 0
5780  %1 = extractelement <2 x double> %__X, i64 0
5781  %2 = extractelement <2 x double> %__Y, i64 0
5782  %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
5783  %4 = bitcast i8 %__U to <8 x i1>
5784  %5 = extractelement <8 x i1> %4, i64 0
5785  %6 = select i1 %5, double %3, double %2
5786  %7 = insertelement <2 x double> %__Y, double %6, i64 0
5787  ret <2 x double> %7
5788}
5789
5790define <2 x double> @test_mm_mask_fmsub_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
5791; X86-LABEL: test_mm_mask_fmsub_sd:
5792; X86:       # %bb.0: # %entry
5793; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5794; X86-NEXT:    kmovw %eax, %k1
5795; X86-NEXT:    vfmsub213sd {{.*#+}} xmm0 {%k1} = (xmm1 * xmm0) - xmm2
5796; X86-NEXT:    retl
5797;
5798; X64-LABEL: test_mm_mask_fmsub_sd:
5799; X64:       # %bb.0: # %entry
5800; X64-NEXT:    kmovw %edi, %k1
5801; X64-NEXT:    vfmsub213sd {{.*#+}} xmm0 {%k1} = (xmm1 * xmm0) - xmm2
5802; X64-NEXT:    retq
5803entry:
5804  %0 = extractelement <2 x double> %__W, i64 0
5805  %1 = extractelement <2 x double> %__A, i64 0
5806  %.rhs.i = extractelement <2 x double> %__B, i64 0
5807  %2 = fsub double -0.000000e+00, %.rhs.i
5808  %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5809  %4 = and i8 %__U, 1
5810  %tobool.i = icmp eq i8 %4, 0
5811  %vecext1.i = extractelement <2 x double> %__W, i32 0
5812  %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
5813  %vecins.i = insertelement <2 x double> %__W, double %cond.i, i32 0
5814  ret <2 x double> %vecins.i
5815}
5816
5817define <2 x double> @test_mm_mask_fmsub_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
5818; X86-LABEL: test_mm_mask_fmsub_round_sd:
5819; X86:       # %bb.0: # %entry
5820; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5821; X86-NEXT:    kmovw %eax, %k1
5822; X86-NEXT:    vfmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5823; X86-NEXT:    retl
5824;
5825; X64-LABEL: test_mm_mask_fmsub_round_sd:
5826; X64:       # %bb.0: # %entry
5827; X64-NEXT:    kmovw %edi, %k1
5828; X64-NEXT:    vfmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5829; X64-NEXT:    retq
5830entry:
5831  %0 = extractelement <2 x double> %__W, i64 0
5832  %1 = extractelement <2 x double> %__A, i64 0
5833  %.rhs = extractelement <2 x double> %__B, i64 0
5834  %2 = fsub double -0.000000e+00, %.rhs
5835  %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
5836  %4 = bitcast i8 %__U to <8 x i1>
5837  %5 = extractelement <8 x i1> %4, i64 0
5838  %6 = select i1 %5, double %3, double %0
5839  %7 = insertelement <2 x double> %__W, double %6, i64 0
5840  ret <2 x double> %7
5841}
5842
5843define <2 x double> @test_mm_maskz_fmsub_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5844; X86-LABEL: test_mm_maskz_fmsub_sd:
5845; X86:       # %bb.0: # %entry
5846; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5847; X86-NEXT:    kmovw %eax, %k1
5848; X86-NEXT:    vfmsub213sd {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2
5849; X86-NEXT:    retl
5850;
5851; X64-LABEL: test_mm_maskz_fmsub_sd:
5852; X64:       # %bb.0: # %entry
5853; X64-NEXT:    kmovw %edi, %k1
5854; X64-NEXT:    vfmsub213sd {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2
5855; X64-NEXT:    retq
5856entry:
5857  %0 = extractelement <2 x double> %__A, i64 0
5858  %1 = extractelement <2 x double> %__B, i64 0
5859  %.rhs.i = extractelement <2 x double> %__C, i64 0
5860  %2 = fsub double -0.000000e+00, %.rhs.i
5861  %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5862  %4 = and i8 %__U, 1
5863  %tobool.i = icmp eq i8 %4, 0
5864  %cond.i = select i1 %tobool.i, double 0.000000e+00, double %3
5865  %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
5866  ret <2 x double> %vecins.i
5867}
5868
5869define <2 x double> @test_mm_maskz_fmsub_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5870; X86-LABEL: test_mm_maskz_fmsub_round_sd:
5871; X86:       # %bb.0: # %entry
5872; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5873; X86-NEXT:    kmovw %eax, %k1
5874; X86-NEXT:    vfmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5875; X86-NEXT:    retl
5876;
5877; X64-LABEL: test_mm_maskz_fmsub_round_sd:
5878; X64:       # %bb.0: # %entry
5879; X64-NEXT:    kmovw %edi, %k1
5880; X64-NEXT:    vfmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5881; X64-NEXT:    retq
5882entry:
5883  %0 = extractelement <2 x double> %__A, i64 0
5884  %1 = extractelement <2 x double> %__B, i64 0
5885  %.rhs = extractelement <2 x double> %__C, i64 0
5886  %2 = fsub double -0.000000e+00, %.rhs
5887  %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
5888  %4 = bitcast i8 %__U to <8 x i1>
5889  %5 = extractelement <8 x i1> %4, i64 0
5890  %6 = select i1 %5, double %3, double 0.000000e+00
5891  %7 = insertelement <2 x double> %__A, double %6, i64 0
5892  ret <2 x double> %7
5893}
5894
5895define <2 x double> @test_mm_mask3_fmsub_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
5896; X86-LABEL: test_mm_mask3_fmsub_sd:
5897; X86:       # %bb.0: # %entry
5898; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5899; X86-NEXT:    kmovw %eax, %k1
5900; X86-NEXT:    vfmsub231sd {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) - xmm2
5901; X86-NEXT:    vmovapd %xmm2, %xmm0
5902; X86-NEXT:    retl
5903;
5904; X64-LABEL: test_mm_mask3_fmsub_sd:
5905; X64:       # %bb.0: # %entry
5906; X64-NEXT:    kmovw %edi, %k1
5907; X64-NEXT:    vfmsub231sd {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) - xmm2
5908; X64-NEXT:    vmovapd %xmm2, %xmm0
5909; X64-NEXT:    retq
5910entry:
5911  %0 = extractelement <2 x double> %__W, i64 0
5912  %1 = extractelement <2 x double> %__X, i64 0
5913  %.rhs.i = extractelement <2 x double> %__Y, i64 0
5914  %2 = fsub double -0.000000e+00, %.rhs.i
5915  %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5916  %4 = and i8 %__U, 1
5917  %tobool.i = icmp eq i8 %4, 0
5918  %vecext1.i = extractelement <2 x double> %__Y, i32 0
5919  %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
5920  %vecins.i = insertelement <2 x double> %__Y, double %cond.i, i32 0
5921  ret <2 x double> %vecins.i
5922}
5923
5924define <2 x double> @test_mm_mask3_fmsub_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
5925; X86-LABEL: test_mm_mask3_fmsub_round_sd:
5926; X86:       # %bb.0: # %entry
5927; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5928; X86-NEXT:    kmovw %eax, %k1
5929; X86-NEXT:    vfmsub231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5930; X86-NEXT:    vmovapd %xmm2, %xmm0
5931; X86-NEXT:    retl
5932;
5933; X64-LABEL: test_mm_mask3_fmsub_round_sd:
5934; X64:       # %bb.0: # %entry
5935; X64-NEXT:    kmovw %edi, %k1
5936; X64-NEXT:    vfmsub231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5937; X64-NEXT:    vmovapd %xmm2, %xmm0
5938; X64-NEXT:    retq
5939entry:
5940  %0 = extractelement <2 x double> %__W, i64 0
5941  %1 = extractelement <2 x double> %__X, i64 0
5942  %.rhs = extractelement <2 x double> %__Y, i64 0
5943  %2 = fsub double -0.000000e+00, %.rhs
5944  %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
5945  %4 = bitcast i8 %__U to <8 x i1>
5946  %5 = extractelement <8 x i1> %4, i64 0
5947  %6 = select i1 %5, double %3, double %.rhs
5948  %7 = insertelement <2 x double> %__Y, double %6, i64 0
5949  ret <2 x double> %7
5950}
5951
5952define <2 x double> @test_mm_mask_fnmadd_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
5953; X86-LABEL: test_mm_mask_fnmadd_sd:
5954; X86:       # %bb.0: # %entry
5955; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5956; X86-NEXT:    kmovw %eax, %k1
5957; X86-NEXT:    vfnmadd213sd {{.*#+}} xmm0 {%k1} = -(xmm1 * xmm0) + xmm2
5958; X86-NEXT:    retl
5959;
5960; X64-LABEL: test_mm_mask_fnmadd_sd:
5961; X64:       # %bb.0: # %entry
5962; X64-NEXT:    kmovw %edi, %k1
5963; X64-NEXT:    vfnmadd213sd {{.*#+}} xmm0 {%k1} = -(xmm1 * xmm0) + xmm2
5964; X64-NEXT:    retq
5965entry:
5966  %0 = extractelement <2 x double> %__W, i64 0
5967  %.rhs.i = extractelement <2 x double> %__A, i64 0
5968  %1 = fsub double -0.000000e+00, %.rhs.i
5969  %2 = extractelement <2 x double> %__B, i64 0
5970  %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5971  %4 = and i8 %__U, 1
5972  %tobool.i = icmp eq i8 %4, 0
5973  %vecext1.i = extractelement <2 x double> %__W, i32 0
5974  %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
5975  %vecins.i = insertelement <2 x double> %__W, double %cond.i, i32 0
5976  ret <2 x double> %vecins.i
5977}
5978
5979define <2 x double> @test_mm_mask_fnmadd_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
5980; X86-LABEL: test_mm_mask_fnmadd_round_sd:
5981; X86:       # %bb.0: # %entry
5982; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
5983; X86-NEXT:    kmovw %eax, %k1
5984; X86-NEXT:    vfnmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5985; X86-NEXT:    retl
5986;
5987; X64-LABEL: test_mm_mask_fnmadd_round_sd:
5988; X64:       # %bb.0: # %entry
5989; X64-NEXT:    kmovw %edi, %k1
5990; X64-NEXT:    vfnmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5991; X64-NEXT:    retq
5992entry:
5993  %0 = extractelement <2 x double> %__W, i64 0
5994  %.rhs = extractelement <2 x double> %__A, i64 0
5995  %1 = fsub double -0.000000e+00, %.rhs
5996  %2 = extractelement <2 x double> %__B, i64 0
5997  %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
5998  %4 = bitcast i8 %__U to <8 x i1>
5999  %5 = extractelement <8 x i1> %4, i64 0
6000  %6 = select i1 %5, double %3, double %0
6001  %7 = insertelement <2 x double> %__W, double %6, i64 0
6002  ret <2 x double> %7
6003}
6004
6005define <2 x double> @test_mm_maskz_fnmadd_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
6006; X86-LABEL: test_mm_maskz_fnmadd_sd:
6007; X86:       # %bb.0: # %entry
6008; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6009; X86-NEXT:    kmovw %eax, %k1
6010; X86-NEXT:    vfnmadd213sd {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2
6011; X86-NEXT:    retl
6012;
6013; X64-LABEL: test_mm_maskz_fnmadd_sd:
6014; X64:       # %bb.0: # %entry
6015; X64-NEXT:    kmovw %edi, %k1
6016; X64-NEXT:    vfnmadd213sd {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2
6017; X64-NEXT:    retq
6018entry:
6019  %0 = extractelement <2 x double> %__A, i64 0
6020  %.rhs.i = extractelement <2 x double> %__B, i64 0
6021  %1 = fsub double -0.000000e+00, %.rhs.i
6022  %2 = extractelement <2 x double> %__C, i64 0
6023  %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
6024  %4 = and i8 %__U, 1
6025  %tobool.i = icmp eq i8 %4, 0
6026  %cond.i = select i1 %tobool.i, double 0.000000e+00, double %3
6027  %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
6028  ret <2 x double> %vecins.i
6029}
6030
6031define <2 x double> @test_mm_maskz_fnmadd_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
6032; X86-LABEL: test_mm_maskz_fnmadd_round_sd:
6033; X86:       # %bb.0: # %entry
6034; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6035; X86-NEXT:    kmovw %eax, %k1
6036; X86-NEXT:    vfnmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
6037; X86-NEXT:    retl
6038;
6039; X64-LABEL: test_mm_maskz_fnmadd_round_sd:
6040; X64:       # %bb.0: # %entry
6041; X64-NEXT:    kmovw %edi, %k1
6042; X64-NEXT:    vfnmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
6043; X64-NEXT:    retq
6044entry:
6045  %0 = extractelement <2 x double> %__A, i64 0
6046  %.rhs = extractelement <2 x double> %__B, i64 0
6047  %1 = fsub double -0.000000e+00, %.rhs
6048  %2 = extractelement <2 x double> %__C, i64 0
6049  %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
6050  %4 = bitcast i8 %__U to <8 x i1>
6051  %5 = extractelement <8 x i1> %4, i64 0
6052  %6 = select i1 %5, double %3, double 0.000000e+00
6053  %7 = insertelement <2 x double> %__A, double %6, i64 0
6054  ret <2 x double> %7
6055}
6056
6057define <2 x double> @test_mm_mask3_fnmadd_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
6058; X86-LABEL: test_mm_mask3_fnmadd_sd:
6059; X86:       # %bb.0: # %entry
6060; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6061; X86-NEXT:    kmovw %eax, %k1
6062; X86-NEXT:    vfnmadd231sd {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) + xmm2
6063; X86-NEXT:    vmovapd %xmm2, %xmm0
6064; X86-NEXT:    retl
6065;
6066; X64-LABEL: test_mm_mask3_fnmadd_sd:
6067; X64:       # %bb.0: # %entry
6068; X64-NEXT:    kmovw %edi, %k1
6069; X64-NEXT:    vfnmadd231sd {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) + xmm2
6070; X64-NEXT:    vmovapd %xmm2, %xmm0
6071; X64-NEXT:    retq
6072entry:
6073  %0 = extractelement <2 x double> %__W, i64 0
6074  %.rhs.i = extractelement <2 x double> %__X, i64 0
6075  %1 = fsub double -0.000000e+00, %.rhs.i
6076  %2 = extractelement <2 x double> %__Y, i64 0
6077  %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
6078  %4 = and i8 %__U, 1
6079  %tobool.i = icmp eq i8 %4, 0
6080  %vecext1.i = extractelement <2 x double> %__Y, i32 0
6081  %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
6082  %vecins.i = insertelement <2 x double> %__Y, double %cond.i, i32 0
6083  ret <2 x double> %vecins.i
6084}
6085
6086define <2 x double> @test_mm_mask3_fnmadd_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
6087; X86-LABEL: test_mm_mask3_fnmadd_round_sd:
6088; X86:       # %bb.0: # %entry
6089; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6090; X86-NEXT:    kmovw %eax, %k1
6091; X86-NEXT:    vfnmadd231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
6092; X86-NEXT:    vmovapd %xmm2, %xmm0
6093; X86-NEXT:    retl
6094;
6095; X64-LABEL: test_mm_mask3_fnmadd_round_sd:
6096; X64:       # %bb.0: # %entry
6097; X64-NEXT:    kmovw %edi, %k1
6098; X64-NEXT:    vfnmadd231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
6099; X64-NEXT:    vmovapd %xmm2, %xmm0
6100; X64-NEXT:    retq
6101entry:
6102  %0 = extractelement <2 x double> %__W, i64 0
6103  %.rhs = extractelement <2 x double> %__X, i64 0
6104  %1 = fsub double -0.000000e+00, %.rhs
6105  %2 = extractelement <2 x double> %__Y, i64 0
6106  %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
6107  %4 = bitcast i8 %__U to <8 x i1>
6108  %5 = extractelement <8 x i1> %4, i64 0
6109  %6 = select i1 %5, double %3, double %2
6110  %7 = insertelement <2 x double> %__Y, double %6, i64 0
6111  ret <2 x double> %7
6112}
6113
6114define <2 x double> @test_mm_mask_fnmsub_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
6115; X86-LABEL: test_mm_mask_fnmsub_sd:
6116; X86:       # %bb.0: # %entry
6117; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6118; X86-NEXT:    kmovw %eax, %k1
6119; X86-NEXT:    vfnmsub213sd {{.*#+}} xmm0 {%k1} = -(xmm1 * xmm0) - xmm2
6120; X86-NEXT:    retl
6121;
6122; X64-LABEL: test_mm_mask_fnmsub_sd:
6123; X64:       # %bb.0: # %entry
6124; X64-NEXT:    kmovw %edi, %k1
6125; X64-NEXT:    vfnmsub213sd {{.*#+}} xmm0 {%k1} = -(xmm1 * xmm0) - xmm2
6126; X64-NEXT:    retq
6127entry:
6128  %0 = extractelement <2 x double> %__W, i64 0
6129  %.rhs.i = extractelement <2 x double> %__A, i64 0
6130  %1 = fsub double -0.000000e+00, %.rhs.i
6131  %.rhs7.i = extractelement <2 x double> %__B, i64 0
6132  %2 = fsub double -0.000000e+00, %.rhs7.i
6133  %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
6134  %4 = and i8 %__U, 1
6135  %tobool.i = icmp eq i8 %4, 0
6136  %vecext2.i = extractelement <2 x double> %__W, i32 0
6137  %cond.i = select i1 %tobool.i, double %vecext2.i, double %3
6138  %vecins.i = insertelement <2 x double> %__W, double %cond.i, i32 0
6139  ret <2 x double> %vecins.i
6140}
6141
6142define <2 x double> @test_mm_mask_fnmsub_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
6143; X86-LABEL: test_mm_mask_fnmsub_round_sd:
6144; X86:       # %bb.0: # %entry
6145; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6146; X86-NEXT:    kmovw %eax, %k1
6147; X86-NEXT:    vfnmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
6148; X86-NEXT:    retl
6149;
6150; X64-LABEL: test_mm_mask_fnmsub_round_sd:
6151; X64:       # %bb.0: # %entry
6152; X64-NEXT:    kmovw %edi, %k1
6153; X64-NEXT:    vfnmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
6154; X64-NEXT:    retq
6155entry:
6156  %0 = extractelement <2 x double> %__W, i64 0
6157  %.rhs = extractelement <2 x double> %__A, i64 0
6158  %1 = fsub double -0.000000e+00, %.rhs
6159  %.rhs2 = extractelement <2 x double> %__B, i64 0
6160  %2 = fsub double -0.000000e+00, %.rhs2
6161  %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
6162  %4 = bitcast i8 %__U to <8 x i1>
6163  %5 = extractelement <8 x i1> %4, i64 0
6164  %6 = select i1 %5, double %3, double %0
6165  %7 = insertelement <2 x double> %__W, double %6, i64 0
6166  ret <2 x double> %7
6167}
6168
6169define <2 x double> @test_mm_maskz_fnmsub_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
6170; X86-LABEL: test_mm_maskz_fnmsub_sd:
6171; X86:       # %bb.0: # %entry
6172; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6173; X86-NEXT:    kmovw %eax, %k1
6174; X86-NEXT:    vfnmsub213sd {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2
6175; X86-NEXT:    retl
6176;
6177; X64-LABEL: test_mm_maskz_fnmsub_sd:
6178; X64:       # %bb.0: # %entry
6179; X64-NEXT:    kmovw %edi, %k1
6180; X64-NEXT:    vfnmsub213sd {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2
6181; X64-NEXT:    retq
6182entry:
6183  %0 = extractelement <2 x double> %__A, i64 0
6184  %.rhs.i = extractelement <2 x double> %__B, i64 0
6185  %1 = fsub double -0.000000e+00, %.rhs.i
6186  %.rhs5.i = extractelement <2 x double> %__C, i64 0
6187  %2 = fsub double -0.000000e+00, %.rhs5.i
6188  %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
6189  %4 = and i8 %__U, 1
6190  %tobool.i = icmp eq i8 %4, 0
6191  %cond.i = select i1 %tobool.i, double 0.000000e+00, double %3
6192  %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
6193  ret <2 x double> %vecins.i
6194}
6195
6196define <2 x double> @test_mm_maskz_fnmsub_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
6197; X86-LABEL: test_mm_maskz_fnmsub_round_sd:
6198; X86:       # %bb.0: # %entry
6199; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6200; X86-NEXT:    kmovw %eax, %k1
6201; X86-NEXT:    vfnmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
6202; X86-NEXT:    retl
6203;
6204; X64-LABEL: test_mm_maskz_fnmsub_round_sd:
6205; X64:       # %bb.0: # %entry
6206; X64-NEXT:    kmovw %edi, %k1
6207; X64-NEXT:    vfnmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
6208; X64-NEXT:    retq
6209entry:
6210  %0 = extractelement <2 x double> %__A, i64 0
6211  %.rhs = extractelement <2 x double> %__B, i64 0
6212  %1 = fsub double -0.000000e+00, %.rhs
6213  %.rhs2 = extractelement <2 x double> %__C, i64 0
6214  %2 = fsub double -0.000000e+00, %.rhs2
6215  %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
6216  %4 = bitcast i8 %__U to <8 x i1>
6217  %5 = extractelement <8 x i1> %4, i64 0
6218  %6 = select i1 %5, double %3, double 0.000000e+00
6219  %7 = insertelement <2 x double> %__A, double %6, i64 0
6220  ret <2 x double> %7
6221}
6222
6223define <2 x double> @test_mm_mask3_fnmsub_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
6224; X86-LABEL: test_mm_mask3_fnmsub_sd:
6225; X86:       # %bb.0: # %entry
6226; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6227; X86-NEXT:    kmovw %eax, %k1
6228; X86-NEXT:    vfnmsub231sd {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) - xmm2
6229; X86-NEXT:    vmovapd %xmm2, %xmm0
6230; X86-NEXT:    retl
6231;
6232; X64-LABEL: test_mm_mask3_fnmsub_sd:
6233; X64:       # %bb.0: # %entry
6234; X64-NEXT:    kmovw %edi, %k1
6235; X64-NEXT:    vfnmsub231sd {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) - xmm2
6236; X64-NEXT:    vmovapd %xmm2, %xmm0
6237; X64-NEXT:    retq
6238entry:
6239  %0 = extractelement <2 x double> %__W, i64 0
6240  %.rhs.i = extractelement <2 x double> %__X, i64 0
6241  %1 = fsub double -0.000000e+00, %.rhs.i
6242  %.rhs7.i = extractelement <2 x double> %__Y, i64 0
6243  %2 = fsub double -0.000000e+00, %.rhs7.i
6244  %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
6245  %4 = and i8 %__U, 1
6246  %tobool.i = icmp eq i8 %4, 0
6247  %vecext2.i = extractelement <2 x double> %__Y, i32 0
6248  %cond.i = select i1 %tobool.i, double %vecext2.i, double %3
6249  %vecins.i = insertelement <2 x double> %__Y, double %cond.i, i32 0
6250  ret <2 x double> %vecins.i
6251}
6252
6253define <2 x double> @test_mm_mask3_fnmsub_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
6254; X86-LABEL: test_mm_mask3_fnmsub_round_sd:
6255; X86:       # %bb.0: # %entry
6256; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6257; X86-NEXT:    kmovw %eax, %k1
6258; X86-NEXT:    vfnmsub231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
6259; X86-NEXT:    vmovapd %xmm2, %xmm0
6260; X86-NEXT:    retl
6261;
6262; X64-LABEL: test_mm_mask3_fnmsub_round_sd:
6263; X64:       # %bb.0: # %entry
6264; X64-NEXT:    kmovw %edi, %k1
6265; X64-NEXT:    vfnmsub231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
6266; X64-NEXT:    vmovapd %xmm2, %xmm0
6267; X64-NEXT:    retq
6268entry:
6269  %0 = extractelement <2 x double> %__W, i64 0
6270  %.rhs = extractelement <2 x double> %__X, i64 0
6271  %1 = fsub double -0.000000e+00, %.rhs
6272  %.rhs1 = extractelement <2 x double> %__Y, i64 0
6273  %2 = fsub double -0.000000e+00, %.rhs1
6274  %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
6275  %4 = bitcast i8 %__U to <8 x i1>
6276  %5 = extractelement <8 x i1> %4, i64 0
6277  %6 = select i1 %5, double %3, double %.rhs1
6278  %7 = insertelement <2 x double> %__Y, double %6, i64 0
6279  ret <2 x double> %7
6280}
6281
6282define <8 x i64> @test_mm512_mask_expandloadu_epi64(<8 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) {
6283; X86-LABEL: test_mm512_mask_expandloadu_epi64:
6284; X86:       # %bb.0: # %entry
6285; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
6286; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
6287; X86-NEXT:    kmovw %ecx, %k1
6288; X86-NEXT:    vpexpandq (%eax), %zmm0 {%k1}
6289; X86-NEXT:    retl
6290;
6291; X64-LABEL: test_mm512_mask_expandloadu_epi64:
6292; X64:       # %bb.0: # %entry
6293; X64-NEXT:    kmovw %edi, %k1
6294; X64-NEXT:    vpexpandq (%rsi), %zmm0 {%k1}
6295; X64-NEXT:    retq
6296entry:
6297  %0 = bitcast i8* %__P to i64*
6298  %1 = bitcast i8 %__U to <8 x i1>
6299  %2 = tail call <8 x i64> @llvm.masked.expandload.v8i64(i64* %0, <8 x i1> %1, <8 x i64> %__W)
6300  ret <8 x i64> %2
6301}
6302
6303define <8 x i64> @test_mm512_maskz_expandloadu_epi64(i8 zeroext %__U, i8* readonly %__P) {
6304; X86-LABEL: test_mm512_maskz_expandloadu_epi64:
6305; X86:       # %bb.0: # %entry
6306; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
6307; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
6308; X86-NEXT:    kmovw %ecx, %k1
6309; X86-NEXT:    vpexpandq (%eax), %zmm0 {%k1} {z}
6310; X86-NEXT:    retl
6311;
6312; X64-LABEL: test_mm512_maskz_expandloadu_epi64:
6313; X64:       # %bb.0: # %entry
6314; X64-NEXT:    kmovw %edi, %k1
6315; X64-NEXT:    vpexpandq (%rsi), %zmm0 {%k1} {z}
6316; X64-NEXT:    retq
6317entry:
6318  %0 = bitcast i8* %__P to i64*
6319  %1 = bitcast i8 %__U to <8 x i1>
6320  %2 = tail call <8 x i64> @llvm.masked.expandload.v8i64(i64* %0, <8 x i1> %1, <8 x i64> zeroinitializer)
6321  ret <8 x i64> %2
6322}
6323
6324define <8 x double> @test_mm512_mask_expandloadu_pd(<8 x double> %__W, i8 zeroext %__U, i8* readonly %__P) {
6325; X86-LABEL: test_mm512_mask_expandloadu_pd:
6326; X86:       # %bb.0: # %entry
6327; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
6328; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
6329; X86-NEXT:    kmovw %ecx, %k1
6330; X86-NEXT:    vexpandpd (%eax), %zmm0 {%k1}
6331; X86-NEXT:    retl
6332;
6333; X64-LABEL: test_mm512_mask_expandloadu_pd:
6334; X64:       # %bb.0: # %entry
6335; X64-NEXT:    kmovw %edi, %k1
6336; X64-NEXT:    vexpandpd (%rsi), %zmm0 {%k1}
6337; X64-NEXT:    retq
6338entry:
6339  %0 = bitcast i8* %__P to double*
6340  %1 = bitcast i8 %__U to <8 x i1>
6341  %2 = tail call <8 x double> @llvm.masked.expandload.v8f64(double* %0, <8 x i1> %1, <8 x double> %__W)
6342  ret <8 x double> %2
6343}
6344
6345define <8 x double> @test_mm512_maskz_expandloadu_pd(i8 zeroext %__U, i8* readonly %__P) {
6346; X86-LABEL: test_mm512_maskz_expandloadu_pd:
6347; X86:       # %bb.0: # %entry
6348; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
6349; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
6350; X86-NEXT:    kmovw %ecx, %k1
6351; X86-NEXT:    vexpandpd (%eax), %zmm0 {%k1} {z}
6352; X86-NEXT:    retl
6353;
6354; X64-LABEL: test_mm512_maskz_expandloadu_pd:
6355; X64:       # %bb.0: # %entry
6356; X64-NEXT:    kmovw %edi, %k1
6357; X64-NEXT:    vexpandpd (%rsi), %zmm0 {%k1} {z}
6358; X64-NEXT:    retq
6359entry:
6360  %0 = bitcast i8* %__P to double*
6361  %1 = bitcast i8 %__U to <8 x i1>
6362  %2 = tail call <8 x double> @llvm.masked.expandload.v8f64(double* %0, <8 x i1> %1, <8 x double> zeroinitializer)
6363  ret <8 x double> %2
6364}
6365
6366define <8 x i64> @test_mm512_mask_expandloadu_epi32(<8 x i64> %__W, i16 zeroext %__U, i8* readonly %__P) {
6367; X86-LABEL: test_mm512_mask_expandloadu_epi32:
6368; X86:       # %bb.0: # %entry
6369; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
6370; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
6371; X86-NEXT:    kmovw %ecx, %k1
6372; X86-NEXT:    vpexpandd (%eax), %zmm0 {%k1}
6373; X86-NEXT:    retl
6374;
6375; X64-LABEL: test_mm512_mask_expandloadu_epi32:
6376; X64:       # %bb.0: # %entry
6377; X64-NEXT:    kmovw %edi, %k1
6378; X64-NEXT:    vpexpandd (%rsi), %zmm0 {%k1}
6379; X64-NEXT:    retq
6380entry:
6381  %0 = bitcast <8 x i64> %__W to <16 x i32>
6382  %1 = bitcast i8* %__P to i32*
6383  %2 = bitcast i16 %__U to <16 x i1>
6384  %3 = tail call <16 x i32> @llvm.masked.expandload.v16i32(i32* %1, <16 x i1> %2, <16 x i32> %0) #11
6385  %4 = bitcast <16 x i32> %3 to <8 x i64>
6386  ret <8 x i64> %4
6387}
6388
6389define <8 x i64> @test_mm512_maskz_expandloadu_epi32(i16 zeroext %__U, i8* readonly %__P) {
6390; X86-LABEL: test_mm512_maskz_expandloadu_epi32:
6391; X86:       # %bb.0: # %entry
6392; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
6393; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
6394; X86-NEXT:    kmovw %ecx, %k1
6395; X86-NEXT:    vpexpandd (%eax), %zmm0 {%k1} {z}
6396; X86-NEXT:    retl
6397;
6398; X64-LABEL: test_mm512_maskz_expandloadu_epi32:
6399; X64:       # %bb.0: # %entry
6400; X64-NEXT:    kmovw %edi, %k1
6401; X64-NEXT:    vpexpandd (%rsi), %zmm0 {%k1} {z}
6402; X64-NEXT:    retq
6403entry:
6404  %0 = bitcast i8* %__P to i32*
6405  %1 = bitcast i16 %__U to <16 x i1>
6406  %2 = tail call <16 x i32> @llvm.masked.expandload.v16i32(i32* %0, <16 x i1> %1, <16 x i32> zeroinitializer)
6407  %3 = bitcast <16 x i32> %2 to <8 x i64>
6408  ret <8 x i64> %3
6409}
6410
6411define <16 x float> @test_mm512_mask_expandloadu_ps(<16 x float> %__W, i16 zeroext %__U, i8* readonly %__P) {
6412; X86-LABEL: test_mm512_mask_expandloadu_ps:
6413; X86:       # %bb.0: # %entry
6414; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
6415; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
6416; X86-NEXT:    kmovw %ecx, %k1
6417; X86-NEXT:    vexpandps (%eax), %zmm0 {%k1}
6418; X86-NEXT:    retl
6419;
6420; X64-LABEL: test_mm512_mask_expandloadu_ps:
6421; X64:       # %bb.0: # %entry
6422; X64-NEXT:    kmovw %edi, %k1
6423; X64-NEXT:    vexpandps (%rsi), %zmm0 {%k1}
6424; X64-NEXT:    retq
6425entry:
6426  %0 = bitcast i8* %__P to float*
6427  %1 = bitcast i16 %__U to <16 x i1>
6428  %2 = tail call <16 x float> @llvm.masked.expandload.v16f32(float* %0, <16 x i1> %1, <16 x float> %__W) #11
6429  ret <16 x float> %2
6430}
6431
6432define <16 x float> @test_mm512_maskz_expandloadu_ps(i16 zeroext %__U, i8* readonly %__P) {
6433; X86-LABEL: test_mm512_maskz_expandloadu_ps:
6434; X86:       # %bb.0: # %entry
6435; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
6436; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
6437; X86-NEXT:    kmovw %ecx, %k1
6438; X86-NEXT:    vexpandps (%eax), %zmm0 {%k1} {z}
6439; X86-NEXT:    retl
6440;
6441; X64-LABEL: test_mm512_maskz_expandloadu_ps:
6442; X64:       # %bb.0: # %entry
6443; X64-NEXT:    kmovw %edi, %k1
6444; X64-NEXT:    vexpandps (%rsi), %zmm0 {%k1} {z}
6445; X64-NEXT:    retq
6446entry:
6447  %0 = bitcast i8* %__P to float*
6448  %1 = bitcast i16 %__U to <16 x i1>
6449  %2 = tail call <16 x float> @llvm.masked.expandload.v16f32(float* %0, <16 x i1> %1, <16 x float> zeroinitializer)
6450  ret <16 x float> %2
6451}
6452
6453define void @test_mm512_mask_compressstoreu_pd(i8* %__P, i8 zeroext %__U, <8 x double> %__A) {
6454; X86-LABEL: test_mm512_mask_compressstoreu_pd:
6455; X86:       # %bb.0: # %entry
6456; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6457; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
6458; X86-NEXT:    kmovw %eax, %k1
6459; X86-NEXT:    vcompresspd %zmm0, (%ecx) {%k1}
6460; X86-NEXT:    vzeroupper
6461; X86-NEXT:    retl
6462;
6463; X64-LABEL: test_mm512_mask_compressstoreu_pd:
6464; X64:       # %bb.0: # %entry
6465; X64-NEXT:    kmovw %esi, %k1
6466; X64-NEXT:    vcompresspd %zmm0, (%rdi) {%k1}
6467; X64-NEXT:    vzeroupper
6468; X64-NEXT:    retq
6469entry:
6470  %0 = bitcast i8* %__P to double*
6471  %1 = bitcast i8 %__U to <8 x i1>
6472  tail call void @llvm.masked.compressstore.v8f64(<8 x double> %__A, double* %0, <8 x i1> %1)
6473  ret void
6474}
6475
6476define void @test_mm512_mask_compressstoreu_epi64(i8* %__P, i8 zeroext %__U, <8 x i64> %__A) {
6477; X86-LABEL: test_mm512_mask_compressstoreu_epi64:
6478; X86:       # %bb.0: # %entry
6479; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6480; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
6481; X86-NEXT:    kmovw %eax, %k1
6482; X86-NEXT:    vpcompressq %zmm0, (%ecx) {%k1}
6483; X86-NEXT:    vzeroupper
6484; X86-NEXT:    retl
6485;
6486; X64-LABEL: test_mm512_mask_compressstoreu_epi64:
6487; X64:       # %bb.0: # %entry
6488; X64-NEXT:    kmovw %esi, %k1
6489; X64-NEXT:    vpcompressq %zmm0, (%rdi) {%k1}
6490; X64-NEXT:    vzeroupper
6491; X64-NEXT:    retq
6492entry:
6493  %0 = bitcast i8* %__P to i64*
6494  %1 = bitcast i8 %__U to <8 x i1>
6495  tail call void @llvm.masked.compressstore.v8i64(<8 x i64> %__A, i64* %0, <8 x i1> %1)
6496  ret void
6497}
6498
6499define void @test_mm512_mask_compressstoreu_ps(i8* %__P, i16 zeroext %__U, <16 x float> %__A) {
6500; X86-LABEL: test_mm512_mask_compressstoreu_ps:
6501; X86:       # %bb.0: # %entry
6502; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
6503; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
6504; X86-NEXT:    kmovw %eax, %k1
6505; X86-NEXT:    vcompressps %zmm0, (%ecx) {%k1}
6506; X86-NEXT:    vzeroupper
6507; X86-NEXT:    retl
6508;
6509; X64-LABEL: test_mm512_mask_compressstoreu_ps:
6510; X64:       # %bb.0: # %entry
6511; X64-NEXT:    kmovw %esi, %k1
6512; X64-NEXT:    vcompressps %zmm0, (%rdi) {%k1}
6513; X64-NEXT:    vzeroupper
6514; X64-NEXT:    retq
6515entry:
6516  %0 = bitcast i8* %__P to float*
6517  %1 = bitcast i16 %__U to <16 x i1>
6518  tail call void @llvm.masked.compressstore.v16f32(<16 x float> %__A, float* %0, <16 x i1> %1)
6519  ret void
6520}
6521
6522define void @test_mm512_mask_compressstoreu_epi32(i8* %__P, i16 zeroext %__U, <8 x i64> %__A) {
6523; X86-LABEL: test_mm512_mask_compressstoreu_epi32:
6524; X86:       # %bb.0: # %entry
6525; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
6526; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
6527; X86-NEXT:    kmovw %eax, %k1
6528; X86-NEXT:    vpcompressd %zmm0, (%ecx) {%k1}
6529; X86-NEXT:    vzeroupper
6530; X86-NEXT:    retl
6531;
6532; X64-LABEL: test_mm512_mask_compressstoreu_epi32:
6533; X64:       # %bb.0: # %entry
6534; X64-NEXT:    kmovw %esi, %k1
6535; X64-NEXT:    vpcompressd %zmm0, (%rdi) {%k1}
6536; X64-NEXT:    vzeroupper
6537; X64-NEXT:    retq
6538entry:
6539  %0 = bitcast <8 x i64> %__A to <16 x i32>
6540  %1 = bitcast i8* %__P to i32*
6541  %2 = bitcast i16 %__U to <16 x i1>
6542  tail call void @llvm.masked.compressstore.v16i32(<16 x i32> %0, i32* %1, <16 x i1> %2)
6543  ret void
6544}
6545
6546define i64 @test_mm512_reduce_add_epi64(<8 x i64> %__W) {
6547; X86-LABEL: test_mm512_reduce_add_epi64:
6548; X86:       # %bb.0: # %entry
6549; X86-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
6550; X86-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
6551; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
6552; X86-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
6553; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
6554; X86-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
6555; X86-NEXT:    vmovd %xmm0, %eax
6556; X86-NEXT:    vpextrd $1, %xmm0, %edx
6557; X86-NEXT:    vzeroupper
6558; X86-NEXT:    retl
6559;
6560; X64-LABEL: test_mm512_reduce_add_epi64:
6561; X64:       # %bb.0: # %entry
6562; X64-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
6563; X64-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
6564; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
6565; X64-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
6566; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
6567; X64-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
6568; X64-NEXT:    vmovq %xmm0, %rax
6569; X64-NEXT:    vzeroupper
6570; X64-NEXT:    retq
6571entry:
6572  %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6573  %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6574  %add.i = add <4 x i64> %shuffle.i, %shuffle1.i
6575  %shuffle2.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6576  %shuffle3.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6577  %add4.i = add <2 x i64> %shuffle2.i, %shuffle3.i
6578  %shuffle6.i = shufflevector <2 x i64> %add4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6579  %add7.i = add <2 x i64> %shuffle6.i, %add4.i
6580  %vecext.i = extractelement <2 x i64> %add7.i, i32 0
6581  ret i64 %vecext.i
6582}
6583
6584define i64 @test_mm512_reduce_mul_epi64(<8 x i64> %__W) {
6585; X86-LABEL: test_mm512_reduce_mul_epi64:
6586; X86:       # %bb.0: # %entry
6587; X86-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
6588; X86-NEXT:    vpsrlq $32, %ymm0, %ymm2
6589; X86-NEXT:    vpmuludq %ymm1, %ymm2, %ymm2
6590; X86-NEXT:    vpsrlq $32, %ymm1, %ymm3
6591; X86-NEXT:    vpmuludq %ymm3, %ymm0, %ymm3
6592; X86-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
6593; X86-NEXT:    vpsllq $32, %ymm2, %ymm2
6594; X86-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
6595; X86-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
6596; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
6597; X86-NEXT:    vpsrlq $32, %xmm0, %xmm2
6598; X86-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
6599; X86-NEXT:    vpsrlq $32, %xmm1, %xmm3
6600; X86-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
6601; X86-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
6602; X86-NEXT:    vpsllq $32, %xmm2, %xmm2
6603; X86-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
6604; X86-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
6605; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
6606; X86-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6607; X86-NEXT:    vpmuludq %xmm0, %xmm2, %xmm2
6608; X86-NEXT:    vpsrlq $32, %xmm0, %xmm3
6609; X86-NEXT:    vpmuludq %xmm3, %xmm1, %xmm3
6610; X86-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
6611; X86-NEXT:    vpsllq $32, %xmm2, %xmm2
6612; X86-NEXT:    vpmuludq %xmm0, %xmm1, %xmm0
6613; X86-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
6614; X86-NEXT:    vmovd %xmm0, %eax
6615; X86-NEXT:    vpextrd $1, %xmm0, %edx
6616; X86-NEXT:    vzeroupper
6617; X86-NEXT:    retl
6618;
6619; X64-LABEL: test_mm512_reduce_mul_epi64:
6620; X64:       # %bb.0: # %entry
6621; X64-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
6622; X64-NEXT:    vpsrlq $32, %ymm0, %ymm2
6623; X64-NEXT:    vpmuludq %ymm1, %ymm2, %ymm2
6624; X64-NEXT:    vpsrlq $32, %ymm1, %ymm3
6625; X64-NEXT:    vpmuludq %ymm3, %ymm0, %ymm3
6626; X64-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
6627; X64-NEXT:    vpsllq $32, %ymm2, %ymm2
6628; X64-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
6629; X64-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
6630; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
6631; X64-NEXT:    vpsrlq $32, %xmm0, %xmm2
6632; X64-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
6633; X64-NEXT:    vpsrlq $32, %xmm1, %xmm3
6634; X64-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
6635; X64-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
6636; X64-NEXT:    vpsllq $32, %xmm2, %xmm2
6637; X64-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
6638; X64-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
6639; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
6640; X64-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
6641; X64-NEXT:    vpmuludq %xmm0, %xmm2, %xmm2
6642; X64-NEXT:    vpsrlq $32, %xmm0, %xmm3
6643; X64-NEXT:    vpmuludq %xmm3, %xmm1, %xmm3
6644; X64-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
6645; X64-NEXT:    vpsllq $32, %xmm2, %xmm2
6646; X64-NEXT:    vpmuludq %xmm0, %xmm1, %xmm0
6647; X64-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
6648; X64-NEXT:    vmovq %xmm0, %rax
6649; X64-NEXT:    vzeroupper
6650; X64-NEXT:    retq
6651entry:
6652  %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6653  %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6654  %mul.i = mul <4 x i64> %shuffle.i, %shuffle1.i
6655  %shuffle2.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6656  %shuffle3.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6657  %mul4.i = mul <2 x i64> %shuffle2.i, %shuffle3.i
6658  %shuffle6.i = shufflevector <2 x i64> %mul4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6659  %mul7.i = mul <2 x i64> %shuffle6.i, %mul4.i
6660  %vecext.i = extractelement <2 x i64> %mul7.i, i32 0
6661  ret i64 %vecext.i
6662}
6663
6664define i64 @test_mm512_reduce_or_epi64(<8 x i64> %__W) {
6665; X86-LABEL: test_mm512_reduce_or_epi64:
6666; X86:       # %bb.0: # %entry
6667; X86-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
6668; X86-NEXT:    vpor %ymm1, %ymm0, %ymm0
6669; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
6670; X86-NEXT:    vpor %xmm1, %xmm0, %xmm0
6671; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
6672; X86-NEXT:    vpor %xmm0, %xmm1, %xmm0
6673; X86-NEXT:    vmovd %xmm0, %eax
6674; X86-NEXT:    vpextrd $1, %xmm0, %edx
6675; X86-NEXT:    vzeroupper
6676; X86-NEXT:    retl
6677;
6678; X64-LABEL: test_mm512_reduce_or_epi64:
6679; X64:       # %bb.0: # %entry
6680; X64-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
6681; X64-NEXT:    vpor %ymm1, %ymm0, %ymm0
6682; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
6683; X64-NEXT:    vpor %xmm1, %xmm0, %xmm0
6684; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
6685; X64-NEXT:    vpor %xmm0, %xmm1, %xmm0
6686; X64-NEXT:    vmovq %xmm0, %rax
6687; X64-NEXT:    vzeroupper
6688; X64-NEXT:    retq
6689entry:
6690  %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6691  %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6692  %or.i = or <4 x i64> %shuffle.i, %shuffle1.i
6693  %shuffle2.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6694  %shuffle3.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6695  %or4.i = or <2 x i64> %shuffle2.i, %shuffle3.i
6696  %shuffle6.i = shufflevector <2 x i64> %or4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6697  %or7.i = or <2 x i64> %shuffle6.i, %or4.i
6698  %vecext.i = extractelement <2 x i64> %or7.i, i32 0
6699  ret i64 %vecext.i
6700}
6701
6702define i64 @test_mm512_reduce_and_epi64(<8 x i64> %__W) {
6703; X86-LABEL: test_mm512_reduce_and_epi64:
6704; X86:       # %bb.0: # %entry
6705; X86-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
6706; X86-NEXT:    vpand %ymm1, %ymm0, %ymm0
6707; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
6708; X86-NEXT:    vpand %xmm1, %xmm0, %xmm0
6709; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
6710; X86-NEXT:    vpand %xmm0, %xmm1, %xmm0
6711; X86-NEXT:    vmovd %xmm0, %eax
6712; X86-NEXT:    vpextrd $1, %xmm0, %edx
6713; X86-NEXT:    vzeroupper
6714; X86-NEXT:    retl
6715;
6716; X64-LABEL: test_mm512_reduce_and_epi64:
6717; X64:       # %bb.0: # %entry
6718; X64-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
6719; X64-NEXT:    vpand %ymm1, %ymm0, %ymm0
6720; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
6721; X64-NEXT:    vpand %xmm1, %xmm0, %xmm0
6722; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
6723; X64-NEXT:    vpand %xmm0, %xmm1, %xmm0
6724; X64-NEXT:    vmovq %xmm0, %rax
6725; X64-NEXT:    vzeroupper
6726; X64-NEXT:    retq
6727entry:
6728  %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6729  %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6730  %and.i = and <4 x i64> %shuffle.i, %shuffle1.i
6731  %shuffle2.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6732  %shuffle3.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6733  %and4.i = and <2 x i64> %shuffle2.i, %shuffle3.i
6734  %shuffle6.i = shufflevector <2 x i64> %and4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6735  %and7.i = and <2 x i64> %shuffle6.i, %and4.i
6736  %vecext.i = extractelement <2 x i64> %and7.i, i32 0
6737  ret i64 %vecext.i
6738}
6739
6740define i64 @test_mm512_mask_reduce_add_epi64(i8 zeroext %__M, <8 x i64> %__W) {
6741; X86-LABEL: test_mm512_mask_reduce_add_epi64:
6742; X86:       # %bb.0: # %entry
6743; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6744; X86-NEXT:    kmovw %eax, %k1
6745; X86-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
6746; X86-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
6747; X86-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
6748; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
6749; X86-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
6750; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
6751; X86-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
6752; X86-NEXT:    vmovd %xmm0, %eax
6753; X86-NEXT:    vpextrd $1, %xmm0, %edx
6754; X86-NEXT:    vzeroupper
6755; X86-NEXT:    retl
6756;
6757; X64-LABEL: test_mm512_mask_reduce_add_epi64:
6758; X64:       # %bb.0: # %entry
6759; X64-NEXT:    kmovw %edi, %k1
6760; X64-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
6761; X64-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
6762; X64-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
6763; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
6764; X64-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
6765; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
6766; X64-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
6767; X64-NEXT:    vmovq %xmm0, %rax
6768; X64-NEXT:    vzeroupper
6769; X64-NEXT:    retq
6770entry:
6771  %0 = bitcast i8 %__M to <8 x i1>
6772  %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> zeroinitializer
6773  %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6774  %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6775  %add.i = add <4 x i64> %shuffle.i, %shuffle1.i
6776  %shuffle2.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6777  %shuffle3.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6778  %add4.i = add <2 x i64> %shuffle2.i, %shuffle3.i
6779  %shuffle6.i = shufflevector <2 x i64> %add4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6780  %add7.i = add <2 x i64> %shuffle6.i, %add4.i
6781  %vecext.i = extractelement <2 x i64> %add7.i, i32 0
6782  ret i64 %vecext.i
6783}
6784
6785define i64 @test_mm512_mask_reduce_mul_epi64(i8 zeroext %__M, <8 x i64> %__W) {
6786; X86-LABEL: test_mm512_mask_reduce_mul_epi64:
6787; X86:       # %bb.0: # %entry
6788; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6789; X86-NEXT:    kmovw %eax, %k1
6790; X86-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]
6791; X86-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
6792; X86-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
6793; X86-NEXT:    vpsrlq $32, %ymm1, %ymm2
6794; X86-NEXT:    vpmuludq %ymm0, %ymm2, %ymm2
6795; X86-NEXT:    vpsrlq $32, %ymm0, %ymm3
6796; X86-NEXT:    vpmuludq %ymm3, %ymm1, %ymm3
6797; X86-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
6798; X86-NEXT:    vpsllq $32, %ymm2, %ymm2
6799; X86-NEXT:    vpmuludq %ymm0, %ymm1, %ymm0
6800; X86-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
6801; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
6802; X86-NEXT:    vpsrlq $32, %xmm0, %xmm2
6803; X86-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
6804; X86-NEXT:    vpsrlq $32, %xmm1, %xmm3
6805; X86-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
6806; X86-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
6807; X86-NEXT:    vpsllq $32, %xmm2, %xmm2
6808; X86-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
6809; X86-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
6810; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
6811; X86-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6812; X86-NEXT:    vpmuludq %xmm0, %xmm2, %xmm2
6813; X86-NEXT:    vpsrlq $32, %xmm0, %xmm3
6814; X86-NEXT:    vpmuludq %xmm3, %xmm1, %xmm3
6815; X86-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
6816; X86-NEXT:    vpsllq $32, %xmm2, %xmm2
6817; X86-NEXT:    vpmuludq %xmm0, %xmm1, %xmm0
6818; X86-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
6819; X86-NEXT:    vmovd %xmm0, %eax
6820; X86-NEXT:    vpextrd $1, %xmm0, %edx
6821; X86-NEXT:    vzeroupper
6822; X86-NEXT:    retl
6823;
6824; X64-LABEL: test_mm512_mask_reduce_mul_epi64:
6825; X64:       # %bb.0: # %entry
6826; X64-NEXT:    kmovw %edi, %k1
6827; X64-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1]
6828; X64-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
6829; X64-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
6830; X64-NEXT:    vpsrlq $32, %ymm1, %ymm2
6831; X64-NEXT:    vpmuludq %ymm0, %ymm2, %ymm2
6832; X64-NEXT:    vpsrlq $32, %ymm0, %ymm3
6833; X64-NEXT:    vpmuludq %ymm3, %ymm1, %ymm3
6834; X64-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
6835; X64-NEXT:    vpsllq $32, %ymm2, %ymm2
6836; X64-NEXT:    vpmuludq %ymm0, %ymm1, %ymm0
6837; X64-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
6838; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
6839; X64-NEXT:    vpsrlq $32, %xmm0, %xmm2
6840; X64-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
6841; X64-NEXT:    vpsrlq $32, %xmm1, %xmm3
6842; X64-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
6843; X64-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
6844; X64-NEXT:    vpsllq $32, %xmm2, %xmm2
6845; X64-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
6846; X64-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
6847; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
6848; X64-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
6849; X64-NEXT:    vpmuludq %xmm0, %xmm2, %xmm2
6850; X64-NEXT:    vpsrlq $32, %xmm0, %xmm3
6851; X64-NEXT:    vpmuludq %xmm3, %xmm1, %xmm3
6852; X64-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
6853; X64-NEXT:    vpsllq $32, %xmm2, %xmm2
6854; X64-NEXT:    vpmuludq %xmm0, %xmm1, %xmm0
6855; X64-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
6856; X64-NEXT:    vmovq %xmm0, %rax
6857; X64-NEXT:    vzeroupper
6858; X64-NEXT:    retq
6859entry:
6860  %0 = bitcast i8 %__M to <8 x i1>
6861  %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
6862  %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6863  %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6864  %mul.i = mul <4 x i64> %shuffle.i, %shuffle1.i
6865  %shuffle2.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6866  %shuffle3.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6867  %mul4.i = mul <2 x i64> %shuffle2.i, %shuffle3.i
6868  %shuffle6.i = shufflevector <2 x i64> %mul4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6869  %mul7.i = mul <2 x i64> %shuffle6.i, %mul4.i
6870  %vecext.i = extractelement <2 x i64> %mul7.i, i32 0
6871  ret i64 %vecext.i
6872}
6873
6874define i64 @test_mm512_mask_reduce_and_epi64(i8 zeroext %__M, <8 x i64> %__W) {
6875; X86-LABEL: test_mm512_mask_reduce_and_epi64:
6876; X86:       # %bb.0: # %entry
6877; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6878; X86-NEXT:    kmovw %eax, %k1
6879; X86-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
6880; X86-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
6881; X86-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
6882; X86-NEXT:    vpand %ymm0, %ymm1, %ymm0
6883; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
6884; X86-NEXT:    vpand %xmm1, %xmm0, %xmm0
6885; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
6886; X86-NEXT:    vpand %xmm0, %xmm1, %xmm0
6887; X86-NEXT:    vmovd %xmm0, %eax
6888; X86-NEXT:    vpextrd $1, %xmm0, %edx
6889; X86-NEXT:    vzeroupper
6890; X86-NEXT:    retl
6891;
6892; X64-LABEL: test_mm512_mask_reduce_and_epi64:
6893; X64:       # %bb.0: # %entry
6894; X64-NEXT:    kmovw %edi, %k1
6895; X64-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
6896; X64-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
6897; X64-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
6898; X64-NEXT:    vpand %ymm0, %ymm1, %ymm0
6899; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
6900; X64-NEXT:    vpand %xmm1, %xmm0, %xmm0
6901; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
6902; X64-NEXT:    vpand %xmm0, %xmm1, %xmm0
6903; X64-NEXT:    vmovq %xmm0, %rax
6904; X64-NEXT:    vzeroupper
6905; X64-NEXT:    retq
6906entry:
6907  %0 = bitcast i8 %__M to <8 x i1>
6908  %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
6909  %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6910  %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6911  %and.i = and <4 x i64> %shuffle.i, %shuffle1.i
6912  %shuffle2.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6913  %shuffle3.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6914  %and4.i = and <2 x i64> %shuffle2.i, %shuffle3.i
6915  %shuffle6.i = shufflevector <2 x i64> %and4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6916  %and7.i = and <2 x i64> %shuffle6.i, %and4.i
6917  %vecext.i = extractelement <2 x i64> %and7.i, i32 0
6918  ret i64 %vecext.i
6919}
6920
6921define i64 @test_mm512_mask_reduce_or_epi64(i8 zeroext %__M, <8 x i64> %__W) {
6922; X86-LABEL: test_mm512_mask_reduce_or_epi64:
6923; X86:       # %bb.0: # %entry
6924; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
6925; X86-NEXT:    kmovw %eax, %k1
6926; X86-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
6927; X86-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
6928; X86-NEXT:    vpor %ymm1, %ymm0, %ymm0
6929; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
6930; X86-NEXT:    vpor %xmm1, %xmm0, %xmm0
6931; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
6932; X86-NEXT:    vpor %xmm0, %xmm1, %xmm0
6933; X86-NEXT:    vmovd %xmm0, %eax
6934; X86-NEXT:    vpextrd $1, %xmm0, %edx
6935; X86-NEXT:    vzeroupper
6936; X86-NEXT:    retl
6937;
6938; X64-LABEL: test_mm512_mask_reduce_or_epi64:
6939; X64:       # %bb.0: # %entry
6940; X64-NEXT:    kmovw %edi, %k1
6941; X64-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
6942; X64-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
6943; X64-NEXT:    vpor %ymm1, %ymm0, %ymm0
6944; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
6945; X64-NEXT:    vpor %xmm1, %xmm0, %xmm0
6946; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
6947; X64-NEXT:    vpor %xmm0, %xmm1, %xmm0
6948; X64-NEXT:    vmovq %xmm0, %rax
6949; X64-NEXT:    vzeroupper
6950; X64-NEXT:    retq
6951entry:
6952  %0 = bitcast i8 %__M to <8 x i1>
6953  %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> zeroinitializer
6954  %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6955  %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6956  %or.i = or <4 x i64> %shuffle.i, %shuffle1.i
6957  %shuffle2.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6958  %shuffle3.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6959  %or4.i = or <2 x i64> %shuffle2.i, %shuffle3.i
6960  %shuffle6.i = shufflevector <2 x i64> %or4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6961  %or7.i = or <2 x i64> %shuffle6.i, %or4.i
6962  %vecext.i = extractelement <2 x i64> %or7.i, i32 0
6963  ret i64 %vecext.i
6964}
6965
6966define i32 @test_mm512_reduce_add_epi32(<8 x i64> %__W) {
6967; CHECK-LABEL: test_mm512_reduce_add_epi32:
6968; CHECK:       # %bb.0: # %entry
6969; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
6970; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
6971; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
6972; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
6973; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6974; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
6975; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
6976; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
6977; CHECK-NEXT:    vmovd %xmm0, %eax
6978; CHECK-NEXT:    vzeroupper
6979; CHECK-NEXT:    ret{{[l|q]}}
6980entry:
6981  %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6982  %0 = bitcast <4 x i64> %extract.i to <8 x i32>
6983  %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6984  %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
6985  %add.i = add <8 x i32> %0, %1
6986  %2 = bitcast <8 x i32> %add.i to <4 x i64>
6987  %extract3.i = shufflevector <4 x i64> %2, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6988  %3 = bitcast <2 x i64> %extract3.i to <4 x i32>
6989  %extract4.i = shufflevector <4 x i64> %2, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6990  %4 = bitcast <2 x i64> %extract4.i to <4 x i32>
6991  %add5.i = add <4 x i32> %3, %4
6992  %shuffle.i = shufflevector <4 x i32> %add5.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
6993  %add6.i = add <4 x i32> %shuffle.i, %add5.i
6994  %shuffle7.i = shufflevector <4 x i32> %add6.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
6995  %add8.i = add <4 x i32> %shuffle7.i, %add6.i
6996  %vecext.i = extractelement <4 x i32> %add8.i, i32 0
6997  ret i32 %vecext.i
6998}
6999
7000define i32 @test_mm512_reduce_mul_epi32(<8 x i64> %__W) {
7001; CHECK-LABEL: test_mm512_reduce_mul_epi32:
7002; CHECK:       # %bb.0: # %entry
7003; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
7004; CHECK-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
7005; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
7006; CHECK-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
7007; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7008; CHECK-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
7009; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
7010; CHECK-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
7011; CHECK-NEXT:    vmovd %xmm0, %eax
7012; CHECK-NEXT:    vzeroupper
7013; CHECK-NEXT:    ret{{[l|q]}}
7014entry:
7015  %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7016  %0 = bitcast <4 x i64> %extract.i to <8 x i32>
7017  %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7018  %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
7019  %mul.i = mul <8 x i32> %0, %1
7020  %2 = bitcast <8 x i32> %mul.i to <4 x i64>
7021  %extract3.i = shufflevector <4 x i64> %2, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
7022  %3 = bitcast <2 x i64> %extract3.i to <4 x i32>
7023  %extract4.i = shufflevector <4 x i64> %2, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
7024  %4 = bitcast <2 x i64> %extract4.i to <4 x i32>
7025  %mul5.i = mul <4 x i32> %3, %4
7026  %shuffle.i = shufflevector <4 x i32> %mul5.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7027  %mul6.i = mul <4 x i32> %shuffle.i, %mul5.i
7028  %shuffle7.i = shufflevector <4 x i32> %mul6.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
7029  %mul8.i = mul <4 x i32> %shuffle7.i, %mul6.i
7030  %vecext.i = extractelement <4 x i32> %mul8.i, i32 0
7031  ret i32 %vecext.i
7032}
7033
7034define i32 @test_mm512_reduce_or_epi32(<8 x i64> %__W) {
7035; CHECK-LABEL: test_mm512_reduce_or_epi32:
7036; CHECK:       # %bb.0: # %entry
7037; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
7038; CHECK-NEXT:    vpor %ymm1, %ymm0, %ymm0
7039; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
7040; CHECK-NEXT:    vpor %xmm1, %xmm0, %xmm0
7041; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7042; CHECK-NEXT:    vpor %xmm0, %xmm1, %xmm0
7043; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
7044; CHECK-NEXT:    vpor %xmm0, %xmm1, %xmm0
7045; CHECK-NEXT:    vmovd %xmm0, %eax
7046; CHECK-NEXT:    vzeroupper
7047; CHECK-NEXT:    ret{{[l|q]}}
7048entry:
7049  %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7050  %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7051  %or25.i = or <4 x i64> %extract.i, %extract2.i
7052  %extract3.i = shufflevector <4 x i64> %or25.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
7053  %extract4.i = shufflevector <4 x i64> %or25.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
7054  %or526.i = or <2 x i64> %extract3.i, %extract4.i
7055  %or5.i = bitcast <2 x i64> %or526.i to <4 x i32>
7056  %shuffle.i = shufflevector <4 x i32> %or5.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7057  %or6.i = or <4 x i32> %shuffle.i, %or5.i
7058  %shuffle7.i = shufflevector <4 x i32> %or6.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
7059  %or8.i = or <4 x i32> %shuffle7.i, %or6.i
7060  %vecext.i = extractelement <4 x i32> %or8.i, i32 0
7061  ret i32 %vecext.i
7062}
7063
7064define i32 @test_mm512_reduce_and_epi32(<8 x i64> %__W) {
7065; CHECK-LABEL: test_mm512_reduce_and_epi32:
7066; CHECK:       # %bb.0: # %entry
7067; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
7068; CHECK-NEXT:    vpand %ymm1, %ymm0, %ymm0
7069; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
7070; CHECK-NEXT:    vpand %xmm1, %xmm0, %xmm0
7071; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7072; CHECK-NEXT:    vpand %xmm0, %xmm1, %xmm0
7073; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
7074; CHECK-NEXT:    vpand %xmm0, %xmm1, %xmm0
7075; CHECK-NEXT:    vmovd %xmm0, %eax
7076; CHECK-NEXT:    vzeroupper
7077; CHECK-NEXT:    ret{{[l|q]}}
7078entry:
7079  %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7080  %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7081  %and25.i = and <4 x i64> %extract.i, %extract2.i
7082  %extract3.i = shufflevector <4 x i64> %and25.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
7083  %extract4.i = shufflevector <4 x i64> %and25.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
7084  %and526.i = and <2 x i64> %extract3.i, %extract4.i
7085  %and5.i = bitcast <2 x i64> %and526.i to <4 x i32>
7086  %shuffle.i = shufflevector <4 x i32> %and5.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7087  %and6.i = and <4 x i32> %shuffle.i, %and5.i
7088  %shuffle7.i = shufflevector <4 x i32> %and6.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
7089  %and8.i = and <4 x i32> %shuffle7.i, %and6.i
7090  %vecext.i = extractelement <4 x i32> %and8.i, i32 0
7091  ret i32 %vecext.i
7092}
7093
7094define i32 @test_mm512_mask_reduce_add_epi32(i16 zeroext %__M, <8 x i64> %__W) {
7095; X86-LABEL: test_mm512_mask_reduce_add_epi32:
7096; X86:       # %bb.0: # %entry
7097; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
7098; X86-NEXT:    kmovw %eax, %k1
7099; X86-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
7100; X86-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
7101; X86-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
7102; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
7103; X86-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
7104; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7105; X86-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
7106; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
7107; X86-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
7108; X86-NEXT:    vmovd %xmm0, %eax
7109; X86-NEXT:    vzeroupper
7110; X86-NEXT:    retl
7111;
7112; X64-LABEL: test_mm512_mask_reduce_add_epi32:
7113; X64:       # %bb.0: # %entry
7114; X64-NEXT:    kmovw %edi, %k1
7115; X64-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
7116; X64-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
7117; X64-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
7118; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
7119; X64-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
7120; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7121; X64-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
7122; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
7123; X64-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
7124; X64-NEXT:    vmovd %xmm0, %eax
7125; X64-NEXT:    vzeroupper
7126; X64-NEXT:    retq
7127entry:
7128  %0 = bitcast <8 x i64> %__W to <16 x i32>
7129  %1 = bitcast i16 %__M to <16 x i1>
7130  %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer
7131  %3 = bitcast <16 x i32> %2 to <8 x i64>
7132  %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7133  %4 = bitcast <4 x i64> %extract.i to <8 x i32>
7134  %extract3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7135  %5 = bitcast <4 x i64> %extract3.i to <8 x i32>
7136  %add.i = add <8 x i32> %4, %5
7137  %6 = bitcast <8 x i32> %add.i to <4 x i64>
7138  %extract4.i = shufflevector <4 x i64> %6, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
7139  %7 = bitcast <2 x i64> %extract4.i to <4 x i32>
7140  %extract5.i = shufflevector <4 x i64> %6, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
7141  %8 = bitcast <2 x i64> %extract5.i to <4 x i32>
7142  %add6.i = add <4 x i32> %7, %8
7143  %shuffle.i = shufflevector <4 x i32> %add6.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7144  %add7.i = add <4 x i32> %shuffle.i, %add6.i
7145  %shuffle8.i = shufflevector <4 x i32> %add7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
7146  %add9.i = add <4 x i32> %shuffle8.i, %add7.i
7147  %vecext.i = extractelement <4 x i32> %add9.i, i32 0
7148  ret i32 %vecext.i
7149}
7150
7151define i32 @test_mm512_mask_reduce_mul_epi32(i16 zeroext %__M, <8 x i64> %__W) {
7152; X86-LABEL: test_mm512_mask_reduce_mul_epi32:
7153; X86:       # %bb.0: # %entry
7154; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
7155; X86-NEXT:    kmovw %eax, %k1
7156; X86-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
7157; X86-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
7158; X86-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
7159; X86-NEXT:    vpmulld %ymm0, %ymm1, %ymm0
7160; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
7161; X86-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
7162; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7163; X86-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
7164; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
7165; X86-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
7166; X86-NEXT:    vmovd %xmm0, %eax
7167; X86-NEXT:    vzeroupper
7168; X86-NEXT:    retl
7169;
7170; X64-LABEL: test_mm512_mask_reduce_mul_epi32:
7171; X64:       # %bb.0: # %entry
7172; X64-NEXT:    kmovw %edi, %k1
7173; X64-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
7174; X64-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
7175; X64-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
7176; X64-NEXT:    vpmulld %ymm0, %ymm1, %ymm0
7177; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
7178; X64-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
7179; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7180; X64-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
7181; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
7182; X64-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
7183; X64-NEXT:    vmovd %xmm0, %eax
7184; X64-NEXT:    vzeroupper
7185; X64-NEXT:    retq
7186entry:
7187  %0 = bitcast <8 x i64> %__W to <16 x i32>
7188  %1 = bitcast i16 %__M to <16 x i1>
7189  %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
7190  %3 = bitcast <16 x i32> %2 to <8 x i64>
7191  %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7192  %4 = bitcast <4 x i64> %extract.i to <8 x i32>
7193  %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7194  %5 = bitcast <4 x i64> %extract4.i to <8 x i32>
7195  %mul.i = mul <8 x i32> %4, %5
7196  %6 = bitcast <8 x i32> %mul.i to <4 x i64>
7197  %extract5.i = shufflevector <4 x i64> %6, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
7198  %7 = bitcast <2 x i64> %extract5.i to <4 x i32>
7199  %extract6.i = shufflevector <4 x i64> %6, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
7200  %8 = bitcast <2 x i64> %extract6.i to <4 x i32>
7201  %mul7.i = mul <4 x i32> %7, %8
7202  %shuffle.i = shufflevector <4 x i32> %mul7.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7203  %mul8.i = mul <4 x i32> %shuffle.i, %mul7.i
7204  %shuffle9.i = shufflevector <4 x i32> %mul8.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
7205  %mul10.i = mul <4 x i32> %shuffle9.i, %mul8.i
7206  %vecext.i = extractelement <4 x i32> %mul10.i, i32 0
7207  ret i32 %vecext.i
7208}
7209
7210define i32 @test_mm512_mask_reduce_and_epi32(i16 zeroext %__M, <8 x i64> %__W) {
7211; X86-LABEL: test_mm512_mask_reduce_and_epi32:
7212; X86:       # %bb.0: # %entry
7213; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
7214; X86-NEXT:    kmovw %eax, %k1
7215; X86-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
7216; X86-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
7217; X86-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
7218; X86-NEXT:    vpand %ymm0, %ymm1, %ymm0
7219; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
7220; X86-NEXT:    vpand %xmm1, %xmm0, %xmm0
7221; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7222; X86-NEXT:    vpand %xmm0, %xmm1, %xmm0
7223; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
7224; X86-NEXT:    vpand %xmm0, %xmm1, %xmm0
7225; X86-NEXT:    vmovd %xmm0, %eax
7226; X86-NEXT:    vzeroupper
7227; X86-NEXT:    retl
7228;
7229; X64-LABEL: test_mm512_mask_reduce_and_epi32:
7230; X64:       # %bb.0: # %entry
7231; X64-NEXT:    kmovw %edi, %k1
7232; X64-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
7233; X64-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
7234; X64-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
7235; X64-NEXT:    vpand %ymm0, %ymm1, %ymm0
7236; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
7237; X64-NEXT:    vpand %xmm1, %xmm0, %xmm0
7238; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7239; X64-NEXT:    vpand %xmm0, %xmm1, %xmm0
7240; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
7241; X64-NEXT:    vpand %xmm0, %xmm1, %xmm0
7242; X64-NEXT:    vmovd %xmm0, %eax
7243; X64-NEXT:    vzeroupper
7244; X64-NEXT:    retq
7245entry:
7246  %0 = bitcast <8 x i64> %__W to <16 x i32>
7247  %1 = bitcast i16 %__M to <16 x i1>
7248  %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
7249  %3 = bitcast <16 x i32> %2 to <8 x i64>
7250  %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7251  %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7252  %and28.i = and <4 x i64> %extract.i, %extract4.i
7253  %extract5.i = shufflevector <4 x i64> %and28.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
7254  %extract6.i = shufflevector <4 x i64> %and28.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
7255  %and729.i = and <2 x i64> %extract5.i, %extract6.i
7256  %and7.i = bitcast <2 x i64> %and729.i to <4 x i32>
7257  %shuffle.i = shufflevector <4 x i32> %and7.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7258  %and8.i = and <4 x i32> %shuffle.i, %and7.i
7259  %shuffle9.i = shufflevector <4 x i32> %and8.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
7260  %and10.i = and <4 x i32> %shuffle9.i, %and8.i
7261  %vecext.i = extractelement <4 x i32> %and10.i, i32 0
7262  ret i32 %vecext.i
7263}
7264
7265define i32 @test_mm512_mask_reduce_or_epi32(i16 zeroext %__M, <8 x i64> %__W) {
7266; X86-LABEL: test_mm512_mask_reduce_or_epi32:
7267; X86:       # %bb.0: # %entry
7268; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
7269; X86-NEXT:    kmovw %eax, %k1
7270; X86-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
7271; X86-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
7272; X86-NEXT:    vpor %ymm1, %ymm0, %ymm0
7273; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
7274; X86-NEXT:    vpor %xmm1, %xmm0, %xmm0
7275; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7276; X86-NEXT:    vpor %xmm0, %xmm1, %xmm0
7277; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
7278; X86-NEXT:    vpor %xmm0, %xmm1, %xmm0
7279; X86-NEXT:    vmovd %xmm0, %eax
7280; X86-NEXT:    vzeroupper
7281; X86-NEXT:    retl
7282;
7283; X64-LABEL: test_mm512_mask_reduce_or_epi32:
7284; X64:       # %bb.0: # %entry
7285; X64-NEXT:    kmovw %edi, %k1
7286; X64-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
7287; X64-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
7288; X64-NEXT:    vpor %ymm1, %ymm0, %ymm0
7289; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
7290; X64-NEXT:    vpor %xmm1, %xmm0, %xmm0
7291; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7292; X64-NEXT:    vpor %xmm0, %xmm1, %xmm0
7293; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
7294; X64-NEXT:    vpor %xmm0, %xmm1, %xmm0
7295; X64-NEXT:    vmovd %xmm0, %eax
7296; X64-NEXT:    vzeroupper
7297; X64-NEXT:    retq
7298entry:
7299  %0 = bitcast <8 x i64> %__W to <16 x i32>
7300  %1 = bitcast i16 %__M to <16 x i1>
7301  %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer
7302  %3 = bitcast <16 x i32> %2 to <8 x i64>
7303  %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7304  %extract3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7305  %or27.i = or <4 x i64> %extract.i, %extract3.i
7306  %extract4.i = shufflevector <4 x i64> %or27.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
7307  %extract5.i = shufflevector <4 x i64> %or27.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
7308  %or628.i = or <2 x i64> %extract4.i, %extract5.i
7309  %or6.i = bitcast <2 x i64> %or628.i to <4 x i32>
7310  %shuffle.i = shufflevector <4 x i32> %or6.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7311  %or7.i = or <4 x i32> %shuffle.i, %or6.i
7312  %shuffle8.i = shufflevector <4 x i32> %or7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
7313  %or9.i = or <4 x i32> %shuffle8.i, %or7.i
7314  %vecext.i = extractelement <4 x i32> %or9.i, i32 0
7315  ret i32 %vecext.i
7316}
7317
7318define double @test_mm512_reduce_add_pd(<8 x double> %__W) {
7319; X86-LABEL: test_mm512_reduce_add_pd:
7320; X86:       # %bb.0: # %entry
7321; X86-NEXT:    pushl %ebp
7322; X86-NEXT:    .cfi_def_cfa_offset 8
7323; X86-NEXT:    .cfi_offset %ebp, -8
7324; X86-NEXT:    movl %esp, %ebp
7325; X86-NEXT:    .cfi_def_cfa_register %ebp
7326; X86-NEXT:    andl $-8, %esp
7327; X86-NEXT:    subl $8, %esp
7328; X86-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
7329; X86-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
7330; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
7331; X86-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
7332; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7333; X86-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
7334; X86-NEXT:    vmovsd %xmm0, (%esp)
7335; X86-NEXT:    fldl (%esp)
7336; X86-NEXT:    movl %ebp, %esp
7337; X86-NEXT:    popl %ebp
7338; X86-NEXT:    .cfi_def_cfa %esp, 4
7339; X86-NEXT:    vzeroupper
7340; X86-NEXT:    retl
7341;
7342; X64-LABEL: test_mm512_reduce_add_pd:
7343; X64:       # %bb.0: # %entry
7344; X64-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
7345; X64-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
7346; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
7347; X64-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
7348; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7349; X64-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
7350; X64-NEXT:    vzeroupper
7351; X64-NEXT:    retq
7352entry:
7353  %shuffle.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7354  %shuffle1.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7355  %add.i = fadd <4 x double> %shuffle.i, %shuffle1.i
7356  %shuffle2.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 0, i32 1>
7357  %shuffle3.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 2, i32 3>
7358  %add4.i = fadd <2 x double> %shuffle2.i, %shuffle3.i
7359  %shuffle6.i = shufflevector <2 x double> %add4.i, <2 x double> undef, <2 x i32> <i32 1, i32 0>
7360  %add7.i = fadd <2 x double> %add4.i, %shuffle6.i
7361  %vecext.i = extractelement <2 x double> %add7.i, i32 0
7362  ret double %vecext.i
7363}
7364
7365define double @test_mm512_reduce_mul_pd(<8 x double> %__W) {
7366; X86-LABEL: test_mm512_reduce_mul_pd:
7367; X86:       # %bb.0: # %entry
7368; X86-NEXT:    pushl %ebp
7369; X86-NEXT:    .cfi_def_cfa_offset 8
7370; X86-NEXT:    .cfi_offset %ebp, -8
7371; X86-NEXT:    movl %esp, %ebp
7372; X86-NEXT:    .cfi_def_cfa_register %ebp
7373; X86-NEXT:    andl $-8, %esp
7374; X86-NEXT:    subl $8, %esp
7375; X86-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
7376; X86-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
7377; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
7378; X86-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
7379; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7380; X86-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
7381; X86-NEXT:    vmovsd %xmm0, (%esp)
7382; X86-NEXT:    fldl (%esp)
7383; X86-NEXT:    movl %ebp, %esp
7384; X86-NEXT:    popl %ebp
7385; X86-NEXT:    .cfi_def_cfa %esp, 4
7386; X86-NEXT:    vzeroupper
7387; X86-NEXT:    retl
7388;
7389; X64-LABEL: test_mm512_reduce_mul_pd:
7390; X64:       # %bb.0: # %entry
7391; X64-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
7392; X64-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
7393; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
7394; X64-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
7395; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7396; X64-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
7397; X64-NEXT:    vzeroupper
7398; X64-NEXT:    retq
7399entry:
7400  %shuffle.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7401  %shuffle1.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7402  %mul.i = fmul <4 x double> %shuffle.i, %shuffle1.i
7403  %shuffle2.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 0, i32 1>
7404  %shuffle3.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 2, i32 3>
7405  %mul4.i = fmul <2 x double> %shuffle2.i, %shuffle3.i
7406  %shuffle6.i = shufflevector <2 x double> %mul4.i, <2 x double> undef, <2 x i32> <i32 1, i32 0>
7407  %mul7.i = fmul <2 x double> %mul4.i, %shuffle6.i
7408  %vecext.i = extractelement <2 x double> %mul7.i, i32 0
7409  ret double %vecext.i
7410}
7411
7412define float @test_mm512_reduce_add_ps(<16 x float> %__W) {
7413; X86-LABEL: test_mm512_reduce_add_ps:
7414; X86:       # %bb.0: # %entry
7415; X86-NEXT:    pushl %eax
7416; X86-NEXT:    .cfi_def_cfa_offset 8
7417; X86-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
7418; X86-NEXT:    vaddps %ymm1, %ymm0, %ymm0
7419; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
7420; X86-NEXT:    vaddps %xmm1, %xmm0, %xmm0
7421; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7422; X86-NEXT:    vaddps %xmm1, %xmm0, %xmm0
7423; X86-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7424; X86-NEXT:    vaddss %xmm1, %xmm0, %xmm0
7425; X86-NEXT:    vmovss %xmm0, (%esp)
7426; X86-NEXT:    flds (%esp)
7427; X86-NEXT:    popl %eax
7428; X86-NEXT:    .cfi_def_cfa_offset 4
7429; X86-NEXT:    vzeroupper
7430; X86-NEXT:    retl
7431;
7432; X64-LABEL: test_mm512_reduce_add_ps:
7433; X64:       # %bb.0: # %entry
7434; X64-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
7435; X64-NEXT:    vaddps %ymm1, %ymm0, %ymm0
7436; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
7437; X64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
7438; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7439; X64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
7440; X64-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7441; X64-NEXT:    vaddss %xmm1, %xmm0, %xmm0
7442; X64-NEXT:    vzeroupper
7443; X64-NEXT:    retq
7444entry:
7445  %0 = bitcast <16 x float> %__W to <8 x double>
7446  %extract.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7447  %1 = bitcast <4 x double> %extract.i to <8 x float>
7448  %extract2.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7449  %2 = bitcast <4 x double> %extract2.i to <8 x float>
7450  %add.i = fadd <8 x float> %1, %2
7451  %extract3.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7452  %extract4.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7453  %add5.i = fadd <4 x float> %extract3.i, %extract4.i
7454  %shuffle.i = shufflevector <4 x float> %add5.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7455  %add6.i = fadd <4 x float> %add5.i, %shuffle.i
7456  %shuffle7.i = shufflevector <4 x float> %add6.i, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
7457  %add8.i = fadd <4 x float> %add6.i, %shuffle7.i
7458  %vecext.i = extractelement <4 x float> %add8.i, i32 0
7459  ret float %vecext.i
7460}
7461
7462define float @test_mm512_reduce_mul_ps(<16 x float> %__W) {
7463; X86-LABEL: test_mm512_reduce_mul_ps:
7464; X86:       # %bb.0: # %entry
7465; X86-NEXT:    pushl %eax
7466; X86-NEXT:    .cfi_def_cfa_offset 8
7467; X86-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
7468; X86-NEXT:    vmulps %ymm1, %ymm0, %ymm0
7469; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
7470; X86-NEXT:    vmulps %xmm1, %xmm0, %xmm0
7471; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7472; X86-NEXT:    vmulps %xmm1, %xmm0, %xmm0
7473; X86-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7474; X86-NEXT:    vmulss %xmm1, %xmm0, %xmm0
7475; X86-NEXT:    vmovss %xmm0, (%esp)
7476; X86-NEXT:    flds (%esp)
7477; X86-NEXT:    popl %eax
7478; X86-NEXT:    .cfi_def_cfa_offset 4
7479; X86-NEXT:    vzeroupper
7480; X86-NEXT:    retl
7481;
7482; X64-LABEL: test_mm512_reduce_mul_ps:
7483; X64:       # %bb.0: # %entry
7484; X64-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
7485; X64-NEXT:    vmulps %ymm1, %ymm0, %ymm0
7486; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
7487; X64-NEXT:    vmulps %xmm1, %xmm0, %xmm0
7488; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7489; X64-NEXT:    vmulps %xmm1, %xmm0, %xmm0
7490; X64-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7491; X64-NEXT:    vmulss %xmm1, %xmm0, %xmm0
7492; X64-NEXT:    vzeroupper
7493; X64-NEXT:    retq
7494entry:
7495  %0 = bitcast <16 x float> %__W to <8 x double>
7496  %extract.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7497  %1 = bitcast <4 x double> %extract.i to <8 x float>
7498  %extract2.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7499  %2 = bitcast <4 x double> %extract2.i to <8 x float>
7500  %mul.i = fmul <8 x float> %1, %2
7501  %extract3.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7502  %extract4.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7503  %mul5.i = fmul <4 x float> %extract3.i, %extract4.i
7504  %shuffle.i = shufflevector <4 x float> %mul5.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7505  %mul6.i = fmul <4 x float> %mul5.i, %shuffle.i
7506  %shuffle7.i = shufflevector <4 x float> %mul6.i, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
7507  %mul8.i = fmul <4 x float> %mul6.i, %shuffle7.i
7508  %vecext.i = extractelement <4 x float> %mul8.i, i32 0
7509  ret float %vecext.i
7510}
7511
7512define double @test_mm512_mask_reduce_add_pd(i8 zeroext %__M, <8 x double> %__W) {
7513; X86-LABEL: test_mm512_mask_reduce_add_pd:
7514; X86:       # %bb.0: # %entry
7515; X86-NEXT:    pushl %ebp
7516; X86-NEXT:    .cfi_def_cfa_offset 8
7517; X86-NEXT:    .cfi_offset %ebp, -8
7518; X86-NEXT:    movl %esp, %ebp
7519; X86-NEXT:    .cfi_def_cfa_register %ebp
7520; X86-NEXT:    andl $-8, %esp
7521; X86-NEXT:    subl $8, %esp
7522; X86-NEXT:    movb 8(%ebp), %al
7523; X86-NEXT:    kmovw %eax, %k1
7524; X86-NEXT:    vmovapd %zmm0, %zmm0 {%k1} {z}
7525; X86-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
7526; X86-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
7527; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
7528; X86-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
7529; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7530; X86-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
7531; X86-NEXT:    vmovsd %xmm0, (%esp)
7532; X86-NEXT:    fldl (%esp)
7533; X86-NEXT:    movl %ebp, %esp
7534; X86-NEXT:    popl %ebp
7535; X86-NEXT:    .cfi_def_cfa %esp, 4
7536; X86-NEXT:    vzeroupper
7537; X86-NEXT:    retl
7538;
7539; X64-LABEL: test_mm512_mask_reduce_add_pd:
7540; X64:       # %bb.0: # %entry
7541; X64-NEXT:    kmovw %edi, %k1
7542; X64-NEXT:    vmovapd %zmm0, %zmm0 {%k1} {z}
7543; X64-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
7544; X64-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
7545; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
7546; X64-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
7547; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7548; X64-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
7549; X64-NEXT:    vzeroupper
7550; X64-NEXT:    retq
7551entry:
7552  %0 = bitcast i8 %__M to <8 x i1>
7553  %1 = select <8 x i1> %0, <8 x double> %__W, <8 x double> zeroinitializer
7554  %shuffle.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7555  %shuffle1.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7556  %add.i = fadd <4 x double> %shuffle.i, %shuffle1.i
7557  %shuffle2.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 0, i32 1>
7558  %shuffle3.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 2, i32 3>
7559  %add4.i = fadd <2 x double> %shuffle2.i, %shuffle3.i
7560  %shuffle6.i = shufflevector <2 x double> %add4.i, <2 x double> undef, <2 x i32> <i32 1, i32 0>
7561  %add7.i = fadd <2 x double> %add4.i, %shuffle6.i
7562  %vecext.i = extractelement <2 x double> %add7.i, i32 0
7563  ret double %vecext.i
7564}
7565
7566define double @test_mm512_mask_reduce_mul_pd(i8 zeroext %__M, <8 x double> %__W) {
7567; X86-LABEL: test_mm512_mask_reduce_mul_pd:
7568; X86:       # %bb.0: # %entry
7569; X86-NEXT:    pushl %ebp
7570; X86-NEXT:    .cfi_def_cfa_offset 8
7571; X86-NEXT:    .cfi_offset %ebp, -8
7572; X86-NEXT:    movl %esp, %ebp
7573; X86-NEXT:    .cfi_def_cfa_register %ebp
7574; X86-NEXT:    andl $-8, %esp
7575; X86-NEXT:    subl $8, %esp
7576; X86-NEXT:    movb 8(%ebp), %al
7577; X86-NEXT:    kmovw %eax, %k1
7578; X86-NEXT:    vbroadcastsd {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
7579; X86-NEXT:    vmovapd %zmm0, %zmm1 {%k1}
7580; X86-NEXT:    vextractf64x4 $1, %zmm1, %ymm0
7581; X86-NEXT:    vmulpd %ymm0, %ymm1, %ymm0
7582; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
7583; X86-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
7584; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7585; X86-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
7586; X86-NEXT:    vmovsd %xmm0, (%esp)
7587; X86-NEXT:    fldl (%esp)
7588; X86-NEXT:    movl %ebp, %esp
7589; X86-NEXT:    popl %ebp
7590; X86-NEXT:    .cfi_def_cfa %esp, 4
7591; X86-NEXT:    vzeroupper
7592; X86-NEXT:    retl
7593;
7594; X64-LABEL: test_mm512_mask_reduce_mul_pd:
7595; X64:       # %bb.0: # %entry
7596; X64-NEXT:    kmovw %edi, %k1
7597; X64-NEXT:    vbroadcastsd {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
7598; X64-NEXT:    vmovapd %zmm0, %zmm1 {%k1}
7599; X64-NEXT:    vextractf64x4 $1, %zmm1, %ymm0
7600; X64-NEXT:    vmulpd %ymm0, %ymm1, %ymm0
7601; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
7602; X64-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
7603; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7604; X64-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
7605; X64-NEXT:    vzeroupper
7606; X64-NEXT:    retq
7607entry:
7608  %0 = bitcast i8 %__M to <8 x i1>
7609  %1 = select <8 x i1> %0, <8 x double> %__W, <8 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
7610  %shuffle.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7611  %shuffle1.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7612  %mul.i = fmul <4 x double> %shuffle.i, %shuffle1.i
7613  %shuffle2.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 0, i32 1>
7614  %shuffle3.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 2, i32 3>
7615  %mul4.i = fmul <2 x double> %shuffle2.i, %shuffle3.i
7616  %shuffle6.i = shufflevector <2 x double> %mul4.i, <2 x double> undef, <2 x i32> <i32 1, i32 0>
7617  %mul7.i = fmul <2 x double> %mul4.i, %shuffle6.i
7618  %vecext.i = extractelement <2 x double> %mul7.i, i32 0
7619  ret double %vecext.i
7620}
7621
7622define float @test_mm512_mask_reduce_add_ps(i16 zeroext %__M, <16 x float> %__W) {
7623; X86-LABEL: test_mm512_mask_reduce_add_ps:
7624; X86:       # %bb.0: # %entry
7625; X86-NEXT:    pushl %eax
7626; X86-NEXT:    .cfi_def_cfa_offset 8
7627; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
7628; X86-NEXT:    kmovw %eax, %k1
7629; X86-NEXT:    vmovaps %zmm0, %zmm0 {%k1} {z}
7630; X86-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
7631; X86-NEXT:    vaddps %ymm1, %ymm0, %ymm0
7632; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
7633; X86-NEXT:    vaddps %xmm1, %xmm0, %xmm0
7634; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7635; X86-NEXT:    vaddps %xmm1, %xmm0, %xmm0
7636; X86-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7637; X86-NEXT:    vaddss %xmm1, %xmm0, %xmm0
7638; X86-NEXT:    vmovss %xmm0, (%esp)
7639; X86-NEXT:    flds (%esp)
7640; X86-NEXT:    popl %eax
7641; X86-NEXT:    .cfi_def_cfa_offset 4
7642; X86-NEXT:    vzeroupper
7643; X86-NEXT:    retl
7644;
7645; X64-LABEL: test_mm512_mask_reduce_add_ps:
7646; X64:       # %bb.0: # %entry
7647; X64-NEXT:    kmovw %edi, %k1
7648; X64-NEXT:    vmovaps %zmm0, %zmm0 {%k1} {z}
7649; X64-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
7650; X64-NEXT:    vaddps %ymm1, %ymm0, %ymm0
7651; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
7652; X64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
7653; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7654; X64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
7655; X64-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7656; X64-NEXT:    vaddss %xmm1, %xmm0, %xmm0
7657; X64-NEXT:    vzeroupper
7658; X64-NEXT:    retq
7659entry:
7660  %0 = bitcast i16 %__M to <16 x i1>
7661  %1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> zeroinitializer
7662  %2 = bitcast <16 x float> %1 to <8 x double>
7663  %extract.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7664  %3 = bitcast <4 x double> %extract.i to <8 x float>
7665  %extract3.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7666  %4 = bitcast <4 x double> %extract3.i to <8 x float>
7667  %add.i = fadd <8 x float> %3, %4
7668  %extract4.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7669  %extract5.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7670  %add6.i = fadd <4 x float> %extract4.i, %extract5.i
7671  %shuffle.i = shufflevector <4 x float> %add6.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7672  %add7.i = fadd <4 x float> %add6.i, %shuffle.i
7673  %shuffle8.i = shufflevector <4 x float> %add7.i, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
7674  %add9.i = fadd <4 x float> %add7.i, %shuffle8.i
7675  %vecext.i = extractelement <4 x float> %add9.i, i32 0
7676  ret float %vecext.i
7677}
7678
7679define float @test_mm512_mask_reduce_mul_ps(i16 zeroext %__M, <16 x float> %__W) {
7680; X86-LABEL: test_mm512_mask_reduce_mul_ps:
7681; X86:       # %bb.0: # %entry
7682; X86-NEXT:    pushl %eax
7683; X86-NEXT:    .cfi_def_cfa_offset 8
7684; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
7685; X86-NEXT:    kmovw %eax, %k1
7686; X86-NEXT:    vbroadcastss {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
7687; X86-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
7688; X86-NEXT:    vextractf64x4 $1, %zmm1, %ymm0
7689; X86-NEXT:    vmulps %ymm0, %ymm1, %ymm0
7690; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
7691; X86-NEXT:    vmulps %xmm1, %xmm0, %xmm0
7692; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7693; X86-NEXT:    vmulps %xmm1, %xmm0, %xmm0
7694; X86-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7695; X86-NEXT:    vmulss %xmm1, %xmm0, %xmm0
7696; X86-NEXT:    vmovss %xmm0, (%esp)
7697; X86-NEXT:    flds (%esp)
7698; X86-NEXT:    popl %eax
7699; X86-NEXT:    .cfi_def_cfa_offset 4
7700; X86-NEXT:    vzeroupper
7701; X86-NEXT:    retl
7702;
7703; X64-LABEL: test_mm512_mask_reduce_mul_ps:
7704; X64:       # %bb.0: # %entry
7705; X64-NEXT:    kmovw %edi, %k1
7706; X64-NEXT:    vbroadcastss {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
7707; X64-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
7708; X64-NEXT:    vextractf64x4 $1, %zmm1, %ymm0
7709; X64-NEXT:    vmulps %ymm0, %ymm1, %ymm0
7710; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
7711; X64-NEXT:    vmulps %xmm1, %xmm0, %xmm0
7712; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7713; X64-NEXT:    vmulps %xmm1, %xmm0, %xmm0
7714; X64-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7715; X64-NEXT:    vmulss %xmm1, %xmm0, %xmm0
7716; X64-NEXT:    vzeroupper
7717; X64-NEXT:    retq
7718entry:
7719  %0 = bitcast i16 %__M to <16 x i1>
7720  %1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
7721  %2 = bitcast <16 x float> %1 to <8 x double>
7722  %extract.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7723  %3 = bitcast <4 x double> %extract.i to <8 x float>
7724  %extract4.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7725  %4 = bitcast <4 x double> %extract4.i to <8 x float>
7726  %mul.i = fmul <8 x float> %3, %4
7727  %extract5.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7728  %extract6.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7729  %mul7.i = fmul <4 x float> %extract5.i, %extract6.i
7730  %shuffle.i = shufflevector <4 x float> %mul7.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7731  %mul8.i = fmul <4 x float> %mul7.i, %shuffle.i
7732  %shuffle9.i = shufflevector <4 x float> %mul8.i, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
7733  %mul10.i = fmul <4 x float> %mul8.i, %shuffle9.i
7734  %vecext.i = extractelement <4 x float> %mul10.i, i32 0
7735  ret float %vecext.i
7736}
7737
7738define i64 @test_mm512_reduce_max_epi64(<8 x i64> %__W) {
7739; X86-LABEL: test_mm512_reduce_max_epi64:
7740; X86:       # %bb.0: # %entry
7741; X86-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7742; X86-NEXT:    vpmaxsq %zmm0, %zmm1, %zmm0
7743; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
7744; X86-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
7745; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7746; X86-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
7747; X86-NEXT:    vmovd %xmm0, %eax
7748; X86-NEXT:    vpextrd $1, %xmm0, %edx
7749; X86-NEXT:    vzeroupper
7750; X86-NEXT:    retl
7751;
7752; X64-LABEL: test_mm512_reduce_max_epi64:
7753; X64:       # %bb.0: # %entry
7754; X64-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7755; X64-NEXT:    vpmaxsq %zmm0, %zmm1, %zmm0
7756; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
7757; X64-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
7758; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7759; X64-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
7760; X64-NEXT:    vmovq %xmm0, %rax
7761; X64-NEXT:    vzeroupper
7762; X64-NEXT:    retq
7763entry:
7764  %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
7765  %0 = icmp slt <8 x i64> %shuffle.i, %__W
7766  %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> %shuffle.i
7767  %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
7768  %2 = icmp sgt <8 x i64> %1, %shuffle1.i
7769  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle1.i
7770  %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
7771  %4 = icmp sgt <8 x i64> %3, %shuffle3.i
7772  %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
7773  %vecext.i = extractelement <8 x i64> %5, i32 0
7774  ret i64 %vecext.i
7775}
7776
7777define i64 @test_mm512_reduce_max_epu64(<8 x i64> %__W) {
7778; X86-LABEL: test_mm512_reduce_max_epu64:
7779; X86:       # %bb.0: # %entry
7780; X86-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7781; X86-NEXT:    vpmaxuq %zmm0, %zmm1, %zmm0
7782; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
7783; X86-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
7784; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7785; X86-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
7786; X86-NEXT:    vmovd %xmm0, %eax
7787; X86-NEXT:    vpextrd $1, %xmm0, %edx
7788; X86-NEXT:    vzeroupper
7789; X86-NEXT:    retl
7790;
7791; X64-LABEL: test_mm512_reduce_max_epu64:
7792; X64:       # %bb.0: # %entry
7793; X64-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7794; X64-NEXT:    vpmaxuq %zmm0, %zmm1, %zmm0
7795; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
7796; X64-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
7797; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7798; X64-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
7799; X64-NEXT:    vmovq %xmm0, %rax
7800; X64-NEXT:    vzeroupper
7801; X64-NEXT:    retq
7802entry:
7803  %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
7804  %0 = icmp ult <8 x i64> %shuffle.i, %__W
7805  %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> %shuffle.i
7806  %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
7807  %2 = icmp ugt <8 x i64> %1, %shuffle1.i
7808  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle1.i
7809  %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
7810  %4 = icmp ugt <8 x i64> %3, %shuffle3.i
7811  %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
7812  %vecext.i = extractelement <8 x i64> %5, i32 0
7813  ret i64 %vecext.i
7814}
7815
7816define double @test_mm512_reduce_max_pd(<8 x double> %__W) {
7817; X86-LABEL: test_mm512_reduce_max_pd:
7818; X86:       # %bb.0: # %entry
7819; X86-NEXT:    pushl %ebp
7820; X86-NEXT:    .cfi_def_cfa_offset 8
7821; X86-NEXT:    .cfi_offset %ebp, -8
7822; X86-NEXT:    movl %esp, %ebp
7823; X86-NEXT:    .cfi_def_cfa_register %ebp
7824; X86-NEXT:    andl $-8, %esp
7825; X86-NEXT:    subl $8, %esp
7826; X86-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
7827; X86-NEXT:    vmaxpd %ymm1, %ymm0, %ymm0
7828; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
7829; X86-NEXT:    vmaxpd %xmm1, %xmm0, %xmm0
7830; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7831; X86-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0
7832; X86-NEXT:    vmovsd %xmm0, (%esp)
7833; X86-NEXT:    fldl (%esp)
7834; X86-NEXT:    movl %ebp, %esp
7835; X86-NEXT:    popl %ebp
7836; X86-NEXT:    .cfi_def_cfa %esp, 4
7837; X86-NEXT:    vzeroupper
7838; X86-NEXT:    retl
7839;
7840; X64-LABEL: test_mm512_reduce_max_pd:
7841; X64:       # %bb.0: # %entry
7842; X64-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
7843; X64-NEXT:    vmaxpd %ymm1, %ymm0, %ymm0
7844; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
7845; X64-NEXT:    vmaxpd %xmm1, %xmm0, %xmm0
7846; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7847; X64-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0
7848; X64-NEXT:    vzeroupper
7849; X64-NEXT:    retq
7850entry:
7851  %extract.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7852  %extract2.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7853  %0 = tail call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %extract.i, <4 x double> %extract2.i)
7854  %extract4.i = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 0, i32 1>
7855  %extract5.i = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 2, i32 3>
7856  %1 = tail call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %extract4.i, <2 x double> %extract5.i)
7857  %shuffle.i = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> <i32 1, i32 0>
7858  %2 = tail call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %1, <2 x double> %shuffle.i)
7859  %vecext.i = extractelement <2 x double> %2, i32 0
7860  ret double %vecext.i
7861}
7862
7863define i64 @test_mm512_reduce_min_epi64(<8 x i64> %__W) {
7864; X86-LABEL: test_mm512_reduce_min_epi64:
7865; X86:       # %bb.0: # %entry
7866; X86-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7867; X86-NEXT:    vpminsq %zmm0, %zmm1, %zmm0
7868; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
7869; X86-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
7870; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7871; X86-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
7872; X86-NEXT:    vmovd %xmm0, %eax
7873; X86-NEXT:    vpextrd $1, %xmm0, %edx
7874; X86-NEXT:    vzeroupper
7875; X86-NEXT:    retl
7876;
7877; X64-LABEL: test_mm512_reduce_min_epi64:
7878; X64:       # %bb.0: # %entry
7879; X64-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7880; X64-NEXT:    vpminsq %zmm0, %zmm1, %zmm0
7881; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
7882; X64-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
7883; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7884; X64-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
7885; X64-NEXT:    vmovq %xmm0, %rax
7886; X64-NEXT:    vzeroupper
7887; X64-NEXT:    retq
7888entry:
7889  %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
7890  %0 = icmp sgt <8 x i64> %shuffle.i, %__W
7891  %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> %shuffle.i
7892  %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
7893  %2 = icmp slt <8 x i64> %1, %shuffle1.i
7894  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle1.i
7895  %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
7896  %4 = icmp slt <8 x i64> %3, %shuffle3.i
7897  %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
7898  %vecext.i = extractelement <8 x i64> %5, i32 0
7899  ret i64 %vecext.i
7900}
7901
7902define i64 @test_mm512_reduce_min_epu64(<8 x i64> %__W) {
7903; X86-LABEL: test_mm512_reduce_min_epu64:
7904; X86:       # %bb.0: # %entry
7905; X86-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7906; X86-NEXT:    vpminuq %zmm0, %zmm1, %zmm0
7907; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
7908; X86-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
7909; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7910; X86-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
7911; X86-NEXT:    vmovd %xmm0, %eax
7912; X86-NEXT:    vpextrd $1, %xmm0, %edx
7913; X86-NEXT:    vzeroupper
7914; X86-NEXT:    retl
7915;
7916; X64-LABEL: test_mm512_reduce_min_epu64:
7917; X64:       # %bb.0: # %entry
7918; X64-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7919; X64-NEXT:    vpminuq %zmm0, %zmm1, %zmm0
7920; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
7921; X64-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
7922; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7923; X64-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
7924; X64-NEXT:    vmovq %xmm0, %rax
7925; X64-NEXT:    vzeroupper
7926; X64-NEXT:    retq
7927entry:
7928  %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
7929  %0 = icmp ugt <8 x i64> %shuffle.i, %__W
7930  %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> %shuffle.i
7931  %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
7932  %2 = icmp ult <8 x i64> %1, %shuffle1.i
7933  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle1.i
7934  %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
7935  %4 = icmp ult <8 x i64> %3, %shuffle3.i
7936  %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
7937  %vecext.i = extractelement <8 x i64> %5, i32 0
7938  ret i64 %vecext.i
7939}
7940
7941define double @test_mm512_reduce_min_pd(<8 x double> %__W) {
7942; X86-LABEL: test_mm512_reduce_min_pd:
7943; X86:       # %bb.0: # %entry
7944; X86-NEXT:    pushl %ebp
7945; X86-NEXT:    .cfi_def_cfa_offset 8
7946; X86-NEXT:    .cfi_offset %ebp, -8
7947; X86-NEXT:    movl %esp, %ebp
7948; X86-NEXT:    .cfi_def_cfa_register %ebp
7949; X86-NEXT:    andl $-8, %esp
7950; X86-NEXT:    subl $8, %esp
7951; X86-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
7952; X86-NEXT:    vminpd %ymm1, %ymm0, %ymm0
7953; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
7954; X86-NEXT:    vminpd %xmm1, %xmm0, %xmm0
7955; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7956; X86-NEXT:    vminsd %xmm1, %xmm0, %xmm0
7957; X86-NEXT:    vmovsd %xmm0, (%esp)
7958; X86-NEXT:    fldl (%esp)
7959; X86-NEXT:    movl %ebp, %esp
7960; X86-NEXT:    popl %ebp
7961; X86-NEXT:    .cfi_def_cfa %esp, 4
7962; X86-NEXT:    vzeroupper
7963; X86-NEXT:    retl
7964;
7965; X64-LABEL: test_mm512_reduce_min_pd:
7966; X64:       # %bb.0: # %entry
7967; X64-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
7968; X64-NEXT:    vminpd %ymm1, %ymm0, %ymm0
7969; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
7970; X64-NEXT:    vminpd %xmm1, %xmm0, %xmm0
7971; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7972; X64-NEXT:    vminsd %xmm1, %xmm0, %xmm0
7973; X64-NEXT:    vzeroupper
7974; X64-NEXT:    retq
7975entry:
7976  %extract.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7977  %extract2.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7978  %0 = tail call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %extract.i, <4 x double> %extract2.i)
7979  %extract4.i = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 0, i32 1>
7980  %extract5.i = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 2, i32 3>
7981  %1 = tail call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %extract4.i, <2 x double> %extract5.i)
7982  %shuffle.i = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> <i32 1, i32 0>
7983  %2 = tail call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %1, <2 x double> %shuffle.i)
7984  %vecext.i = extractelement <2 x double> %2, i32 0
7985  ret double %vecext.i
7986}
7987
7988define i64 @test_mm512_mask_reduce_max_epi64(i8 zeroext %__M, <8 x i64> %__W) {
7989; X86-LABEL: test_mm512_mask_reduce_max_epi64:
7990; X86:       # %bb.0: # %entry
7991; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
7992; X86-NEXT:    kmovw %eax, %k1
7993; X86-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648]
7994; X86-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
7995; X86-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
7996; X86-NEXT:    vpmaxsq %zmm0, %zmm1, %zmm0
7997; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
7998; X86-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
7999; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8000; X86-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
8001; X86-NEXT:    vmovd %xmm0, %eax
8002; X86-NEXT:    vpextrd $1, %xmm0, %edx
8003; X86-NEXT:    vzeroupper
8004; X86-NEXT:    retl
8005;
8006; X64-LABEL: test_mm512_mask_reduce_max_epi64:
8007; X64:       # %bb.0: # %entry
8008; X64-NEXT:    kmovw %edi, %k1
8009; X64-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
8010; X64-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
8011; X64-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
8012; X64-NEXT:    vpmaxsq %zmm0, %zmm1, %zmm0
8013; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
8014; X64-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
8015; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8016; X64-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
8017; X64-NEXT:    vmovq %xmm0, %rax
8018; X64-NEXT:    vzeroupper
8019; X64-NEXT:    retq
8020entry:
8021  %0 = bitcast i8 %__M to <8 x i1>
8022  %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808>
8023  %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
8024  %2 = icmp sgt <8 x i64> %1, %shuffle.i
8025  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle.i
8026  %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
8027  %4 = icmp sgt <8 x i64> %3, %shuffle3.i
8028  %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
8029  %shuffle5.i = shufflevector <8 x i64> %5, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
8030  %6 = icmp sgt <8 x i64> %5, %shuffle5.i
8031  %7 = select <8 x i1> %6, <8 x i64> %5, <8 x i64> %shuffle5.i
8032  %vecext.i = extractelement <8 x i64> %7, i32 0
8033  ret i64 %vecext.i
8034}
8035
8036define i64 @test_mm512_mask_reduce_max_epu64(i8 zeroext %__M, <8 x i64> %__W) {
8037; X86-LABEL: test_mm512_mask_reduce_max_epu64:
8038; X86:       # %bb.0: # %entry
8039; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
8040; X86-NEXT:    kmovw %eax, %k1
8041; X86-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
8042; X86-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
8043; X86-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
8044; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
8045; X86-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
8046; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8047; X86-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
8048; X86-NEXT:    vmovd %xmm0, %eax
8049; X86-NEXT:    vpextrd $1, %xmm0, %edx
8050; X86-NEXT:    vzeroupper
8051; X86-NEXT:    retl
8052;
8053; X64-LABEL: test_mm512_mask_reduce_max_epu64:
8054; X64:       # %bb.0: # %entry
8055; X64-NEXT:    kmovw %edi, %k1
8056; X64-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
8057; X64-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
8058; X64-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
8059; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
8060; X64-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
8061; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8062; X64-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
8063; X64-NEXT:    vmovq %xmm0, %rax
8064; X64-NEXT:    vzeroupper
8065; X64-NEXT:    retq
8066entry:
8067  %0 = bitcast i8 %__M to <8 x i1>
8068  %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> zeroinitializer
8069  %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
8070  %2 = icmp ugt <8 x i64> %1, %shuffle.i
8071  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle.i
8072  %shuffle2.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
8073  %4 = icmp ugt <8 x i64> %3, %shuffle2.i
8074  %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle2.i
8075  %shuffle4.i = shufflevector <8 x i64> %5, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
8076  %6 = icmp ugt <8 x i64> %5, %shuffle4.i
8077  %7 = select <8 x i1> %6, <8 x i64> %5, <8 x i64> %shuffle4.i
8078  %vecext.i = extractelement <8 x i64> %7, i32 0
8079  ret i64 %vecext.i
8080}
8081
8082define double @test_mm512_mask_reduce_max_pd(i8 zeroext %__M, <8 x double> %__W) {
8083; X86-LABEL: test_mm512_mask_reduce_max_pd:
8084; X86:       # %bb.0: # %entry
8085; X86-NEXT:    pushl %ebp
8086; X86-NEXT:    .cfi_def_cfa_offset 8
8087; X86-NEXT:    .cfi_offset %ebp, -8
8088; X86-NEXT:    movl %esp, %ebp
8089; X86-NEXT:    .cfi_def_cfa_register %ebp
8090; X86-NEXT:    andl $-8, %esp
8091; X86-NEXT:    subl $8, %esp
8092; X86-NEXT:    movb 8(%ebp), %al
8093; X86-NEXT:    kmovw %eax, %k1
8094; X86-NEXT:    vbroadcastsd {{.*#+}} zmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf]
8095; X86-NEXT:    vmovapd %zmm0, %zmm1 {%k1}
8096; X86-NEXT:    vextractf64x4 $1, %zmm1, %ymm0
8097; X86-NEXT:    vmaxpd %ymm0, %ymm1, %ymm0
8098; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
8099; X86-NEXT:    vmaxpd %xmm1, %xmm0, %xmm0
8100; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8101; X86-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0
8102; X86-NEXT:    vmovsd %xmm0, (%esp)
8103; X86-NEXT:    fldl (%esp)
8104; X86-NEXT:    movl %ebp, %esp
8105; X86-NEXT:    popl %ebp
8106; X86-NEXT:    .cfi_def_cfa %esp, 4
8107; X86-NEXT:    vzeroupper
8108; X86-NEXT:    retl
8109;
8110; X64-LABEL: test_mm512_mask_reduce_max_pd:
8111; X64:       # %bb.0: # %entry
8112; X64-NEXT:    kmovw %edi, %k1
8113; X64-NEXT:    vbroadcastsd {{.*#+}} zmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf]
8114; X64-NEXT:    vmovapd %zmm0, %zmm1 {%k1}
8115; X64-NEXT:    vextractf64x4 $1, %zmm1, %ymm0
8116; X64-NEXT:    vmaxpd %ymm0, %ymm1, %ymm0
8117; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
8118; X64-NEXT:    vmaxpd %xmm1, %xmm0, %xmm0
8119; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8120; X64-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0
8121; X64-NEXT:    vzeroupper
8122; X64-NEXT:    retq
8123entry:
8124  %0 = bitcast i8 %__M to <8 x i1>
8125  %1 = select <8 x i1> %0, <8 x double> %__W, <8 x double> <double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000>
8126  %extract.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8127  %extract4.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8128  %2 = tail call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %extract.i, <4 x double> %extract4.i) #3
8129  %extract6.i = shufflevector <4 x double> %2, <4 x double> undef, <2 x i32> <i32 0, i32 1>
8130  %extract7.i = shufflevector <4 x double> %2, <4 x double> undef, <2 x i32> <i32 2, i32 3>
8131  %3 = tail call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %extract6.i, <2 x double> %extract7.i) #3
8132  %shuffle.i = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> <i32 1, i32 0>
8133  %4 = tail call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %3, <2 x double> %shuffle.i) #3
8134  %vecext.i = extractelement <2 x double> %4, i32 0
8135  ret double %vecext.i
8136}
8137
8138define i64 @test_mm512_mask_reduce_min_epi64(i8 zeroext %__M, <8 x i64> %__W) {
8139; X86-LABEL: test_mm512_mask_reduce_min_epi64:
8140; X86:       # %bb.0: # %entry
8141; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
8142; X86-NEXT:    kmovw %eax, %k1
8143; X86-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647]
8144; X86-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
8145; X86-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
8146; X86-NEXT:    vpminsq %zmm0, %zmm1, %zmm0
8147; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
8148; X86-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
8149; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8150; X86-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
8151; X86-NEXT:    vmovd %xmm0, %eax
8152; X86-NEXT:    vpextrd $1, %xmm0, %edx
8153; X86-NEXT:    vzeroupper
8154; X86-NEXT:    retl
8155;
8156; X64-LABEL: test_mm512_mask_reduce_min_epi64:
8157; X64:       # %bb.0: # %entry
8158; X64-NEXT:    kmovw %edi, %k1
8159; X64-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
8160; X64-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
8161; X64-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
8162; X64-NEXT:    vpminsq %zmm0, %zmm1, %zmm0
8163; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
8164; X64-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
8165; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8166; X64-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
8167; X64-NEXT:    vmovq %xmm0, %rax
8168; X64-NEXT:    vzeroupper
8169; X64-NEXT:    retq
8170entry:
8171  %0 = bitcast i8 %__M to <8 x i1>
8172  %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807>
8173  %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
8174  %2 = icmp slt <8 x i64> %1, %shuffle.i
8175  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle.i
8176  %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
8177  %4 = icmp slt <8 x i64> %3, %shuffle3.i
8178  %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
8179  %shuffle5.i = shufflevector <8 x i64> %5, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
8180  %6 = icmp slt <8 x i64> %5, %shuffle5.i
8181  %7 = select <8 x i1> %6, <8 x i64> %5, <8 x i64> %shuffle5.i
8182  %vecext.i = extractelement <8 x i64> %7, i32 0
8183  ret i64 %vecext.i
8184}
8185
8186define i64 @test_mm512_mask_reduce_min_epu64(i8 zeroext %__M, <8 x i64> %__W) {
8187; X86-LABEL: test_mm512_mask_reduce_min_epu64:
8188; X86:       # %bb.0: # %entry
8189; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
8190; X86-NEXT:    kmovw %eax, %k1
8191; X86-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
8192; X86-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
8193; X86-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
8194; X86-NEXT:    vpminuq %zmm0, %zmm1, %zmm0
8195; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
8196; X86-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
8197; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8198; X86-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
8199; X86-NEXT:    vmovd %xmm0, %eax
8200; X86-NEXT:    vpextrd $1, %xmm0, %edx
8201; X86-NEXT:    vzeroupper
8202; X86-NEXT:    retl
8203;
8204; X64-LABEL: test_mm512_mask_reduce_min_epu64:
8205; X64:       # %bb.0: # %entry
8206; X64-NEXT:    kmovw %edi, %k1
8207; X64-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
8208; X64-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
8209; X64-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
8210; X64-NEXT:    vpminuq %zmm0, %zmm1, %zmm0
8211; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
8212; X64-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
8213; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8214; X64-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
8215; X64-NEXT:    vmovq %xmm0, %rax
8216; X64-NEXT:    vzeroupper
8217; X64-NEXT:    retq
8218entry:
8219  %0 = bitcast i8 %__M to <8 x i1>
8220  %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
8221  %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
8222  %2 = icmp ult <8 x i64> %1, %shuffle.i
8223  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle.i
8224  %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
8225  %4 = icmp ult <8 x i64> %3, %shuffle3.i
8226  %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
8227  %shuffle5.i = shufflevector <8 x i64> %5, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
8228  %6 = icmp ult <8 x i64> %5, %shuffle5.i
8229  %7 = select <8 x i1> %6, <8 x i64> %5, <8 x i64> %shuffle5.i
8230  %vecext.i = extractelement <8 x i64> %7, i32 0
8231  ret i64 %vecext.i
8232}
8233
8234define double @test_mm512_mask_reduce_min_pd(i8 zeroext %__M, <8 x double> %__W) {
8235; X86-LABEL: test_mm512_mask_reduce_min_pd:
8236; X86:       # %bb.0: # %entry
8237; X86-NEXT:    pushl %ebp
8238; X86-NEXT:    .cfi_def_cfa_offset 8
8239; X86-NEXT:    .cfi_offset %ebp, -8
8240; X86-NEXT:    movl %esp, %ebp
8241; X86-NEXT:    .cfi_def_cfa_register %ebp
8242; X86-NEXT:    andl $-8, %esp
8243; X86-NEXT:    subl $8, %esp
8244; X86-NEXT:    movb 8(%ebp), %al
8245; X86-NEXT:    kmovw %eax, %k1
8246; X86-NEXT:    vbroadcastsd {{.*#+}} zmm1 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf]
8247; X86-NEXT:    vmovapd %zmm0, %zmm1 {%k1}
8248; X86-NEXT:    vextractf64x4 $1, %zmm1, %ymm0
8249; X86-NEXT:    vminpd %ymm0, %ymm1, %ymm0
8250; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
8251; X86-NEXT:    vminpd %xmm1, %xmm0, %xmm0
8252; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8253; X86-NEXT:    vminsd %xmm1, %xmm0, %xmm0
8254; X86-NEXT:    vmovsd %xmm0, (%esp)
8255; X86-NEXT:    fldl (%esp)
8256; X86-NEXT:    movl %ebp, %esp
8257; X86-NEXT:    popl %ebp
8258; X86-NEXT:    .cfi_def_cfa %esp, 4
8259; X86-NEXT:    vzeroupper
8260; X86-NEXT:    retl
8261;
8262; X64-LABEL: test_mm512_mask_reduce_min_pd:
8263; X64:       # %bb.0: # %entry
8264; X64-NEXT:    kmovw %edi, %k1
8265; X64-NEXT:    vbroadcastsd {{.*#+}} zmm1 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf]
8266; X64-NEXT:    vmovapd %zmm0, %zmm1 {%k1}
8267; X64-NEXT:    vextractf64x4 $1, %zmm1, %ymm0
8268; X64-NEXT:    vminpd %ymm0, %ymm1, %ymm0
8269; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
8270; X64-NEXT:    vminpd %xmm1, %xmm0, %xmm0
8271; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8272; X64-NEXT:    vminsd %xmm1, %xmm0, %xmm0
8273; X64-NEXT:    vzeroupper
8274; X64-NEXT:    retq
8275entry:
8276  %0 = bitcast i8 %__M to <8 x i1>
8277  %1 = select <8 x i1> %0, <8 x double> %__W, <8 x double> <double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000>
8278  %extract.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8279  %extract4.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8280  %2 = tail call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %extract.i, <4 x double> %extract4.i)
8281  %extract6.i = shufflevector <4 x double> %2, <4 x double> undef, <2 x i32> <i32 0, i32 1>
8282  %extract7.i = shufflevector <4 x double> %2, <4 x double> undef, <2 x i32> <i32 2, i32 3>
8283  %3 = tail call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %extract6.i, <2 x double> %extract7.i)
8284  %shuffle.i = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> <i32 1, i32 0>
8285  %4 = tail call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %3, <2 x double> %shuffle.i)
8286  %vecext.i = extractelement <2 x double> %4, i32 0
8287  ret double %vecext.i
8288}
8289
8290define i32 @test_mm512_reduce_max_epi32(<8 x i64> %__W) {
8291; CHECK-LABEL: test_mm512_reduce_max_epi32:
8292; CHECK:       # %bb.0: # %entry
8293; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
8294; CHECK-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
8295; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
8296; CHECK-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
8297; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8298; CHECK-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
8299; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8300; CHECK-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
8301; CHECK-NEXT:    vmovd %xmm0, %eax
8302; CHECK-NEXT:    vzeroupper
8303; CHECK-NEXT:    ret{{[l|q]}}
8304entry:
8305  %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8306  %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8307  %0 = bitcast <4 x i64> %extract.i to <8 x i32>
8308  %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
8309  %2 = icmp sgt <8 x i32> %0, %1
8310  %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
8311  %4 = bitcast <8 x i32> %3 to <4 x i64>
8312  %extract4.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8313  %extract5.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8314  %5 = bitcast <2 x i64> %extract4.i to <4 x i32>
8315  %6 = bitcast <2 x i64> %extract5.i to <4 x i32>
8316  %7 = icmp sgt <4 x i32> %5, %6
8317  %8 = select <4 x i1> %7, <4 x i32> %5, <4 x i32> %6
8318  %shuffle.i = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8319  %9 = icmp sgt <4 x i32> %8, %shuffle.i
8320  %10 = select <4 x i1> %9, <4 x i32> %8, <4 x i32> %shuffle.i
8321  %shuffle8.i = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8322  %11 = icmp sgt <4 x i32> %10, %shuffle8.i
8323  %12 = select <4 x i1> %11, <4 x i32> %10, <4 x i32> %shuffle8.i
8324  %vecext.i = extractelement <4 x i32> %12, i32 0
8325  ret i32 %vecext.i
8326}
8327
8328define i32 @test_mm512_reduce_max_epu32(<8 x i64> %__W) {
8329; CHECK-LABEL: test_mm512_reduce_max_epu32:
8330; CHECK:       # %bb.0: # %entry
8331; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
8332; CHECK-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
8333; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
8334; CHECK-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
8335; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8336; CHECK-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
8337; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8338; CHECK-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
8339; CHECK-NEXT:    vmovd %xmm0, %eax
8340; CHECK-NEXT:    vzeroupper
8341; CHECK-NEXT:    ret{{[l|q]}}
8342entry:
8343  %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8344  %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8345  %0 = bitcast <4 x i64> %extract.i to <8 x i32>
8346  %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
8347  %2 = icmp ugt <8 x i32> %0, %1
8348  %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
8349  %4 = bitcast <8 x i32> %3 to <4 x i64>
8350  %extract4.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8351  %extract5.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8352  %5 = bitcast <2 x i64> %extract4.i to <4 x i32>
8353  %6 = bitcast <2 x i64> %extract5.i to <4 x i32>
8354  %7 = icmp ugt <4 x i32> %5, %6
8355  %8 = select <4 x i1> %7, <4 x i32> %5, <4 x i32> %6
8356  %shuffle.i = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8357  %9 = icmp ugt <4 x i32> %8, %shuffle.i
8358  %10 = select <4 x i1> %9, <4 x i32> %8, <4 x i32> %shuffle.i
8359  %shuffle8.i = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8360  %11 = icmp ugt <4 x i32> %10, %shuffle8.i
8361  %12 = select <4 x i1> %11, <4 x i32> %10, <4 x i32> %shuffle8.i
8362  %vecext.i = extractelement <4 x i32> %12, i32 0
8363  ret i32 %vecext.i
8364}
8365
8366define float @test_mm512_reduce_max_ps(<16 x float> %__W) {
8367; X86-LABEL: test_mm512_reduce_max_ps:
8368; X86:       # %bb.0: # %entry
8369; X86-NEXT:    pushl %eax
8370; X86-NEXT:    .cfi_def_cfa_offset 8
8371; X86-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
8372; X86-NEXT:    vmaxps %ymm1, %ymm0, %ymm0
8373; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
8374; X86-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
8375; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8376; X86-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
8377; X86-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
8378; X86-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
8379; X86-NEXT:    vmovss %xmm0, (%esp)
8380; X86-NEXT:    flds (%esp)
8381; X86-NEXT:    popl %eax
8382; X86-NEXT:    .cfi_def_cfa_offset 4
8383; X86-NEXT:    vzeroupper
8384; X86-NEXT:    retl
8385;
8386; X64-LABEL: test_mm512_reduce_max_ps:
8387; X64:       # %bb.0: # %entry
8388; X64-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
8389; X64-NEXT:    vmaxps %ymm1, %ymm0, %ymm0
8390; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
8391; X64-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
8392; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8393; X64-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
8394; X64-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
8395; X64-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
8396; X64-NEXT:    vzeroupper
8397; X64-NEXT:    retq
8398entry:
8399  %0 = bitcast <16 x float> %__W to <8 x double>
8400  %extract.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8401  %1 = bitcast <4 x double> %extract.i to <8 x float>
8402  %extract2.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8403  %2 = bitcast <4 x double> %extract2.i to <8 x float>
8404  %3 = tail call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %1, <8 x float> %2)
8405  %extract4.i = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8406  %extract5.i = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8407  %4 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %extract4.i, <4 x float> %extract5.i)
8408  %shuffle.i = shufflevector <4 x float> %4, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8409  %5 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %4, <4 x float> %shuffle.i)
8410  %shuffle8.i = shufflevector <4 x float> %5, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8411  %6 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %5, <4 x float> %shuffle8.i)
8412  %vecext.i = extractelement <4 x float> %6, i32 0
8413  ret float %vecext.i
8414}
8415
8416define i32 @test_mm512_reduce_min_epi32(<8 x i64> %__W) {
8417; CHECK-LABEL: test_mm512_reduce_min_epi32:
8418; CHECK:       # %bb.0: # %entry
8419; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
8420; CHECK-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
8421; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
8422; CHECK-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
8423; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8424; CHECK-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
8425; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8426; CHECK-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
8427; CHECK-NEXT:    vmovd %xmm0, %eax
8428; CHECK-NEXT:    vzeroupper
8429; CHECK-NEXT:    ret{{[l|q]}}
8430entry:
8431  %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8432  %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8433  %0 = bitcast <4 x i64> %extract.i to <8 x i32>
8434  %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
8435  %2 = icmp slt <8 x i32> %0, %1
8436  %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
8437  %4 = bitcast <8 x i32> %3 to <4 x i64>
8438  %extract4.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8439  %extract5.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8440  %5 = bitcast <2 x i64> %extract4.i to <4 x i32>
8441  %6 = bitcast <2 x i64> %extract5.i to <4 x i32>
8442  %7 = icmp slt <4 x i32> %5, %6
8443  %8 = select <4 x i1> %7, <4 x i32> %5, <4 x i32> %6
8444  %shuffle.i = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8445  %9 = icmp slt <4 x i32> %8, %shuffle.i
8446  %10 = select <4 x i1> %9, <4 x i32> %8, <4 x i32> %shuffle.i
8447  %shuffle8.i = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8448  %11 = icmp slt <4 x i32> %10, %shuffle8.i
8449  %12 = select <4 x i1> %11, <4 x i32> %10, <4 x i32> %shuffle8.i
8450  %vecext.i = extractelement <4 x i32> %12, i32 0
8451  ret i32 %vecext.i
8452}
8453
8454define i32 @test_mm512_reduce_min_epu32(<8 x i64> %__W) {
8455; CHECK-LABEL: test_mm512_reduce_min_epu32:
8456; CHECK:       # %bb.0: # %entry
8457; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
8458; CHECK-NEXT:    vpminud %ymm1, %ymm0, %ymm0
8459; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
8460; CHECK-NEXT:    vpminud %xmm1, %xmm0, %xmm0
8461; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8462; CHECK-NEXT:    vpminud %xmm1, %xmm0, %xmm0
8463; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8464; CHECK-NEXT:    vpminud %xmm1, %xmm0, %xmm0
8465; CHECK-NEXT:    vmovd %xmm0, %eax
8466; CHECK-NEXT:    vzeroupper
8467; CHECK-NEXT:    ret{{[l|q]}}
8468entry:
8469  %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8470  %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8471  %0 = bitcast <4 x i64> %extract.i to <8 x i32>
8472  %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
8473  %2 = icmp ult <8 x i32> %0, %1
8474  %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
8475  %4 = bitcast <8 x i32> %3 to <4 x i64>
8476  %extract4.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8477  %extract5.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8478  %5 = bitcast <2 x i64> %extract4.i to <4 x i32>
8479  %6 = bitcast <2 x i64> %extract5.i to <4 x i32>
8480  %7 = icmp ult <4 x i32> %5, %6
8481  %8 = select <4 x i1> %7, <4 x i32> %5, <4 x i32> %6
8482  %shuffle.i = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8483  %9 = icmp ult <4 x i32> %8, %shuffle.i
8484  %10 = select <4 x i1> %9, <4 x i32> %8, <4 x i32> %shuffle.i
8485  %shuffle8.i = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8486  %11 = icmp ult <4 x i32> %10, %shuffle8.i
8487  %12 = select <4 x i1> %11, <4 x i32> %10, <4 x i32> %shuffle8.i
8488  %vecext.i = extractelement <4 x i32> %12, i32 0
8489  ret i32 %vecext.i
8490}
8491
8492define float @test_mm512_reduce_min_ps(<16 x float> %__W) {
8493; X86-LABEL: test_mm512_reduce_min_ps:
8494; X86:       # %bb.0: # %entry
8495; X86-NEXT:    pushl %eax
8496; X86-NEXT:    .cfi_def_cfa_offset 8
8497; X86-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
8498; X86-NEXT:    vminps %ymm1, %ymm0, %ymm0
8499; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
8500; X86-NEXT:    vminps %xmm1, %xmm0, %xmm0
8501; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8502; X86-NEXT:    vminps %xmm1, %xmm0, %xmm0
8503; X86-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
8504; X86-NEXT:    vminss %xmm1, %xmm0, %xmm0
8505; X86-NEXT:    vmovss %xmm0, (%esp)
8506; X86-NEXT:    flds (%esp)
8507; X86-NEXT:    popl %eax
8508; X86-NEXT:    .cfi_def_cfa_offset 4
8509; X86-NEXT:    vzeroupper
8510; X86-NEXT:    retl
8511;
8512; X64-LABEL: test_mm512_reduce_min_ps:
8513; X64:       # %bb.0: # %entry
8514; X64-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
8515; X64-NEXT:    vminps %ymm1, %ymm0, %ymm0
8516; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
8517; X64-NEXT:    vminps %xmm1, %xmm0, %xmm0
8518; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8519; X64-NEXT:    vminps %xmm1, %xmm0, %xmm0
8520; X64-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
8521; X64-NEXT:    vminss %xmm1, %xmm0, %xmm0
8522; X64-NEXT:    vzeroupper
8523; X64-NEXT:    retq
8524entry:
8525  %0 = bitcast <16 x float> %__W to <8 x double>
8526  %extract.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8527  %1 = bitcast <4 x double> %extract.i to <8 x float>
8528  %extract2.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8529  %2 = bitcast <4 x double> %extract2.i to <8 x float>
8530  %3 = tail call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %1, <8 x float> %2)
8531  %extract4.i = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8532  %extract5.i = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8533  %4 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %extract4.i, <4 x float> %extract5.i)
8534  %shuffle.i = shufflevector <4 x float> %4, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8535  %5 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %4, <4 x float> %shuffle.i)
8536  %shuffle8.i = shufflevector <4 x float> %5, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8537  %6 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %5, <4 x float> %shuffle8.i)
8538  %vecext.i = extractelement <4 x float> %6, i32 0
8539  ret float %vecext.i
8540}
8541
8542define i32 @test_mm512_mask_reduce_max_epi32(i16 zeroext %__M, <8 x i64> %__W) {
8543; X86-LABEL: test_mm512_mask_reduce_max_epi32:
8544; X86:       # %bb.0: # %entry
8545; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
8546; X86-NEXT:    kmovw %eax, %k1
8547; X86-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
8548; X86-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
8549; X86-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
8550; X86-NEXT:    vpmaxsd %ymm0, %ymm1, %ymm0
8551; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
8552; X86-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
8553; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8554; X86-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
8555; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8556; X86-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
8557; X86-NEXT:    vmovd %xmm0, %eax
8558; X86-NEXT:    vzeroupper
8559; X86-NEXT:    retl
8560;
8561; X64-LABEL: test_mm512_mask_reduce_max_epi32:
8562; X64:       # %bb.0: # %entry
8563; X64-NEXT:    kmovw %edi, %k1
8564; X64-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
8565; X64-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
8566; X64-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
8567; X64-NEXT:    vpmaxsd %ymm0, %ymm1, %ymm0
8568; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
8569; X64-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
8570; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8571; X64-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
8572; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8573; X64-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
8574; X64-NEXT:    vmovd %xmm0, %eax
8575; X64-NEXT:    vzeroupper
8576; X64-NEXT:    retq
8577entry:
8578  %0 = bitcast <8 x i64> %__W to <16 x i32>
8579  %1 = bitcast i16 %__M to <16 x i1>
8580  %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
8581  %3 = bitcast <16 x i32> %2 to <8 x i64>
8582  %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8583  %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8584  %4 = bitcast <4 x i64> %extract.i to <8 x i32>
8585  %5 = bitcast <4 x i64> %extract4.i to <8 x i32>
8586  %6 = icmp sgt <8 x i32> %4, %5
8587  %7 = select <8 x i1> %6, <8 x i32> %4, <8 x i32> %5
8588  %8 = bitcast <8 x i32> %7 to <4 x i64>
8589  %extract6.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8590  %extract7.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8591  %9 = bitcast <2 x i64> %extract6.i to <4 x i32>
8592  %10 = bitcast <2 x i64> %extract7.i to <4 x i32>
8593  %11 = icmp sgt <4 x i32> %9, %10
8594  %12 = select <4 x i1> %11, <4 x i32> %9, <4 x i32> %10
8595  %shuffle.i = shufflevector <4 x i32> %12, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8596  %13 = icmp sgt <4 x i32> %12, %shuffle.i
8597  %14 = select <4 x i1> %13, <4 x i32> %12, <4 x i32> %shuffle.i
8598  %shuffle10.i = shufflevector <4 x i32> %14, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8599  %15 = icmp sgt <4 x i32> %14, %shuffle10.i
8600  %16 = select <4 x i1> %15, <4 x i32> %14, <4 x i32> %shuffle10.i
8601  %vecext.i = extractelement <4 x i32> %16, i32 0
8602  ret i32 %vecext.i
8603}
8604
8605define i32 @test_mm512_mask_reduce_max_epu32(i16 zeroext %__M, <8 x i64> %__W) {
8606; X86-LABEL: test_mm512_mask_reduce_max_epu32:
8607; X86:       # %bb.0: # %entry
8608; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
8609; X86-NEXT:    kmovw %eax, %k1
8610; X86-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
8611; X86-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
8612; X86-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
8613; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
8614; X86-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
8615; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8616; X86-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
8617; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8618; X86-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
8619; X86-NEXT:    vmovd %xmm0, %eax
8620; X86-NEXT:    vzeroupper
8621; X86-NEXT:    retl
8622;
8623; X64-LABEL: test_mm512_mask_reduce_max_epu32:
8624; X64:       # %bb.0: # %entry
8625; X64-NEXT:    kmovw %edi, %k1
8626; X64-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
8627; X64-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
8628; X64-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
8629; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
8630; X64-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
8631; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8632; X64-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
8633; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8634; X64-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
8635; X64-NEXT:    vmovd %xmm0, %eax
8636; X64-NEXT:    vzeroupper
8637; X64-NEXT:    retq
8638entry:
8639  %0 = bitcast <8 x i64> %__W to <16 x i32>
8640  %1 = bitcast i16 %__M to <16 x i1>
8641  %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer
8642  %3 = bitcast <16 x i32> %2 to <8 x i64>
8643  %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8644  %extract3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8645  %4 = bitcast <4 x i64> %extract.i to <8 x i32>
8646  %5 = bitcast <4 x i64> %extract3.i to <8 x i32>
8647  %6 = icmp ugt <8 x i32> %4, %5
8648  %7 = select <8 x i1> %6, <8 x i32> %4, <8 x i32> %5
8649  %8 = bitcast <8 x i32> %7 to <4 x i64>
8650  %extract5.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8651  %extract6.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8652  %9 = bitcast <2 x i64> %extract5.i to <4 x i32>
8653  %10 = bitcast <2 x i64> %extract6.i to <4 x i32>
8654  %11 = icmp ugt <4 x i32> %9, %10
8655  %12 = select <4 x i1> %11, <4 x i32> %9, <4 x i32> %10
8656  %shuffle.i = shufflevector <4 x i32> %12, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8657  %13 = icmp ugt <4 x i32> %12, %shuffle.i
8658  %14 = select <4 x i1> %13, <4 x i32> %12, <4 x i32> %shuffle.i
8659  %shuffle9.i = shufflevector <4 x i32> %14, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8660  %15 = icmp ugt <4 x i32> %14, %shuffle9.i
8661  %16 = select <4 x i1> %15, <4 x i32> %14, <4 x i32> %shuffle9.i
8662  %vecext.i = extractelement <4 x i32> %16, i32 0
8663  ret i32 %vecext.i
8664}
8665
8666define float @test_mm512_mask_reduce_max_ps(i16 zeroext %__M, <16 x float> %__W) {
8667; X86-LABEL: test_mm512_mask_reduce_max_ps:
8668; X86:       # %bb.0: # %entry
8669; X86-NEXT:    pushl %eax
8670; X86-NEXT:    .cfi_def_cfa_offset 8
8671; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
8672; X86-NEXT:    kmovw %eax, %k1
8673; X86-NEXT:    vbroadcastss {{.*#+}} zmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf]
8674; X86-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
8675; X86-NEXT:    vextractf64x4 $1, %zmm1, %ymm0
8676; X86-NEXT:    vmaxps %ymm0, %ymm1, %ymm0
8677; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
8678; X86-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
8679; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8680; X86-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
8681; X86-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
8682; X86-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
8683; X86-NEXT:    vmovss %xmm0, (%esp)
8684; X86-NEXT:    flds (%esp)
8685; X86-NEXT:    popl %eax
8686; X86-NEXT:    .cfi_def_cfa_offset 4
8687; X86-NEXT:    vzeroupper
8688; X86-NEXT:    retl
8689;
8690; X64-LABEL: test_mm512_mask_reduce_max_ps:
8691; X64:       # %bb.0: # %entry
8692; X64-NEXT:    kmovw %edi, %k1
8693; X64-NEXT:    vbroadcastss {{.*#+}} zmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf]
8694; X64-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
8695; X64-NEXT:    vextractf64x4 $1, %zmm1, %ymm0
8696; X64-NEXT:    vmaxps %ymm0, %ymm1, %ymm0
8697; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
8698; X64-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
8699; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8700; X64-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
8701; X64-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
8702; X64-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
8703; X64-NEXT:    vzeroupper
8704; X64-NEXT:    retq
8705entry:
8706  %0 = bitcast i16 %__M to <16 x i1>
8707  %1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> <float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000>
8708  %2 = bitcast <16 x float> %1 to <8 x double>
8709  %extract.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8710  %3 = bitcast <4 x double> %extract.i to <8 x float>
8711  %extract4.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8712  %4 = bitcast <4 x double> %extract4.i to <8 x float>
8713  %5 = tail call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %3, <8 x float> %4)
8714  %extract6.i = shufflevector <8 x float> %5, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8715  %extract7.i = shufflevector <8 x float> %5, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8716  %6 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %extract6.i, <4 x float> %extract7.i)
8717  %shuffle.i = shufflevector <4 x float> %6, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8718  %7 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %6, <4 x float> %shuffle.i)
8719  %shuffle10.i = shufflevector <4 x float> %7, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8720  %8 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %7, <4 x float> %shuffle10.i)
8721  %vecext.i = extractelement <4 x float> %8, i32 0
8722  ret float %vecext.i
8723}
8724
8725define i32 @test_mm512_mask_reduce_min_epi32(i16 zeroext %__M, <8 x i64> %__W) {
8726; X86-LABEL: test_mm512_mask_reduce_min_epi32:
8727; X86:       # %bb.0: # %entry
8728; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
8729; X86-NEXT:    kmovw %eax, %k1
8730; X86-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
8731; X86-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
8732; X86-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
8733; X86-NEXT:    vpminsd %ymm0, %ymm1, %ymm0
8734; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
8735; X86-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
8736; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8737; X86-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
8738; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8739; X86-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
8740; X86-NEXT:    vmovd %xmm0, %eax
8741; X86-NEXT:    vzeroupper
8742; X86-NEXT:    retl
8743;
8744; X64-LABEL: test_mm512_mask_reduce_min_epi32:
8745; X64:       # %bb.0: # %entry
8746; X64-NEXT:    kmovw %edi, %k1
8747; X64-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
8748; X64-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
8749; X64-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
8750; X64-NEXT:    vpminsd %ymm0, %ymm1, %ymm0
8751; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
8752; X64-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
8753; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8754; X64-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
8755; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8756; X64-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
8757; X64-NEXT:    vmovd %xmm0, %eax
8758; X64-NEXT:    vzeroupper
8759; X64-NEXT:    retq
8760entry:
8761  %0 = bitcast <8 x i64> %__W to <16 x i32>
8762  %1 = bitcast i16 %__M to <16 x i1>
8763  %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
8764  %3 = bitcast <16 x i32> %2 to <8 x i64>
8765  %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8766  %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8767  %4 = bitcast <4 x i64> %extract.i to <8 x i32>
8768  %5 = bitcast <4 x i64> %extract4.i to <8 x i32>
8769  %6 = icmp slt <8 x i32> %4, %5
8770  %7 = select <8 x i1> %6, <8 x i32> %4, <8 x i32> %5
8771  %8 = bitcast <8 x i32> %7 to <4 x i64>
8772  %extract6.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8773  %extract7.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8774  %9 = bitcast <2 x i64> %extract6.i to <4 x i32>
8775  %10 = bitcast <2 x i64> %extract7.i to <4 x i32>
8776  %11 = icmp slt <4 x i32> %9, %10
8777  %12 = select <4 x i1> %11, <4 x i32> %9, <4 x i32> %10
8778  %shuffle.i = shufflevector <4 x i32> %12, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8779  %13 = icmp slt <4 x i32> %12, %shuffle.i
8780  %14 = select <4 x i1> %13, <4 x i32> %12, <4 x i32> %shuffle.i
8781  %shuffle10.i = shufflevector <4 x i32> %14, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8782  %15 = icmp slt <4 x i32> %14, %shuffle10.i
8783  %16 = select <4 x i1> %15, <4 x i32> %14, <4 x i32> %shuffle10.i
8784  %vecext.i = extractelement <4 x i32> %16, i32 0
8785  ret i32 %vecext.i
8786}
8787
8788define i32 @test_mm512_mask_reduce_min_epu32(i16 zeroext %__M, <8 x i64> %__W) {
8789; X86-LABEL: test_mm512_mask_reduce_min_epu32:
8790; X86:       # %bb.0: # %entry
8791; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
8792; X86-NEXT:    kmovw %eax, %k1
8793; X86-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
8794; X86-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
8795; X86-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
8796; X86-NEXT:    vpminud %ymm0, %ymm1, %ymm0
8797; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
8798; X86-NEXT:    vpminud %xmm1, %xmm0, %xmm0
8799; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8800; X86-NEXT:    vpminud %xmm1, %xmm0, %xmm0
8801; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8802; X86-NEXT:    vpminud %xmm1, %xmm0, %xmm0
8803; X86-NEXT:    vmovd %xmm0, %eax
8804; X86-NEXT:    vzeroupper
8805; X86-NEXT:    retl
8806;
8807; X64-LABEL: test_mm512_mask_reduce_min_epu32:
8808; X64:       # %bb.0: # %entry
8809; X64-NEXT:    kmovw %edi, %k1
8810; X64-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
8811; X64-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
8812; X64-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
8813; X64-NEXT:    vpminud %ymm0, %ymm1, %ymm0
8814; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
8815; X64-NEXT:    vpminud %xmm1, %xmm0, %xmm0
8816; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8817; X64-NEXT:    vpminud %xmm1, %xmm0, %xmm0
8818; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8819; X64-NEXT:    vpminud %xmm1, %xmm0, %xmm0
8820; X64-NEXT:    vmovd %xmm0, %eax
8821; X64-NEXT:    vzeroupper
8822; X64-NEXT:    retq
8823entry:
8824  %0 = bitcast <8 x i64> %__W to <16 x i32>
8825  %1 = bitcast i16 %__M to <16 x i1>
8826  %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
8827  %3 = bitcast <16 x i32> %2 to <8 x i64>
8828  %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8829  %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8830  %4 = bitcast <4 x i64> %extract.i to <8 x i32>
8831  %5 = bitcast <4 x i64> %extract4.i to <8 x i32>
8832  %6 = icmp ult <8 x i32> %4, %5
8833  %7 = select <8 x i1> %6, <8 x i32> %4, <8 x i32> %5
8834  %8 = bitcast <8 x i32> %7 to <4 x i64>
8835  %extract6.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8836  %extract7.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8837  %9 = bitcast <2 x i64> %extract6.i to <4 x i32>
8838  %10 = bitcast <2 x i64> %extract7.i to <4 x i32>
8839  %11 = icmp ult <4 x i32> %9, %10
8840  %12 = select <4 x i1> %11, <4 x i32> %9, <4 x i32> %10
8841  %shuffle.i = shufflevector <4 x i32> %12, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8842  %13 = icmp ult <4 x i32> %12, %shuffle.i
8843  %14 = select <4 x i1> %13, <4 x i32> %12, <4 x i32> %shuffle.i
8844  %shuffle10.i = shufflevector <4 x i32> %14, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8845  %15 = icmp ult <4 x i32> %14, %shuffle10.i
8846  %16 = select <4 x i1> %15, <4 x i32> %14, <4 x i32> %shuffle10.i
8847  %vecext.i = extractelement <4 x i32> %16, i32 0
8848  ret i32 %vecext.i
8849}
8850
8851define float @test_mm512_mask_reduce_min_ps(i16 zeroext %__M, <16 x float> %__W) {
8852; X86-LABEL: test_mm512_mask_reduce_min_ps:
8853; X86:       # %bb.0: # %entry
8854; X86-NEXT:    pushl %eax
8855; X86-NEXT:    .cfi_def_cfa_offset 8
8856; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
8857; X86-NEXT:    kmovw %eax, %k1
8858; X86-NEXT:    vbroadcastss {{.*#+}} zmm1 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf]
8859; X86-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
8860; X86-NEXT:    vextractf64x4 $1, %zmm1, %ymm0
8861; X86-NEXT:    vminps %ymm0, %ymm1, %ymm0
8862; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
8863; X86-NEXT:    vminps %xmm1, %xmm0, %xmm0
8864; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8865; X86-NEXT:    vminps %xmm1, %xmm0, %xmm0
8866; X86-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
8867; X86-NEXT:    vminss %xmm1, %xmm0, %xmm0
8868; X86-NEXT:    vmovss %xmm0, (%esp)
8869; X86-NEXT:    flds (%esp)
8870; X86-NEXT:    popl %eax
8871; X86-NEXT:    .cfi_def_cfa_offset 4
8872; X86-NEXT:    vzeroupper
8873; X86-NEXT:    retl
8874;
8875; X64-LABEL: test_mm512_mask_reduce_min_ps:
8876; X64:       # %bb.0: # %entry
8877; X64-NEXT:    kmovw %edi, %k1
8878; X64-NEXT:    vbroadcastss {{.*#+}} zmm1 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf]
8879; X64-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
8880; X64-NEXT:    vextractf64x4 $1, %zmm1, %ymm0
8881; X64-NEXT:    vminps %ymm0, %ymm1, %ymm0
8882; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
8883; X64-NEXT:    vminps %xmm1, %xmm0, %xmm0
8884; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8885; X64-NEXT:    vminps %xmm1, %xmm0, %xmm0
8886; X64-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
8887; X64-NEXT:    vminss %xmm1, %xmm0, %xmm0
8888; X64-NEXT:    vzeroupper
8889; X64-NEXT:    retq
8890entry:
8891  %0 = bitcast i16 %__M to <16 x i1>
8892  %1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> <float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000>
8893  %2 = bitcast <16 x float> %1 to <8 x double>
8894  %extract.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8895  %3 = bitcast <4 x double> %extract.i to <8 x float>
8896  %extract4.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8897  %4 = bitcast <4 x double> %extract4.i to <8 x float>
8898  %5 = tail call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %3, <8 x float> %4)
8899  %extract6.i = shufflevector <8 x float> %5, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8900  %extract7.i = shufflevector <8 x float> %5, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8901  %6 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %extract6.i, <4 x float> %extract7.i)
8902  %shuffle.i = shufflevector <4 x float> %6, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8903  %7 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %6, <4 x float> %shuffle.i)
8904  %shuffle10.i = shufflevector <4 x float> %7, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8905  %8 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %7, <4 x float> %shuffle10.i)
8906  %vecext.i = extractelement <4 x float> %8, i32 0
8907  ret float %vecext.i
8908}
8909
8910define <8 x double> @test_mm512_mask_max_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
8911; X86-LABEL: test_mm512_mask_max_pd:
8912; X86:       # %bb.0: # %entry
8913; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
8914; X86-NEXT:    kmovw %eax, %k1
8915; X86-NEXT:    vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
8916; X86-NEXT:    retl
8917;
8918; X64-LABEL: test_mm512_mask_max_pd:
8919; X64:       # %bb.0: # %entry
8920; X64-NEXT:    kmovw %edi, %k1
8921; X64-NEXT:    vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
8922; X64-NEXT:    retq
8923entry:
8924  %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
8925  %1 = bitcast i8 %__U to <8 x i1>
8926  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W
8927  ret <8 x double> %2
8928}
8929
8930define <8 x double> @test_mm512_maskz_max_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
8931; X86-LABEL: test_mm512_maskz_max_pd:
8932; X86:       # %bb.0: # %entry
8933; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
8934; X86-NEXT:    kmovw %eax, %k1
8935; X86-NEXT:    vmaxpd %zmm1, %zmm0, %zmm0 {%k1} {z}
8936; X86-NEXT:    retl
8937;
8938; X64-LABEL: test_mm512_maskz_max_pd:
8939; X64:       # %bb.0: # %entry
8940; X64-NEXT:    kmovw %edi, %k1
8941; X64-NEXT:    vmaxpd %zmm1, %zmm0, %zmm0 {%k1} {z}
8942; X64-NEXT:    retq
8943entry:
8944  %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
8945  %1 = bitcast i8 %__U to <8 x i1>
8946  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
8947  ret <8 x double> %2
8948}
8949
8950define <16 x float> @test_mm512_mask_max_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
8951; X86-LABEL: test_mm512_mask_max_ps:
8952; X86:       # %bb.0: # %entry
8953; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
8954; X86-NEXT:    kmovw %eax, %k1
8955; X86-NEXT:    vmaxps %zmm2, %zmm1, %zmm0 {%k1}
8956; X86-NEXT:    retl
8957;
8958; X64-LABEL: test_mm512_mask_max_ps:
8959; X64:       # %bb.0: # %entry
8960; X64-NEXT:    kmovw %edi, %k1
8961; X64-NEXT:    vmaxps %zmm2, %zmm1, %zmm0 {%k1}
8962; X64-NEXT:    retq
8963entry:
8964  %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
8965  %1 = bitcast i16 %__U to <16 x i1>
8966  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W
8967  ret <16 x float> %2
8968}
8969
8970define <8 x double> @test_mm512_mask_max_round_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
8971; X86-LABEL: test_mm512_mask_max_round_pd:
8972; X86:       # %bb.0: # %entry
8973; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
8974; X86-NEXT:    kmovw %eax, %k1
8975; X86-NEXT:    vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
8976; X86-NEXT:    retl
8977;
8978; X64-LABEL: test_mm512_mask_max_round_pd:
8979; X64:       # %bb.0: # %entry
8980; X64-NEXT:    kmovw %edi, %k1
8981; X64-NEXT:    vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
8982; X64-NEXT:    retq
8983entry:
8984  %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
8985  %1 = bitcast i8 %__U to <8 x i1>
8986  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W
8987  ret <8 x double> %2
8988}
8989
8990declare <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double>, <8 x double>, i32)
8991
8992define <8 x double> @test_mm512_maskz_max_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
8993; X86-LABEL: test_mm512_maskz_max_round_pd:
8994; X86:       # %bb.0: # %entry
8995; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
8996; X86-NEXT:    kmovw %eax, %k1
8997; X86-NEXT:    vmaxpd %zmm1, %zmm0, %zmm0 {%k1} {z}
8998; X86-NEXT:    retl
8999;
9000; X64-LABEL: test_mm512_maskz_max_round_pd:
9001; X64:       # %bb.0: # %entry
9002; X64-NEXT:    kmovw %edi, %k1
9003; X64-NEXT:    vmaxpd %zmm1, %zmm0, %zmm0 {%k1} {z}
9004; X64-NEXT:    retq
9005entry:
9006  %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
9007  %1 = bitcast i8 %__U to <8 x i1>
9008  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
9009  ret <8 x double> %2
9010}
9011
9012define <8 x double> @test_mm512_max_round_pd(<8 x double> %__A, <8 x double> %__B) {
9013; CHECK-LABEL: test_mm512_max_round_pd:
9014; CHECK:       # %bb.0: # %entry
9015; CHECK-NEXT:    vmaxpd %zmm1, %zmm0, %zmm0
9016; CHECK-NEXT:    ret{{[l|q]}}
9017entry:
9018  %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
9019  ret <8 x double> %0
9020}
9021
9022define <16 x float> @test_mm512_maskz_max_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
9023; X86-LABEL: test_mm512_maskz_max_ps:
9024; X86:       # %bb.0: # %entry
9025; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
9026; X86-NEXT:    kmovw %eax, %k1
9027; X86-NEXT:    vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z}
9028; X86-NEXT:    retl
9029;
9030; X64-LABEL: test_mm512_maskz_max_ps:
9031; X64:       # %bb.0: # %entry
9032; X64-NEXT:    kmovw %edi, %k1
9033; X64-NEXT:    vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z}
9034; X64-NEXT:    retq
9035entry:
9036  %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9037  %1 = bitcast i16 %__U to <16 x i1>
9038  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
9039  ret <16 x float> %2
9040}
9041
9042define <16 x float> @test_mm512_mask_max_round_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
9043; X86-LABEL: test_mm512_mask_max_round_ps:
9044; X86:       # %bb.0: # %entry
9045; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
9046; X86-NEXT:    kmovw %eax, %k1
9047; X86-NEXT:    vmaxps %zmm2, %zmm1, %zmm0 {%k1}
9048; X86-NEXT:    retl
9049;
9050; X64-LABEL: test_mm512_mask_max_round_ps:
9051; X64:       # %bb.0: # %entry
9052; X64-NEXT:    kmovw %edi, %k1
9053; X64-NEXT:    vmaxps %zmm2, %zmm1, %zmm0 {%k1}
9054; X64-NEXT:    retq
9055entry:
9056  %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9057  %1 = bitcast i16 %__U to <16 x i1>
9058  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W
9059  ret <16 x float> %2
9060}
9061
9062declare <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float>, <16 x float>, i32)
9063
9064define <16 x float> @test_mm512_maskz_max_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
9065; X86-LABEL: test_mm512_maskz_max_round_ps:
9066; X86:       # %bb.0: # %entry
9067; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
9068; X86-NEXT:    kmovw %eax, %k1
9069; X86-NEXT:    vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z}
9070; X86-NEXT:    retl
9071;
9072; X64-LABEL: test_mm512_maskz_max_round_ps:
9073; X64:       # %bb.0: # %entry
9074; X64-NEXT:    kmovw %edi, %k1
9075; X64-NEXT:    vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z}
9076; X64-NEXT:    retq
9077entry:
9078  %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9079  %1 = bitcast i16 %__U to <16 x i1>
9080  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
9081  ret <16 x float> %2
9082}
9083
9084define <16 x float> @test_mm512_max_round_ps(<16 x float> %__A, <16 x float> %__B) {
9085; CHECK-LABEL: test_mm512_max_round_ps:
9086; CHECK:       # %bb.0: # %entry
9087; CHECK-NEXT:    vmaxps %zmm1, %zmm0, %zmm0
9088; CHECK-NEXT:    ret{{[l|q]}}
9089entry:
9090  %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9091  ret <16 x float> %0
9092}
9093
9094define <8 x double> @test_mm512_mask_min_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
9095; X86-LABEL: test_mm512_mask_min_pd:
9096; X86:       # %bb.0: # %entry
9097; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
9098; X86-NEXT:    kmovw %eax, %k1
9099; X86-NEXT:    vminpd %zmm2, %zmm1, %zmm0 {%k1}
9100; X86-NEXT:    retl
9101;
9102; X64-LABEL: test_mm512_mask_min_pd:
9103; X64:       # %bb.0: # %entry
9104; X64-NEXT:    kmovw %edi, %k1
9105; X64-NEXT:    vminpd %zmm2, %zmm1, %zmm0 {%k1}
9106; X64-NEXT:    retq
9107entry:
9108  %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
9109  %1 = bitcast i8 %__U to <8 x i1>
9110  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W
9111  ret <8 x double> %2
9112}
9113
9114define <8 x double> @test_mm512_maskz_min_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
9115; X86-LABEL: test_mm512_maskz_min_pd:
9116; X86:       # %bb.0: # %entry
9117; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
9118; X86-NEXT:    kmovw %eax, %k1
9119; X86-NEXT:    vminpd %zmm1, %zmm0, %zmm0 {%k1} {z}
9120; X86-NEXT:    retl
9121;
9122; X64-LABEL: test_mm512_maskz_min_pd:
9123; X64:       # %bb.0: # %entry
9124; X64-NEXT:    kmovw %edi, %k1
9125; X64-NEXT:    vminpd %zmm1, %zmm0, %zmm0 {%k1} {z}
9126; X64-NEXT:    retq
9127entry:
9128  %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
9129  %1 = bitcast i8 %__U to <8 x i1>
9130  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
9131  ret <8 x double> %2
9132}
9133
9134define <8 x double> @test_mm512_mask_min_round_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
9135; X86-LABEL: test_mm512_mask_min_round_pd:
9136; X86:       # %bb.0: # %entry
9137; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
9138; X86-NEXT:    kmovw %eax, %k1
9139; X86-NEXT:    vminpd %zmm2, %zmm1, %zmm0 {%k1}
9140; X86-NEXT:    retl
9141;
9142; X64-LABEL: test_mm512_mask_min_round_pd:
9143; X64:       # %bb.0: # %entry
9144; X64-NEXT:    kmovw %edi, %k1
9145; X64-NEXT:    vminpd %zmm2, %zmm1, %zmm0 {%k1}
9146; X64-NEXT:    retq
9147entry:
9148  %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
9149  %1 = bitcast i8 %__U to <8 x i1>
9150  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W
9151  ret <8 x double> %2
9152}
9153
9154declare <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double>, <8 x double>, i32)
9155
9156define <8 x double> @test_mm512_maskz_min_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
9157; X86-LABEL: test_mm512_maskz_min_round_pd:
9158; X86:       # %bb.0: # %entry
9159; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
9160; X86-NEXT:    kmovw %eax, %k1
9161; X86-NEXT:    vminpd %zmm1, %zmm0, %zmm0 {%k1} {z}
9162; X86-NEXT:    retl
9163;
9164; X64-LABEL: test_mm512_maskz_min_round_pd:
9165; X64:       # %bb.0: # %entry
9166; X64-NEXT:    kmovw %edi, %k1
9167; X64-NEXT:    vminpd %zmm1, %zmm0, %zmm0 {%k1} {z}
9168; X64-NEXT:    retq
9169entry:
9170  %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
9171  %1 = bitcast i8 %__U to <8 x i1>
9172  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
9173  ret <8 x double> %2
9174}
9175
9176define <8 x double> @test_mm512_min_round_pd(<8 x double> %__A, <8 x double> %__B) {
9177; CHECK-LABEL: test_mm512_min_round_pd:
9178; CHECK:       # %bb.0: # %entry
9179; CHECK-NEXT:    vminpd %zmm1, %zmm0, %zmm0
9180; CHECK-NEXT:    ret{{[l|q]}}
9181entry:
9182  %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
9183  ret <8 x double> %0
9184}
9185
9186define <16 x float> @test_mm512_mask_min_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
9187; X86-LABEL: test_mm512_mask_min_ps:
9188; X86:       # %bb.0: # %entry
9189; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
9190; X86-NEXT:    kmovw %eax, %k1
9191; X86-NEXT:    vminps %zmm2, %zmm1, %zmm0 {%k1}
9192; X86-NEXT:    retl
9193;
9194; X64-LABEL: test_mm512_mask_min_ps:
9195; X64:       # %bb.0: # %entry
9196; X64-NEXT:    kmovw %edi, %k1
9197; X64-NEXT:    vminps %zmm2, %zmm1, %zmm0 {%k1}
9198; X64-NEXT:    retq
9199entry:
9200  %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9201  %1 = bitcast i16 %__U to <16 x i1>
9202  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W
9203  ret <16 x float> %2
9204}
9205
9206define <16 x float> @test_mm512_maskz_min_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
9207; X86-LABEL: test_mm512_maskz_min_ps:
9208; X86:       # %bb.0: # %entry
9209; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
9210; X86-NEXT:    kmovw %eax, %k1
9211; X86-NEXT:    vminps %zmm1, %zmm0, %zmm0 {%k1} {z}
9212; X86-NEXT:    retl
9213;
9214; X64-LABEL: test_mm512_maskz_min_ps:
9215; X64:       # %bb.0: # %entry
9216; X64-NEXT:    kmovw %edi, %k1
9217; X64-NEXT:    vminps %zmm1, %zmm0, %zmm0 {%k1} {z}
9218; X64-NEXT:    retq
9219entry:
9220  %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9221  %1 = bitcast i16 %__U to <16 x i1>
9222  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
9223  ret <16 x float> %2
9224}
9225
9226define <16 x float> @test_mm512_mask_min_round_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
9227; X86-LABEL: test_mm512_mask_min_round_ps:
9228; X86:       # %bb.0: # %entry
9229; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
9230; X86-NEXT:    kmovw %eax, %k1
9231; X86-NEXT:    vminps %zmm2, %zmm1, %zmm0 {%k1}
9232; X86-NEXT:    retl
9233;
9234; X64-LABEL: test_mm512_mask_min_round_ps:
9235; X64:       # %bb.0: # %entry
9236; X64-NEXT:    kmovw %edi, %k1
9237; X64-NEXT:    vminps %zmm2, %zmm1, %zmm0 {%k1}
9238; X64-NEXT:    retq
9239entry:
9240  %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9241  %1 = bitcast i16 %__U to <16 x i1>
9242  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W
9243  ret <16 x float> %2
9244}
9245
9246declare <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float>, <16 x float>, i32)
9247
9248define <16 x float> @test_mm512_maskz_min_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
9249; X86-LABEL: test_mm512_maskz_min_round_ps:
9250; X86:       # %bb.0: # %entry
9251; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
9252; X86-NEXT:    kmovw %eax, %k1
9253; X86-NEXT:    vminps %zmm1, %zmm0, %zmm0 {%k1} {z}
9254; X86-NEXT:    retl
9255;
9256; X64-LABEL: test_mm512_maskz_min_round_ps:
9257; X64:       # %bb.0: # %entry
9258; X64-NEXT:    kmovw %edi, %k1
9259; X64-NEXT:    vminps %zmm1, %zmm0, %zmm0 {%k1} {z}
9260; X64-NEXT:    retq
9261entry:
9262  %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9263  %1 = bitcast i16 %__U to <16 x i1>
9264  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
9265  ret <16 x float> %2
9266}
9267
9268define <16 x float> @test_mm512_min_round_ps(<16 x float> %__A, <16 x float> %__B) {
9269; CHECK-LABEL: test_mm512_min_round_ps:
9270; CHECK:       # %bb.0: # %entry
9271; CHECK-NEXT:    vminps %zmm1, %zmm0, %zmm0
9272; CHECK-NEXT:    ret{{[l|q]}}
9273entry:
9274  %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9275  ret <16 x float> %0
9276}
9277
9278define <8 x double> @test_mm512_sqrt_pd(<8 x double> %a) {
9279; CHECK-LABEL: test_mm512_sqrt_pd:
9280; CHECK:       # %bb.0: # %entry
9281; CHECK-NEXT:    vsqrtpd %zmm0, %zmm0
9282; CHECK-NEXT:    ret{{[l|q]}}
9283entry:
9284  %0 = tail call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a)
9285  ret <8 x double> %0
9286}
9287
9288define <8 x double> @test_mm512_mask_sqrt_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A) {
9289; X86-LABEL: test_mm512_mask_sqrt_pd:
9290; X86:       # %bb.0: # %entry
9291; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
9292; X86-NEXT:    kmovw %eax, %k1
9293; X86-NEXT:    vsqrtpd %zmm1, %zmm0 {%k1}
9294; X86-NEXT:    retl
9295;
9296; X64-LABEL: test_mm512_mask_sqrt_pd:
9297; X64:       # %bb.0: # %entry
9298; X64-NEXT:    kmovw %edi, %k1
9299; X64-NEXT:    vsqrtpd %zmm1, %zmm0 {%k1}
9300; X64-NEXT:    retq
9301entry:
9302  %0 = tail call <8 x double> @llvm.sqrt.v8f64(<8 x double> %__A)
9303  %1 = bitcast i8 %__U to <8 x i1>
9304  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W
9305  ret <8 x double> %2
9306}
9307
9308define <8 x double> @test_mm512_maskz_sqrt_pd(i8 zeroext %__U, <8 x double> %__A) {
9309; X86-LABEL: test_mm512_maskz_sqrt_pd:
9310; X86:       # %bb.0: # %entry
9311; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
9312; X86-NEXT:    kmovw %eax, %k1
9313; X86-NEXT:    vsqrtpd %zmm0, %zmm0 {%k1} {z}
9314; X86-NEXT:    retl
9315;
9316; X64-LABEL: test_mm512_maskz_sqrt_pd:
9317; X64:       # %bb.0: # %entry
9318; X64-NEXT:    kmovw %edi, %k1
9319; X64-NEXT:    vsqrtpd %zmm0, %zmm0 {%k1} {z}
9320; X64-NEXT:    retq
9321entry:
9322  %0 = tail call <8 x double> @llvm.sqrt.v8f64(<8 x double> %__A)
9323  %1 = bitcast i8 %__U to <8 x i1>
9324  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
9325  ret <8 x double> %2
9326}
9327
9328define <8 x double> @test_mm512_mask_sqrt_round_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A) {
9329; X86-LABEL: test_mm512_mask_sqrt_round_pd:
9330; X86:       # %bb.0: # %entry
9331; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
9332; X86-NEXT:    kmovw %eax, %k1
9333; X86-NEXT:    vsqrtpd {rn-sae}, %zmm1, %zmm0 {%k1}
9334; X86-NEXT:    retl
9335;
9336; X64-LABEL: test_mm512_mask_sqrt_round_pd:
9337; X64:       # %bb.0: # %entry
9338; X64-NEXT:    kmovw %edi, %k1
9339; X64-NEXT:    vsqrtpd {rn-sae}, %zmm1, %zmm0 {%k1}
9340; X64-NEXT:    retq
9341entry:
9342  %0 = tail call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %__A, i32 8)
9343  %1 = bitcast i8 %__U to <8 x i1>
9344  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W
9345  ret <8 x double> %2
9346}
9347
9348declare <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double>, i32)
9349
9350define <8 x double> @test_mm512_maskz_sqrt_round_pd(i8 zeroext %__U, <8 x double> %__A) {
9351; X86-LABEL: test_mm512_maskz_sqrt_round_pd:
9352; X86:       # %bb.0: # %entry
9353; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
9354; X86-NEXT:    kmovw %eax, %k1
9355; X86-NEXT:    vsqrtpd {rn-sae}, %zmm0, %zmm0 {%k1} {z}
9356; X86-NEXT:    retl
9357;
9358; X64-LABEL: test_mm512_maskz_sqrt_round_pd:
9359; X64:       # %bb.0: # %entry
9360; X64-NEXT:    kmovw %edi, %k1
9361; X64-NEXT:    vsqrtpd {rn-sae}, %zmm0, %zmm0 {%k1} {z}
9362; X64-NEXT:    retq
9363entry:
9364  %0 = tail call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %__A, i32 8)
9365  %1 = bitcast i8 %__U to <8 x i1>
9366  %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
9367  ret <8 x double> %2
9368}
9369
9370define <8 x double> @test_mm512_sqrt_round_pd(<8 x double> %__A) {
9371; CHECK-LABEL: test_mm512_sqrt_round_pd:
9372; CHECK:       # %bb.0: # %entry
9373; CHECK-NEXT:    vsqrtpd {rn-sae}, %zmm0, %zmm0
9374; CHECK-NEXT:    ret{{[l|q]}}
9375entry:
9376  %0 = tail call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %__A, i32 8)
9377  ret <8 x double> %0
9378}
9379
9380define <16 x float> @test_mm512_sqrt_ps(<16 x float> %a) {
9381; CHECK-LABEL: test_mm512_sqrt_ps:
9382; CHECK:       # %bb.0: # %entry
9383; CHECK-NEXT:    vsqrtps %zmm0, %zmm0
9384; CHECK-NEXT:    ret{{[l|q]}}
9385entry:
9386  %0 = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a)
9387  ret <16 x float> %0
9388}
9389
9390define <16 x float> @test_mm512_mask_sqrt_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A) {
9391; X86-LABEL: test_mm512_mask_sqrt_ps:
9392; X86:       # %bb.0: # %entry
9393; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
9394; X86-NEXT:    kmovw %eax, %k1
9395; X86-NEXT:    vsqrtps %zmm1, %zmm0 {%k1}
9396; X86-NEXT:    retl
9397;
9398; X64-LABEL: test_mm512_mask_sqrt_ps:
9399; X64:       # %bb.0: # %entry
9400; X64-NEXT:    kmovw %edi, %k1
9401; X64-NEXT:    vsqrtps %zmm1, %zmm0 {%k1}
9402; X64-NEXT:    retq
9403entry:
9404  %0 = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %__A)
9405  %1 = bitcast i16 %__U to <16 x i1>
9406  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W
9407  ret <16 x float> %2
9408}
9409
9410define <16 x float> @test_mm512_maskz_sqrt_ps(i16 zeroext %__U, <16 x float> %__A) {
9411; X86-LABEL: test_mm512_maskz_sqrt_ps:
9412; X86:       # %bb.0: # %entry
9413; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
9414; X86-NEXT:    kmovw %eax, %k1
9415; X86-NEXT:    vsqrtps %zmm0, %zmm0 {%k1} {z}
9416; X86-NEXT:    retl
9417;
9418; X64-LABEL: test_mm512_maskz_sqrt_ps:
9419; X64:       # %bb.0: # %entry
9420; X64-NEXT:    kmovw %edi, %k1
9421; X64-NEXT:    vsqrtps %zmm0, %zmm0 {%k1} {z}
9422; X64-NEXT:    retq
9423entry:
9424  %0 = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %__A)
9425  %1 = bitcast i16 %__U to <16 x i1>
9426  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
9427  ret <16 x float> %2
9428}
9429
9430define <16 x float> @test_mm512_mask_sqrt_round_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A) {
9431; X86-LABEL: test_mm512_mask_sqrt_round_ps:
9432; X86:       # %bb.0: # %entry
9433; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
9434; X86-NEXT:    kmovw %eax, %k1
9435; X86-NEXT:    vsqrtps {rn-sae}, %zmm1, %zmm0 {%k1}
9436; X86-NEXT:    retl
9437;
9438; X64-LABEL: test_mm512_mask_sqrt_round_ps:
9439; X64:       # %bb.0: # %entry
9440; X64-NEXT:    kmovw %edi, %k1
9441; X64-NEXT:    vsqrtps {rn-sae}, %zmm1, %zmm0 {%k1}
9442; X64-NEXT:    retq
9443entry:
9444  %0 = tail call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %__A, i32 8)
9445  %1 = bitcast i16 %__U to <16 x i1>
9446  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W
9447  ret <16 x float> %2
9448}
9449
9450declare <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float>, i32)
9451
9452define <16 x float> @test_mm512_maskz_sqrt_round_ps(i16 zeroext %__U, <16 x float> %__A) {
9453; X86-LABEL: test_mm512_maskz_sqrt_round_ps:
9454; X86:       # %bb.0: # %entry
9455; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
9456; X86-NEXT:    kmovw %eax, %k1
9457; X86-NEXT:    vsqrtps {rn-sae}, %zmm0, %zmm0 {%k1} {z}
9458; X86-NEXT:    retl
9459;
9460; X64-LABEL: test_mm512_maskz_sqrt_round_ps:
9461; X64:       # %bb.0: # %entry
9462; X64-NEXT:    kmovw %edi, %k1
9463; X64-NEXT:    vsqrtps {rn-sae}, %zmm0, %zmm0 {%k1} {z}
9464; X64-NEXT:    retq
9465entry:
9466  %0 = tail call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %__A, i32 8)
9467  %1 = bitcast i16 %__U to <16 x i1>
9468  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
9469  ret <16 x float> %2
9470}
9471
9472define <16 x float> @test_mm512_sqrt_round_ps(<16 x float> %__A) {
9473; CHECK-LABEL: test_mm512_sqrt_round_ps:
9474; CHECK:       # %bb.0: # %entry
9475; CHECK-NEXT:    vsqrtps {rn-sae}, %zmm0, %zmm0
9476; CHECK-NEXT:    ret{{[l|q]}}
9477entry:
9478  %0 = tail call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %__A, i32 8)
9479  ret <16 x float> %0
9480}
9481
9482define <8 x i64> @test_mm512_rol_epi32(<8 x i64> %__A) local_unnamed_addr #0 {
9483; CHECK-LABEL: test_mm512_rol_epi32:
9484; CHECK:       # %bb.0: # %entry
9485; CHECK-NEXT:    vprold $5, %zmm0, %zmm0
9486; CHECK-NEXT:    ret{{[l|q]}}
9487entry:
9488  %0 = bitcast <8 x i64> %__A to <16 x i32>
9489  %1 = tail call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
9490  %2 = bitcast <16 x i32> %1 to <8 x i64>
9491  ret <8 x i64> %2
9492}
9493
9494define <8 x i64> @test_mm512_mask_rol_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A) {
9495; X86-LABEL: test_mm512_mask_rol_epi32:
9496; X86:       # %bb.0: # %entry
9497; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
9498; X86-NEXT:    kmovw %eax, %k1
9499; X86-NEXT:    vprold $5, %zmm1, %zmm0 {%k1}
9500; X86-NEXT:    retl
9501;
9502; X64-LABEL: test_mm512_mask_rol_epi32:
9503; X64:       # %bb.0: # %entry
9504; X64-NEXT:    kmovw %edi, %k1
9505; X64-NEXT:    vprold $5, %zmm1, %zmm0 {%k1}
9506; X64-NEXT:    retq
9507entry:
9508  %0 = bitcast <8 x i64> %__A to <16 x i32>
9509  %1 = tail call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
9510  %2 = bitcast <8 x i64> %__W to <16 x i32>
9511  %3 = bitcast i16 %__U to <16 x i1>
9512  %4 = select <16 x i1> %3, <16 x i32> %1, <16 x i32> %2
9513  %5 = bitcast <16 x i32> %4 to <8 x i64>
9514  ret <8 x i64> %5
9515}
9516
9517define <8 x i64> @test_mm512_maskz_rol_epi32(i16 zeroext %__U, <8 x i64> %__A) {
9518; X86-LABEL: test_mm512_maskz_rol_epi32:
9519; X86:       # %bb.0: # %entry
9520; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
9521; X86-NEXT:    kmovw %eax, %k1
9522; X86-NEXT:    vprold $5, %zmm0, %zmm0 {%k1} {z}
9523; X86-NEXT:    retl
9524;
9525; X64-LABEL: test_mm512_maskz_rol_epi32:
9526; X64:       # %bb.0: # %entry
9527; X64-NEXT:    kmovw %edi, %k1
9528; X64-NEXT:    vprold $5, %zmm0, %zmm0 {%k1} {z}
9529; X64-NEXT:    retq
9530entry:
9531  %0 = bitcast <8 x i64> %__A to <16 x i32>
9532  %1 = tail call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
9533  %2 = bitcast i16 %__U to <16 x i1>
9534  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer
9535  %4 = bitcast <16 x i32> %3 to <8 x i64>
9536  ret <8 x i64> %4
9537}
9538
9539define <8 x i64> @test_mm512_rol_epi64(<8 x i64> %__A) {
9540; CHECK-LABEL: test_mm512_rol_epi64:
9541; CHECK:       # %bb.0: # %entry
9542; CHECK-NEXT:    vprolq $5, %zmm0, %zmm0
9543; CHECK-NEXT:    ret{{[l|q]}}
9544entry:
9545  %0 = tail call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>)
9546  ret <8 x i64> %0
9547}
9548
9549define <8 x i64> @test_mm512_mask_rol_epi64(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A) {
9550; X86-LABEL: test_mm512_mask_rol_epi64:
9551; X86:       # %bb.0: # %entry
9552; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
9553; X86-NEXT:    kmovw %eax, %k1
9554; X86-NEXT:    vprolq $5, %zmm1, %zmm0 {%k1}
9555; X86-NEXT:    retl
9556;
9557; X64-LABEL: test_mm512_mask_rol_epi64:
9558; X64:       # %bb.0: # %entry
9559; X64-NEXT:    kmovw %edi, %k1
9560; X64-NEXT:    vprolq $5, %zmm1, %zmm0 {%k1}
9561; X64-NEXT:    retq
9562entry:
9563  %0 = tail call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>)
9564  %1 = bitcast i8 %__U to <8 x i1>
9565  %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__W
9566  ret <8 x i64> %2
9567}
9568
9569define <8 x i64> @test_mm512_maskz_rol_epi64(i8 zeroext %__U, <8 x i64> %__A) {
9570; X86-LABEL: test_mm512_maskz_rol_epi64:
9571; X86:       # %bb.0: # %entry
9572; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
9573; X86-NEXT:    kmovw %eax, %k1
9574; X86-NEXT:    vprolq $5, %zmm0, %zmm0 {%k1} {z}
9575; X86-NEXT:    retl
9576;
9577; X64-LABEL: test_mm512_maskz_rol_epi64:
9578; X64:       # %bb.0: # %entry
9579; X64-NEXT:    kmovw %edi, %k1
9580; X64-NEXT:    vprolq $5, %zmm0, %zmm0 {%k1} {z}
9581; X64-NEXT:    retq
9582entry:
9583  %0 = tail call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>)
9584  %1 = bitcast i8 %__U to <8 x i1>
9585  %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
9586  ret <8 x i64> %2
9587}
9588
9589define <8 x i64> @test_mm512_rolv_epi32(<8 x i64> %__A, <8 x i64> %__B) {
9590; CHECK-LABEL: test_mm512_rolv_epi32:
9591; CHECK:       # %bb.0: # %entry
9592; CHECK-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
9593; CHECK-NEXT:    ret{{[l|q]}}
9594entry:
9595  %0 = bitcast <8 x i64> %__A to <16 x i32>
9596  %1 = bitcast <8 x i64> %__B to <16 x i32>
9597  %2 = tail call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> %1)
9598  %3 = bitcast <16 x i32> %2 to <8 x i64>
9599  ret <8 x i64> %3
9600}
9601
9602define <8 x i64> @test_mm512_mask_rolv_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9603; X86-LABEL: test_mm512_mask_rolv_epi32:
9604; X86:       # %bb.0: # %entry
9605; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
9606; X86-NEXT:    kmovw %eax, %k1
9607; X86-NEXT:    vprolvd %zmm2, %zmm1, %zmm0 {%k1}
9608; X86-NEXT:    retl
9609;
9610; X64-LABEL: test_mm512_mask_rolv_epi32:
9611; X64:       # %bb.0: # %entry
9612; X64-NEXT:    kmovw %edi, %k1
9613; X64-NEXT:    vprolvd %zmm2, %zmm1, %zmm0 {%k1}
9614; X64-NEXT:    retq
9615entry:
9616  %0 = bitcast <8 x i64> %__A to <16 x i32>
9617  %1 = bitcast <8 x i64> %__B to <16 x i32>
9618  %2 = tail call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> %1)
9619  %3 = bitcast <8 x i64> %__W to <16 x i32>
9620  %4 = bitcast i16 %__U to <16 x i1>
9621  %5 = select <16 x i1> %4, <16 x i32> %2, <16 x i32> %3
9622  %6 = bitcast <16 x i32> %5 to <8 x i64>
9623  ret <8 x i64> %6
9624}
9625
9626define <8 x i64> @test_mm512_maskz_rolv_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9627; X86-LABEL: test_mm512_maskz_rolv_epi32:
9628; X86:       # %bb.0: # %entry
9629; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
9630; X86-NEXT:    kmovw %eax, %k1
9631; X86-NEXT:    vprolvd %zmm1, %zmm0, %zmm0 {%k1} {z}
9632; X86-NEXT:    retl
9633;
9634; X64-LABEL: test_mm512_maskz_rolv_epi32:
9635; X64:       # %bb.0: # %entry
9636; X64-NEXT:    kmovw %edi, %k1
9637; X64-NEXT:    vprolvd %zmm1, %zmm0, %zmm0 {%k1} {z}
9638; X64-NEXT:    retq
9639entry:
9640  %0 = bitcast <8 x i64> %__A to <16 x i32>
9641  %1 = bitcast <8 x i64> %__B to <16 x i32>
9642  %2 = tail call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> %1)
9643  %3 = bitcast i16 %__U to <16 x i1>
9644  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
9645  %5 = bitcast <16 x i32> %4 to <8 x i64>
9646  ret <8 x i64> %5
9647}
9648
9649define <8 x i64> @test_mm512_rolv_epi64(<8 x i64> %__A, <8 x i64> %__B) {
9650; CHECK-LABEL: test_mm512_rolv_epi64:
9651; CHECK:       # %bb.0: # %entry
9652; CHECK-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
9653; CHECK-NEXT:    ret{{[l|q]}}
9654entry:
9655  %0 = tail call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> %__B)
9656  ret <8 x i64> %0
9657}
9658
9659define <8 x i64> @test_mm512_mask_rolv_epi64(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9660; X86-LABEL: test_mm512_mask_rolv_epi64:
9661; X86:       # %bb.0: # %entry
9662; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
9663; X86-NEXT:    kmovw %eax, %k1
9664; X86-NEXT:    vprolvq %zmm2, %zmm1, %zmm0 {%k1}
9665; X86-NEXT:    retl
9666;
9667; X64-LABEL: test_mm512_mask_rolv_epi64:
9668; X64:       # %bb.0: # %entry
9669; X64-NEXT:    kmovw %edi, %k1
9670; X64-NEXT:    vprolvq %zmm2, %zmm1, %zmm0 {%k1}
9671; X64-NEXT:    retq
9672entry:
9673  %0 = tail call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> %__B)
9674  %1 = bitcast i8 %__U to <8 x i1>
9675  %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__W
9676  ret <8 x i64> %2
9677}
9678
9679define <8 x i64> @test_mm512_maskz_rolv_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9680; X86-LABEL: test_mm512_maskz_rolv_epi64:
9681; X86:       # %bb.0: # %entry
9682; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
9683; X86-NEXT:    kmovw %eax, %k1
9684; X86-NEXT:    vprolvq %zmm1, %zmm0, %zmm0 {%k1} {z}
9685; X86-NEXT:    retl
9686;
9687; X64-LABEL: test_mm512_maskz_rolv_epi64:
9688; X64:       # %bb.0: # %entry
9689; X64-NEXT:    kmovw %edi, %k1
9690; X64-NEXT:    vprolvq %zmm1, %zmm0, %zmm0 {%k1} {z}
9691; X64-NEXT:    retq
9692entry:
9693  %0 = tail call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> %__B)
9694  %1 = bitcast i8 %__U to <8 x i1>
9695  %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
9696  ret <8 x i64> %2
9697}
9698
9699define <8 x i64> @test_mm512_ror_epi32(<8 x i64> %__A) {
9700; CHECK-LABEL: test_mm512_ror_epi32:
9701; CHECK:       # %bb.0: # %entry
9702; CHECK-NEXT:    vprord $5, %zmm0, %zmm0
9703; CHECK-NEXT:    ret{{[l|q]}}
9704entry:
9705  %0 = bitcast <8 x i64> %__A to <16 x i32>
9706  %1 = tail call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
9707  %2 = bitcast <16 x i32> %1 to <8 x i64>
9708  ret <8 x i64> %2
9709}
9710
9711
9712define <8 x i64> @test_mm512_mask_ror_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A) {
9713; X86-LABEL: test_mm512_mask_ror_epi32:
9714; X86:       # %bb.0: # %entry
9715; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
9716; X86-NEXT:    kmovw %eax, %k1
9717; X86-NEXT:    vprord $5, %zmm1, %zmm0 {%k1}
9718; X86-NEXT:    retl
9719;
9720; X64-LABEL: test_mm512_mask_ror_epi32:
9721; X64:       # %bb.0: # %entry
9722; X64-NEXT:    kmovw %edi, %k1
9723; X64-NEXT:    vprord $5, %zmm1, %zmm0 {%k1}
9724; X64-NEXT:    retq
9725entry:
9726  %0 = bitcast <8 x i64> %__A to <16 x i32>
9727  %1 = tail call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
9728  %2 = bitcast <8 x i64> %__W to <16 x i32>
9729  %3 = bitcast i16 %__U to <16 x i1>
9730  %4 = select <16 x i1> %3, <16 x i32> %1, <16 x i32> %2
9731  %5 = bitcast <16 x i32> %4 to <8 x i64>
9732  ret <8 x i64> %5
9733}
9734
9735define <8 x i64> @test_mm512_maskz_ror_epi32(i16 zeroext %__U, <8 x i64> %__A) {
9736; X86-LABEL: test_mm512_maskz_ror_epi32:
9737; X86:       # %bb.0: # %entry
9738; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
9739; X86-NEXT:    kmovw %eax, %k1
9740; X86-NEXT:    vprord $5, %zmm0, %zmm0 {%k1} {z}
9741; X86-NEXT:    retl
9742;
9743; X64-LABEL: test_mm512_maskz_ror_epi32:
9744; X64:       # %bb.0: # %entry
9745; X64-NEXT:    kmovw %edi, %k1
9746; X64-NEXT:    vprord $5, %zmm0, %zmm0 {%k1} {z}
9747; X64-NEXT:    retq
9748entry:
9749  %0 = bitcast <8 x i64> %__A to <16 x i32>
9750  %1 = tail call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
9751  %2 = bitcast i16 %__U to <16 x i1>
9752  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer
9753  %4 = bitcast <16 x i32> %3 to <8 x i64>
9754  ret <8 x i64> %4
9755}
9756
9757define <8 x i64> @test_mm512_ror_epi64(<8 x i64> %__A) {
9758; CHECK-LABEL: test_mm512_ror_epi64:
9759; CHECK:       # %bb.0: # %entry
9760; CHECK-NEXT:    vprorq $5, %zmm0, %zmm0
9761; CHECK-NEXT:    ret{{[l|q]}}
9762entry:
9763  %0 = tail call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>)
9764  ret <8 x i64> %0
9765}
9766
9767define <8 x i64> @test_mm512_mask_ror_epi64(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A) {
9768; X86-LABEL: test_mm512_mask_ror_epi64:
9769; X86:       # %bb.0: # %entry
9770; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
9771; X86-NEXT:    kmovw %eax, %k1
9772; X86-NEXT:    vprorq $5, %zmm1, %zmm0 {%k1}
9773; X86-NEXT:    retl
9774;
9775; X64-LABEL: test_mm512_mask_ror_epi64:
9776; X64:       # %bb.0: # %entry
9777; X64-NEXT:    kmovw %edi, %k1
9778; X64-NEXT:    vprorq $5, %zmm1, %zmm0 {%k1}
9779; X64-NEXT:    retq
9780entry:
9781  %0 = tail call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>)
9782  %1 = bitcast i8 %__U to <8 x i1>
9783  %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__W
9784  ret <8 x i64> %2
9785}
9786
9787define <8 x i64> @test_mm512_maskz_ror_epi64(i8 zeroext %__U, <8 x i64> %__A) {
9788; X86-LABEL: test_mm512_maskz_ror_epi64:
9789; X86:       # %bb.0: # %entry
9790; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
9791; X86-NEXT:    kmovw %eax, %k1
9792; X86-NEXT:    vprorq $5, %zmm0, %zmm0 {%k1} {z}
9793; X86-NEXT:    retl
9794;
9795; X64-LABEL: test_mm512_maskz_ror_epi64:
9796; X64:       # %bb.0: # %entry
9797; X64-NEXT:    kmovw %edi, %k1
9798; X64-NEXT:    vprorq $5, %zmm0, %zmm0 {%k1} {z}
9799; X64-NEXT:    retq
9800entry:
9801  %0 = tail call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>)
9802  %1 = bitcast i8 %__U to <8 x i1>
9803  %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
9804  ret <8 x i64> %2
9805}
9806
9807define <8 x i64> @test_mm512_rorv_epi32(<8 x i64> %__A, <8 x i64> %__B) {
9808; CHECK-LABEL: test_mm512_rorv_epi32:
9809; CHECK:       # %bb.0: # %entry
9810; CHECK-NEXT:    vprorvd %zmm1, %zmm0, %zmm0
9811; CHECK-NEXT:    ret{{[l|q]}}
9812entry:
9813  %0 = bitcast <8 x i64> %__A to <16 x i32>
9814  %1 = bitcast <8 x i64> %__B to <16 x i32>
9815  %2 = tail call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> %1)
9816  %3 = bitcast <16 x i32> %2 to <8 x i64>
9817  ret <8 x i64> %3
9818}
9819
9820define <8 x i64> @test_mm512_mask_rorv_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9821; X86-LABEL: test_mm512_mask_rorv_epi32:
9822; X86:       # %bb.0: # %entry
9823; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
9824; X86-NEXT:    kmovw %eax, %k1
9825; X86-NEXT:    vprorvd %zmm2, %zmm1, %zmm0 {%k1}
9826; X86-NEXT:    retl
9827;
9828; X64-LABEL: test_mm512_mask_rorv_epi32:
9829; X64:       # %bb.0: # %entry
9830; X64-NEXT:    kmovw %edi, %k1
9831; X64-NEXT:    vprorvd %zmm2, %zmm1, %zmm0 {%k1}
9832; X64-NEXT:    retq
9833entry:
9834  %0 = bitcast <8 x i64> %__A to <16 x i32>
9835  %1 = bitcast <8 x i64> %__B to <16 x i32>
9836  %2 = tail call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> %1)
9837  %3 = bitcast <8 x i64> %__W to <16 x i32>
9838  %4 = bitcast i16 %__U to <16 x i1>
9839  %5 = select <16 x i1> %4, <16 x i32> %2, <16 x i32> %3
9840  %6 = bitcast <16 x i32> %5 to <8 x i64>
9841  ret <8 x i64> %6
9842}
9843
9844define <8 x i64> @test_mm512_maskz_rorv_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9845; X86-LABEL: test_mm512_maskz_rorv_epi32:
9846; X86:       # %bb.0: # %entry
9847; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
9848; X86-NEXT:    kmovw %eax, %k1
9849; X86-NEXT:    vprorvd %zmm1, %zmm0, %zmm0 {%k1} {z}
9850; X86-NEXT:    retl
9851;
9852; X64-LABEL: test_mm512_maskz_rorv_epi32:
9853; X64:       # %bb.0: # %entry
9854; X64-NEXT:    kmovw %edi, %k1
9855; X64-NEXT:    vprorvd %zmm1, %zmm0, %zmm0 {%k1} {z}
9856; X64-NEXT:    retq
9857entry:
9858  %0 = bitcast <8 x i64> %__A to <16 x i32>
9859  %1 = bitcast <8 x i64> %__B to <16 x i32>
9860  %2 = tail call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> %1)
9861  %3 = bitcast i16 %__U to <16 x i1>
9862  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
9863  %5 = bitcast <16 x i32> %4 to <8 x i64>
9864  ret <8 x i64> %5
9865}
9866
9867define <8 x i64> @test_mm512_rorv_epi64(<8 x i64> %__A, <8 x i64> %__B) {
9868; CHECK-LABEL: test_mm512_rorv_epi64:
9869; CHECK:       # %bb.0: # %entry
9870; CHECK-NEXT:    vprorvq %zmm1, %zmm0, %zmm0
9871; CHECK-NEXT:    ret{{[l|q]}}
9872entry:
9873  %0 = tail call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> %__B)
9874  ret <8 x i64> %0
9875}
9876
9877define <8 x i64> @test_mm512_mask_rorv_epi64(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9878; X86-LABEL: test_mm512_mask_rorv_epi64:
9879; X86:       # %bb.0: # %entry
9880; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
9881; X86-NEXT:    kmovw %eax, %k1
9882; X86-NEXT:    vprorvq %zmm2, %zmm1, %zmm0 {%k1}
9883; X86-NEXT:    retl
9884;
9885; X64-LABEL: test_mm512_mask_rorv_epi64:
9886; X64:       # %bb.0: # %entry
9887; X64-NEXT:    kmovw %edi, %k1
9888; X64-NEXT:    vprorvq %zmm2, %zmm1, %zmm0 {%k1}
9889; X64-NEXT:    retq
9890entry:
9891  %0 = tail call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> %__B)
9892  %1 = bitcast i8 %__U to <8 x i1>
9893  %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__W
9894  ret <8 x i64> %2
9895}
9896
9897define <8 x i64> @test_mm512_maskz_rorv_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9898; X86-LABEL: test_mm512_maskz_rorv_epi64:
9899; X86:       # %bb.0: # %entry
9900; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
9901; X86-NEXT:    kmovw %eax, %k1
9902; X86-NEXT:    vprorvq %zmm1, %zmm0, %zmm0 {%k1} {z}
9903; X86-NEXT:    retl
9904;
9905; X64-LABEL: test_mm512_maskz_rorv_epi64:
9906; X64:       # %bb.0: # %entry
9907; X64-NEXT:    kmovw %edi, %k1
9908; X64-NEXT:    vprorvq %zmm1, %zmm0, %zmm0 {%k1} {z}
9909; X64-NEXT:    retq
9910entry:
9911  %0 = tail call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> %__B)
9912  %1 = bitcast i8 %__U to <8 x i1>
9913  %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
9914  ret <8 x i64> %2
9915}
9916
9917declare <8 x double> @llvm.fma.v8f64(<8 x double>, <8 x double>, <8 x double>) #9
9918declare <16 x float> @llvm.fma.v16f32(<16 x float>, <16 x float>, <16 x float>) #9
9919declare float @llvm.fma.f32(float, float, float) #9
9920declare double @llvm.fma.f64(double, double, double) #9
9921declare <8 x i64> @llvm.masked.expandload.v8i64(i64*, <8 x i1>, <8 x i64>)
9922declare <8 x double> @llvm.masked.expandload.v8f64(double*, <8 x i1>, <8 x double>)
9923declare <16 x i32> @llvm.masked.expandload.v16i32(i32*, <16 x i1>, <16 x i32>) #10
9924declare <16 x float> @llvm.masked.expandload.v16f32(float*, <16 x i1>, <16 x float>)
9925declare void @llvm.masked.compressstore.v8f64(<8 x double>, double*, <8 x i1>)
9926declare void @llvm.masked.compressstore.v8i64(<8 x i64>, i64*, <8 x i1>)
9927declare void @llvm.masked.compressstore.v16f32(<16 x float>, float*, <16 x i1>)
9928declare void @llvm.masked.compressstore.v16i32(<16 x i32>, i32*, <16 x i1>)
9929declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>)
9930declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>)
9931declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>)
9932declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>)
9933declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>)
9934declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>)
9935declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>)
9936declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>)
9937declare <8 x double> @llvm.sqrt.v8f64(<8 x double>)
9938declare <16 x float> @llvm.sqrt.v16f32(<16 x float>)
9939
9940declare <16 x i32> @llvm.fshl.v16i32(<16 x i32>, <16 x i32>, <16 x i32>)
9941declare <8 x i64> @llvm.fshl.v8i64(<8 x i64>, <8 x i64>, <8 x i64>)
9942declare <16 x i32> @llvm.fshr.v16i32(<16 x i32>, <16 x i32>, <16 x i32>)
9943declare <8 x i64> @llvm.fshr.v8i64(<8 x i64>, <8 x i64>, <8 x i64>)
9944
9945!0 = !{i32 1}
9946
9947