• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
3
4declare i32 @llvm.x86.avx512.kortestz.w(i16, i16) nounwind readnone
5define i32 @test_kortestz(i16 %a0, i16 %a1) {
6; CHECK-LABEL: test_kortestz:
7; CHECK:       ## BB#0:
8; CHECK-NEXT:    kmovw %esi, %k0
9; CHECK-NEXT:    kmovw %edi, %k1
10; CHECK-NEXT:    kortestw %k0, %k1
11; CHECK-NEXT:    sete %al
12; CHECK-NEXT:    kmovw %eax, %k0
13; CHECK-NEXT:    kmovw %k0, %eax
14; CHECK-NEXT:    andl $1, %eax
15; CHECK-NEXT:    retq
16  %res = call i32 @llvm.x86.avx512.kortestz.w(i16 %a0, i16 %a1)
17  ret i32 %res
18}
19
20declare i32 @llvm.x86.avx512.kortestc.w(i16, i16) nounwind readnone
21define i32 @test_kortestc(i16 %a0, i16 %a1) {
22; CHECK-LABEL: test_kortestc:
23; CHECK:       ## BB#0:
24; CHECK-NEXT:    kmovw %esi, %k0
25; CHECK-NEXT:    kmovw %edi, %k1
26; CHECK-NEXT:    kortestw %k0, %k1
27; CHECK-NEXT:    sbbl %eax, %eax
28; CHECK-NEXT:    andl $1, %eax
29; CHECK-NEXT:    retq
30  %res = call i32 @llvm.x86.avx512.kortestc.w(i16 %a0, i16 %a1)
31  ret i32 %res
32}
33
34declare i16 @llvm.x86.avx512.kand.w(i16, i16) nounwind readnone
35define i16 @test_kand(i16 %a0, i16 %a1) {
36; CHECK-LABEL: test_kand:
37; CHECK:       ## BB#0:
38; CHECK-NEXT:    movw $8, %ax
39; CHECK-NEXT:    kmovw %eax, %k0
40; CHECK-NEXT:    kmovw %edi, %k1
41; CHECK-NEXT:    kandw %k0, %k1, %k0
42; CHECK-NEXT:    kmovw %esi, %k1
43; CHECK-NEXT:    kandw %k1, %k0, %k0
44; CHECK-NEXT:    kmovw %k0, %eax
45; CHECK-NEXT:    retq
46  %t1 = call i16 @llvm.x86.avx512.kand.w(i16 %a0, i16 8)
47  %t2 = call i16 @llvm.x86.avx512.kand.w(i16 %t1, i16 %a1)
48  ret i16 %t2
49}
50
51declare i16 @llvm.x86.avx512.knot.w(i16) nounwind readnone
52define i16 @test_knot(i16 %a0) {
53; CHECK-LABEL: test_knot:
54; CHECK:       ## BB#0:
55; CHECK-NEXT:    kmovw %edi, %k0
56; CHECK-NEXT:    knotw %k0, %k0
57; CHECK-NEXT:    kmovw %k0, %eax
58; CHECK-NEXT:    retq
59  %res = call i16 @llvm.x86.avx512.knot.w(i16 %a0)
60  ret i16 %res
61}
62
63declare i16 @llvm.x86.avx512.kunpck.bw(i16, i16) nounwind readnone
64
65define i16 @unpckbw_test(i16 %a0, i16 %a1) {
66; CHECK-LABEL: unpckbw_test:
67; CHECK:       ## BB#0:
68; CHECK-NEXT:    kmovw %edi, %k0
69; CHECK-NEXT:    kmovw %esi, %k1
70; CHECK-NEXT:    kunpckbw %k1, %k0, %k0
71; CHECK-NEXT:    kmovw %k0, %eax
72; CHECK-NEXT:    retq
73  %res = call i16 @llvm.x86.avx512.kunpck.bw(i16 %a0, i16 %a1)
74  ret i16 %res
75}
76
77define <16 x float> @test_rcp_ps_512(<16 x float> %a0) {
78; CHECK-LABEL: test_rcp_ps_512:
79; CHECK:       ## BB#0:
80; CHECK-NEXT:    vrcp14ps %zmm0, %zmm0
81; CHECK-NEXT:    retq
82  %res = call <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1) ; <<16 x float>> [#uses=1]
83  ret <16 x float> %res
84}
85declare <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone
86
87define <8 x double> @test_rcp_pd_512(<8 x double> %a0) {
88; CHECK-LABEL: test_rcp_pd_512:
89; CHECK:       ## BB#0:
90; CHECK-NEXT:    vrcp14pd %zmm0, %zmm0
91; CHECK-NEXT:    retq
92  %res = call <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1) ; <<8 x double>> [#uses=1]
93  ret <8 x double> %res
94}
95declare <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double>, <8 x double>, i8) nounwind readnone
96
97declare <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double>, i32, <8 x double>, i8, i32)
98
99define <8 x double> @test7(<8 x double> %a) {
100; CHECK-LABEL: test7:
101; CHECK:       ## BB#0:
102; CHECK-NEXT:    vrndscalepd $11, %zmm0, %zmm0
103; CHECK-NEXT:    retq
104  %res = call <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double> %a, i32 11, <8 x double> %a, i8 -1, i32 4)
105  ret <8 x double>%res
106}
107
108declare <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float>, i32, <16 x float>, i16, i32)
109
110define <16 x float> @test8(<16 x float> %a) {
111; CHECK-LABEL: test8:
112; CHECK:       ## BB#0:
113; CHECK-NEXT:    vrndscaleps $11, %zmm0, %zmm0
114; CHECK-NEXT:    retq
115  %res = call <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float> %a, i32 11, <16 x float> %a, i16 -1, i32 4)
116  ret <16 x float>%res
117}
118
119define <16 x float> @test_rsqrt_ps_512(<16 x float> %a0) {
120; CHECK-LABEL: test_rsqrt_ps_512:
121; CHECK:       ## BB#0:
122; CHECK-NEXT:    vrsqrt14ps %zmm0, %zmm0
123; CHECK-NEXT:    retq
124  %res = call <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1) ; <<16 x float>> [#uses=1]
125  ret <16 x float> %res
126}
127declare <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone
128
129define <4 x float> @test_rsqrt14_ss(<4 x float> %a0) {
130; CHECK-LABEL: test_rsqrt14_ss:
131; CHECK:       ## BB#0:
132; CHECK-NEXT:    vrsqrt14ss %xmm0, %xmm0, %xmm0
133; CHECK-NEXT:    retq
134  %res = call <4 x float> @llvm.x86.avx512.rsqrt14.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1) ; <<4 x float>> [#uses=1]
135  ret <4 x float> %res
136}
137declare <4 x float> @llvm.x86.avx512.rsqrt14.ss(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
138
139define <4 x float> @test_rcp14_ss(<4 x float> %a0) {
140; CHECK-LABEL: test_rcp14_ss:
141; CHECK:       ## BB#0:
142; CHECK-NEXT:    vrcp14ss %xmm0, %xmm0, %xmm0
143; CHECK-NEXT:    retq
144  %res = call <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1) ; <<4 x float>> [#uses=1]
145  ret <4 x float> %res
146}
147declare <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
148
149define <8 x double> @test_sqrt_pd_512(<8 x double> %a0) {
150; CHECK-LABEL: test_sqrt_pd_512:
151; CHECK:       ## BB#0:
152; CHECK-NEXT:    vsqrtpd %zmm0, %zmm0
153; CHECK-NEXT:    retq
154  %res = call <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double> %a0,  <8 x double> zeroinitializer, i8 -1, i32 4)
155  ret <8 x double> %res
156}
157declare <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double>, <8 x double>, i8, i32) nounwind readnone
158
159define <16 x float> @test_sqrt_ps_512(<16 x float> %a0) {
160; CHECK-LABEL: test_sqrt_ps_512:
161; CHECK:       ## BB#0:
162; CHECK-NEXT:    vsqrtps %zmm0, %zmm0
163; CHECK-NEXT:    retq
164  %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 4)
165  ret <16 x float> %res
166}
167define <16 x float> @test_sqrt_round_ps_512(<16 x float> %a0) {
168; CHECK-LABEL: test_sqrt_round_ps_512:
169; CHECK:       ## BB#0:
170; CHECK-NEXT:    vsqrtps {rz-sae}, %zmm0, %zmm0
171; CHECK-NEXT:    retq
172  %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 3)
173  ret <16 x float> %res
174}
175declare <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float>, <16 x float>, i16, i32) nounwind readnone
176
177define <8 x double> @test_getexp_pd_512(<8 x double> %a0) {
178; CHECK-LABEL: test_getexp_pd_512:
179; CHECK:       ## BB#0:
180; CHECK-NEXT:    vgetexppd %zmm0, %zmm0
181; CHECK-NEXT:    retq
182  %res = call <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double> %a0,  <8 x double> zeroinitializer, i8 -1, i32 4)
183  ret <8 x double> %res
184}
185define <8 x double> @test_getexp_round_pd_512(<8 x double> %a0) {
186; CHECK-LABEL: test_getexp_round_pd_512:
187; CHECK:       ## BB#0:
188; CHECK-NEXT:    vgetexppd {sae}, %zmm0, %zmm0
189; CHECK-NEXT:    retq
190  %res = call <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double> %a0,  <8 x double> zeroinitializer, i8 -1, i32 8)
191  ret <8 x double> %res
192}
193declare <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double>, <8 x double>, i8, i32) nounwind readnone
194
195define <16 x float> @test_getexp_ps_512(<16 x float> %a0) {
196; CHECK-LABEL: test_getexp_ps_512:
197; CHECK:       ## BB#0:
198; CHECK-NEXT:    vgetexpps %zmm0, %zmm0
199; CHECK-NEXT:    retq
200  %res = call <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 4)
201  ret <16 x float> %res
202}
203
204define <16 x float> @test_getexp_round_ps_512(<16 x float> %a0) {
205; CHECK-LABEL: test_getexp_round_ps_512:
206; CHECK:       ## BB#0:
207; CHECK-NEXT:    vgetexpps {sae}, %zmm0, %zmm0
208; CHECK-NEXT:    retq
209  %res = call <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
210  ret <16 x float> %res
211}
212declare <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float>, <16 x float>, i16, i32) nounwind readnone
213
214declare <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
215
216define <4 x float> @test_sqrt_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
217; CHECK-LABEL: test_sqrt_ss:
218; CHECK:       ## BB#0:
219; CHECK-NEXT:    andl $1, %edi
220; CHECK-NEXT:    kmovw %edi, %k1
221; CHECK-NEXT:    vmovaps %zmm2, %zmm3
222; CHECK-NEXT:    vsqrtss %xmm1, %xmm0, %xmm3 {%k1}
223; CHECK-NEXT:    vsqrtss {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
224; CHECK-NEXT:    vsqrtss {ru-sae}, %xmm1, %xmm0, %xmm4 {%k1} {z}
225; CHECK-NEXT:    vsqrtss {rz-sae}, %xmm1, %xmm0, %xmm0
226; CHECK-NEXT:    vaddps %xmm2, %xmm3, %xmm1
227; CHECK-NEXT:    vaddps %xmm0, %xmm4, %xmm0
228; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
229; CHECK-NEXT:    retq
230  %res0 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
231  %res1 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 1)
232  %res2 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 2)
233  %res3 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 3)
234
235  %res.1 = fadd <4 x float> %res0, %res1
236  %res.2 = fadd <4 x float> %res2, %res3
237  %res   = fadd <4 x float> %res.1, %res.2
238  ret <4 x float> %res
239}
240
241declare <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
242
243define <2 x double> @test_sqrt_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
244; CHECK-LABEL: test_sqrt_sd:
245; CHECK:       ## BB#0:
246; CHECK-NEXT:    andl $1, %edi
247; CHECK-NEXT:    kmovw %edi, %k1
248; CHECK-NEXT:    vmovaps %zmm2, %zmm3
249; CHECK-NEXT:    vsqrtsd %xmm1, %xmm0, %xmm3 {%k1}
250; CHECK-NEXT:    vsqrtsd {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
251; CHECK-NEXT:    vsqrtsd {ru-sae}, %xmm1, %xmm0, %xmm4 {%k1} {z}
252; CHECK-NEXT:    vsqrtsd {rz-sae}, %xmm1, %xmm0, %xmm0
253; CHECK-NEXT:    vaddpd %xmm2, %xmm3, %xmm1
254; CHECK-NEXT:    vaddpd %xmm0, %xmm4, %xmm0
255; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
256; CHECK-NEXT:    retq
257  %res0 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
258  %res1 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 1)
259  %res2 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 2)
260  %res3 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 3)
261
262  %res.1 = fadd <2 x double> %res0, %res1
263  %res.2 = fadd <2 x double> %res2, %res3
264  %res   = fadd <2 x double> %res.1, %res.2
265  ret <2 x double> %res
266}
267
268define i64 @test_x86_sse2_cvtsd2si64(<2 x double> %a0) {
269; CHECK-LABEL: test_x86_sse2_cvtsd2si64:
270; CHECK:       ## BB#0:
271; CHECK-NEXT:    vcvtsd2si %xmm0, %rax
272; CHECK-NEXT:    retq
273  %res = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %a0) ; <i64> [#uses=1]
274  ret i64 %res
275}
276declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone
277
278define <2 x double> @test_x86_sse2_cvtsi642sd(<2 x double> %a0, i64 %a1) {
279; CHECK-LABEL: test_x86_sse2_cvtsi642sd:
280; CHECK:       ## BB#0:
281; CHECK-NEXT:    vcvtsi2sdq %rdi, %xmm0, %xmm0
282; CHECK-NEXT:    retq
283  %res = call <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double> %a0, i64 %a1) ; <<2 x double>> [#uses=1]
284  ret <2 x double> %res
285}
286declare <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double>, i64) nounwind readnone
287
288define i64 @test_x86_avx512_cvttsd2si64(<2 x double> %a0) {
289; CHECK-LABEL: test_x86_avx512_cvttsd2si64:
290; CHECK:       ## BB#0:
291; CHECK-NEXT:    vcvttsd2si %xmm0, %rcx
292; CHECK-NEXT:    vcvttsd2si {sae}, %xmm0, %rax
293; CHECK-NEXT:    addq %rcx, %rax
294; CHECK-NEXT:    retq
295  %res0 = call i64 @llvm.x86.avx512.cvttsd2si64(<2 x double> %a0, i32 4) ;
296  %res1 = call i64 @llvm.x86.avx512.cvttsd2si64(<2 x double> %a0, i32 8) ;
297  %res2 = add i64 %res0, %res1
298  ret i64 %res2
299}
300declare i64 @llvm.x86.avx512.cvttsd2si64(<2 x double>, i32) nounwind readnone
301
302define i32 @test_x86_avx512_cvttsd2usi(<2 x double> %a0) {
303; CHECK-LABEL: test_x86_avx512_cvttsd2usi:
304; CHECK:       ## BB#0:
305; CHECK-NEXT:    vcvttsd2usi %xmm0, %ecx
306; CHECK-NEXT:    vcvttsd2usi {sae}, %xmm0, %eax
307; CHECK-NEXT:    addl %ecx, %eax
308; CHECK-NEXT:    retq
309  %res0 = call i32 @llvm.x86.avx512.cvttsd2usi(<2 x double> %a0, i32 4) ;
310  %res1 = call i32 @llvm.x86.avx512.cvttsd2usi(<2 x double> %a0, i32 8) ;
311  %res2 = add i32 %res0, %res1
312  ret i32 %res2
313}
314declare i32 @llvm.x86.avx512.cvttsd2usi(<2 x double>, i32) nounwind readnone
315
316define i32 @test_x86_avx512_cvttsd2si(<2 x double> %a0) {
317; CHECK-LABEL: test_x86_avx512_cvttsd2si:
318; CHECK:       ## BB#0:
319; CHECK-NEXT:    vcvttsd2si %xmm0, %ecx
320; CHECK-NEXT:    vcvttsd2si {sae}, %xmm0, %eax
321; CHECK-NEXT:    addl %ecx, %eax
322; CHECK-NEXT:    retq
323  %res0 = call i32 @llvm.x86.avx512.cvttsd2si(<2 x double> %a0, i32 4) ;
324  %res1 = call i32 @llvm.x86.avx512.cvttsd2si(<2 x double> %a0, i32 8) ;
325  %res2 = add i32 %res0, %res1
326  ret i32 %res2
327}
328declare i32 @llvm.x86.avx512.cvttsd2si(<2 x double>, i32) nounwind readnone
329
330
331
332define i64 @test_x86_avx512_cvttsd2usi64(<2 x double> %a0) {
333; CHECK-LABEL: test_x86_avx512_cvttsd2usi64:
334; CHECK:       ## BB#0:
335; CHECK-NEXT:    vcvttsd2usi %xmm0, %rcx
336; CHECK-NEXT:    vcvttsd2usi {sae}, %xmm0, %rax
337; CHECK-NEXT:    addq %rcx, %rax
338; CHECK-NEXT:    retq
339  %res0 = call i64 @llvm.x86.avx512.cvttsd2usi64(<2 x double> %a0, i32 4) ;
340  %res1 = call i64 @llvm.x86.avx512.cvttsd2usi64(<2 x double> %a0, i32 8) ;
341  %res2 = add i64 %res0, %res1
342  ret i64 %res2
343}
344declare i64 @llvm.x86.avx512.cvttsd2usi64(<2 x double>, i32) nounwind readnone
345
346define i64 @test_x86_sse_cvtss2si64(<4 x float> %a0) {
347; CHECK-LABEL: test_x86_sse_cvtss2si64:
348; CHECK:       ## BB#0:
349; CHECK-NEXT:    vcvtss2si %xmm0, %rax
350; CHECK-NEXT:    retq
351  %res = call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %a0) ; <i64> [#uses=1]
352  ret i64 %res
353}
354declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone
355
356
357define <4 x float> @test_x86_sse_cvtsi642ss(<4 x float> %a0, i64 %a1) {
358; CHECK-LABEL: test_x86_sse_cvtsi642ss:
359; CHECK:       ## BB#0:
360; CHECK-NEXT:    vcvtsi2ssq %rdi, %xmm0, %xmm0
361; CHECK-NEXT:    retq
362  %res = call <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float> %a0, i64 %a1) ; <<4 x float>> [#uses=1]
363  ret <4 x float> %res
364}
365declare <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float>, i64) nounwind readnone
366
367
368define i32 @test_x86_avx512_cvttss2si(<4 x float> %a0) {
369; CHECK-LABEL: test_x86_avx512_cvttss2si:
370; CHECK:       ## BB#0:
371; CHECK-NEXT:    vcvttss2si {sae}, %xmm0, %ecx
372; CHECK-NEXT:    vcvttss2si %xmm0, %eax
373; CHECK-NEXT:    addl %ecx, %eax
374; CHECK-NEXT:    retq
375  %res0 = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> %a0, i32 8) ;
376  %res1 = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> %a0, i32 4) ;
377  %res2 = add i32 %res0, %res1
378  ret i32 %res2
379}
380declare i32 @llvm.x86.avx512.cvttss2si(<4 x float>, i32) nounwind readnone
381
382define i64 @test_x86_avx512_cvttss2si64(<4 x float> %a0) {
383; CHECK-LABEL: test_x86_avx512_cvttss2si64:
384; CHECK:       ## BB#0:
385; CHECK-NEXT:    vcvttss2si %xmm0, %rcx
386; CHECK-NEXT:    vcvttss2si {sae}, %xmm0, %rax
387; CHECK-NEXT:    addq %rcx, %rax
388; CHECK-NEXT:    retq
389  %res0 = call i64 @llvm.x86.avx512.cvttss2si64(<4 x float> %a0, i32 4) ;
390  %res1 = call i64 @llvm.x86.avx512.cvttss2si64(<4 x float> %a0, i32 8) ;
391  %res2 = add i64 %res0, %res1
392  ret i64 %res2
393}
394declare i64 @llvm.x86.avx512.cvttss2si64(<4 x float>, i32) nounwind readnone
395
396define i32 @test_x86_avx512_cvttss2usi(<4 x float> %a0) {
397; CHECK-LABEL: test_x86_avx512_cvttss2usi:
398; CHECK:       ## BB#0:
399; CHECK-NEXT:    vcvttss2usi {sae}, %xmm0, %ecx
400; CHECK-NEXT:    vcvttss2usi %xmm0, %eax
401; CHECK-NEXT:    addl %ecx, %eax
402; CHECK-NEXT:    retq
403  %res0 = call i32 @llvm.x86.avx512.cvttss2usi(<4 x float> %a0, i32 8) ;
404  %res1 = call i32 @llvm.x86.avx512.cvttss2usi(<4 x float> %a0, i32 4) ;
405  %res2 = add i32 %res0, %res1
406  ret i32 %res2
407}
408declare i32 @llvm.x86.avx512.cvttss2usi(<4 x float>, i32) nounwind readnone
409
410define i64 @test_x86_avx512_cvttss2usi64(<4 x float> %a0) {
411; CHECK-LABEL: test_x86_avx512_cvttss2usi64:
412; CHECK:       ## BB#0:
413; CHECK-NEXT:    vcvttss2usi %xmm0, %rcx
414; CHECK-NEXT:    vcvttss2usi {sae}, %xmm0, %rax
415; CHECK-NEXT:    addq %rcx, %rax
416; CHECK-NEXT:    retq
417  %res0 = call i64 @llvm.x86.avx512.cvttss2usi64(<4 x float> %a0, i32 4) ;
418  %res1 = call i64 @llvm.x86.avx512.cvttss2usi64(<4 x float> %a0, i32 8) ;
419  %res2 = add i64 %res0, %res1
420  ret i64 %res2
421}
422declare i64 @llvm.x86.avx512.cvttss2usi64(<4 x float>, i32) nounwind readnone
423
424define i64 @test_x86_avx512_cvtsd2usi64(<2 x double> %a0) {
425; CHECK-LABEL: test_x86_avx512_cvtsd2usi64:
426; CHECK:       ## BB#0:
427; CHECK-NEXT:    vcvtsd2usi %xmm0, %rax
428; CHECK-NEXT:    retq
429  %res = call i64 @llvm.x86.avx512.cvtsd2usi64(<2 x double> %a0) ; <i64> [#uses=1]
430  ret i64 %res
431}
432declare i64 @llvm.x86.avx512.cvtsd2usi64(<2 x double>) nounwind readnone
433
434define <16 x float> @test_x86_vcvtph2ps_512(<16 x i16> %a0) {
435; CHECK-LABEL: test_x86_vcvtph2ps_512:
436; CHECK:       ## BB#0:
437; CHECK-NEXT:    vcvtph2ps %ymm0, %zmm0
438; CHECK-NEXT:    retq
439  %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 -1, i32 4)
440  ret <16 x float> %res
441}
442
443define <16 x float> @test_x86_vcvtph2ps_512_sae(<16 x i16> %a0) {
444; CHECK-LABEL: test_x86_vcvtph2ps_512_sae:
445; CHECK:       ## BB#0:
446; CHECK-NEXT:    vcvtph2ps {sae}, %ymm0, %zmm0
447; CHECK-NEXT:    retq
448  %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
449  ret <16 x float> %res
450}
451
452define <16 x float> @test_x86_vcvtph2ps_512_rrk(<16 x i16> %a0,<16 x float> %a1, i16 %mask) {
453; CHECK-LABEL: test_x86_vcvtph2ps_512_rrk:
454; CHECK:       ## BB#0:
455; CHECK-NEXT:    kmovw %edi, %k1
456; CHECK-NEXT:    vcvtph2ps %ymm0, %zmm1 {%k1}
457; CHECK-NEXT:    vmovaps %zmm1, %zmm0
458; CHECK-NEXT:    retq
459  %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> %a1, i16 %mask, i32 4)
460  ret <16 x float> %res
461}
462
463define <16 x float> @test_x86_vcvtph2ps_512_sae_rrkz(<16 x i16> %a0, i16 %mask) {
464; CHECK-LABEL: test_x86_vcvtph2ps_512_sae_rrkz:
465; CHECK:       ## BB#0:
466; CHECK-NEXT:    kmovw %edi, %k1
467; CHECK-NEXT:    vcvtph2ps {sae}, %ymm0, %zmm0 {%k1} {z}
468; CHECK-NEXT:    retq
469  %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 %mask, i32 8)
470  ret <16 x float> %res
471}
472
473define <16 x float> @test_x86_vcvtph2ps_512_rrkz(<16 x i16> %a0, i16 %mask) {
474; CHECK-LABEL: test_x86_vcvtph2ps_512_rrkz:
475; CHECK:       ## BB#0:
476; CHECK-NEXT:    kmovw %edi, %k1
477; CHECK-NEXT:    vcvtph2ps %ymm0, %zmm0 {%k1} {z}
478; CHECK-NEXT:    retq
479  %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 %mask, i32 4)
480  ret <16 x float> %res
481}
482
483declare <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16>, <16 x float>, i16, i32) nounwind readonly
484
485
486define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0) {
487; CHECK-LABEL: test_x86_vcvtps2ph_256:
488; CHECK:       ## BB#0:
489; CHECK-NEXT:    vcvtps2ph $2, %zmm0, %ymm0
490; CHECK-NEXT:    retq
491  %res = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 -1)
492  ret <16 x i16> %res
493}
494
495declare <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float>, i32, <16 x i16>, i16) nounwind readonly
496
497define <16 x float> @test_x86_vbroadcast_ss_512(i8* %a0) {
498; CHECK-LABEL: test_x86_vbroadcast_ss_512:
499; CHECK:       ## BB#0:
500; CHECK-NEXT:    vbroadcastss (%rdi), %zmm0
501; CHECK-NEXT:    retq
502  %res = call <16 x float> @llvm.x86.avx512.vbroadcast.ss.512(i8* %a0) ; <<16 x float>> [#uses=1]
503  ret <16 x float> %res
504}
505declare <16 x float> @llvm.x86.avx512.vbroadcast.ss.512(i8*) nounwind readonly
506
507define <8 x double> @test_x86_vbroadcast_sd_512(i8* %a0) {
508; CHECK-LABEL: test_x86_vbroadcast_sd_512:
509; CHECK:       ## BB#0:
510; CHECK-NEXT:    vbroadcastsd (%rdi), %zmm0
511; CHECK-NEXT:    retq
512  %res = call <8 x double> @llvm.x86.avx512.vbroadcast.sd.512(i8* %a0) ; <<8 x double>> [#uses=1]
513  ret <8 x double> %res
514}
515declare <8 x double> @llvm.x86.avx512.vbroadcast.sd.512(i8*) nounwind readonly
516
517define <16 x float> @test_x86_vbroadcast_ss_ps_512(<4 x float> %a0) {
518; CHECK-LABEL: test_x86_vbroadcast_ss_ps_512:
519; CHECK:       ## BB#0:
520; CHECK-NEXT:    vbroadcastss %xmm0, %zmm0
521; CHECK-NEXT:    retq
522  %res = call <16 x float> @llvm.x86.avx512.vbroadcast.ss.ps.512(<4 x float> %a0) ; <<16 x float>> [#uses=1]
523  ret <16 x float> %res
524}
525declare <16 x float> @llvm.x86.avx512.vbroadcast.ss.ps.512(<4 x float>) nounwind readonly
526
527define <8 x double> @test_x86_vbroadcast_sd_pd_512(<2 x double> %a0) {
528; CHECK-LABEL: test_x86_vbroadcast_sd_pd_512:
529; CHECK:       ## BB#0:
530; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm0
531; CHECK-NEXT:    retq
532  %res = call <8 x double> @llvm.x86.avx512.vbroadcast.sd.pd.512(<2 x double> %a0) ; <<8 x double>> [#uses=1]
533  ret <8 x double> %res
534}
535declare <8 x double> @llvm.x86.avx512.vbroadcast.sd.pd.512(<2 x double>) nounwind readonly
536
537define <16 x i32>@test_int_x86_avx512_pbroadcastd_512(<4 x i32> %x0, <16 x i32> %x1, i16 %mask) {
538; CHECK-LABEL: test_int_x86_avx512_pbroadcastd_512:
539; CHECK:       ## BB#0:
540; CHECK-NEXT:    kmovw %edi, %k1
541; CHECK-NEXT:    vpbroadcastd %xmm0, %zmm1 {%k1}
542; CHECK-NEXT:    vpbroadcastd %xmm0, %zmm2 {%k1} {z}
543; CHECK-NEXT:    vpbroadcastd %xmm0, %zmm0
544; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
545; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
546; CHECK-NEXT:    retq
547  %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 -1)
548  %res1 = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 %mask)
549  %res2 = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> zeroinitializer, i16 %mask)
550  %res3 = add <16 x i32> %res, %res1
551  %res4 = add <16 x i32> %res2, %res3
552  ret <16 x i32> %res4
553}
554declare <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32>, <16 x i32>, i16)
555
556define <16 x i32> @test_x86_pbroadcastd_i32_512(i32  %a0) {
557; CHECK-LABEL: test_x86_pbroadcastd_i32_512:
558; CHECK:       ## BB#0:
559; CHECK-NEXT:    vpbroadcastd %edi, %zmm0
560; CHECK-NEXT:    retq
561  %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.i32.512(i32 %a0) ; <<16 x i32>> [#uses=1]
562  ret <16 x i32> %res
563}
564declare <16 x i32> @llvm.x86.avx512.pbroadcastd.i32.512(i32) nounwind readonly
565
566define <8 x i64>@test_int_x86_avx512_pbroadcastq_512(<2 x i64> %x0, <8 x i64> %x1, i8 %mask) {
567; CHECK-LABEL: test_int_x86_avx512_pbroadcastq_512:
568; CHECK:       ## BB#0:
569; CHECK-NEXT:    movzbl %dil, %eax
570; CHECK-NEXT:    kmovw %eax, %k1
571; CHECK-NEXT:    vpbroadcastq %xmm0, %zmm1 {%k1}
572; CHECK-NEXT:    vpbroadcastq %xmm0, %zmm2 {%k1} {z}
573; CHECK-NEXT:    vpbroadcastq %xmm0, %zmm0
574; CHECK-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
575; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
576; CHECK-NEXT:    retq
577  %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 -1)
578  %res1 = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 %mask)
579  %res2 = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> zeroinitializer,i8 %mask)
580  %res3 = add <8 x i64> %res, %res1
581  %res4 = add <8 x i64> %res2, %res3
582  ret <8 x i64> %res4
583}
584declare <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64>, <8 x i64>, i8)
585
586define <8 x i64> @test_x86_pbroadcastq_i64_512(i64 %a0) {
587; CHECK-LABEL: test_x86_pbroadcastq_i64_512:
588; CHECK:       ## BB#0:
589; CHECK-NEXT:    vpbroadcastq %rdi, %zmm0
590; CHECK-NEXT:    retq
591  %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.i64.512(i64 %a0) ; <<8 x i64>> [#uses=1]
592  ret <8 x i64> %res
593}
594declare <8 x i64> @llvm.x86.avx512.pbroadcastq.i64.512(i64) nounwind readonly
595
596define <16 x i32> @test_conflict_d(<16 x i32> %a) {
597; CHECK-LABEL: test_conflict_d:
598; CHECK:       ## BB#0:
599; CHECK-NEXT:    vpconflictd %zmm0, %zmm0
600; CHECK-NEXT:    retq
601  %res = call <16 x i32> @llvm.x86.avx512.mask.conflict.d.512(<16 x i32> %a, <16 x i32> zeroinitializer, i16 -1)
602  ret <16 x i32> %res
603}
604
605declare <16 x i32> @llvm.x86.avx512.mask.conflict.d.512(<16 x i32>, <16 x i32>, i16) nounwind readonly
606
607define <8 x i64> @test_conflict_q(<8 x i64> %a) {
608; CHECK-LABEL: test_conflict_q:
609; CHECK:       ## BB#0:
610; CHECK-NEXT:    vpconflictq %zmm0, %zmm0
611; CHECK-NEXT:    retq
612  %res = call <8 x i64> @llvm.x86.avx512.mask.conflict.q.512(<8 x i64> %a, <8 x i64> zeroinitializer, i8 -1)
613  ret <8 x i64> %res
614}
615
616declare <8 x i64> @llvm.x86.avx512.mask.conflict.q.512(<8 x i64>, <8 x i64>, i8) nounwind readonly
617
618define <16 x i32> @test_maskz_conflict_d(<16 x i32> %a, i16 %mask) {
619; CHECK-LABEL: test_maskz_conflict_d:
620; CHECK:       ## BB#0:
621; CHECK-NEXT:    kmovw %edi, %k1
622; CHECK-NEXT:    vpconflictd %zmm0, %zmm0 {%k1} {z}
623; CHECK-NEXT:    retq
624  %res = call <16 x i32> @llvm.x86.avx512.mask.conflict.d.512(<16 x i32> %a, <16 x i32> zeroinitializer, i16 %mask)
625  ret <16 x i32> %res
626}
627
628define <8 x i64> @test_mask_conflict_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
629; CHECK-LABEL: test_mask_conflict_q:
630; CHECK:       ## BB#0:
631; CHECK-NEXT:    movzbl %dil, %eax
632; CHECK-NEXT:    kmovw %eax, %k1
633; CHECK-NEXT:    vpconflictq %zmm0, %zmm1 {%k1}
634; CHECK-NEXT:    vmovaps %zmm1, %zmm0
635; CHECK-NEXT:    retq
636  %res = call <8 x i64> @llvm.x86.avx512.mask.conflict.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
637  ret <8 x i64> %res
638}
639
640define <16 x i32> @test_lzcnt_d(<16 x i32> %a) {
641; CHECK-LABEL: test_lzcnt_d:
642; CHECK:       ## BB#0:
643; CHECK-NEXT:    vplzcntd %zmm0, %zmm0
644; CHECK-NEXT:    retq
645  %res = call <16 x i32> @llvm.x86.avx512.mask.lzcnt.d.512(<16 x i32> %a, <16 x i32> zeroinitializer, i16 -1)
646  ret <16 x i32> %res
647}
648
649declare <16 x i32> @llvm.x86.avx512.mask.lzcnt.d.512(<16 x i32>, <16 x i32>, i16) nounwind readonly
650
651define <8 x i64> @test_lzcnt_q(<8 x i64> %a) {
652; CHECK-LABEL: test_lzcnt_q:
653; CHECK:       ## BB#0:
654; CHECK-NEXT:    vplzcntq %zmm0, %zmm0
655; CHECK-NEXT:    retq
656  %res = call <8 x i64> @llvm.x86.avx512.mask.lzcnt.q.512(<8 x i64> %a, <8 x i64> zeroinitializer, i8 -1)
657  ret <8 x i64> %res
658}
659
660declare <8 x i64> @llvm.x86.avx512.mask.lzcnt.q.512(<8 x i64>, <8 x i64>, i8) nounwind readonly
661
662
663define <16 x i32> @test_mask_lzcnt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
664; CHECK-LABEL: test_mask_lzcnt_d:
665; CHECK:       ## BB#0:
666; CHECK-NEXT:    kmovw %edi, %k1
667; CHECK-NEXT:    vplzcntd %zmm0, %zmm1 {%k1}
668; CHECK-NEXT:    vmovaps %zmm1, %zmm0
669; CHECK-NEXT:    retq
670  %res = call <16 x i32> @llvm.x86.avx512.mask.lzcnt.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
671  ret <16 x i32> %res
672}
673
674define <8 x i64> @test_mask_lzcnt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
675; CHECK-LABEL: test_mask_lzcnt_q:
676; CHECK:       ## BB#0:
677; CHECK-NEXT:    movzbl %dil, %eax
678; CHECK-NEXT:    kmovw %eax, %k1
679; CHECK-NEXT:    vplzcntq %zmm0, %zmm1 {%k1}
680; CHECK-NEXT:    vmovaps %zmm1, %zmm0
681; CHECK-NEXT:    retq
682  %res = call <8 x i64> @llvm.x86.avx512.mask.lzcnt.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
683  ret <8 x i64> %res
684}
685
686define <16 x float> @test_x86_mask_blend_ps_512(i16 %a0, <16 x float> %a1, <16 x float> %a2) {
687; CHECK-LABEL: test_x86_mask_blend_ps_512:
688; CHECK:       ## BB#0:
689; CHECK-NEXT:    kmovw %edi, %k1
690; CHECK-NEXT:    vblendmps %zmm1, %zmm0, %zmm0 {%k1}
691; CHECK-NEXT:    retq
692  %res = call <16 x float> @llvm.x86.avx512.mask.blend.ps.512(<16 x float> %a1, <16 x float> %a2, i16 %a0) ; <<16 x float>> [#uses=1]
693  ret <16 x float> %res
694}
695
696declare <16 x float> @llvm.x86.avx512.mask.blend.ps.512(<16 x float>, <16 x float>, i16) nounwind readonly
697
698define <8 x double> @test_x86_mask_blend_pd_512(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
699; CHECK-LABEL: test_x86_mask_blend_pd_512:
700; CHECK:       ## BB#0:
701; CHECK-NEXT:    movzbl %dil, %eax
702; CHECK-NEXT:    kmovw %eax, %k1
703; CHECK-NEXT:    vblendmpd %zmm1, %zmm0, %zmm0 {%k1}
704; CHECK-NEXT:    retq
705  %res = call <8 x double> @llvm.x86.avx512.mask.blend.pd.512(<8 x double> %a1, <8 x double> %a2, i8 %a0) ; <<8 x double>> [#uses=1]
706  ret <8 x double> %res
707}
708
709define <8 x double> @test_x86_mask_blend_pd_512_memop(<8 x double> %a, <8 x double>* %ptr, i8 %mask) {
710; CHECK-LABEL: test_x86_mask_blend_pd_512_memop:
711; CHECK:       ## BB#0:
712; CHECK-NEXT:    movzbl %sil, %eax
713; CHECK-NEXT:    kmovw %eax, %k1
714; CHECK-NEXT:    vblendmpd (%rdi), %zmm0, %zmm0 {%k1}
715; CHECK-NEXT:    retq
716  %b = load <8 x double>, <8 x double>* %ptr
717  %res = call <8 x double> @llvm.x86.avx512.mask.blend.pd.512(<8 x double> %a, <8 x double> %b, i8 %mask) ; <<8 x double>> [#uses=1]
718  ret <8 x double> %res
719}
720declare <8 x double> @llvm.x86.avx512.mask.blend.pd.512(<8 x double>, <8 x double>, i8) nounwind readonly
721
722define <16 x i32> @test_x86_mask_blend_d_512(i16 %a0, <16 x i32> %a1, <16 x i32> %a2) {
723; CHECK-LABEL: test_x86_mask_blend_d_512:
724; CHECK:       ## BB#0:
725; CHECK-NEXT:    kmovw %edi, %k1
726; CHECK-NEXT:    vpblendmd %zmm1, %zmm0, %zmm0 {%k1}
727; CHECK-NEXT:    retq
728  %res = call <16 x i32> @llvm.x86.avx512.mask.blend.d.512(<16 x i32> %a1, <16 x i32> %a2, i16 %a0) ; <<16 x i32>> [#uses=1]
729  ret <16 x i32> %res
730}
731declare <16 x i32> @llvm.x86.avx512.mask.blend.d.512(<16 x i32>, <16 x i32>, i16) nounwind readonly
732
733define <8 x i64> @test_x86_mask_blend_q_512(i8 %a0, <8 x i64> %a1, <8 x i64> %a2) {
734; CHECK-LABEL: test_x86_mask_blend_q_512:
735; CHECK:       ## BB#0:
736; CHECK-NEXT:    movzbl %dil, %eax
737; CHECK-NEXT:    kmovw %eax, %k1
738; CHECK-NEXT:    vpblendmq %zmm1, %zmm0, %zmm0 {%k1}
739; CHECK-NEXT:    retq
740  %res = call <8 x i64> @llvm.x86.avx512.mask.blend.q.512(<8 x i64> %a1, <8 x i64> %a2, i8 %a0) ; <<8 x i64>> [#uses=1]
741  ret <8 x i64> %res
742}
743declare <8 x i64> @llvm.x86.avx512.mask.blend.q.512(<8 x i64>, <8 x i64>, i8) nounwind readonly
744
745 define i16 @test_cmpps(<16 x float> %a, <16 x float> %b) {
746; CHECK-LABEL: test_cmpps:
747; CHECK:       ## BB#0:
748; CHECK-NEXT:    vcmpleps {sae}, %zmm1, %zmm0, %k0
749; CHECK-NEXT:    kmovw %k0, %eax
750; CHECK-NEXT:    retq
751   %res = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, i16 -1, i32 8)
752   ret i16 %res
753 }
754 declare i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> , <16 x float> , i32, i16, i32)
755
756 define i8 @test_cmppd(<8 x double> %a, <8 x double> %b) {
757; CHECK-LABEL: test_cmppd:
758; CHECK:       ## BB#0:
759; CHECK-NEXT:    vcmpneqpd %zmm1, %zmm0, %k0
760; CHECK-NEXT:    kmovw %k0, %eax
761; CHECK-NEXT:    retq
762   %res = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 4, i8 -1, i32 4)
763   ret i8 %res
764 }
765 declare i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> , <8 x double> , i32, i8, i32)
766
767 ; fp min - max
768define <8 x double> @test_vmaxpd(<8 x double> %a0, <8 x double> %a1) {
769; CHECK-LABEL: test_vmaxpd:
770; CHECK:       ## BB#0:
771; CHECK-NEXT:    vmaxpd %zmm1, %zmm0, %zmm0
772; CHECK-NEXT:    retq
773  %res = call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %a0, <8 x double> %a1,
774                    <8 x double>zeroinitializer, i8 -1, i32 4)
775  ret <8 x double> %res
776}
777declare <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double>, <8 x double>,
778                    <8 x double>, i8, i32)
779
780define <8 x double> @test_vminpd(<8 x double> %a0, <8 x double> %a1) {
781; CHECK-LABEL: test_vminpd:
782; CHECK:       ## BB#0:
783; CHECK-NEXT:    vminpd %zmm1, %zmm0, %zmm0
784; CHECK-NEXT:    retq
785  %res = call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> %a0, <8 x double> %a1,
786                    <8 x double>zeroinitializer, i8 -1, i32 4)
787  ret <8 x double> %res
788}
789declare <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double>, <8 x double>,
790                    <8 x double>, i8, i32)
791
792 declare <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32>, <16 x i32>, i16)
793
794define <16 x i32>@test_int_x86_avx512_mask_pabs_d_512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) {
795; CHECK-LABEL: test_int_x86_avx512_mask_pabs_d_512:
796; CHECK:       ## BB#0:
797; CHECK-NEXT:    kmovw %edi, %k1
798; CHECK-NEXT:    vpabsd %zmm0, %zmm1 {%k1}
799; CHECK-NEXT:    vpabsd %zmm0, %zmm0
800; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
801; CHECK-NEXT:    retq
802  %res = call <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2)
803  %res1 = call <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32> %x0, <16 x i32> %x1, i16 -1)
804  %res2 = add <16 x i32> %res, %res1
805  ret <16 x i32> %res2
806}
807
808declare <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64>, <8 x i64>, i8)
809
810define <8 x i64>@test_int_x86_avx512_mask_pabs_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) {
811; CHECK-LABEL: test_int_x86_avx512_mask_pabs_q_512:
812; CHECK:       ## BB#0:
813; CHECK-NEXT:    movzbl %dil, %eax
814; CHECK-NEXT:    kmovw %eax, %k1
815; CHECK-NEXT:    vpabsq %zmm0, %zmm1 {%k1}
816; CHECK-NEXT:    vpabsq %zmm0, %zmm0
817; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
818; CHECK-NEXT:    retq
819  %res = call <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2)
820  %res1 = call <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64> %x0, <8 x i64> %x1, i8 -1)
821  %res2 = add <8 x i64> %res, %res1
822  ret <8 x i64> %res2
823}
824
825define i8 @test_vptestmq(<8 x i64> %a0, <8 x i64> %a1) {
826; CHECK-LABEL: test_vptestmq:
827; CHECK:       ## BB#0:
828; CHECK-NEXT:    vptestmq %zmm1, %zmm0, %k0
829; CHECK-NEXT:    kmovw %k0, %eax
830; CHECK-NEXT:    retq
831  %res = call i8 @llvm.x86.avx512.mask.ptestm.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 -1)
832  ret i8 %res
833}
834declare i8 @llvm.x86.avx512.mask.ptestm.q.512(<8 x i64>, <8 x i64>, i8)
835
836define i16 @test_vptestmd(<16 x i32> %a0, <16 x i32> %a1) {
837; CHECK-LABEL: test_vptestmd:
838; CHECK:       ## BB#0:
839; CHECK-NEXT:    vptestmd %zmm1, %zmm0, %k0
840; CHECK-NEXT:    kmovw %k0, %eax
841; CHECK-NEXT:    retq
842  %res = call i16 @llvm.x86.avx512.mask.ptestm.d.512(<16 x i32> %a0, <16 x i32> %a1, i16 -1)
843  ret i16 %res
844}
845declare i16 @llvm.x86.avx512.mask.ptestm.d.512(<16 x i32>, <16 x i32>, i16)
846
847define void @test_store1(<16 x float> %data, i8* %ptr, i16 %mask) {
848; CHECK-LABEL: test_store1:
849; CHECK:       ## BB#0:
850; CHECK-NEXT:    kmovw %esi, %k1
851; CHECK-NEXT:    vmovups %zmm0, (%rdi) {%k1}
852; CHECK-NEXT:    retq
853  call void @llvm.x86.avx512.mask.storeu.ps.512(i8* %ptr, <16 x float> %data, i16 %mask)
854  ret void
855}
856
857declare void @llvm.x86.avx512.mask.storeu.ps.512(i8*, <16 x float>, i16 )
858
859define void @test_store2(<8 x double> %data, i8* %ptr, i8 %mask) {
860; CHECK-LABEL: test_store2:
861; CHECK:       ## BB#0:
862; CHECK-NEXT:    kmovw %esi, %k1
863; CHECK-NEXT:    vmovupd %zmm0, (%rdi) {%k1}
864; CHECK-NEXT:    retq
865  call void @llvm.x86.avx512.mask.storeu.pd.512(i8* %ptr, <8 x double> %data, i8 %mask)
866  ret void
867}
868
869declare void @llvm.x86.avx512.mask.storeu.pd.512(i8*, <8 x double>, i8)
870
871define void @test_mask_store_aligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) {
872; CHECK-LABEL: test_mask_store_aligned_ps:
873; CHECK:       ## BB#0:
874; CHECK-NEXT:    kmovw %esi, %k1
875; CHECK-NEXT:    vmovaps %zmm0, (%rdi) {%k1}
876; CHECK-NEXT:    retq
877  call void @llvm.x86.avx512.mask.store.ps.512(i8* %ptr, <16 x float> %data, i16 %mask)
878  ret void
879}
880
881declare void @llvm.x86.avx512.mask.store.ps.512(i8*, <16 x float>, i16 )
882
883define void @test_mask_store_aligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) {
884; CHECK-LABEL: test_mask_store_aligned_pd:
885; CHECK:       ## BB#0:
886; CHECK-NEXT:    kmovw %esi, %k1
887; CHECK-NEXT:    vmovapd %zmm0, (%rdi) {%k1}
888; CHECK-NEXT:    retq
889  call void @llvm.x86.avx512.mask.store.pd.512(i8* %ptr, <8 x double> %data, i8 %mask)
890  ret void
891}
892
893declare void @llvm.x86.avx512.mask.store.pd.512(i8*, <8 x double>, i8)
894
895define <16 x float> @test_maskz_load_aligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) {
896; CHECK-LABEL: test_maskz_load_aligned_ps:
897; CHECK:       ## BB#0:
898; CHECK-NEXT:    kmovw %esi, %k1
899; CHECK-NEXT:    vmovaps (%rdi), %zmm0 {%k1} {z}
900; CHECK-NEXT:    retq
901  %res = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 %mask)
902  ret <16 x float> %res
903}
904
905declare <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8*, <16 x float>, i16)
906
907define <8 x double> @test_maskz_load_aligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) {
908; CHECK-LABEL: test_maskz_load_aligned_pd:
909; CHECK:       ## BB#0:
910; CHECK-NEXT:    kmovw %esi, %k1
911; CHECK-NEXT:    vmovapd (%rdi), %zmm0 {%k1} {z}
912; CHECK-NEXT:    retq
913  %res = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 %mask)
914  ret <8 x double> %res
915}
916
917declare <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8*, <8 x double>, i8)
918
919define <16 x float> @test_load_aligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) {
920; CHECK-LABEL: test_load_aligned_ps:
921; CHECK:       ## BB#0:
922; CHECK-NEXT:    vmovaps (%rdi), %zmm0
923; CHECK-NEXT:    retq
924  %res = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 -1)
925  ret <16 x float> %res
926}
927
928define <8 x double> @test_load_aligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) {
929; CHECK-LABEL: test_load_aligned_pd:
930; CHECK:       ## BB#0:
931; CHECK-NEXT:    vmovapd (%rdi), %zmm0
932; CHECK-NEXT:    retq
933  %res = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 -1)
934  ret <8 x double> %res
935}
936
937declare <8 x i64> @llvm.x86.avx512.movntdqa(i8*)
938
939define <8 x i64> @test_valign_q(<8 x i64> %a, <8 x i64> %b) {
940; CHECK-LABEL: test_valign_q:
941; CHECK:       ## BB#0:
942; CHECK-NEXT:    valignq $2, %zmm1, %zmm0, %zmm0
943; CHECK-NEXT:    retq
944  %res = call <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64> %a, <8 x i64> %b, i32 2, <8 x i64> zeroinitializer, i8 -1)
945  ret <8 x i64> %res
946}
947
948define <8 x i64> @test_mask_valign_q(<8 x i64> %a, <8 x i64> %b, <8 x i64> %src, i8 %mask) {
949; CHECK-LABEL: test_mask_valign_q:
950; CHECK:       ## BB#0:
951; CHECK-NEXT:    movzbl %dil, %eax
952; CHECK-NEXT:    kmovw %eax, %k1
953; CHECK-NEXT:    valignq $2, %zmm1, %zmm0, %zmm2 {%k1}
954; CHECK-NEXT:    vmovaps %zmm2, %zmm0
955; CHECK-NEXT:    retq
956  %res = call <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64> %a, <8 x i64> %b, i32 2, <8 x i64> %src, i8 %mask)
957  ret <8 x i64> %res
958}
959
960declare <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64>, <8 x i64>, i32, <8 x i64>, i8)
961
962define <16 x i32> @test_maskz_valign_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
963; CHECK-LABEL: test_maskz_valign_d:
964; CHECK:       ## BB#0:
965; CHECK-NEXT:    kmovw %edi, %k1
966; CHECK-NEXT:    valignd $5, %zmm1, %zmm0, %zmm0 {%k1} {z}
967; CHECK-NEXT:    retq
968  %res = call <16 x i32> @llvm.x86.avx512.mask.valign.d.512(<16 x i32> %a, <16 x i32> %b, i32 5, <16 x i32> zeroinitializer, i16 %mask)
969  ret <16 x i32> %res
970}
971
972declare <16 x i32> @llvm.x86.avx512.mask.valign.d.512(<16 x i32>, <16 x i32>, i32, <16 x i32>, i16)
973
974define void @test_mask_store_ss(i8* %ptr, <4 x float> %data, i8 %mask) {
975; CHECK-LABEL: test_mask_store_ss:
976; CHECK:       ## BB#0:
977; CHECK-NEXT:    kmovw %esi, %k1
978; CHECK-NEXT:    vmovss %xmm0, (%rdi) {%k1}
979; CHECK-NEXT:    retq
980 call void @llvm.x86.avx512.mask.store.ss(i8* %ptr, <4 x float> %data, i8 %mask)
981 ret void
982}
983
984declare void @llvm.x86.avx512.mask.store.ss(i8*, <4 x float>, i8 )
985
986define i16 @test_pcmpeq_d(<16 x i32> %a, <16 x i32> %b) {
987; CHECK-LABEL: test_pcmpeq_d:
988; CHECK:       ## BB#0:
989; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
990; CHECK-NEXT:    kmovw %k0, %eax
991; CHECK-NEXT:    retq
992  %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1)
993  ret i16 %res
994}
995
996define i16 @test_mask_pcmpeq_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
997; CHECK-LABEL: test_mask_pcmpeq_d:
998; CHECK:       ## BB#0:
999; CHECK-NEXT:    kmovw %edi, %k1
1000; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
1001; CHECK-NEXT:    kmovw %k0, %eax
1002; CHECK-NEXT:    retq
1003  %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
1004  ret i16 %res
1005}
1006
1007declare i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32>, <16 x i32>, i16)
1008
1009define i8 @test_pcmpeq_q(<8 x i64> %a, <8 x i64> %b) {
1010; CHECK-LABEL: test_pcmpeq_q:
1011; CHECK:       ## BB#0:
1012; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0
1013; CHECK-NEXT:    kmovw %k0, %eax
1014; CHECK-NEXT:    retq
1015  %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1)
1016  ret i8 %res
1017}
1018
1019define i8 @test_mask_pcmpeq_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
1020; CHECK-LABEL: test_mask_pcmpeq_q:
1021; CHECK:       ## BB#0:
1022; CHECK-NEXT:    movzbl %dil, %eax
1023; CHECK-NEXT:    kmovw %eax, %k1
1024; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
1025; CHECK-NEXT:    kmovw %k0, %eax
1026; CHECK-NEXT:    retq
1027  %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
1028  ret i8 %res
1029}
1030
1031declare i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64>, <8 x i64>, i8)
1032
1033define i16 @test_pcmpgt_d(<16 x i32> %a, <16 x i32> %b) {
1034; CHECK-LABEL: test_pcmpgt_d:
1035; CHECK:       ## BB#0:
1036; CHECK-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
1037; CHECK-NEXT:    kmovw %k0, %eax
1038; CHECK-NEXT:    retq
1039  %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1)
1040  ret i16 %res
1041}
1042
1043define i16 @test_mask_pcmpgt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
1044; CHECK-LABEL: test_mask_pcmpgt_d:
1045; CHECK:       ## BB#0:
1046; CHECK-NEXT:    kmovw %edi, %k1
1047; CHECK-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
1048; CHECK-NEXT:    kmovw %k0, %eax
1049; CHECK-NEXT:    retq
1050  %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
1051  ret i16 %res
1052}
1053
1054declare i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32>, <16 x i32>, i16)
1055
1056define i8 @test_pcmpgt_q(<8 x i64> %a, <8 x i64> %b) {
1057; CHECK-LABEL: test_pcmpgt_q:
1058; CHECK:       ## BB#0:
1059; CHECK-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0
1060; CHECK-NEXT:    kmovw %k0, %eax
1061; CHECK-NEXT:    retq
1062  %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1)
1063  ret i8 %res
1064}
1065
1066define i8 @test_mask_pcmpgt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
1067; CHECK-LABEL: test_mask_pcmpgt_q:
1068; CHECK:       ## BB#0:
1069; CHECK-NEXT:    movzbl %dil, %eax
1070; CHECK-NEXT:    kmovw %eax, %k1
1071; CHECK-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
1072; CHECK-NEXT:    kmovw %k0, %eax
1073; CHECK-NEXT:    retq
1074  %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
1075  ret i8 %res
1076}
1077
1078declare i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64>, <8 x i64>, i8)
1079
1080define <8 x i16> @test_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1) {
1081; CHECK-LABEL: test_cmp_d_512:
1082; CHECK:       ## BB#0:
1083; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
1084; CHECK-NEXT:    kmovw %k0, %r8d
1085; CHECK-NEXT:    vpcmpltd %zmm1, %zmm0, %k0
1086; CHECK-NEXT:    kmovw %k0, %r9d
1087; CHECK-NEXT:    vpcmpled %zmm1, %zmm0, %k0
1088; CHECK-NEXT:    kmovw %k0, %r10d
1089; CHECK-NEXT:    vpcmpunordd %zmm1, %zmm0, %k0
1090; CHECK-NEXT:    kmovw %k0, %esi
1091; CHECK-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
1092; CHECK-NEXT:    kmovw %k0, %edi
1093; CHECK-NEXT:    vpcmpnltd %zmm1, %zmm0, %k0
1094; CHECK-NEXT:    kmovw %k0, %eax
1095; CHECK-NEXT:    vpcmpnled %zmm1, %zmm0, %k0
1096; CHECK-NEXT:    kmovw %k0, %ecx
1097; CHECK-NEXT:    vpcmpordd %zmm1, %zmm0, %k0
1098; CHECK-NEXT:    kmovw %k0, %edx
1099; CHECK-NEXT:    vmovd %r8d, %xmm0
1100; CHECK-NEXT:    vpinsrw $1, %r9d, %xmm0, %xmm0
1101; CHECK-NEXT:    vpinsrw $2, %r10d, %xmm0, %xmm0
1102; CHECK-NEXT:    vpinsrw $3, %esi, %xmm0, %xmm0
1103; CHECK-NEXT:    vpinsrw $4, %edi, %xmm0, %xmm0
1104; CHECK-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
1105; CHECK-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
1106; CHECK-NEXT:    vpinsrw $7, %edx, %xmm0, %xmm0
1107; CHECK-NEXT:    retq
1108  %res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1)
1109  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
1110  %res1 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 -1)
1111  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
1112  %res2 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 -1)
1113  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
1114  %res3 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 -1)
1115  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
1116  %res4 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 -1)
1117  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
1118  %res5 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 -1)
1119  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
1120  %res6 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 -1)
1121  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
1122  %res7 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 -1)
1123  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
1124  ret <8 x i16> %vec7
1125}
1126
1127define <8 x i16> @test_mask_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
1128; CHECK-LABEL: test_mask_cmp_d_512:
1129; CHECK:       ## BB#0:
1130; CHECK-NEXT:    kmovw %edi, %k1
1131; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
1132; CHECK-NEXT:    kmovw %k0, %r8d
1133; CHECK-NEXT:    vpcmpltd %zmm1, %zmm0, %k0 {%k1}
1134; CHECK-NEXT:    kmovw %k0, %r9d
1135; CHECK-NEXT:    vpcmpled %zmm1, %zmm0, %k0 {%k1}
1136; CHECK-NEXT:    kmovw %k0, %r10d
1137; CHECK-NEXT:    vpcmpunordd %zmm1, %zmm0, %k0 {%k1}
1138; CHECK-NEXT:    kmovw %k0, %esi
1139; CHECK-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0 {%k1}
1140; CHECK-NEXT:    kmovw %k0, %edi
1141; CHECK-NEXT:    vpcmpnltd %zmm1, %zmm0, %k0 {%k1}
1142; CHECK-NEXT:    kmovw %k0, %eax
1143; CHECK-NEXT:    vpcmpnled %zmm1, %zmm0, %k0 {%k1}
1144; CHECK-NEXT:    kmovw %k0, %ecx
1145; CHECK-NEXT:    vpcmpordd %zmm1, %zmm0, %k0 {%k1}
1146; CHECK-NEXT:    kmovw %k0, %edx
1147; CHECK-NEXT:    vmovd %r8d, %xmm0
1148; CHECK-NEXT:    vpinsrw $1, %r9d, %xmm0, %xmm0
1149; CHECK-NEXT:    vpinsrw $2, %r10d, %xmm0, %xmm0
1150; CHECK-NEXT:    vpinsrw $3, %esi, %xmm0, %xmm0
1151; CHECK-NEXT:    vpinsrw $4, %edi, %xmm0, %xmm0
1152; CHECK-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
1153; CHECK-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
1154; CHECK-NEXT:    vpinsrw $7, %edx, %xmm0, %xmm0
1155; CHECK-NEXT:    retq
1156  %res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 %mask)
1157  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
1158  %res1 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 %mask)
1159  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
1160  %res2 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 %mask)
1161  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
1162  %res3 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 %mask)
1163  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
1164  %res4 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 %mask)
1165  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
1166  %res5 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 %mask)
1167  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
1168  %res6 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 %mask)
1169  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
1170  %res7 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 %mask)
1171  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
1172  ret <8 x i16> %vec7
1173}
1174
1175declare i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32>, <16 x i32>, i32, i16) nounwind readnone
1176
1177define <8 x i16> @test_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1) {
1178; CHECK-LABEL: test_ucmp_d_512:
1179; CHECK:       ## BB#0:
1180; CHECK-NEXT:    vpcmpequd %zmm1, %zmm0, %k0
1181; CHECK-NEXT:    kmovw %k0, %r8d
1182; CHECK-NEXT:    vpcmpltud %zmm1, %zmm0, %k0
1183; CHECK-NEXT:    kmovw %k0, %r9d
1184; CHECK-NEXT:    vpcmpleud %zmm1, %zmm0, %k0
1185; CHECK-NEXT:    kmovw %k0, %r10d
1186; CHECK-NEXT:    vpcmpunordud %zmm1, %zmm0, %k0
1187; CHECK-NEXT:    kmovw %k0, %esi
1188; CHECK-NEXT:    vpcmpnequd %zmm1, %zmm0, %k0
1189; CHECK-NEXT:    kmovw %k0, %edi
1190; CHECK-NEXT:    vpcmpnltud %zmm1, %zmm0, %k0
1191; CHECK-NEXT:    kmovw %k0, %eax
1192; CHECK-NEXT:    vpcmpnleud %zmm1, %zmm0, %k0
1193; CHECK-NEXT:    kmovw %k0, %ecx
1194; CHECK-NEXT:    vpcmpordud %zmm1, %zmm0, %k0
1195; CHECK-NEXT:    kmovw %k0, %edx
1196; CHECK-NEXT:    vmovd %r8d, %xmm0
1197; CHECK-NEXT:    vpinsrw $1, %r9d, %xmm0, %xmm0
1198; CHECK-NEXT:    vpinsrw $2, %r10d, %xmm0, %xmm0
1199; CHECK-NEXT:    vpinsrw $3, %esi, %xmm0, %xmm0
1200; CHECK-NEXT:    vpinsrw $4, %edi, %xmm0, %xmm0
1201; CHECK-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
1202; CHECK-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
1203; CHECK-NEXT:    vpinsrw $7, %edx, %xmm0, %xmm0
1204; CHECK-NEXT:    retq
1205  %res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1)
1206  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
1207  %res1 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 -1)
1208  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
1209  %res2 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 -1)
1210  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
1211  %res3 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 -1)
1212  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
1213  %res4 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 -1)
1214  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
1215  %res5 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 -1)
1216  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
1217  %res6 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 -1)
1218  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
1219  %res7 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 -1)
1220  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
1221  ret <8 x i16> %vec7
1222}
1223
1224define <8 x i16> @test_mask_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
1225; CHECK-LABEL: test_mask_ucmp_d_512:
1226; CHECK:       ## BB#0:
1227; CHECK-NEXT:    kmovw %edi, %k1
1228; CHECK-NEXT:    vpcmpequd %zmm1, %zmm0, %k0 {%k1}
1229; CHECK-NEXT:    kmovw %k0, %r8d
1230; CHECK-NEXT:    vpcmpltud %zmm1, %zmm0, %k0 {%k1}
1231; CHECK-NEXT:    kmovw %k0, %r9d
1232; CHECK-NEXT:    vpcmpleud %zmm1, %zmm0, %k0 {%k1}
1233; CHECK-NEXT:    kmovw %k0, %r10d
1234; CHECK-NEXT:    vpcmpunordud %zmm1, %zmm0, %k0 {%k1}
1235; CHECK-NEXT:    kmovw %k0, %esi
1236; CHECK-NEXT:    vpcmpnequd %zmm1, %zmm0, %k0 {%k1}
1237; CHECK-NEXT:    kmovw %k0, %edi
1238; CHECK-NEXT:    vpcmpnltud %zmm1, %zmm0, %k0 {%k1}
1239; CHECK-NEXT:    kmovw %k0, %eax
1240; CHECK-NEXT:    vpcmpnleud %zmm1, %zmm0, %k0 {%k1}
1241; CHECK-NEXT:    kmovw %k0, %ecx
1242; CHECK-NEXT:    vpcmpordud %zmm1, %zmm0, %k0 {%k1}
1243; CHECK-NEXT:    kmovw %k0, %edx
1244; CHECK-NEXT:    vmovd %r8d, %xmm0
1245; CHECK-NEXT:    vpinsrw $1, %r9d, %xmm0, %xmm0
1246; CHECK-NEXT:    vpinsrw $2, %r10d, %xmm0, %xmm0
1247; CHECK-NEXT:    vpinsrw $3, %esi, %xmm0, %xmm0
1248; CHECK-NEXT:    vpinsrw $4, %edi, %xmm0, %xmm0
1249; CHECK-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
1250; CHECK-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
1251; CHECK-NEXT:    vpinsrw $7, %edx, %xmm0, %xmm0
1252; CHECK-NEXT:    retq
1253  %res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 %mask)
1254  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
1255  %res1 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 %mask)
1256  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
1257  %res2 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 %mask)
1258  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
1259  %res3 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 %mask)
1260  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
1261  %res4 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 %mask)
1262  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
1263  %res5 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 %mask)
1264  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
1265  %res6 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 %mask)
1266  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
1267  %res7 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 %mask)
1268  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
1269  ret <8 x i16> %vec7
1270}
1271
1272declare i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32>, <16 x i32>, i32, i16) nounwind readnone
1273
1274define <8 x i8> @test_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1) {
1275; CHECK-LABEL: test_cmp_q_512:
1276; CHECK:       ## BB#0:
1277; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0
1278; CHECK-NEXT:    kmovw %k0, %r8d
1279; CHECK-NEXT:    vpcmpltq %zmm1, %zmm0, %k0
1280; CHECK-NEXT:    kmovw %k0, %r9d
1281; CHECK-NEXT:    vpcmpleq %zmm1, %zmm0, %k0
1282; CHECK-NEXT:    kmovw %k0, %r10d
1283; CHECK-NEXT:    vpcmpunordq %zmm1, %zmm0, %k0
1284; CHECK-NEXT:    kmovw %k0, %r11d
1285; CHECK-NEXT:    vpcmpneqq %zmm1, %zmm0, %k0
1286; CHECK-NEXT:    kmovw %k0, %edi
1287; CHECK-NEXT:    vpcmpnltq %zmm1, %zmm0, %k0
1288; CHECK-NEXT:    kmovw %k0, %eax
1289; CHECK-NEXT:    vpcmpnleq %zmm1, %zmm0, %k0
1290; CHECK-NEXT:    kmovw %k0, %ecx
1291; CHECK-NEXT:    vpcmpordq %zmm1, %zmm0, %k0
1292; CHECK-NEXT:    kmovw %k0, %edx
1293; CHECK-NEXT:    movzbl %r8b, %esi
1294; CHECK-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
1295; CHECK-NEXT:    movzbl %r9b, %esi
1296; CHECK-NEXT:    vpinsrb $2, %esi, %xmm0, %xmm0
1297; CHECK-NEXT:    movzbl %r10b, %esi
1298; CHECK-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
1299; CHECK-NEXT:    movzbl %r11b, %esi
1300; CHECK-NEXT:    vpinsrb $6, %esi, %xmm0, %xmm0
1301; CHECK-NEXT:    movzbl %dil, %esi
1302; CHECK-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
1303; CHECK-NEXT:    movzbl %al, %eax
1304; CHECK-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
1305; CHECK-NEXT:    movzbl %cl, %eax
1306; CHECK-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
1307; CHECK-NEXT:    movzbl %dl, %eax
1308; CHECK-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
1309; CHECK-NEXT:    retq
1310  %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1)
1311  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
1312  %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 -1)
1313  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
1314  %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 -1)
1315  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
1316  %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 -1)
1317  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
1318  %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 -1)
1319  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
1320  %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 -1)
1321  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
1322  %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 -1)
1323  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
1324  %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 -1)
1325  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
1326  ret <8 x i8> %vec7
1327}
1328
1329define <8 x i8> @test_mask_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
1330; CHECK-LABEL: test_mask_cmp_q_512:
1331; CHECK:       ## BB#0:
1332; CHECK-NEXT:    movzbl %dil, %eax
1333; CHECK-NEXT:    kmovw %eax, %k1
1334; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
1335; CHECK-NEXT:    kmovw %k0, %r8d
1336; CHECK-NEXT:    vpcmpltq %zmm1, %zmm0, %k0 {%k1}
1337; CHECK-NEXT:    kmovw %k0, %r9d
1338; CHECK-NEXT:    vpcmpleq %zmm1, %zmm0, %k0 {%k1}
1339; CHECK-NEXT:    kmovw %k0, %r10d
1340; CHECK-NEXT:    vpcmpunordq %zmm1, %zmm0, %k0 {%k1}
1341; CHECK-NEXT:    kmovw %k0, %r11d
1342; CHECK-NEXT:    vpcmpneqq %zmm1, %zmm0, %k0 {%k1}
1343; CHECK-NEXT:    kmovw %k0, %edi
1344; CHECK-NEXT:    vpcmpnltq %zmm1, %zmm0, %k0 {%k1}
1345; CHECK-NEXT:    kmovw %k0, %eax
1346; CHECK-NEXT:    vpcmpnleq %zmm1, %zmm0, %k0 {%k1}
1347; CHECK-NEXT:    kmovw %k0, %ecx
1348; CHECK-NEXT:    vpcmpordq %zmm1, %zmm0, %k0 {%k1}
1349; CHECK-NEXT:    kmovw %k0, %edx
1350; CHECK-NEXT:    movzbl %r8b, %esi
1351; CHECK-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
1352; CHECK-NEXT:    movzbl %r9b, %esi
1353; CHECK-NEXT:    vpinsrb $2, %esi, %xmm0, %xmm0
1354; CHECK-NEXT:    movzbl %r10b, %esi
1355; CHECK-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
1356; CHECK-NEXT:    movzbl %r11b, %esi
1357; CHECK-NEXT:    vpinsrb $6, %esi, %xmm0, %xmm0
1358; CHECK-NEXT:    movzbl %dil, %esi
1359; CHECK-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
1360; CHECK-NEXT:    movzbl %al, %eax
1361; CHECK-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
1362; CHECK-NEXT:    movzbl %cl, %eax
1363; CHECK-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
1364; CHECK-NEXT:    movzbl %dl, %eax
1365; CHECK-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
1366; CHECK-NEXT:    retq
1367  %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask)
1368  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
1369  %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 %mask)
1370  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
1371  %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 %mask)
1372  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
1373  %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 %mask)
1374  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
1375  %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 %mask)
1376  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
1377  %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 %mask)
1378  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
1379  %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 %mask)
1380  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
1381  %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 %mask)
1382  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
1383  ret <8 x i8> %vec7
1384}
1385
1386declare i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64>, <8 x i64>, i32, i8) nounwind readnone
1387
1388define <8 x i8> @test_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1) {
1389; CHECK-LABEL: test_ucmp_q_512:
1390; CHECK:       ## BB#0:
1391; CHECK-NEXT:    vpcmpequq %zmm1, %zmm0, %k0
1392; CHECK-NEXT:    kmovw %k0, %r8d
1393; CHECK-NEXT:    vpcmpltuq %zmm1, %zmm0, %k0
1394; CHECK-NEXT:    kmovw %k0, %r9d
1395; CHECK-NEXT:    vpcmpleuq %zmm1, %zmm0, %k0
1396; CHECK-NEXT:    kmovw %k0, %r10d
1397; CHECK-NEXT:    vpcmpunorduq %zmm1, %zmm0, %k0
1398; CHECK-NEXT:    kmovw %k0, %r11d
1399; CHECK-NEXT:    vpcmpnequq %zmm1, %zmm0, %k0
1400; CHECK-NEXT:    kmovw %k0, %edi
1401; CHECK-NEXT:    vpcmpnltuq %zmm1, %zmm0, %k0
1402; CHECK-NEXT:    kmovw %k0, %eax
1403; CHECK-NEXT:    vpcmpnleuq %zmm1, %zmm0, %k0
1404; CHECK-NEXT:    kmovw %k0, %ecx
1405; CHECK-NEXT:    vpcmporduq %zmm1, %zmm0, %k0
1406; CHECK-NEXT:    kmovw %k0, %edx
1407; CHECK-NEXT:    movzbl %r8b, %esi
1408; CHECK-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
1409; CHECK-NEXT:    movzbl %r9b, %esi
1410; CHECK-NEXT:    vpinsrb $2, %esi, %xmm0, %xmm0
1411; CHECK-NEXT:    movzbl %r10b, %esi
1412; CHECK-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
1413; CHECK-NEXT:    movzbl %r11b, %esi
1414; CHECK-NEXT:    vpinsrb $6, %esi, %xmm0, %xmm0
1415; CHECK-NEXT:    movzbl %dil, %esi
1416; CHECK-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
1417; CHECK-NEXT:    movzbl %al, %eax
1418; CHECK-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
1419; CHECK-NEXT:    movzbl %cl, %eax
1420; CHECK-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
1421; CHECK-NEXT:    movzbl %dl, %eax
1422; CHECK-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
1423; CHECK-NEXT:    retq
1424  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1)
1425  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
1426  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 -1)
1427  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
1428  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 -1)
1429  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
1430  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 -1)
1431  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
1432  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 -1)
1433  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
1434  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 -1)
1435  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
1436  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 -1)
1437  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
1438  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 -1)
1439  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
1440  ret <8 x i8> %vec7
1441}
1442
1443define <8 x i8> @test_mask_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
1444; CHECK-LABEL: test_mask_ucmp_q_512:
1445; CHECK:       ## BB#0:
1446; CHECK-NEXT:    movzbl %dil, %eax
1447; CHECK-NEXT:    kmovw %eax, %k1
1448; CHECK-NEXT:    vpcmpequq %zmm1, %zmm0, %k0 {%k1}
1449; CHECK-NEXT:    kmovw %k0, %r8d
1450; CHECK-NEXT:    vpcmpltuq %zmm1, %zmm0, %k0 {%k1}
1451; CHECK-NEXT:    kmovw %k0, %r9d
1452; CHECK-NEXT:    vpcmpleuq %zmm1, %zmm0, %k0 {%k1}
1453; CHECK-NEXT:    kmovw %k0, %r10d
1454; CHECK-NEXT:    vpcmpunorduq %zmm1, %zmm0, %k0 {%k1}
1455; CHECK-NEXT:    kmovw %k0, %r11d
1456; CHECK-NEXT:    vpcmpnequq %zmm1, %zmm0, %k0 {%k1}
1457; CHECK-NEXT:    kmovw %k0, %edi
1458; CHECK-NEXT:    vpcmpnltuq %zmm1, %zmm0, %k0 {%k1}
1459; CHECK-NEXT:    kmovw %k0, %eax
1460; CHECK-NEXT:    vpcmpnleuq %zmm1, %zmm0, %k0 {%k1}
1461; CHECK-NEXT:    kmovw %k0, %ecx
1462; CHECK-NEXT:    vpcmporduq %zmm1, %zmm0, %k0 {%k1}
1463; CHECK-NEXT:    kmovw %k0, %edx
1464; CHECK-NEXT:    movzbl %r8b, %esi
1465; CHECK-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
1466; CHECK-NEXT:    movzbl %r9b, %esi
1467; CHECK-NEXT:    vpinsrb $2, %esi, %xmm0, %xmm0
1468; CHECK-NEXT:    movzbl %r10b, %esi
1469; CHECK-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
1470; CHECK-NEXT:    movzbl %r11b, %esi
1471; CHECK-NEXT:    vpinsrb $6, %esi, %xmm0, %xmm0
1472; CHECK-NEXT:    movzbl %dil, %esi
1473; CHECK-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
1474; CHECK-NEXT:    movzbl %al, %eax
1475; CHECK-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
1476; CHECK-NEXT:    movzbl %cl, %eax
1477; CHECK-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
1478; CHECK-NEXT:    movzbl %dl, %eax
1479; CHECK-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
1480; CHECK-NEXT:    retq
1481  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask)
1482  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
1483  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 %mask)
1484  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
1485  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 %mask)
1486  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
1487  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 %mask)
1488  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
1489  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 %mask)
1490  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
1491  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 %mask)
1492  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
1493  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 %mask)
1494  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
1495  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 %mask)
1496  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
1497  ret <8 x i8> %vec7
1498}
1499
1500declare i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64>, <8 x i64>, i32, i8) nounwind readnone
1501
1502define <4 x float> @test_mask_vextractf32x4(<4 x float> %b, <16 x float> %a, i8 %mask) {
1503; CHECK-LABEL: test_mask_vextractf32x4:
1504; CHECK:       ## BB#0:
1505; CHECK-NEXT:    kmovw %edi, %k1
1506; CHECK-NEXT:    vextractf32x4 $2, %zmm1, %xmm0 {%k1}
1507; CHECK-NEXT:    retq
1508  %res = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float> %a, i32 2, <4 x float> %b, i8 %mask)
1509  ret <4 x float> %res
1510}
1511
1512declare <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float>, i32, <4 x float>, i8)
1513
1514define <4 x i64> @test_mask_vextracti64x4(<4 x i64> %b, <8 x i64> %a, i8 %mask) {
1515; CHECK-LABEL: test_mask_vextracti64x4:
1516; CHECK:       ## BB#0:
1517; CHECK-NEXT:    kmovw %edi, %k1
1518; CHECK-NEXT:    vextracti64x4 $2, %zmm1, %ymm0 {%k1}
1519; CHECK-NEXT:    retq
1520  %res = call <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64> %a, i32 2, <4 x i64> %b, i8 %mask)
1521  ret <4 x i64> %res
1522}
1523
1524declare <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64>, i32, <4 x i64>, i8)
1525
1526define <4 x i32> @test_maskz_vextracti32x4(<16 x i32> %a, i8 %mask) {
1527; CHECK-LABEL: test_maskz_vextracti32x4:
1528; CHECK:       ## BB#0:
1529; CHECK-NEXT:    kmovw %edi, %k1
1530; CHECK-NEXT:    vextracti32x4 $2, %zmm0, %xmm0 {%k1} {z}
1531; CHECK-NEXT:    retq
1532  %res = call <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32> %a, i32 2, <4 x i32> zeroinitializer, i8 %mask)
1533  ret <4 x i32> %res
1534}
1535
1536declare <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32>, i32, <4 x i32>, i8)
1537
1538define <4 x double> @test_vextractf64x4(<8 x double> %a) {
1539; CHECK-LABEL: test_vextractf64x4:
1540; CHECK:       ## BB#0:
1541; CHECK-NEXT:    vextractf64x4 $2, %zmm0, %ymm0
1542; CHECK-NEXT:    retq
1543  %res = call <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double> %a, i32 2, <4 x double> zeroinitializer, i8 -1)
1544  ret <4 x double> %res
1545}
1546
1547declare <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double>, i32, <4 x double>, i8)
1548
1549define <16 x i32> @test_x86_avx512_pslli_d(<16 x i32> %a0) {
1550; CHECK-LABEL: test_x86_avx512_pslli_d:
1551; CHECK:       ## BB#0:
1552; CHECK-NEXT:    vpslld $7, %zmm0, %zmm0
1553; CHECK-NEXT:    retq
1554  %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
1555  ret <16 x i32> %res
1556}
1557
1558define <16 x i32> @test_x86_avx512_mask_pslli_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
1559; CHECK-LABEL: test_x86_avx512_mask_pslli_d:
1560; CHECK:       ## BB#0:
1561; CHECK-NEXT:    kmovw %edi, %k1
1562; CHECK-NEXT:    vpslld $7, %zmm0, %zmm1 {%k1}
1563; CHECK-NEXT:    vmovaps %zmm1, %zmm0
1564; CHECK-NEXT:    retq
1565  %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
1566  ret <16 x i32> %res
1567}
1568
1569define <16 x i32> @test_x86_avx512_maskz_pslli_d(<16 x i32> %a0, i16 %mask) {
1570; CHECK-LABEL: test_x86_avx512_maskz_pslli_d:
1571; CHECK:       ## BB#0:
1572; CHECK-NEXT:    kmovw %edi, %k1
1573; CHECK-NEXT:    vpslld $7, %zmm0, %zmm0 {%k1} {z}
1574; CHECK-NEXT:    retq
1575  %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask)
1576  ret <16 x i32> %res
1577}
1578
1579declare <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone
1580
1581define <8 x i64> @test_x86_avx512_pslli_q(<8 x i64> %a0) {
1582; CHECK-LABEL: test_x86_avx512_pslli_q:
1583; CHECK:       ## BB#0:
1584; CHECK-NEXT:    vpsllq $7, %zmm0, %zmm0
1585; CHECK-NEXT:    retq
1586  %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
1587  ret <8 x i64> %res
1588}
1589
1590define <8 x i64> @test_x86_avx512_mask_pslli_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
1591; CHECK-LABEL: test_x86_avx512_mask_pslli_q:
1592; CHECK:       ## BB#0:
1593; CHECK-NEXT:    movzbl %dil, %eax
1594; CHECK-NEXT:    kmovw %eax, %k1
1595; CHECK-NEXT:    vpsllq $7, %zmm0, %zmm1 {%k1}
1596; CHECK-NEXT:    vmovaps %zmm1, %zmm0
1597; CHECK-NEXT:    retq
1598  %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
1599  ret <8 x i64> %res
1600}
1601
1602define <8 x i64> @test_x86_avx512_maskz_pslli_q(<8 x i64> %a0, i8 %mask) {
1603; CHECK-LABEL: test_x86_avx512_maskz_pslli_q:
1604; CHECK:       ## BB#0:
1605; CHECK-NEXT:    movzbl %dil, %eax
1606; CHECK-NEXT:    kmovw %eax, %k1
1607; CHECK-NEXT:    vpsllq $7, %zmm0, %zmm0 {%k1} {z}
1608; CHECK-NEXT:    retq
1609  %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
1610  ret <8 x i64> %res
1611}
1612
1613declare <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone
1614
1615define <16 x i32> @test_x86_avx512_psrli_d(<16 x i32> %a0) {
1616; CHECK-LABEL: test_x86_avx512_psrli_d:
1617; CHECK:       ## BB#0:
1618; CHECK-NEXT:    vpsrld $7, %zmm0, %zmm0
1619; CHECK-NEXT:    retq
1620  %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
1621  ret <16 x i32> %res
1622}
1623
1624define <16 x i32> @test_x86_avx512_mask_psrli_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
1625; CHECK-LABEL: test_x86_avx512_mask_psrli_d:
1626; CHECK:       ## BB#0:
1627; CHECK-NEXT:    kmovw %edi, %k1
1628; CHECK-NEXT:    vpsrld $7, %zmm0, %zmm1 {%k1}
1629; CHECK-NEXT:    vmovaps %zmm1, %zmm0
1630; CHECK-NEXT:    retq
1631  %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
1632  ret <16 x i32> %res
1633}
1634
1635define <16 x i32> @test_x86_avx512_maskz_psrli_d(<16 x i32> %a0, i16 %mask) {
1636; CHECK-LABEL: test_x86_avx512_maskz_psrli_d:
1637; CHECK:       ## BB#0:
1638; CHECK-NEXT:    kmovw %edi, %k1
1639; CHECK-NEXT:    vpsrld $7, %zmm0, %zmm0 {%k1} {z}
1640; CHECK-NEXT:    retq
1641  %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask)
1642  ret <16 x i32> %res
1643}
1644
1645declare <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone
1646
1647define <8 x i64> @test_x86_avx512_psrli_q(<8 x i64> %a0) {
1648; CHECK-LABEL: test_x86_avx512_psrli_q:
1649; CHECK:       ## BB#0:
1650; CHECK-NEXT:    vpsrlq $7, %zmm0, %zmm0
1651; CHECK-NEXT:    retq
1652  %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
1653  ret <8 x i64> %res
1654}
1655
1656define <8 x i64> @test_x86_avx512_mask_psrli_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
1657; CHECK-LABEL: test_x86_avx512_mask_psrli_q:
1658; CHECK:       ## BB#0:
1659; CHECK-NEXT:    movzbl %dil, %eax
1660; CHECK-NEXT:    kmovw %eax, %k1
1661; CHECK-NEXT:    vpsrlq $7, %zmm0, %zmm1 {%k1}
1662; CHECK-NEXT:    vmovaps %zmm1, %zmm0
1663; CHECK-NEXT:    retq
1664  %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
1665  ret <8 x i64> %res
1666}
1667
1668define <8 x i64> @test_x86_avx512_maskz_psrli_q(<8 x i64> %a0, i8 %mask) {
1669; CHECK-LABEL: test_x86_avx512_maskz_psrli_q:
1670; CHECK:       ## BB#0:
1671; CHECK-NEXT:    movzbl %dil, %eax
1672; CHECK-NEXT:    kmovw %eax, %k1
1673; CHECK-NEXT:    vpsrlq $7, %zmm0, %zmm0 {%k1} {z}
1674; CHECK-NEXT:    retq
1675  %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
1676  ret <8 x i64> %res
1677}
1678
1679declare <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone
1680
1681define <16 x i32> @test_x86_avx512_psrai_d(<16 x i32> %a0) {
1682; CHECK-LABEL: test_x86_avx512_psrai_d:
1683; CHECK:       ## BB#0:
1684; CHECK-NEXT:    vpsrad $7, %zmm0, %zmm0
1685; CHECK-NEXT:    retq
1686  %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
1687  ret <16 x i32> %res
1688}
1689
1690define <16 x i32> @test_x86_avx512_mask_psrai_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
1691; CHECK-LABEL: test_x86_avx512_mask_psrai_d:
1692; CHECK:       ## BB#0:
1693; CHECK-NEXT:    kmovw %edi, %k1
1694; CHECK-NEXT:    vpsrad $7, %zmm0, %zmm1 {%k1}
1695; CHECK-NEXT:    vmovaps %zmm1, %zmm0
1696; CHECK-NEXT:    retq
1697  %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
1698  ret <16 x i32> %res
1699}
1700
1701define <16 x i32> @test_x86_avx512_maskz_psrai_d(<16 x i32> %a0, i16 %mask) {
1702; CHECK-LABEL: test_x86_avx512_maskz_psrai_d:
1703; CHECK:       ## BB#0:
1704; CHECK-NEXT:    kmovw %edi, %k1
1705; CHECK-NEXT:    vpsrad $7, %zmm0, %zmm0 {%k1} {z}
1706; CHECK-NEXT:    retq
1707  %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask)
1708  ret <16 x i32> %res
1709}
1710
1711declare <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone
1712
1713define <8 x i64> @test_x86_avx512_psrai_q(<8 x i64> %a0) {
1714; CHECK-LABEL: test_x86_avx512_psrai_q:
1715; CHECK:       ## BB#0:
1716; CHECK-NEXT:    vpsraq $7, %zmm0, %zmm0
1717; CHECK-NEXT:    retq
1718  %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
1719  ret <8 x i64> %res
1720}
1721
1722define <8 x i64> @test_x86_avx512_mask_psrai_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
1723; CHECK-LABEL: test_x86_avx512_mask_psrai_q:
1724; CHECK:       ## BB#0:
1725; CHECK-NEXT:    movzbl %dil, %eax
1726; CHECK-NEXT:    kmovw %eax, %k1
1727; CHECK-NEXT:    vpsraq $7, %zmm0, %zmm1 {%k1}
1728; CHECK-NEXT:    vmovaps %zmm1, %zmm0
1729; CHECK-NEXT:    retq
1730  %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
1731  ret <8 x i64> %res
1732}
1733
1734define <8 x i64> @test_x86_avx512_maskz_psrai_q(<8 x i64> %a0, i8 %mask) {
1735; CHECK-LABEL: test_x86_avx512_maskz_psrai_q:
1736; CHECK:       ## BB#0:
1737; CHECK-NEXT:    movzbl %dil, %eax
1738; CHECK-NEXT:    kmovw %eax, %k1
1739; CHECK-NEXT:    vpsraq $7, %zmm0, %zmm0 {%k1} {z}
1740; CHECK-NEXT:    retq
1741  %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
1742  ret <8 x i64> %res
1743}
1744
1745declare <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone
1746
1747define <16 x i32> @test_x86_avx512_psll_d(<16 x i32> %a0, <4 x i32> %a1) {
1748; CHECK-LABEL: test_x86_avx512_psll_d:
1749; CHECK:       ## BB#0:
1750; CHECK-NEXT:    vpslld %xmm1, %zmm0, %zmm0
1751; CHECK-NEXT:    retq
1752  %res = call <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
1753  ret <16 x i32> %res
1754}
1755
1756define <16 x i32> @test_x86_avx512_mask_psll_d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask) {
1757; CHECK-LABEL: test_x86_avx512_mask_psll_d:
1758; CHECK:       ## BB#0:
1759; CHECK-NEXT:    kmovw %edi, %k1
1760; CHECK-NEXT:    vpslld %xmm1, %zmm0, %zmm2 {%k1}
1761; CHECK-NEXT:    vmovaps %zmm2, %zmm0
1762; CHECK-NEXT:    retq
1763  %res = call <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask)
1764  ret <16 x i32> %res
1765}
1766
1767define <16 x i32> @test_x86_avx512_maskz_psll_d(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
1768; CHECK-LABEL: test_x86_avx512_maskz_psll_d:
1769; CHECK:       ## BB#0:
1770; CHECK-NEXT:    kmovw %edi, %k1
1771; CHECK-NEXT:    vpslld %xmm1, %zmm0, %zmm0 {%k1} {z}
1772; CHECK-NEXT:    retq
1773  %res = call <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
1774  ret <16 x i32> %res
1775}
1776
1777declare <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32>, <4 x i32>, <16 x i32>, i16) nounwind readnone
1778
1779define <8 x i64> @test_x86_avx512_psll_q(<8 x i64> %a0, <2 x i64> %a1) {
1780; CHECK-LABEL: test_x86_avx512_psll_q:
1781; CHECK:       ## BB#0:
1782; CHECK-NEXT:    vpsllq %xmm1, %zmm0, %zmm0
1783; CHECK-NEXT:    retq
1784  %res = call <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
1785  ret <8 x i64> %res
1786}
1787
1788define <8 x i64> @test_x86_avx512_mask_psll_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) {
1789; CHECK-LABEL: test_x86_avx512_mask_psll_q:
1790; CHECK:       ## BB#0:
1791; CHECK-NEXT:    movzbl %dil, %eax
1792; CHECK-NEXT:    kmovw %eax, %k1
1793; CHECK-NEXT:    vpsllq %xmm1, %zmm0, %zmm2 {%k1}
1794; CHECK-NEXT:    vmovaps %zmm2, %zmm0
1795; CHECK-NEXT:    retq
1796  %res = call <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask)
1797  ret <8 x i64> %res
1798}
1799
1800define <8 x i64> @test_x86_avx512_maskz_psll_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) {
1801; CHECK-LABEL: test_x86_avx512_maskz_psll_q:
1802; CHECK:       ## BB#0:
1803; CHECK-NEXT:    movzbl %dil, %eax
1804; CHECK-NEXT:    kmovw %eax, %k1
1805; CHECK-NEXT:    vpsllq %xmm1, %zmm0, %zmm0 {%k1} {z}
1806; CHECK-NEXT:    retq
1807  %res = call <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
1808  ret <8 x i64> %res
1809}
1810
1811declare <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64>, <2 x i64>, <8 x i64>, i8) nounwind readnone
1812
1813define <16 x i32> @test_x86_avx512_psrl_d(<16 x i32> %a0, <4 x i32> %a1) {
1814; CHECK-LABEL: test_x86_avx512_psrl_d:
1815; CHECK:       ## BB#0:
1816; CHECK-NEXT:    vpsrld %xmm1, %zmm0, %zmm0
1817; CHECK-NEXT:    retq
1818  %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
1819  ret <16 x i32> %res
1820}
1821
1822define <16 x i32> @test_x86_avx512_mask_psrl_d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask) {
1823; CHECK-LABEL: test_x86_avx512_mask_psrl_d:
1824; CHECK:       ## BB#0:
1825; CHECK-NEXT:    kmovw %edi, %k1
1826; CHECK-NEXT:    vpsrld %xmm1, %zmm0, %zmm2 {%k1}
1827; CHECK-NEXT:    vmovaps %zmm2, %zmm0
1828; CHECK-NEXT:    retq
1829  %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask)
1830  ret <16 x i32> %res
1831}
1832
1833define <16 x i32> @test_x86_avx512_maskz_psrl_d(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
1834; CHECK-LABEL: test_x86_avx512_maskz_psrl_d:
1835; CHECK:       ## BB#0:
1836; CHECK-NEXT:    kmovw %edi, %k1
1837; CHECK-NEXT:    vpsrld %xmm1, %zmm0, %zmm0 {%k1} {z}
1838; CHECK-NEXT:    retq
1839  %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
1840  ret <16 x i32> %res
1841}
1842
1843declare <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32>, <4 x i32>, <16 x i32>, i16) nounwind readnone
1844
1845define <8 x i64> @test_x86_avx512_psrl_q(<8 x i64> %a0, <2 x i64> %a1) {
1846; CHECK-LABEL: test_x86_avx512_psrl_q:
1847; CHECK:       ## BB#0:
1848; CHECK-NEXT:    vpsrlq %xmm1, %zmm0, %zmm0
1849; CHECK-NEXT:    retq
1850  %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
1851  ret <8 x i64> %res
1852}
1853
1854define <8 x i64> @test_x86_avx512_mask_psrl_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) {
1855; CHECK-LABEL: test_x86_avx512_mask_psrl_q:
1856; CHECK:       ## BB#0:
1857; CHECK-NEXT:    movzbl %dil, %eax
1858; CHECK-NEXT:    kmovw %eax, %k1
1859; CHECK-NEXT:    vpsrlq %xmm1, %zmm0, %zmm2 {%k1}
1860; CHECK-NEXT:    vmovaps %zmm2, %zmm0
1861; CHECK-NEXT:    retq
1862  %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask)
1863  ret <8 x i64> %res
1864}
1865
1866define <8 x i64> @test_x86_avx512_maskz_psrl_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) {
1867; CHECK-LABEL: test_x86_avx512_maskz_psrl_q:
1868; CHECK:       ## BB#0:
1869; CHECK-NEXT:    movzbl %dil, %eax
1870; CHECK-NEXT:    kmovw %eax, %k1
1871; CHECK-NEXT:    vpsrlq %xmm1, %zmm0, %zmm0 {%k1} {z}
1872; CHECK-NEXT:    retq
1873  %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
1874  ret <8 x i64> %res
1875}
1876
1877declare <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64>, <2 x i64>, <8 x i64>, i8) nounwind readnone
1878
1879define <16 x i32> @test_x86_avx512_psra_d(<16 x i32> %a0, <4 x i32> %a1) {
1880; CHECK-LABEL: test_x86_avx512_psra_d:
1881; CHECK:       ## BB#0:
1882; CHECK-NEXT:    vpsrad %xmm1, %zmm0, %zmm0
1883; CHECK-NEXT:    retq
1884  %res = call <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
1885  ret <16 x i32> %res
1886}
1887
1888define <16 x i32> @test_x86_avx512_mask_psra_d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask) {
1889; CHECK-LABEL: test_x86_avx512_mask_psra_d:
1890; CHECK:       ## BB#0:
1891; CHECK-NEXT:    kmovw %edi, %k1
1892; CHECK-NEXT:    vpsrad %xmm1, %zmm0, %zmm2 {%k1}
1893; CHECK-NEXT:    vmovaps %zmm2, %zmm0
1894; CHECK-NEXT:    retq
1895  %res = call <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask)
1896  ret <16 x i32> %res
1897}
1898
1899define <16 x i32> @test_x86_avx512_maskz_psra_d(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
1900; CHECK-LABEL: test_x86_avx512_maskz_psra_d:
1901; CHECK:       ## BB#0:
1902; CHECK-NEXT:    kmovw %edi, %k1
1903; CHECK-NEXT:    vpsrad %xmm1, %zmm0, %zmm0 {%k1} {z}
1904; CHECK-NEXT:    retq
1905  %res = call <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
1906  ret <16 x i32> %res
1907}
1908
1909declare <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32>, <4 x i32>, <16 x i32>, i16) nounwind readnone
1910
1911define <8 x i64> @test_x86_avx512_psra_q(<8 x i64> %a0, <2 x i64> %a1) {
1912; CHECK-LABEL: test_x86_avx512_psra_q:
1913; CHECK:       ## BB#0:
1914; CHECK-NEXT:    vpsraq %xmm1, %zmm0, %zmm0
1915; CHECK-NEXT:    retq
1916  %res = call <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
1917  ret <8 x i64> %res
1918}
1919
1920define <8 x i64> @test_x86_avx512_mask_psra_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) {
1921; CHECK-LABEL: test_x86_avx512_mask_psra_q:
1922; CHECK:       ## BB#0:
1923; CHECK-NEXT:    movzbl %dil, %eax
1924; CHECK-NEXT:    kmovw %eax, %k1
1925; CHECK-NEXT:    vpsraq %xmm1, %zmm0, %zmm2 {%k1}
1926; CHECK-NEXT:    vmovaps %zmm2, %zmm0
1927; CHECK-NEXT:    retq
1928  %res = call <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask)
1929  ret <8 x i64> %res
1930}
1931
1932define <8 x i64> @test_x86_avx512_maskz_psra_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) {
1933; CHECK-LABEL: test_x86_avx512_maskz_psra_q:
1934; CHECK:       ## BB#0:
1935; CHECK-NEXT:    movzbl %dil, %eax
1936; CHECK-NEXT:    kmovw %eax, %k1
1937; CHECK-NEXT:    vpsraq %xmm1, %zmm0, %zmm0 {%k1} {z}
1938; CHECK-NEXT:    retq
1939  %res = call <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
1940  ret <8 x i64> %res
1941}
1942
1943declare <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64>, <2 x i64>, <8 x i64>, i8) nounwind readnone
1944
1945define <16 x i32> @test_x86_avx512_psllv_d(<16 x i32> %a0, <16 x i32> %a1) {
1946; CHECK-LABEL: test_x86_avx512_psllv_d:
1947; CHECK:       ## BB#0:
1948; CHECK-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
1949; CHECK-NEXT:    retq
1950  %res = call <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
1951  ret <16 x i32> %res
1952}
1953
1954define <16 x i32> @test_x86_avx512_mask_psllv_d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) {
1955; CHECK-LABEL: test_x86_avx512_mask_psllv_d:
1956; CHECK:       ## BB#0:
1957; CHECK-NEXT:    kmovw %edi, %k1
1958; CHECK-NEXT:    vpsllvd %zmm1, %zmm0, %zmm2 {%k1}
1959; CHECK-NEXT:    vmovaps %zmm2, %zmm0
1960; CHECK-NEXT:    retq
1961  %res = call <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask)
1962  ret <16 x i32> %res
1963}
1964
1965define <16 x i32> @test_x86_avx512_maskz_psllv_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
1966; CHECK-LABEL: test_x86_avx512_maskz_psllv_d:
1967; CHECK:       ## BB#0:
1968; CHECK-NEXT:    kmovw %edi, %k1
1969; CHECK-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0 {%k1} {z}
1970; CHECK-NEXT:    retq
1971  %res = call <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
1972  ret <16 x i32> %res
1973}
1974
1975declare <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32>, <16 x i32>, <16 x i32>, i16) nounwind readnone
1976
1977define <8 x i64> @test_x86_avx512_psllv_q(<8 x i64> %a0, <8 x i64> %a1) {
1978; CHECK-LABEL: test_x86_avx512_psllv_q:
1979; CHECK:       ## BB#0:
1980; CHECK-NEXT:    vpsllvq %zmm1, %zmm0, %zmm0
1981; CHECK-NEXT:    retq
1982  %res = call <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
1983  ret <8 x i64> %res
1984}
1985
1986define <8 x i64> @test_x86_avx512_mask_psllv_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
1987; CHECK-LABEL: test_x86_avx512_mask_psllv_q:
1988; CHECK:       ## BB#0:
1989; CHECK-NEXT:    movzbl %dil, %eax
1990; CHECK-NEXT:    kmovw %eax, %k1
1991; CHECK-NEXT:    vpsllvq %zmm1, %zmm0, %zmm2 {%k1}
1992; CHECK-NEXT:    vmovaps %zmm2, %zmm0
1993; CHECK-NEXT:    retq
1994  %res = call <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask)
1995  ret <8 x i64> %res
1996}
1997
1998define <8 x i64> @test_x86_avx512_maskz_psllv_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
1999; CHECK-LABEL: test_x86_avx512_maskz_psllv_q:
2000; CHECK:       ## BB#0:
2001; CHECK-NEXT:    movzbl %dil, %eax
2002; CHECK-NEXT:    kmovw %eax, %k1
2003; CHECK-NEXT:    vpsllvq %zmm1, %zmm0, %zmm0 {%k1} {z}
2004; CHECK-NEXT:    retq
2005  %res = call <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
2006  ret <8 x i64> %res
2007}
2008
2009declare <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64>, <8 x i64>, <8 x i64>, i8) nounwind readnone
2010
2011
2012define <16 x i32> @test_x86_avx512_psrav_d(<16 x i32> %a0, <16 x i32> %a1) {
2013; CHECK-LABEL: test_x86_avx512_psrav_d:
2014; CHECK:       ## BB#0:
2015; CHECK-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
2016; CHECK-NEXT:    retq
2017  %res = call <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
2018  ret <16 x i32> %res
2019}
2020
2021define <16 x i32> @test_x86_avx512_mask_psrav_d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) {
2022; CHECK-LABEL: test_x86_avx512_mask_psrav_d:
2023; CHECK:       ## BB#0:
2024; CHECK-NEXT:    kmovw %edi, %k1
2025; CHECK-NEXT:    vpsravd %zmm1, %zmm0, %zmm2 {%k1}
2026; CHECK-NEXT:    vmovaps %zmm2, %zmm0
2027; CHECK-NEXT:    retq
2028  %res = call <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask)
2029  ret <16 x i32> %res
2030}
2031
2032define <16 x i32> @test_x86_avx512_maskz_psrav_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
2033; CHECK-LABEL: test_x86_avx512_maskz_psrav_d:
2034; CHECK:       ## BB#0:
2035; CHECK-NEXT:    kmovw %edi, %k1
2036; CHECK-NEXT:    vpsravd %zmm1, %zmm0, %zmm0 {%k1} {z}
2037; CHECK-NEXT:    retq
2038  %res = call <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
2039  ret <16 x i32> %res
2040}
2041
2042declare <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32>, <16 x i32>, <16 x i32>, i16) nounwind readnone
2043
2044define <8 x i64> @test_x86_avx512_psrav_q(<8 x i64> %a0, <8 x i64> %a1) {
2045; CHECK-LABEL: test_x86_avx512_psrav_q:
2046; CHECK:       ## BB#0:
2047; CHECK-NEXT:    vpsravq %zmm1, %zmm0, %zmm0
2048; CHECK-NEXT:    retq
2049  %res = call <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
2050  ret <8 x i64> %res
2051}
2052
2053define <8 x i64> @test_x86_avx512_mask_psrav_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
2054; CHECK-LABEL: test_x86_avx512_mask_psrav_q:
2055; CHECK:       ## BB#0:
2056; CHECK-NEXT:    movzbl %dil, %eax
2057; CHECK-NEXT:    kmovw %eax, %k1
2058; CHECK-NEXT:    vpsravq %zmm1, %zmm0, %zmm2 {%k1}
2059; CHECK-NEXT:    vmovaps %zmm2, %zmm0
2060; CHECK-NEXT:    retq
2061  %res = call <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask)
2062  ret <8 x i64> %res
2063}
2064
2065define <8 x i64> @test_x86_avx512_maskz_psrav_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
2066; CHECK-LABEL: test_x86_avx512_maskz_psrav_q:
2067; CHECK:       ## BB#0:
2068; CHECK-NEXT:    movzbl %dil, %eax
2069; CHECK-NEXT:    kmovw %eax, %k1
2070; CHECK-NEXT:    vpsravq %zmm1, %zmm0, %zmm0 {%k1} {z}
2071; CHECK-NEXT:    retq
2072  %res = call <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
2073  ret <8 x i64> %res
2074}
2075
2076declare <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64>, <8 x i64>, <8 x i64>, i8) nounwind readnone
2077
2078define <16 x i32> @test_x86_avx512_psrlv_d(<16 x i32> %a0, <16 x i32> %a1) {
2079; CHECK-LABEL: test_x86_avx512_psrlv_d:
2080; CHECK:       ## BB#0:
2081; CHECK-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
2082; CHECK-NEXT:    retq
2083  %res = call <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
2084  ret <16 x i32> %res
2085}
2086
2087define <16 x i32> @test_x86_avx512_mask_psrlv_d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) {
2088; CHECK-LABEL: test_x86_avx512_mask_psrlv_d:
2089; CHECK:       ## BB#0:
2090; CHECK-NEXT:    kmovw %edi, %k1
2091; CHECK-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm2 {%k1}
2092; CHECK-NEXT:    vmovaps %zmm2, %zmm0
2093; CHECK-NEXT:    retq
2094  %res = call <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask)
2095  ret <16 x i32> %res
2096}
2097
2098define <16 x i32> @test_x86_avx512_maskz_psrlv_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
2099; CHECK-LABEL: test_x86_avx512_maskz_psrlv_d:
2100; CHECK:       ## BB#0:
2101; CHECK-NEXT:    kmovw %edi, %k1
2102; CHECK-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0 {%k1} {z}
2103; CHECK-NEXT:    retq
2104  %res = call <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
2105  ret <16 x i32> %res
2106}
2107
2108declare <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32>, <16 x i32>, <16 x i32>, i16) nounwind readnone
2109
2110define <8 x i64> @test_x86_avx512_psrlv_q(<8 x i64> %a0, <8 x i64> %a1) {
2111; CHECK-LABEL: test_x86_avx512_psrlv_q:
2112; CHECK:       ## BB#0:
2113; CHECK-NEXT:    vpsrlvq %zmm1, %zmm0, %zmm0
2114; CHECK-NEXT:    retq
2115  %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
2116  ret <8 x i64> %res
2117}
2118
2119define <8 x i64> @test_x86_avx512_mask_psrlv_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
2120; CHECK-LABEL: test_x86_avx512_mask_psrlv_q:
2121; CHECK:       ## BB#0:
2122; CHECK-NEXT:    movzbl %dil, %eax
2123; CHECK-NEXT:    kmovw %eax, %k1
2124; CHECK-NEXT:    vpsrlvq %zmm1, %zmm0, %zmm2 {%k1}
2125; CHECK-NEXT:    vmovaps %zmm2, %zmm0
2126; CHECK-NEXT:    retq
2127  %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask)
2128  ret <8 x i64> %res
2129}
2130
2131define <8 x i64> @test_x86_avx512_maskz_psrlv_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
2132; CHECK-LABEL: test_x86_avx512_maskz_psrlv_q:
2133; CHECK:       ## BB#0:
2134; CHECK-NEXT:    movzbl %dil, %eax
2135; CHECK-NEXT:    kmovw %eax, %k1
2136; CHECK-NEXT:    vpsrlvq %zmm1, %zmm0, %zmm0 {%k1} {z}
2137; CHECK-NEXT:    retq
2138  %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
2139  ret <8 x i64> %res
2140}
2141
2142declare <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64>, <8 x i64>, <8 x i64>, i8) nounwind readnone
2143
2144define <8 x i64> @test_x86_avx512_psrlv_q_memop(<8 x i64> %a0, <8 x i64>* %ptr) {
2145; CHECK-LABEL: test_x86_avx512_psrlv_q_memop:
2146; CHECK:       ## BB#0:
2147; CHECK-NEXT:    vpsrlvq (%rdi), %zmm0, %zmm0
2148; CHECK-NEXT:    retq
2149  %b = load <8 x i64>, <8 x i64>* %ptr
2150  %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
2151  ret <8 x i64> %res
2152}
2153
2154declare <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
2155declare <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
2156declare <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
2157
2158define <16 x float> @test_vsubps_rn(<16 x float> %a0, <16 x float> %a1) {
2159; CHECK-LABEL: test_vsubps_rn:
2160; CHECK:       ## BB#0:
2161; CHECK-NEXT:    vsubps {rn-sae}, %zmm1, %zmm0, %zmm0
2162; CHECK-NEXT:    retq
2163  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1,
2164                    <16 x float> zeroinitializer, i16 -1, i32 0)
2165  ret <16 x float> %res
2166}
2167
2168define <16 x float> @test_vsubps_rd(<16 x float> %a0, <16 x float> %a1) {
2169; CHECK-LABEL: test_vsubps_rd:
2170; CHECK:       ## BB#0:
2171; CHECK-NEXT:    vsubps {rd-sae}, %zmm1, %zmm0, %zmm0
2172; CHECK-NEXT:    retq
2173  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1,
2174                    <16 x float> zeroinitializer, i16 -1, i32 1)
2175  ret <16 x float> %res
2176}
2177
2178define <16 x float> @test_vsubps_ru(<16 x float> %a0, <16 x float> %a1) {
2179; CHECK-LABEL: test_vsubps_ru:
2180; CHECK:       ## BB#0:
2181; CHECK-NEXT:    vsubps {ru-sae}, %zmm1, %zmm0, %zmm0
2182; CHECK-NEXT:    retq
2183  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1,
2184                    <16 x float> zeroinitializer, i16 -1, i32 2)
2185  ret <16 x float> %res
2186}
2187
2188define <16 x float> @test_vsubps_rz(<16 x float> %a0, <16 x float> %a1) {
2189; CHECK-LABEL: test_vsubps_rz:
2190; CHECK:       ## BB#0:
2191; CHECK-NEXT:    vsubps {rz-sae}, %zmm1, %zmm0, %zmm0
2192; CHECK-NEXT:    retq
2193  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1,
2194                    <16 x float> zeroinitializer, i16 -1, i32 3)
2195  ret <16 x float> %res
2196}
2197
2198define <16 x float> @test_vmulps_rn(<16 x float> %a0, <16 x float> %a1) {
2199; CHECK-LABEL: test_vmulps_rn:
2200; CHECK:       ## BB#0:
2201; CHECK-NEXT:    vmulps {rn-sae}, %zmm1, %zmm0, %zmm0
2202; CHECK-NEXT:    retq
2203  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
2204                    <16 x float> zeroinitializer, i16 -1, i32 0)
2205  ret <16 x float> %res
2206}
2207
2208define <16 x float> @test_vmulps_rd(<16 x float> %a0, <16 x float> %a1) {
2209; CHECK-LABEL: test_vmulps_rd:
2210; CHECK:       ## BB#0:
2211; CHECK-NEXT:    vmulps {rd-sae}, %zmm1, %zmm0, %zmm0
2212; CHECK-NEXT:    retq
2213  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
2214                    <16 x float> zeroinitializer, i16 -1, i32 1)
2215  ret <16 x float> %res
2216}
2217
2218define <16 x float> @test_vmulps_ru(<16 x float> %a0, <16 x float> %a1) {
2219; CHECK-LABEL: test_vmulps_ru:
2220; CHECK:       ## BB#0:
2221; CHECK-NEXT:    vmulps {ru-sae}, %zmm1, %zmm0, %zmm0
2222; CHECK-NEXT:    retq
2223  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
2224                    <16 x float> zeroinitializer, i16 -1, i32 2)
2225  ret <16 x float> %res
2226}
2227
2228define <16 x float> @test_vmulps_rz(<16 x float> %a0, <16 x float> %a1) {
2229; CHECK-LABEL: test_vmulps_rz:
2230; CHECK:       ## BB#0:
2231; CHECK-NEXT:    vmulps {rz-sae}, %zmm1, %zmm0, %zmm0
2232; CHECK-NEXT:    retq
2233  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
2234                    <16 x float> zeroinitializer, i16 -1, i32 3)
2235  ret <16 x float> %res
2236}
2237
2238;; mask float
2239define <16 x float> @test_vmulps_mask_rn(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2240; CHECK-LABEL: test_vmulps_mask_rn:
2241; CHECK:       ## BB#0:
2242; CHECK-NEXT:    kmovw %edi, %k1
2243; CHECK-NEXT:    vmulps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
2244; CHECK-NEXT:    retq
2245  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
2246                    <16 x float> zeroinitializer, i16 %mask, i32 0)
2247  ret <16 x float> %res
2248}
2249
2250define <16 x float> @test_vmulps_mask_rd(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2251; CHECK-LABEL: test_vmulps_mask_rd:
2252; CHECK:       ## BB#0:
2253; CHECK-NEXT:    kmovw %edi, %k1
2254; CHECK-NEXT:    vmulps {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
2255; CHECK-NEXT:    retq
2256  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
2257                    <16 x float> zeroinitializer, i16 %mask, i32 1)
2258  ret <16 x float> %res
2259}
2260
2261define <16 x float> @test_vmulps_mask_ru(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2262; CHECK-LABEL: test_vmulps_mask_ru:
2263; CHECK:       ## BB#0:
2264; CHECK-NEXT:    kmovw %edi, %k1
2265; CHECK-NEXT:    vmulps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
2266; CHECK-NEXT:    retq
2267  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
2268                    <16 x float> zeroinitializer, i16 %mask, i32 2)
2269  ret <16 x float> %res
2270}
2271
2272define <16 x float> @test_vmulps_mask_rz(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
2273; CHECK-LABEL: test_vmulps_mask_rz:
2274; CHECK:       ## BB#0:
2275; CHECK-NEXT:    kmovw %edi, %k1
2276; CHECK-NEXT:    vmulps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
2277; CHECK-NEXT:    retq
2278  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
2279                    <16 x float> zeroinitializer, i16 %mask, i32 3)
2280  ret <16 x float> %res
2281}
2282
2283;; With Passthru value
2284define <16 x float> @test_vmulps_mask_passthru_rn(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) {
2285; CHECK-LABEL: test_vmulps_mask_passthru_rn:
2286; CHECK:       ## BB#0:
2287; CHECK-NEXT:    kmovw %edi, %k1
2288; CHECK-NEXT:    vmulps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
2289; CHECK-NEXT:    vmovaps %zmm2, %zmm0
2290; CHECK-NEXT:    retq
2291  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
2292                    <16 x float> %passthru, i16 %mask, i32 0)
2293  ret <16 x float> %res
2294}
2295
2296define <16 x float> @test_vmulps_mask_passthru_rd(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) {
2297; CHECK-LABEL: test_vmulps_mask_passthru_rd:
2298; CHECK:       ## BB#0:
2299; CHECK-NEXT:    kmovw %edi, %k1
2300; CHECK-NEXT:    vmulps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1}
2301; CHECK-NEXT:    vmovaps %zmm2, %zmm0
2302; CHECK-NEXT:    retq
2303  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
2304                    <16 x float> %passthru, i16 %mask, i32 1)
2305  ret <16 x float> %res
2306}
2307
2308define <16 x float> @test_vmulps_mask_passthru_ru(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) {
2309; CHECK-LABEL: test_vmulps_mask_passthru_ru:
2310; CHECK:       ## BB#0:
2311; CHECK-NEXT:    kmovw %edi, %k1
2312; CHECK-NEXT:    vmulps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
2313; CHECK-NEXT:    vmovaps %zmm2, %zmm0
2314; CHECK-NEXT:    retq
2315  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
2316                    <16 x float> %passthru, i16 %mask, i32 2)
2317  ret <16 x float> %res
2318}
2319
2320define <16 x float> @test_vmulps_mask_passthru_rz(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) {
2321; CHECK-LABEL: test_vmulps_mask_passthru_rz:
2322; CHECK:       ## BB#0:
2323; CHECK-NEXT:    kmovw %edi, %k1
2324; CHECK-NEXT:    vmulps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
2325; CHECK-NEXT:    vmovaps %zmm2, %zmm0
2326; CHECK-NEXT:    retq
2327  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
2328                    <16 x float> %passthru, i16 %mask, i32 3)
2329  ret <16 x float> %res
2330}
2331
2332;; mask double
2333define <8 x double> @test_vmulpd_mask_rn(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
2334; CHECK-LABEL: test_vmulpd_mask_rn:
2335; CHECK:       ## BB#0:
2336; CHECK-NEXT:    movzbl %dil, %eax
2337; CHECK-NEXT:    kmovw %eax, %k1
2338; CHECK-NEXT:    vmulpd {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
2339; CHECK-NEXT:    retq
2340  %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1,
2341                    <8 x double> zeroinitializer, i8 %mask, i32 0)
2342  ret <8 x double> %res
2343}
2344
2345define <8 x double> @test_vmulpd_mask_rd(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
2346; CHECK-LABEL: test_vmulpd_mask_rd:
2347; CHECK:       ## BB#0:
2348; CHECK-NEXT:    movzbl %dil, %eax
2349; CHECK-NEXT:    kmovw %eax, %k1
2350; CHECK-NEXT:    vmulpd {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
2351; CHECK-NEXT:    retq
2352  %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1,
2353                    <8 x double> zeroinitializer, i8 %mask, i32 1)
2354  ret <8 x double> %res
2355}
2356
2357define <8 x double> @test_vmulpd_mask_ru(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
2358; CHECK-LABEL: test_vmulpd_mask_ru:
2359; CHECK:       ## BB#0:
2360; CHECK-NEXT:    movzbl %dil, %eax
2361; CHECK-NEXT:    kmovw %eax, %k1
2362; CHECK-NEXT:    vmulpd {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
2363; CHECK-NEXT:    retq
2364  %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1,
2365                    <8 x double> zeroinitializer, i8 %mask, i32 2)
2366  ret <8 x double> %res
2367}
2368
2369define <8 x double> @test_vmulpd_mask_rz(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
2370; CHECK-LABEL: test_vmulpd_mask_rz:
2371; CHECK:       ## BB#0:
2372; CHECK-NEXT:    movzbl %dil, %eax
2373; CHECK-NEXT:    kmovw %eax, %k1
2374; CHECK-NEXT:    vmulpd {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
2375; CHECK-NEXT:    retq
2376  %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1,
2377                    <8 x double> zeroinitializer, i8 %mask, i32 3)
2378  ret <8 x double> %res
2379}
2380
2381define <16 x i32> @test_xor_epi32(<16 x i32> %a, <16 x i32> %b) {
2382; CHECK-LABEL: test_xor_epi32:
2383; CHECK:       ## BB#0:
2384; CHECK-NEXT:    vpxord %zmm1, %zmm0, %zmm0
2385; CHECK-NEXT:    retq
2386  %res = call <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
2387  ret < 16 x i32> %res
2388}
2389
2390define <16 x i32> @test_mask_xor_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
2391; CHECK-LABEL: test_mask_xor_epi32:
2392; CHECK:       ## BB#0:
2393; CHECK-NEXT:    kmovw %edi, %k1
2394; CHECK-NEXT:    vpxord %zmm1, %zmm0, %zmm2 {%k1}
2395; CHECK-NEXT:    vmovaps %zmm2, %zmm0
2396; CHECK-NEXT:    retq
2397  %res = call <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
2398  ret < 16 x i32> %res
2399}
2400
2401declare <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
2402
2403define <16 x i32> @test_or_epi32(<16 x i32> %a, <16 x i32> %b) {
2404; CHECK-LABEL: test_or_epi32:
2405; CHECK:       ## BB#0:
2406; CHECK-NEXT:    vpord %zmm1, %zmm0, %zmm0
2407; CHECK-NEXT:    retq
2408  %res = call <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
2409  ret < 16 x i32> %res
2410}
2411
2412define <16 x i32> @test_mask_or_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
2413; CHECK-LABEL: test_mask_or_epi32:
2414; CHECK:       ## BB#0:
2415; CHECK-NEXT:    kmovw %edi, %k1
2416; CHECK-NEXT:    vpord %zmm1, %zmm0, %zmm2 {%k1}
2417; CHECK-NEXT:    vmovaps %zmm2, %zmm0
2418; CHECK-NEXT:    retq
2419  %res = call <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
2420  ret < 16 x i32> %res
2421}
2422
2423declare <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
2424
2425define <16 x i32> @test_and_epi32(<16 x i32> %a, <16 x i32> %b) {
2426; CHECK-LABEL: test_and_epi32:
2427; CHECK:       ## BB#0:
2428; CHECK-NEXT:    vpandd %zmm1, %zmm0, %zmm0
2429; CHECK-NEXT:    retq
2430  %res = call <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
2431  ret < 16 x i32> %res
2432}
2433
2434define <16 x i32> @test_mask_and_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
2435; CHECK-LABEL: test_mask_and_epi32:
2436; CHECK:       ## BB#0:
2437; CHECK-NEXT:    kmovw %edi, %k1
2438; CHECK-NEXT:    vpandd %zmm1, %zmm0, %zmm2 {%k1}
2439; CHECK-NEXT:    vmovaps %zmm2, %zmm0
2440; CHECK-NEXT:    retq
2441  %res = call <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
2442  ret < 16 x i32> %res
2443}
2444
2445declare <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
2446
2447define <8 x i64> @test_xor_epi64(<8 x i64> %a, <8 x i64> %b) {
2448; CHECK-LABEL: test_xor_epi64:
2449; CHECK:       ## BB#0:
2450; CHECK-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
2451; CHECK-NEXT:    retq
2452  %res = call <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
2453  ret < 8 x i64> %res
2454}
2455
2456define <8 x i64> @test_mask_xor_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
2457; CHECK-LABEL: test_mask_xor_epi64:
2458; CHECK:       ## BB#0:
2459; CHECK-NEXT:    movzbl %dil, %eax
2460; CHECK-NEXT:    kmovw %eax, %k1
2461; CHECK-NEXT:    vpxorq %zmm1, %zmm0, %zmm2 {%k1}
2462; CHECK-NEXT:    vmovaps %zmm2, %zmm0
2463; CHECK-NEXT:    retq
2464  %res = call <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
2465  ret < 8 x i64> %res
2466}
2467
2468declare <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
2469
2470define <8 x i64> @test_or_epi64(<8 x i64> %a, <8 x i64> %b) {
2471; CHECK-LABEL: test_or_epi64:
2472; CHECK:       ## BB#0:
2473; CHECK-NEXT:    vporq %zmm1, %zmm0, %zmm0
2474; CHECK-NEXT:    retq
2475  %res = call <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
2476  ret < 8 x i64> %res
2477}
2478
2479define <8 x i64> @test_mask_or_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
2480; CHECK-LABEL: test_mask_or_epi64:
2481; CHECK:       ## BB#0:
2482; CHECK-NEXT:    movzbl %dil, %eax
2483; CHECK-NEXT:    kmovw %eax, %k1
2484; CHECK-NEXT:    vporq %zmm1, %zmm0, %zmm2 {%k1}
2485; CHECK-NEXT:    vmovaps %zmm2, %zmm0
2486; CHECK-NEXT:    retq
2487  %res = call <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
2488  ret < 8 x i64> %res
2489}
2490
2491declare <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
2492
2493define <8 x i64> @test_and_epi64(<8 x i64> %a, <8 x i64> %b) {
2494; CHECK-LABEL: test_and_epi64:
2495; CHECK:       ## BB#0:
2496; CHECK-NEXT:    vpandq %zmm1, %zmm0, %zmm0
2497; CHECK-NEXT:    retq
2498  %res = call <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
2499  ret < 8 x i64> %res
2500}
2501
2502define <8 x i64> @test_mask_and_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
2503; CHECK-LABEL: test_mask_and_epi64:
2504; CHECK:       ## BB#0:
2505; CHECK-NEXT:    movzbl %dil, %eax
2506; CHECK-NEXT:    kmovw %eax, %k1
2507; CHECK-NEXT:    vpandq %zmm1, %zmm0, %zmm2 {%k1}
2508; CHECK-NEXT:    vmovaps %zmm2, %zmm0
2509; CHECK-NEXT:    retq
2510  %res = call <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
2511  ret < 8 x i64> %res
2512}
2513
2514declare <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
2515
2516
2517define <16 x i32> @test_mask_add_epi32_rr(<16 x i32> %a, <16 x i32> %b) {
2518; CHECK-LABEL: test_mask_add_epi32_rr:
2519; CHECK:       ## BB#0:
2520; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
2521; CHECK-NEXT:    retq
2522  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
2523  ret < 16 x i32> %res
2524}
2525
2526define <16 x i32> @test_mask_add_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
2527; CHECK-LABEL: test_mask_add_epi32_rrk:
2528; CHECK:       ## BB#0:
2529; CHECK-NEXT:    kmovw %edi, %k1
2530; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm2 {%k1}
2531; CHECK-NEXT:    vmovaps %zmm2, %zmm0
2532; CHECK-NEXT:    retq
2533  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
2534  ret < 16 x i32> %res
2535}
2536
2537define <16 x i32> @test_mask_add_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
2538; CHECK-LABEL: test_mask_add_epi32_rrkz:
2539; CHECK:       ## BB#0:
2540; CHECK-NEXT:    kmovw %edi, %k1
2541; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0 {%k1} {z}
2542; CHECK-NEXT:    retq
2543  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
2544  ret < 16 x i32> %res
2545}
2546
2547define <16 x i32> @test_mask_add_epi32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
2548; CHECK-LABEL: test_mask_add_epi32_rm:
2549; CHECK:       ## BB#0:
2550; CHECK-NEXT:    vpaddd (%rdi), %zmm0, %zmm0
2551; CHECK-NEXT:    retq
2552  %b = load <16 x i32>, <16 x i32>* %ptr_b
2553  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
2554  ret < 16 x i32> %res
2555}
2556
2557define <16 x i32> @test_mask_add_epi32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <16 x i32> %passThru, i16 %mask) {
2558; CHECK-LABEL: test_mask_add_epi32_rmk:
2559; CHECK:       ## BB#0:
2560; CHECK-NEXT:    kmovw %esi, %k1
2561; CHECK-NEXT:    vpaddd (%rdi), %zmm0, %zmm1 {%k1}
2562; CHECK-NEXT:    vmovaps %zmm1, %zmm0
2563; CHECK-NEXT:    retq
2564  %b = load <16 x i32>, <16 x i32>* %ptr_b
2565  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
2566  ret < 16 x i32> %res
2567}
2568
2569define <16 x i32> @test_mask_add_epi32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i16 %mask) {
2570; CHECK-LABEL: test_mask_add_epi32_rmkz:
2571; CHECK:       ## BB#0:
2572; CHECK-NEXT:    kmovw %esi, %k1
2573; CHECK-NEXT:    vpaddd (%rdi), %zmm0, %zmm0 {%k1} {z}
2574; CHECK-NEXT:    retq
2575  %b = load <16 x i32>, <16 x i32>* %ptr_b
2576  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
2577  ret < 16 x i32> %res
2578}
2579
2580define <16 x i32> @test_mask_add_epi32_rmb(<16 x i32> %a, i32* %ptr_b) {
2581; CHECK-LABEL: test_mask_add_epi32_rmb:
2582; CHECK:       ## BB#0:
2583; CHECK-NEXT:    vpaddd (%rdi){1to16}, %zmm0, %zmm0
2584; CHECK-NEXT:    retq
2585  %q = load i32, i32* %ptr_b
2586  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
2587  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
2588  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
2589  ret < 16 x i32> %res
2590}
2591
2592define <16 x i32> @test_mask_add_epi32_rmbk(<16 x i32> %a, i32* %ptr_b, <16 x i32> %passThru, i16 %mask) {
2593; CHECK-LABEL: test_mask_add_epi32_rmbk:
2594; CHECK:       ## BB#0:
2595; CHECK-NEXT:    kmovw %esi, %k1
2596; CHECK-NEXT:    vpaddd (%rdi){1to16}, %zmm0, %zmm1 {%k1}
2597; CHECK-NEXT:    vmovaps %zmm1, %zmm0
2598; CHECK-NEXT:    retq
2599  %q = load i32, i32* %ptr_b
2600  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
2601  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
2602  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
2603  ret < 16 x i32> %res
2604}
2605
2606define <16 x i32> @test_mask_add_epi32_rmbkz(<16 x i32> %a, i32* %ptr_b, i16 %mask) {
2607; CHECK-LABEL: test_mask_add_epi32_rmbkz:
2608; CHECK:       ## BB#0:
2609; CHECK-NEXT:    kmovw %esi, %k1
2610; CHECK-NEXT:    vpaddd (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
2611; CHECK-NEXT:    retq
2612  %q = load i32, i32* %ptr_b
2613  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
2614  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
2615  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
2616  ret < 16 x i32> %res
2617}
2618
2619declare <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
2620
2621define <16 x i32> @test_mask_sub_epi32_rr(<16 x i32> %a, <16 x i32> %b) {
2622; CHECK-LABEL: test_mask_sub_epi32_rr:
2623; CHECK:       ## BB#0:
2624; CHECK-NEXT:    vpsubd %zmm1, %zmm0, %zmm0
2625; CHECK-NEXT:    retq
2626  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
2627  ret < 16 x i32> %res
2628}
2629
2630define <16 x i32> @test_mask_sub_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
2631; CHECK-LABEL: test_mask_sub_epi32_rrk:
2632; CHECK:       ## BB#0:
2633; CHECK-NEXT:    kmovw %edi, %k1
2634; CHECK-NEXT:    vpsubd %zmm1, %zmm0, %zmm2 {%k1}
2635; CHECK-NEXT:    vmovaps %zmm2, %zmm0
2636; CHECK-NEXT:    retq
2637  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
2638  ret < 16 x i32> %res
2639}
2640
2641define <16 x i32> @test_mask_sub_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
2642; CHECK-LABEL: test_mask_sub_epi32_rrkz:
2643; CHECK:       ## BB#0:
2644; CHECK-NEXT:    kmovw %edi, %k1
2645; CHECK-NEXT:    vpsubd %zmm1, %zmm0, %zmm0 {%k1} {z}
2646; CHECK-NEXT:    retq
2647  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
2648  ret < 16 x i32> %res
2649}
2650
2651define <16 x i32> @test_mask_sub_epi32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
2652; CHECK-LABEL: test_mask_sub_epi32_rm:
2653; CHECK:       ## BB#0:
2654; CHECK-NEXT:    vpsubd (%rdi), %zmm0, %zmm0
2655; CHECK-NEXT:    retq
2656  %b = load <16 x i32>, <16 x i32>* %ptr_b
2657  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
2658  ret < 16 x i32> %res
2659}
2660
2661define <16 x i32> @test_mask_sub_epi32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <16 x i32> %passThru, i16 %mask) {
2662; CHECK-LABEL: test_mask_sub_epi32_rmk:
2663; CHECK:       ## BB#0:
2664; CHECK-NEXT:    kmovw %esi, %k1
2665; CHECK-NEXT:    vpsubd (%rdi), %zmm0, %zmm1 {%k1}
2666; CHECK-NEXT:    vmovaps %zmm1, %zmm0
2667; CHECK-NEXT:    retq
2668  %b = load <16 x i32>, <16 x i32>* %ptr_b
2669  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
2670  ret < 16 x i32> %res
2671}
2672
2673define <16 x i32> @test_mask_sub_epi32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i16 %mask) {
2674; CHECK-LABEL: test_mask_sub_epi32_rmkz:
2675; CHECK:       ## BB#0:
2676; CHECK-NEXT:    kmovw %esi, %k1
2677; CHECK-NEXT:    vpsubd (%rdi), %zmm0, %zmm0 {%k1} {z}
2678; CHECK-NEXT:    retq
2679  %b = load <16 x i32>, <16 x i32>* %ptr_b
2680  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
2681  ret < 16 x i32> %res
2682}
2683
2684define <16 x i32> @test_mask_sub_epi32_rmb(<16 x i32> %a, i32* %ptr_b) {
2685; CHECK-LABEL: test_mask_sub_epi32_rmb:
2686; CHECK:       ## BB#0:
2687; CHECK-NEXT:    vpsubd (%rdi){1to16}, %zmm0, %zmm0
2688; CHECK-NEXT:    retq
2689  %q = load i32, i32* %ptr_b
2690  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
2691  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
2692  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
2693  ret < 16 x i32> %res
2694}
2695
2696define <16 x i32> @test_mask_sub_epi32_rmbk(<16 x i32> %a, i32* %ptr_b, <16 x i32> %passThru, i16 %mask) {
2697; CHECK-LABEL: test_mask_sub_epi32_rmbk:
2698; CHECK:       ## BB#0:
2699; CHECK-NEXT:    kmovw %esi, %k1
2700; CHECK-NEXT:    vpsubd (%rdi){1to16}, %zmm0, %zmm1 {%k1}
2701; CHECK-NEXT:    vmovaps %zmm1, %zmm0
2702; CHECK-NEXT:    retq
2703  %q = load i32, i32* %ptr_b
2704  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
2705  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
2706  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
2707  ret < 16 x i32> %res
2708}
2709
2710define <16 x i32> @test_mask_sub_epi32_rmbkz(<16 x i32> %a, i32* %ptr_b, i16 %mask) {
2711; CHECK-LABEL: test_mask_sub_epi32_rmbkz:
2712; CHECK:       ## BB#0:
2713; CHECK-NEXT:    kmovw %esi, %k1
2714; CHECK-NEXT:    vpsubd (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
2715; CHECK-NEXT:    retq
2716  %q = load i32, i32* %ptr_b
2717  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
2718  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
2719  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
2720  ret < 16 x i32> %res
2721}
2722
2723declare <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
2724
2725define <8 x i64> @test_mask_add_epi64_rr(<8 x i64> %a, <8 x i64> %b) {
2726; CHECK-LABEL: test_mask_add_epi64_rr:
2727; CHECK:       ## BB#0:
2728; CHECK-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
2729; CHECK-NEXT:    retq
2730  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
2731  ret < 8 x i64> %res
2732}
2733
2734define <8 x i64> @test_mask_add_epi64_rrk(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
2735; CHECK-LABEL: test_mask_add_epi64_rrk:
2736; CHECK:       ## BB#0:
2737; CHECK-NEXT:    movzbl %dil, %eax
2738; CHECK-NEXT:    kmovw %eax, %k1
2739; CHECK-NEXT:    vpaddq %zmm1, %zmm0, %zmm2 {%k1}
2740; CHECK-NEXT:    vmovaps %zmm2, %zmm0
2741; CHECK-NEXT:    retq
2742  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
2743  ret < 8 x i64> %res
2744}
2745
2746define <8 x i64> @test_mask_add_epi64_rrkz(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
2747; CHECK-LABEL: test_mask_add_epi64_rrkz:
2748; CHECK:       ## BB#0:
2749; CHECK-NEXT:    movzbl %dil, %eax
2750; CHECK-NEXT:    kmovw %eax, %k1
2751; CHECK-NEXT:    vpaddq %zmm1, %zmm0, %zmm0 {%k1} {z}
2752; CHECK-NEXT:    retq
2753  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
2754  ret < 8 x i64> %res
2755}
2756
2757define <8 x i64> @test_mask_add_epi64_rm(<8 x i64> %a, <8 x i64>* %ptr_b) {
2758; CHECK-LABEL: test_mask_add_epi64_rm:
2759; CHECK:       ## BB#0:
2760; CHECK-NEXT:    vpaddq (%rdi), %zmm0, %zmm0
2761; CHECK-NEXT:    retq
2762  %b = load <8 x i64>, <8 x i64>* %ptr_b
2763  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
2764  ret < 8 x i64> %res
2765}
2766
2767define <8 x i64> @test_mask_add_epi64_rmk(<8 x i64> %a, <8 x i64>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
2768; CHECK-LABEL: test_mask_add_epi64_rmk:
2769; CHECK:       ## BB#0:
2770; CHECK-NEXT:    movzbl %sil, %eax
2771; CHECK-NEXT:    kmovw %eax, %k1
2772; CHECK-NEXT:    vpaddq (%rdi), %zmm0, %zmm1 {%k1}
2773; CHECK-NEXT:    vmovaps %zmm1, %zmm0
2774; CHECK-NEXT:    retq
2775  %b = load <8 x i64>, <8 x i64>* %ptr_b
2776  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
2777  ret < 8 x i64> %res
2778}
2779
2780define <8 x i64> @test_mask_add_epi64_rmkz(<8 x i64> %a, <8 x i64>* %ptr_b, i8 %mask) {
2781; CHECK-LABEL: test_mask_add_epi64_rmkz:
2782; CHECK:       ## BB#0:
2783; CHECK-NEXT:    movzbl %sil, %eax
2784; CHECK-NEXT:    kmovw %eax, %k1
2785; CHECK-NEXT:    vpaddq (%rdi), %zmm0, %zmm0 {%k1} {z}
2786; CHECK-NEXT:    retq
2787  %b = load <8 x i64>, <8 x i64>* %ptr_b
2788  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
2789  ret < 8 x i64> %res
2790}
2791
2792define <8 x i64> @test_mask_add_epi64_rmb(<8 x i64> %a, i64* %ptr_b) {
2793; CHECK-LABEL: test_mask_add_epi64_rmb:
2794; CHECK:       ## BB#0:
2795; CHECK-NEXT:    vpaddq (%rdi){1to8}, %zmm0, %zmm0
2796; CHECK-NEXT:    retq
2797  %q = load i64, i64* %ptr_b
2798  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
2799  %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
2800  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
2801  ret < 8 x i64> %res
2802}
2803
2804define <8 x i64> @test_mask_add_epi64_rmbk(<8 x i64> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
2805; CHECK-LABEL: test_mask_add_epi64_rmbk:
2806; CHECK:       ## BB#0:
2807; CHECK-NEXT:    movzbl %sil, %eax
2808; CHECK-NEXT:    kmovw %eax, %k1
2809; CHECK-NEXT:    vpaddq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
2810; CHECK-NEXT:    vmovaps %zmm1, %zmm0
2811; CHECK-NEXT:    retq
2812  %q = load i64, i64* %ptr_b
2813  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
2814  %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
2815  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
2816  ret < 8 x i64> %res
2817}
2818
2819define <8 x i64> @test_mask_add_epi64_rmbkz(<8 x i64> %a, i64* %ptr_b, i8 %mask) {
2820; CHECK-LABEL: test_mask_add_epi64_rmbkz:
2821; CHECK:       ## BB#0:
2822; CHECK-NEXT:    movzbl %sil, %eax
2823; CHECK-NEXT:    kmovw %eax, %k1
2824; CHECK-NEXT:    vpaddq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
2825; CHECK-NEXT:    retq
2826  %q = load i64, i64* %ptr_b
2827  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
2828  %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
2829  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
2830  ret < 8 x i64> %res
2831}
2832
2833declare <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
2834
2835define <8 x i64> @test_mask_sub_epi64_rr(<8 x i64> %a, <8 x i64> %b) {
2836; CHECK-LABEL: test_mask_sub_epi64_rr:
2837; CHECK:       ## BB#0:
2838; CHECK-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
2839; CHECK-NEXT:    retq
2840  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
2841  ret < 8 x i64> %res
2842}
2843
2844define <8 x i64> @test_mask_sub_epi64_rrk(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
2845; CHECK-LABEL: test_mask_sub_epi64_rrk:
2846; CHECK:       ## BB#0:
2847; CHECK-NEXT:    movzbl %dil, %eax
2848; CHECK-NEXT:    kmovw %eax, %k1
2849; CHECK-NEXT:    vpsubq %zmm1, %zmm0, %zmm2 {%k1}
2850; CHECK-NEXT:    vmovaps %zmm2, %zmm0
2851; CHECK-NEXT:    retq
2852  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
2853  ret < 8 x i64> %res
2854}
2855
2856define <8 x i64> @test_mask_sub_epi64_rrkz(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
2857; CHECK-LABEL: test_mask_sub_epi64_rrkz:
2858; CHECK:       ## BB#0:
2859; CHECK-NEXT:    movzbl %dil, %eax
2860; CHECK-NEXT:    kmovw %eax, %k1
2861; CHECK-NEXT:    vpsubq %zmm1, %zmm0, %zmm0 {%k1} {z}
2862; CHECK-NEXT:    retq
2863  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
2864  ret < 8 x i64> %res
2865}
2866
2867define <8 x i64> @test_mask_sub_epi64_rm(<8 x i64> %a, <8 x i64>* %ptr_b) {
2868; CHECK-LABEL: test_mask_sub_epi64_rm:
2869; CHECK:       ## BB#0:
2870; CHECK-NEXT:    vpsubq (%rdi), %zmm0, %zmm0
2871; CHECK-NEXT:    retq
2872  %b = load <8 x i64>, <8 x i64>* %ptr_b
2873  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
2874  ret < 8 x i64> %res
2875}
2876
2877define <8 x i64> @test_mask_sub_epi64_rmk(<8 x i64> %a, <8 x i64>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
2878; CHECK-LABEL: test_mask_sub_epi64_rmk:
2879; CHECK:       ## BB#0:
2880; CHECK-NEXT:    movzbl %sil, %eax
2881; CHECK-NEXT:    kmovw %eax, %k1
2882; CHECK-NEXT:    vpsubq (%rdi), %zmm0, %zmm1 {%k1}
2883; CHECK-NEXT:    vmovaps %zmm1, %zmm0
2884; CHECK-NEXT:    retq
2885  %b = load <8 x i64>, <8 x i64>* %ptr_b
2886  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
2887  ret < 8 x i64> %res
2888}
2889
2890define <8 x i64> @test_mask_sub_epi64_rmkz(<8 x i64> %a, <8 x i64>* %ptr_b, i8 %mask) {
2891; CHECK-LABEL: test_mask_sub_epi64_rmkz:
2892; CHECK:       ## BB#0:
2893; CHECK-NEXT:    movzbl %sil, %eax
2894; CHECK-NEXT:    kmovw %eax, %k1
2895; CHECK-NEXT:    vpsubq (%rdi), %zmm0, %zmm0 {%k1} {z}
2896; CHECK-NEXT:    retq
2897  %b = load <8 x i64>, <8 x i64>* %ptr_b
2898  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
2899  ret < 8 x i64> %res
2900}
2901
2902define <8 x i64> @test_mask_sub_epi64_rmb(<8 x i64> %a, i64* %ptr_b) {
2903; CHECK-LABEL: test_mask_sub_epi64_rmb:
2904; CHECK:       ## BB#0:
2905; CHECK-NEXT:    vpsubq (%rdi){1to8}, %zmm0, %zmm0
2906; CHECK-NEXT:    retq
2907  %q = load i64, i64* %ptr_b
2908  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
2909  %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
2910  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
2911  ret < 8 x i64> %res
2912}
2913
2914define <8 x i64> @test_mask_sub_epi64_rmbk(<8 x i64> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
2915; CHECK-LABEL: test_mask_sub_epi64_rmbk:
2916; CHECK:       ## BB#0:
2917; CHECK-NEXT:    movzbl %sil, %eax
2918; CHECK-NEXT:    kmovw %eax, %k1
2919; CHECK-NEXT:    vpsubq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
2920; CHECK-NEXT:    vmovaps %zmm1, %zmm0
2921; CHECK-NEXT:    retq
2922  %q = load i64, i64* %ptr_b
2923  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
2924  %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
2925  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
2926  ret < 8 x i64> %res
2927}
2928
2929define <8 x i64> @test_mask_sub_epi64_rmbkz(<8 x i64> %a, i64* %ptr_b, i8 %mask) {
2930; CHECK-LABEL: test_mask_sub_epi64_rmbkz:
2931; CHECK:       ## BB#0:
2932; CHECK-NEXT:    movzbl %sil, %eax
2933; CHECK-NEXT:    kmovw %eax, %k1
2934; CHECK-NEXT:    vpsubq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
2935; CHECK-NEXT:    retq
2936  %q = load i64, i64* %ptr_b
2937  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
2938  %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
2939  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
2940  ret < 8 x i64> %res
2941}
2942
2943declare <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
2944
2945define <8 x i64> @test_mask_mul_epi32_rr(<16 x i32> %a, <16 x i32> %b) {
2946; CHECK-LABEL: test_mask_mul_epi32_rr:
2947; CHECK:       ## BB#0:
2948; CHECK-NEXT:    vpmuldq %zmm1, %zmm0, %zmm0
2949; CHECK-NEXT:    retq
2950  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
2951  ret < 8 x i64> %res
2952}
2953
2954define <8 x i64> @test_mask_mul_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) {
2955; CHECK-LABEL: test_mask_mul_epi32_rrk:
2956; CHECK:       ## BB#0:
2957; CHECK-NEXT:    movzbl %dil, %eax
2958; CHECK-NEXT:    kmovw %eax, %k1
2959; CHECK-NEXT:    vpmuldq %zmm1, %zmm0, %zmm2 {%k1}
2960; CHECK-NEXT:    vmovaps %zmm2, %zmm0
2961; CHECK-NEXT:    retq
2962  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
2963  ret < 8 x i64> %res
2964}
2965
2966define <8 x i64> @test_mask_mul_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask) {
2967; CHECK-LABEL: test_mask_mul_epi32_rrkz:
2968; CHECK:       ## BB#0:
2969; CHECK-NEXT:    movzbl %dil, %eax
2970; CHECK-NEXT:    kmovw %eax, %k1
2971; CHECK-NEXT:    vpmuldq %zmm1, %zmm0, %zmm0 {%k1} {z}
2972; CHECK-NEXT:    retq
2973  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
2974  ret < 8 x i64> %res
2975}
2976
2977define <8 x i64> @test_mask_mul_epi32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
2978; CHECK-LABEL: test_mask_mul_epi32_rm:
2979; CHECK:       ## BB#0:
2980; CHECK-NEXT:    vpmuldq (%rdi), %zmm0, %zmm0
2981; CHECK-NEXT:    retq
2982  %b = load <16 x i32>, <16 x i32>* %ptr_b
2983  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
2984  ret < 8 x i64> %res
2985}
2986
2987define <8 x i64> @test_mask_mul_epi32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
2988; CHECK-LABEL: test_mask_mul_epi32_rmk:
2989; CHECK:       ## BB#0:
2990; CHECK-NEXT:    movzbl %sil, %eax
2991; CHECK-NEXT:    kmovw %eax, %k1
2992; CHECK-NEXT:    vpmuldq (%rdi), %zmm0, %zmm1 {%k1}
2993; CHECK-NEXT:    vmovaps %zmm1, %zmm0
2994; CHECK-NEXT:    retq
2995  %b = load <16 x i32>, <16 x i32>* %ptr_b
2996  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
2997  ret < 8 x i64> %res
2998}
2999
3000define <8 x i64> @test_mask_mul_epi32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i8 %mask) {
3001; CHECK-LABEL: test_mask_mul_epi32_rmkz:
3002; CHECK:       ## BB#0:
3003; CHECK-NEXT:    movzbl %sil, %eax
3004; CHECK-NEXT:    kmovw %eax, %k1
3005; CHECK-NEXT:    vpmuldq (%rdi), %zmm0, %zmm0 {%k1} {z}
3006; CHECK-NEXT:    retq
3007  %b = load <16 x i32>, <16 x i32>* %ptr_b
3008  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
3009  ret < 8 x i64> %res
3010}
3011
3012define <8 x i64> @test_mask_mul_epi32_rmb(<16 x i32> %a, i64* %ptr_b) {
3013; CHECK-LABEL: test_mask_mul_epi32_rmb:
3014; CHECK:       ## BB#0:
3015; CHECK-NEXT:    vpmuldq (%rdi){1to8}, %zmm0, %zmm0
3016; CHECK-NEXT:    retq
3017  %q = load i64, i64* %ptr_b
3018  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
3019  %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
3020  %b = bitcast <8 x i64> %b64 to <16 x i32>
3021  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
3022  ret < 8 x i64> %res
3023}
3024
3025define <8 x i64> @test_mask_mul_epi32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
3026; CHECK-LABEL: test_mask_mul_epi32_rmbk:
3027; CHECK:       ## BB#0:
3028; CHECK-NEXT:    movzbl %sil, %eax
3029; CHECK-NEXT:    kmovw %eax, %k1
3030; CHECK-NEXT:    vpmuldq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
3031; CHECK-NEXT:    vmovaps %zmm1, %zmm0
3032; CHECK-NEXT:    retq
3033  %q = load i64, i64* %ptr_b
3034  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
3035  %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
3036  %b = bitcast <8 x i64> %b64 to <16 x i32>
3037  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
3038  ret < 8 x i64> %res
3039}
3040
3041define <8 x i64> @test_mask_mul_epi32_rmbkz(<16 x i32> %a, i64* %ptr_b, i8 %mask) {
3042; CHECK-LABEL: test_mask_mul_epi32_rmbkz:
3043; CHECK:       ## BB#0:
3044; CHECK-NEXT:    movzbl %sil, %eax
3045; CHECK-NEXT:    kmovw %eax, %k1
3046; CHECK-NEXT:    vpmuldq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
3047; CHECK-NEXT:    retq
3048  %q = load i64, i64* %ptr_b
3049  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
3050  %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
3051  %b = bitcast <8 x i64> %b64 to <16 x i32>
3052  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
3053  ret < 8 x i64> %res
3054}
3055
3056declare <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32>, <16 x i32>, <8 x i64>, i8)
3057
3058define <8 x i64> @test_mask_mul_epu32_rr(<16 x i32> %a, <16 x i32> %b) {
3059; CHECK-LABEL: test_mask_mul_epu32_rr:
3060; CHECK:       ## BB#0:
3061; CHECK-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
3062; CHECK-NEXT:    retq
3063  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
3064  ret < 8 x i64> %res
3065}
3066
3067define <8 x i64> @test_mask_mul_epu32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) {
3068; CHECK-LABEL: test_mask_mul_epu32_rrk:
3069; CHECK:       ## BB#0:
3070; CHECK-NEXT:    movzbl %dil, %eax
3071; CHECK-NEXT:    kmovw %eax, %k1
3072; CHECK-NEXT:    vpmuludq %zmm1, %zmm0, %zmm2 {%k1}
3073; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3074; CHECK-NEXT:    retq
3075  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
3076  ret < 8 x i64> %res
3077}
3078
3079define <8 x i64> @test_mask_mul_epu32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask) {
3080; CHECK-LABEL: test_mask_mul_epu32_rrkz:
3081; CHECK:       ## BB#0:
3082; CHECK-NEXT:    movzbl %dil, %eax
3083; CHECK-NEXT:    kmovw %eax, %k1
3084; CHECK-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0 {%k1} {z}
3085; CHECK-NEXT:    retq
3086  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
3087  ret < 8 x i64> %res
3088}
3089
3090define <8 x i64> @test_mask_mul_epu32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
3091; CHECK-LABEL: test_mask_mul_epu32_rm:
3092; CHECK:       ## BB#0:
3093; CHECK-NEXT:    vpmuludq (%rdi), %zmm0, %zmm0
3094; CHECK-NEXT:    retq
3095  %b = load <16 x i32>, <16 x i32>* %ptr_b
3096  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
3097  ret < 8 x i64> %res
3098}
3099
3100define <8 x i64> @test_mask_mul_epu32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
3101; CHECK-LABEL: test_mask_mul_epu32_rmk:
3102; CHECK:       ## BB#0:
3103; CHECK-NEXT:    movzbl %sil, %eax
3104; CHECK-NEXT:    kmovw %eax, %k1
3105; CHECK-NEXT:    vpmuludq (%rdi), %zmm0, %zmm1 {%k1}
3106; CHECK-NEXT:    vmovaps %zmm1, %zmm0
3107; CHECK-NEXT:    retq
3108  %b = load <16 x i32>, <16 x i32>* %ptr_b
3109  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
3110  ret < 8 x i64> %res
3111}
3112
3113define <8 x i64> @test_mask_mul_epu32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i8 %mask) {
3114; CHECK-LABEL: test_mask_mul_epu32_rmkz:
3115; CHECK:       ## BB#0:
3116; CHECK-NEXT:    movzbl %sil, %eax
3117; CHECK-NEXT:    kmovw %eax, %k1
3118; CHECK-NEXT:    vpmuludq (%rdi), %zmm0, %zmm0 {%k1} {z}
3119; CHECK-NEXT:    retq
3120  %b = load <16 x i32>, <16 x i32>* %ptr_b
3121  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
3122  ret < 8 x i64> %res
3123}
3124
3125define <8 x i64> @test_mask_mul_epu32_rmb(<16 x i32> %a, i64* %ptr_b) {
3126; CHECK-LABEL: test_mask_mul_epu32_rmb:
3127; CHECK:       ## BB#0:
3128; CHECK-NEXT:    vpmuludq (%rdi){1to8}, %zmm0, %zmm0
3129; CHECK-NEXT:    retq
3130  %q = load i64, i64* %ptr_b
3131  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
3132  %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
3133  %b = bitcast <8 x i64> %b64 to <16 x i32>
3134  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
3135  ret < 8 x i64> %res
3136}
3137
3138define <8 x i64> @test_mask_mul_epu32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
3139; CHECK-LABEL: test_mask_mul_epu32_rmbk:
3140; CHECK:       ## BB#0:
3141; CHECK-NEXT:    movzbl %sil, %eax
3142; CHECK-NEXT:    kmovw %eax, %k1
3143; CHECK-NEXT:    vpmuludq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
3144; CHECK-NEXT:    vmovaps %zmm1, %zmm0
3145; CHECK-NEXT:    retq
3146  %q = load i64, i64* %ptr_b
3147  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
3148  %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
3149  %b = bitcast <8 x i64> %b64 to <16 x i32>
3150  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
3151  ret < 8 x i64> %res
3152}
3153
3154define <8 x i64> @test_mask_mul_epu32_rmbkz(<16 x i32> %a, i64* %ptr_b, i8 %mask) {
3155; CHECK-LABEL: test_mask_mul_epu32_rmbkz:
3156; CHECK:       ## BB#0:
3157; CHECK-NEXT:    movzbl %sil, %eax
3158; CHECK-NEXT:    kmovw %eax, %k1
3159; CHECK-NEXT:    vpmuludq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
3160; CHECK-NEXT:    retq
3161  %q = load i64, i64* %ptr_b
3162  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
3163  %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
3164  %b = bitcast <8 x i64> %b64 to <16 x i32>
3165  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
3166  ret < 8 x i64> %res
3167}
3168
3169declare <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32>, <16 x i32>, <8 x i64>, i8)
3170
3171define <16 x i32> @test_mask_mullo_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) {
3172; CHECK-LABEL: test_mask_mullo_epi32_rr_512:
3173; CHECK:       ## BB#0:
3174; CHECK-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
3175; CHECK-NEXT:    retq
3176  %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
3177  ret <16 x i32> %res
3178}
3179
3180define <16 x i32> @test_mask_mullo_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
3181; CHECK-LABEL: test_mask_mullo_epi32_rrk_512:
3182; CHECK:       ## BB#0:
3183; CHECK-NEXT:    kmovw %edi, %k1
3184; CHECK-NEXT:    vpmulld %zmm1, %zmm0, %zmm2 {%k1}
3185; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3186; CHECK-NEXT:    retq
3187  %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
3188  ret < 16 x i32> %res
3189}
3190
3191define <16 x i32> @test_mask_mullo_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
3192; CHECK-LABEL: test_mask_mullo_epi32_rrkz_512:
3193; CHECK:       ## BB#0:
3194; CHECK-NEXT:    kmovw %edi, %k1
3195; CHECK-NEXT:    vpmulld %zmm1, %zmm0, %zmm0 {%k1} {z}
3196; CHECK-NEXT:    retq
3197  %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
3198  ret < 16 x i32> %res
3199}
3200
3201define <16 x i32> @test_mask_mullo_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_b) {
3202; CHECK-LABEL: test_mask_mullo_epi32_rm_512:
3203; CHECK:       ## BB#0:
3204; CHECK-NEXT:    vpmulld (%rdi), %zmm0, %zmm0
3205; CHECK-NEXT:    retq
3206  %b = load <16 x i32>, <16 x i32>* %ptr_b
3207  %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
3208  ret < 16 x i32> %res
3209}
3210
3211define <16 x i32> @test_mask_mullo_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr_b, <16 x i32> %passThru, i16 %mask) {
3212; CHECK-LABEL: test_mask_mullo_epi32_rmk_512:
3213; CHECK:       ## BB#0:
3214; CHECK-NEXT:    kmovw %esi, %k1
3215; CHECK-NEXT:    vpmulld (%rdi), %zmm0, %zmm1 {%k1}
3216; CHECK-NEXT:    vmovaps %zmm1, %zmm0
3217; CHECK-NEXT:    retq
3218  %b = load <16 x i32>, <16 x i32>* %ptr_b
3219  %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
3220  ret < 16 x i32> %res
3221}
3222
3223define <16 x i32> @test_mask_mullo_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %ptr_b, i16 %mask) {
3224; CHECK-LABEL: test_mask_mullo_epi32_rmkz_512:
3225; CHECK:       ## BB#0:
3226; CHECK-NEXT:    kmovw %esi, %k1
3227; CHECK-NEXT:    vpmulld (%rdi), %zmm0, %zmm0 {%k1} {z}
3228; CHECK-NEXT:    retq
3229  %b = load <16 x i32>, <16 x i32>* %ptr_b
3230  %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
3231  ret < 16 x i32> %res
3232}
3233
3234define <16 x i32> @test_mask_mullo_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
3235; CHECK-LABEL: test_mask_mullo_epi32_rmb_512:
3236; CHECK:       ## BB#0:
3237; CHECK-NEXT:    vpmulld (%rdi){1to16}, %zmm0, %zmm0
3238; CHECK-NEXT:    retq
3239  %q = load i32, i32* %ptr_b
3240  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
3241  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
3242  %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
3243  ret < 16 x i32> %res
3244}
3245
3246define <16 x i32> @test_mask_mullo_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <16 x i32> %passThru, i16 %mask) {
3247; CHECK-LABEL: test_mask_mullo_epi32_rmbk_512:
3248; CHECK:       ## BB#0:
3249; CHECK-NEXT:    kmovw %esi, %k1
3250; CHECK-NEXT:    vpmulld (%rdi){1to16}, %zmm0, %zmm1 {%k1}
3251; CHECK-NEXT:    vmovaps %zmm1, %zmm0
3252; CHECK-NEXT:    retq
3253  %q = load i32, i32* %ptr_b
3254  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
3255  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
3256  %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
3257  ret < 16 x i32> %res
3258}
3259
3260define <16 x i32> @test_mask_mullo_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i16 %mask) {
3261; CHECK-LABEL: test_mask_mullo_epi32_rmbkz_512:
3262; CHECK:       ## BB#0:
3263; CHECK-NEXT:    kmovw %esi, %k1
3264; CHECK-NEXT:    vpmulld (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
3265; CHECK-NEXT:    retq
3266  %q = load i32, i32* %ptr_b
3267  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
3268  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
3269  %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
3270  ret < 16 x i32> %res
3271}
3272
3273declare <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
3274
3275define <16 x float> @test_mm512_maskz_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
3276; CHECK-LABEL: test_mm512_maskz_add_round_ps_rn_sae:
3277; CHECK:       ## BB#0:
3278; CHECK-NEXT:    kmovw %edi, %k1
3279; CHECK-NEXT:    vaddps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
3280; CHECK-NEXT:    retq
3281  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 0)
3282  ret <16 x float> %res
3283}
3284define <16 x float> @test_mm512_maskz_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
3285; CHECK-LABEL: test_mm512_maskz_add_round_ps_rd_sae:
3286; CHECK:       ## BB#0:
3287; CHECK-NEXT:    kmovw %edi, %k1
3288; CHECK-NEXT:    vaddps {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
3289; CHECK-NEXT:    retq
3290  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 1)
3291  ret <16 x float> %res
3292}
3293define <16 x float> @test_mm512_maskz_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
3294; CHECK-LABEL: test_mm512_maskz_add_round_ps_ru_sae:
3295; CHECK:       ## BB#0:
3296; CHECK-NEXT:    kmovw %edi, %k1
3297; CHECK-NEXT:    vaddps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
3298; CHECK-NEXT:    retq
3299  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 2)
3300  ret <16 x float> %res
3301}
3302
3303define <16 x float> @test_mm512_maskz_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
3304; CHECK-LABEL: test_mm512_maskz_add_round_ps_rz_sae:
3305; CHECK:       ## BB#0:
3306; CHECK-NEXT:    kmovw %edi, %k1
3307; CHECK-NEXT:    vaddps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
3308; CHECK-NEXT:    retq
3309  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 3)
3310  ret <16 x float> %res
3311}
3312
3313
3314define <16 x float> @test_mm512_maskz_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
3315; CHECK-LABEL: test_mm512_maskz_add_round_ps_current:
3316; CHECK:       ## BB#0:
3317; CHECK-NEXT:    kmovw %edi, %k1
3318; CHECK-NEXT:    vaddps %zmm1, %zmm0, %zmm0 {%k1} {z}
3319; CHECK-NEXT:    retq
3320  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4)
3321  ret <16 x float> %res
3322}
3323
3324define <16 x float> @test_mm512_mask_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
3325; CHECK-LABEL: test_mm512_mask_add_round_ps_rn_sae:
3326; CHECK:       ## BB#0:
3327; CHECK-NEXT:    kmovw %edi, %k1
3328; CHECK-NEXT:    vaddps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3329; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3330; CHECK-NEXT:    retq
3331  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 0)
3332  ret <16 x float> %res
3333}
3334define <16 x float> @test_mm512_mask_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
3335; CHECK-LABEL: test_mm512_mask_add_round_ps_rd_sae:
3336; CHECK:       ## BB#0:
3337; CHECK-NEXT:    kmovw %edi, %k1
3338; CHECK-NEXT:    vaddps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3339; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3340; CHECK-NEXT:    retq
3341  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 1)
3342  ret <16 x float> %res
3343}
3344define <16 x float> @test_mm512_mask_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
3345; CHECK-LABEL: test_mm512_mask_add_round_ps_ru_sae:
3346; CHECK:       ## BB#0:
3347; CHECK-NEXT:    kmovw %edi, %k1
3348; CHECK-NEXT:    vaddps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3349; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3350; CHECK-NEXT:    retq
3351  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 2)
3352  ret <16 x float> %res
3353}
3354
3355define <16 x float> @test_mm512_mask_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
3356; CHECK-LABEL: test_mm512_mask_add_round_ps_rz_sae:
3357; CHECK:       ## BB#0:
3358; CHECK-NEXT:    kmovw %edi, %k1
3359; CHECK-NEXT:    vaddps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3360; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3361; CHECK-NEXT:    retq
3362  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 3)
3363  ret <16 x float> %res
3364}
3365
3366
3367define <16 x float> @test_mm512_mask_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
3368; CHECK-LABEL: test_mm512_mask_add_round_ps_current:
3369; CHECK:       ## BB#0:
3370; CHECK-NEXT:    kmovw %edi, %k1
3371; CHECK-NEXT:    vaddps %zmm1, %zmm0, %zmm2 {%k1}
3372; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3373; CHECK-NEXT:    retq
3374  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4)
3375  ret <16 x float> %res
3376}
3377
3378
3379define <16 x float> @test_mm512_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
3380; CHECK-LABEL: test_mm512_add_round_ps_rn_sae:
3381; CHECK:       ## BB#0:
3382; CHECK-NEXT:    vaddps {rn-sae}, %zmm1, %zmm0, %zmm0
3383; CHECK-NEXT:    retq
3384  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 0)
3385  ret <16 x float> %res
3386}
3387define <16 x float> @test_mm512_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
3388; CHECK-LABEL: test_mm512_add_round_ps_rd_sae:
3389; CHECK:       ## BB#0:
3390; CHECK-NEXT:    vaddps {rd-sae}, %zmm1, %zmm0, %zmm0
3391; CHECK-NEXT:    retq
3392  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 1)
3393  ret <16 x float> %res
3394}
3395define <16 x float> @test_mm512_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
3396; CHECK-LABEL: test_mm512_add_round_ps_ru_sae:
3397; CHECK:       ## BB#0:
3398; CHECK-NEXT:    vaddps {ru-sae}, %zmm1, %zmm0, %zmm0
3399; CHECK-NEXT:    retq
3400  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 2)
3401  ret <16 x float> %res
3402}
3403
3404define <16 x float> @test_mm512_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
3405; CHECK-LABEL: test_mm512_add_round_ps_rz_sae:
3406; CHECK:       ## BB#0:
3407; CHECK-NEXT:    vaddps {rz-sae}, %zmm1, %zmm0, %zmm0
3408; CHECK-NEXT:    retq
3409  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 3)
3410  ret <16 x float> %res
3411}
3412
3413define <16 x float> @test_mm512_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
3414; CHECK-LABEL: test_mm512_add_round_ps_current:
3415; CHECK:       ## BB#0:
3416; CHECK-NEXT:    vaddps %zmm1, %zmm0, %zmm0
3417; CHECK-NEXT:    retq
3418  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4)
3419  ret <16 x float> %res
3420}
3421declare <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
3422
3423define <16 x float> @test_mm512_mask_sub_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
3424; CHECK-LABEL: test_mm512_mask_sub_round_ps_rn_sae:
3425; CHECK:       ## BB#0:
3426; CHECK-NEXT:    kmovw %edi, %k1
3427; CHECK-NEXT:    vsubps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3428; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3429; CHECK-NEXT:    retq
3430  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 0)
3431  ret <16 x float> %res
3432}
3433define <16 x float> @test_mm512_mask_sub_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
3434; CHECK-LABEL: test_mm512_mask_sub_round_ps_rd_sae:
3435; CHECK:       ## BB#0:
3436; CHECK-NEXT:    kmovw %edi, %k1
3437; CHECK-NEXT:    vsubps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3438; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3439; CHECK-NEXT:    retq
3440  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 1)
3441  ret <16 x float> %res
3442}
3443define <16 x float> @test_mm512_mask_sub_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
3444; CHECK-LABEL: test_mm512_mask_sub_round_ps_ru_sae:
3445; CHECK:       ## BB#0:
3446; CHECK-NEXT:    kmovw %edi, %k1
3447; CHECK-NEXT:    vsubps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3448; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3449; CHECK-NEXT:    retq
3450  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 2)
3451  ret <16 x float> %res
3452}
3453
3454define <16 x float> @test_mm512_mask_sub_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
3455; CHECK-LABEL: test_mm512_mask_sub_round_ps_rz_sae:
3456; CHECK:       ## BB#0:
3457; CHECK-NEXT:    kmovw %edi, %k1
3458; CHECK-NEXT:    vsubps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3459; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3460; CHECK-NEXT:    retq
3461  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 3)
3462  ret <16 x float> %res
3463}
3464
3465
3466define <16 x float> @test_mm512_mask_sub_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
3467; CHECK-LABEL: test_mm512_mask_sub_round_ps_current:
3468; CHECK:       ## BB#0:
3469; CHECK-NEXT:    kmovw %edi, %k1
3470; CHECK-NEXT:    vsubps %zmm1, %zmm0, %zmm2 {%k1}
3471; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3472; CHECK-NEXT:    retq
3473  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4)
3474  ret <16 x float> %res
3475}
3476
3477define <16 x float> @test_mm512_sub_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
3478; CHECK-LABEL: test_mm512_sub_round_ps_rn_sae:
3479; CHECK:       ## BB#0:
3480; CHECK-NEXT:    vsubps {rn-sae}, %zmm1, %zmm0, %zmm0
3481; CHECK-NEXT:    retq
3482  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 0)
3483  ret <16 x float> %res
3484}
3485define <16 x float> @test_mm512_sub_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
3486; CHECK-LABEL: test_mm512_sub_round_ps_rd_sae:
3487; CHECK:       ## BB#0:
3488; CHECK-NEXT:    vsubps {rd-sae}, %zmm1, %zmm0, %zmm0
3489; CHECK-NEXT:    retq
3490  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 1)
3491  ret <16 x float> %res
3492}
3493define <16 x float> @test_mm512_sub_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
3494; CHECK-LABEL: test_mm512_sub_round_ps_ru_sae:
3495; CHECK:       ## BB#0:
3496; CHECK-NEXT:    vsubps {ru-sae}, %zmm1, %zmm0, %zmm0
3497; CHECK-NEXT:    retq
3498  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 2)
3499  ret <16 x float> %res
3500}
3501
3502define <16 x float> @test_mm512_sub_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
3503; CHECK-LABEL: test_mm512_sub_round_ps_rz_sae:
3504; CHECK:       ## BB#0:
3505; CHECK-NEXT:    vsubps {rz-sae}, %zmm1, %zmm0, %zmm0
3506; CHECK-NEXT:    retq
3507  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 3)
3508  ret <16 x float> %res
3509}
3510
3511define <16 x float> @test_mm512_sub_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
3512; CHECK-LABEL: test_mm512_sub_round_ps_current:
3513; CHECK:       ## BB#0:
3514; CHECK-NEXT:    vsubps %zmm1, %zmm0, %zmm0
3515; CHECK-NEXT:    retq
3516  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4)
3517  ret <16 x float> %res
3518}
3519
3520define <16 x float> @test_mm512_maskz_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
3521; CHECK-LABEL: test_mm512_maskz_div_round_ps_rn_sae:
3522; CHECK:       ## BB#0:
3523; CHECK-NEXT:    kmovw %edi, %k1
3524; CHECK-NEXT:    vdivps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
3525; CHECK-NEXT:    retq
3526  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 0)
3527  ret <16 x float> %res
3528}
3529define <16 x float> @test_mm512_maskz_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
3530; CHECK-LABEL: test_mm512_maskz_div_round_ps_rd_sae:
3531; CHECK:       ## BB#0:
3532; CHECK-NEXT:    kmovw %edi, %k1
3533; CHECK-NEXT:    vdivps {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
3534; CHECK-NEXT:    retq
3535  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 1)
3536  ret <16 x float> %res
3537}
3538define <16 x float> @test_mm512_maskz_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
3539; CHECK-LABEL: test_mm512_maskz_div_round_ps_ru_sae:
3540; CHECK:       ## BB#0:
3541; CHECK-NEXT:    kmovw %edi, %k1
3542; CHECK-NEXT:    vdivps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
3543; CHECK-NEXT:    retq
3544  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 2)
3545  ret <16 x float> %res
3546}
3547
3548define <16 x float> @test_mm512_maskz_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
3549; CHECK-LABEL: test_mm512_maskz_div_round_ps_rz_sae:
3550; CHECK:       ## BB#0:
3551; CHECK-NEXT:    kmovw %edi, %k1
3552; CHECK-NEXT:    vdivps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
3553; CHECK-NEXT:    retq
3554  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 3)
3555  ret <16 x float> %res
3556}
3557
3558
3559define <16 x float> @test_mm512_maskz_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
3560; CHECK-LABEL: test_mm512_maskz_div_round_ps_current:
3561; CHECK:       ## BB#0:
3562; CHECK-NEXT:    kmovw %edi, %k1
3563; CHECK-NEXT:    vdivps %zmm1, %zmm0, %zmm0 {%k1} {z}
3564; CHECK-NEXT:    retq
3565  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4)
3566  ret <16 x float> %res
3567}
3568
3569define <16 x float> @test_mm512_mask_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
3570; CHECK-LABEL: test_mm512_mask_div_round_ps_rn_sae:
3571; CHECK:       ## BB#0:
3572; CHECK-NEXT:    kmovw %edi, %k1
3573; CHECK-NEXT:    vdivps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3574; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3575; CHECK-NEXT:    retq
3576  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 0)
3577  ret <16 x float> %res
3578}
3579define <16 x float> @test_mm512_mask_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
3580; CHECK-LABEL: test_mm512_mask_div_round_ps_rd_sae:
3581; CHECK:       ## BB#0:
3582; CHECK-NEXT:    kmovw %edi, %k1
3583; CHECK-NEXT:    vdivps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3584; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3585; CHECK-NEXT:    retq
3586  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 1)
3587  ret <16 x float> %res
3588}
3589define <16 x float> @test_mm512_mask_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
3590; CHECK-LABEL: test_mm512_mask_div_round_ps_ru_sae:
3591; CHECK:       ## BB#0:
3592; CHECK-NEXT:    kmovw %edi, %k1
3593; CHECK-NEXT:    vdivps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3594; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3595; CHECK-NEXT:    retq
3596  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 2)
3597  ret <16 x float> %res
3598}
3599
3600define <16 x float> @test_mm512_mask_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
3601; CHECK-LABEL: test_mm512_mask_div_round_ps_rz_sae:
3602; CHECK:       ## BB#0:
3603; CHECK-NEXT:    kmovw %edi, %k1
3604; CHECK-NEXT:    vdivps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3605; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3606; CHECK-NEXT:    retq
3607  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 3)
3608  ret <16 x float> %res
3609}
3610
3611
3612define <16 x float> @test_mm512_mask_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
3613; CHECK-LABEL: test_mm512_mask_div_round_ps_current:
3614; CHECK:       ## BB#0:
3615; CHECK-NEXT:    kmovw %edi, %k1
3616; CHECK-NEXT:    vdivps %zmm1, %zmm0, %zmm2 {%k1}
3617; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3618; CHECK-NEXT:    retq
3619  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4)
3620  ret <16 x float> %res
3621}
3622
3623
3624define <16 x float> @test_mm512_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
3625; CHECK-LABEL: test_mm512_div_round_ps_rn_sae:
3626; CHECK:       ## BB#0:
3627; CHECK-NEXT:    vdivps {rn-sae}, %zmm1, %zmm0, %zmm0
3628; CHECK-NEXT:    retq
3629  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 0)
3630  ret <16 x float> %res
3631}
3632define <16 x float> @test_mm512_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
3633; CHECK-LABEL: test_mm512_div_round_ps_rd_sae:
3634; CHECK:       ## BB#0:
3635; CHECK-NEXT:    vdivps {rd-sae}, %zmm1, %zmm0, %zmm0
3636; CHECK-NEXT:    retq
3637  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 1)
3638  ret <16 x float> %res
3639}
3640define <16 x float> @test_mm512_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
3641; CHECK-LABEL: test_mm512_div_round_ps_ru_sae:
3642; CHECK:       ## BB#0:
3643; CHECK-NEXT:    vdivps {ru-sae}, %zmm1, %zmm0, %zmm0
3644; CHECK-NEXT:    retq
3645  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 2)
3646  ret <16 x float> %res
3647}
3648
3649define <16 x float> @test_mm512_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
3650; CHECK-LABEL: test_mm512_div_round_ps_rz_sae:
3651; CHECK:       ## BB#0:
3652; CHECK-NEXT:    vdivps {rz-sae}, %zmm1, %zmm0, %zmm0
3653; CHECK-NEXT:    retq
3654  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 3)
3655  ret <16 x float> %res
3656}
3657
3658define <16 x float> @test_mm512_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
3659; CHECK-LABEL: test_mm512_div_round_ps_current:
3660; CHECK:       ## BB#0:
3661; CHECK-NEXT:    vdivps %zmm1, %zmm0, %zmm0
3662; CHECK-NEXT:    retq
3663  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4)
3664  ret <16 x float> %res
3665}
3666declare <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
3667
3668define <16 x float> @test_mm512_maskz_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
3669; CHECK-LABEL: test_mm512_maskz_min_round_ps_sae:
3670; CHECK:       ## BB#0:
3671; CHECK-NEXT:    kmovw %edi, %k1
3672; CHECK-NEXT:    vminps {sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
3673; CHECK-NEXT:    retq
3674  %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 8)
3675  ret <16 x float> %res
3676}
3677
3678define <16 x float> @test_mm512_maskz_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
3679; CHECK-LABEL: test_mm512_maskz_min_round_ps_current:
3680; CHECK:       ## BB#0:
3681; CHECK-NEXT:    kmovw %edi, %k1
3682; CHECK-NEXT:    vminps %zmm1, %zmm0, %zmm0 {%k1} {z}
3683; CHECK-NEXT:    retq
3684  %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4)
3685  ret <16 x float> %res
3686}
3687
3688define <16 x float> @test_mm512_mask_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
3689; CHECK-LABEL: test_mm512_mask_min_round_ps_sae:
3690; CHECK:       ## BB#0:
3691; CHECK-NEXT:    kmovw %edi, %k1
3692; CHECK-NEXT:    vminps {sae}, %zmm1, %zmm0, %zmm2 {%k1}
3693; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3694; CHECK-NEXT:    retq
3695  %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 8)
3696  ret <16 x float> %res
3697}
3698
3699define <16 x float> @test_mm512_mask_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
3700; CHECK-LABEL: test_mm512_mask_min_round_ps_current:
3701; CHECK:       ## BB#0:
3702; CHECK-NEXT:    kmovw %edi, %k1
3703; CHECK-NEXT:    vminps %zmm1, %zmm0, %zmm2 {%k1}
3704; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3705; CHECK-NEXT:    retq
3706  %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4)
3707  ret <16 x float> %res
3708}
3709
3710define <16 x float> @test_mm512_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
3711; CHECK-LABEL: test_mm512_min_round_ps_sae:
3712; CHECK:       ## BB#0:
3713; CHECK-NEXT:    vminps {sae}, %zmm1, %zmm0, %zmm0
3714; CHECK-NEXT:    retq
3715  %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 8)
3716  ret <16 x float> %res
3717}
3718
3719define <16 x float> @test_mm512_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
3720; CHECK-LABEL: test_mm512_min_round_ps_current:
3721; CHECK:       ## BB#0:
3722; CHECK-NEXT:    vminps %zmm1, %zmm0, %zmm0
3723; CHECK-NEXT:    retq
3724  %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4)
3725  ret <16 x float> %res
3726}
3727declare <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
3728
3729define <16 x float> @test_mm512_maskz_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
3730; CHECK-LABEL: test_mm512_maskz_max_round_ps_sae:
3731; CHECK:       ## BB#0:
3732; CHECK-NEXT:    kmovw %edi, %k1
3733; CHECK-NEXT:    vmaxps {sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
3734; CHECK-NEXT:    retq
3735  %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 8)
3736  ret <16 x float> %res
3737}
3738
3739define <16 x float> @test_mm512_maskz_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
3740; CHECK-LABEL: test_mm512_maskz_max_round_ps_current:
3741; CHECK:       ## BB#0:
3742; CHECK-NEXT:    kmovw %edi, %k1
3743; CHECK-NEXT:    vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z}
3744; CHECK-NEXT:    retq
3745  %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4)
3746  ret <16 x float> %res
3747}
3748
3749define <16 x float> @test_mm512_mask_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
3750; CHECK-LABEL: test_mm512_mask_max_round_ps_sae:
3751; CHECK:       ## BB#0:
3752; CHECK-NEXT:    kmovw %edi, %k1
3753; CHECK-NEXT:    vmaxps {sae}, %zmm1, %zmm0, %zmm2 {%k1}
3754; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3755; CHECK-NEXT:    retq
3756  %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 8)
3757  ret <16 x float> %res
3758}
3759
3760define <16 x float> @test_mm512_mask_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
3761; CHECK-LABEL: test_mm512_mask_max_round_ps_current:
3762; CHECK:       ## BB#0:
3763; CHECK-NEXT:    kmovw %edi, %k1
3764; CHECK-NEXT:    vmaxps %zmm1, %zmm0, %zmm2 {%k1}
3765; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3766; CHECK-NEXT:    retq
3767  %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4)
3768  ret <16 x float> %res
3769}
3770
3771define <16 x float> @test_mm512_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
3772; CHECK-LABEL: test_mm512_max_round_ps_sae:
3773; CHECK:       ## BB#0:
3774; CHECK-NEXT:    vmaxps {sae}, %zmm1, %zmm0, %zmm0
3775; CHECK-NEXT:    retq
3776  %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 8)
3777  ret <16 x float> %res
3778}
3779
3780define <16 x float> @test_mm512_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
3781; CHECK-LABEL: test_mm512_max_round_ps_current:
3782; CHECK:       ## BB#0:
3783; CHECK-NEXT:    vmaxps %zmm1, %zmm0, %zmm0
3784; CHECK-NEXT:    retq
3785  %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4)
3786  ret <16 x float> %res
3787}
3788declare <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
3789
3790declare <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
3791
3792define <4 x float> @test_mask_add_ss_rn(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
3793; CHECK-LABEL: test_mask_add_ss_rn:
3794; CHECK:       ## BB#0:
3795; CHECK-NEXT:    andl $1, %edi
3796; CHECK-NEXT:    kmovw %edi, %k1
3797; CHECK-NEXT:    vaddss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
3798; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3799; CHECK-NEXT:    retq
3800  %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 0)
3801  ret <4 x float> %res
3802}
3803
3804define <4 x float> @test_mask_add_ss_rd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
3805; CHECK-LABEL: test_mask_add_ss_rd:
3806; CHECK:       ## BB#0:
3807; CHECK-NEXT:    andl $1, %edi
3808; CHECK-NEXT:    kmovw %edi, %k1
3809; CHECK-NEXT:    vaddss {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
3810; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3811; CHECK-NEXT:    retq
3812  %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 1)
3813  ret <4 x float> %res
3814}
3815
3816define <4 x float> @test_mask_add_ss_ru(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
3817; CHECK-LABEL: test_mask_add_ss_ru:
3818; CHECK:       ## BB#0:
3819; CHECK-NEXT:    andl $1, %edi
3820; CHECK-NEXT:    kmovw %edi, %k1
3821; CHECK-NEXT:    vaddss {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
3822; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3823; CHECK-NEXT:    retq
3824  %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 2)
3825  ret <4 x float> %res
3826}
3827
3828define <4 x float> @test_mask_add_ss_rz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
3829; CHECK-LABEL: test_mask_add_ss_rz:
3830; CHECK:       ## BB#0:
3831; CHECK-NEXT:    andl $1, %edi
3832; CHECK-NEXT:    kmovw %edi, %k1
3833; CHECK-NEXT:    vaddss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
3834; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3835; CHECK-NEXT:    retq
3836  %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 3)
3837  ret <4 x float> %res
3838}
3839
3840define <4 x float> @test_mask_add_ss_current(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
3841; CHECK-LABEL: test_mask_add_ss_current:
3842; CHECK:       ## BB#0:
3843; CHECK-NEXT:    andl $1, %edi
3844; CHECK-NEXT:    kmovw %edi, %k1
3845; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm2 {%k1}
3846; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3847; CHECK-NEXT:    retq
3848  %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
3849  ret <4 x float> %res
3850}
3851
3852define <4 x float> @test_maskz_add_ss_rn(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
3853; CHECK-LABEL: test_maskz_add_ss_rn:
3854; CHECK:       ## BB#0:
3855; CHECK-NEXT:    andl $1, %edi
3856; CHECK-NEXT:    kmovw %edi, %k1
3857; CHECK-NEXT:    vaddss {rn-sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
3858; CHECK-NEXT:    retq
3859  %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 0)
3860  ret <4 x float> %res
3861}
3862
3863define <4 x float> @test_add_ss_rn(<4 x float> %a0, <4 x float> %a1) {
3864; CHECK-LABEL: test_add_ss_rn:
3865; CHECK:       ## BB#0:
3866; CHECK-NEXT:    vaddss {rn-sae}, %xmm1, %xmm0, %xmm0
3867; CHECK-NEXT:    retq
3868  %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 0)
3869  ret <4 x float> %res
3870}
3871
3872declare <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
3873
3874define <2 x double> @test_mask_add_sd_rn(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
3875; CHECK-LABEL: test_mask_add_sd_rn:
3876; CHECK:       ## BB#0:
3877; CHECK-NEXT:    andl $1, %edi
3878; CHECK-NEXT:    kmovw %edi, %k1
3879; CHECK-NEXT:    vaddsd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
3880; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3881; CHECK-NEXT:    retq
3882  %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 0)
3883  ret <2 x double> %res
3884}
3885
3886define <2 x double> @test_mask_add_sd_rd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
3887; CHECK-LABEL: test_mask_add_sd_rd:
3888; CHECK:       ## BB#0:
3889; CHECK-NEXT:    andl $1, %edi
3890; CHECK-NEXT:    kmovw %edi, %k1
3891; CHECK-NEXT:    vaddsd {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
3892; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3893; CHECK-NEXT:    retq
3894  %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 1)
3895  ret <2 x double> %res
3896}
3897
3898define <2 x double> @test_mask_add_sd_ru(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
3899; CHECK-LABEL: test_mask_add_sd_ru:
3900; CHECK:       ## BB#0:
3901; CHECK-NEXT:    andl $1, %edi
3902; CHECK-NEXT:    kmovw %edi, %k1
3903; CHECK-NEXT:    vaddsd {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
3904; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3905; CHECK-NEXT:    retq
3906  %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 2)
3907  ret <2 x double> %res
3908}
3909
3910define <2 x double> @test_mask_add_sd_rz(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
3911; CHECK-LABEL: test_mask_add_sd_rz:
3912; CHECK:       ## BB#0:
3913; CHECK-NEXT:    andl $1, %edi
3914; CHECK-NEXT:    kmovw %edi, %k1
3915; CHECK-NEXT:    vaddsd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
3916; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3917; CHECK-NEXT:    retq
3918  %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 3)
3919  ret <2 x double> %res
3920}
3921
3922define <2 x double> @test_mask_add_sd_current(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
3923; CHECK-LABEL: test_mask_add_sd_current:
3924; CHECK:       ## BB#0:
3925; CHECK-NEXT:    andl $1, %edi
3926; CHECK-NEXT:    kmovw %edi, %k1
3927; CHECK-NEXT:    vaddsd %xmm1, %xmm0, %xmm2 {%k1}
3928; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3929; CHECK-NEXT:    retq
3930  %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
3931  ret <2 x double> %res
3932}
3933
3934define <2 x double> @test_maskz_add_sd_rn(<2 x double> %a0, <2 x double> %a1, i8 %mask) {
3935; CHECK-LABEL: test_maskz_add_sd_rn:
3936; CHECK:       ## BB#0:
3937; CHECK-NEXT:    andl $1, %edi
3938; CHECK-NEXT:    kmovw %edi, %k1
3939; CHECK-NEXT:    vaddsd {rn-sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
3940; CHECK-NEXT:    retq
3941  %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 0)
3942  ret <2 x double> %res
3943}
3944
3945define <2 x double> @test_add_sd_rn(<2 x double> %a0, <2 x double> %a1) {
3946; CHECK-LABEL: test_add_sd_rn:
3947; CHECK:       ## BB#0:
3948; CHECK-NEXT:    vaddsd {rn-sae}, %xmm1, %xmm0, %xmm0
3949; CHECK-NEXT:    retq
3950  %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 0)
3951  ret <2 x double> %res
3952}
3953
3954declare <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
3955
3956define <4 x float> @test_mask_max_ss_sae(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
3957; CHECK-LABEL: test_mask_max_ss_sae:
3958; CHECK:       ## BB#0:
3959; CHECK-NEXT:    andl $1, %edi
3960; CHECK-NEXT:    kmovw %edi, %k1
3961; CHECK-NEXT:    vmaxss {sae}, %xmm1, %xmm0, %xmm2 {%k1}
3962; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3963; CHECK-NEXT:    retq
3964  %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 8)
3965  ret <4 x float> %res
3966}
3967
3968define <4 x float> @test_maskz_max_ss_sae(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
3969; CHECK-LABEL: test_maskz_max_ss_sae:
3970; CHECK:       ## BB#0:
3971; CHECK-NEXT:    andl $1, %edi
3972; CHECK-NEXT:    kmovw %edi, %k1
3973; CHECK-NEXT:    vmaxss {sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
3974; CHECK-NEXT:    retq
3975  %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 8)
3976  ret <4 x float> %res
3977}
3978
3979define <4 x float> @test_max_ss_sae(<4 x float> %a0, <4 x float> %a1) {
3980; CHECK-LABEL: test_max_ss_sae:
3981; CHECK:       ## BB#0:
3982; CHECK-NEXT:    vmaxss {sae}, %xmm1, %xmm0, %xmm0
3983; CHECK-NEXT:    retq
3984  %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 8)
3985  ret <4 x float> %res
3986}
3987
3988define <4 x float> @test_mask_max_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
3989; CHECK-LABEL: test_mask_max_ss:
3990; CHECK:       ## BB#0:
3991; CHECK-NEXT:    andl $1, %edi
3992; CHECK-NEXT:    kmovw %edi, %k1
3993; CHECK-NEXT:    vmaxss %xmm1, %xmm0, %xmm2 {%k1}
3994; CHECK-NEXT:    vmovaps %zmm2, %zmm0
3995; CHECK-NEXT:    retq
3996  %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
3997  ret <4 x float> %res
3998}
3999
4000define <4 x float> @test_maskz_max_ss(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
4001; CHECK-LABEL: test_maskz_max_ss:
4002; CHECK:       ## BB#0:
4003; CHECK-NEXT:    andl $1, %edi
4004; CHECK-NEXT:    kmovw %edi, %k1
4005; CHECK-NEXT:    vmaxss %xmm1, %xmm0, %xmm0 {%k1} {z}
4006; CHECK-NEXT:    retq
4007  %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 4)
4008  ret <4 x float> %res
4009}
4010
4011define <4 x float> @test_max_ss(<4 x float> %a0, <4 x float> %a1) {
4012; CHECK-LABEL: test_max_ss:
4013; CHECK:       ## BB#0:
4014; CHECK-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
4015; CHECK-NEXT:    retq
4016  %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 4)
4017  ret <4 x float> %res
4018}
4019declare <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
4020
4021define <2 x double> @test_mask_max_sd_sae(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
4022; CHECK-LABEL: test_mask_max_sd_sae:
4023; CHECK:       ## BB#0:
4024; CHECK-NEXT:    andl $1, %edi
4025; CHECK-NEXT:    kmovw %edi, %k1
4026; CHECK-NEXT:    vmaxsd {sae}, %xmm1, %xmm0, %xmm2 {%k1}
4027; CHECK-NEXT:    vmovaps %zmm2, %zmm0
4028; CHECK-NEXT:    retq
4029  %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 8)
4030  ret <2 x double> %res
4031}
4032
4033define <2 x double> @test_maskz_max_sd_sae(<2 x double> %a0, <2 x double> %a1, i8 %mask) {
4034; CHECK-LABEL: test_maskz_max_sd_sae:
4035; CHECK:       ## BB#0:
4036; CHECK-NEXT:    andl $1, %edi
4037; CHECK-NEXT:    kmovw %edi, %k1
4038; CHECK-NEXT:    vmaxsd {sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
4039; CHECK-NEXT:    retq
4040  %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 8)
4041  ret <2 x double> %res
4042}
4043
4044define <2 x double> @test_max_sd_sae(<2 x double> %a0, <2 x double> %a1) {
4045; CHECK-LABEL: test_max_sd_sae:
4046; CHECK:       ## BB#0:
4047; CHECK-NEXT:    vmaxsd {sae}, %xmm1, %xmm0, %xmm0
4048; CHECK-NEXT:    retq
4049  %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 8)
4050  ret <2 x double> %res
4051}
4052
4053define <2 x double> @test_mask_max_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
4054; CHECK-LABEL: test_mask_max_sd:
4055; CHECK:       ## BB#0:
4056; CHECK-NEXT:    andl $1, %edi
4057; CHECK-NEXT:    kmovw %edi, %k1
4058; CHECK-NEXT:    vmaxsd %xmm1, %xmm0, %xmm2 {%k1}
4059; CHECK-NEXT:    vmovaps %zmm2, %zmm0
4060; CHECK-NEXT:    retq
4061  %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
4062  ret <2 x double> %res
4063}
4064
4065define <2 x double> @test_maskz_max_sd(<2 x double> %a0, <2 x double> %a1, i8 %mask) {
4066; CHECK-LABEL: test_maskz_max_sd:
4067; CHECK:       ## BB#0:
4068; CHECK-NEXT:    andl $1, %edi
4069; CHECK-NEXT:    kmovw %edi, %k1
4070; CHECK-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0 {%k1} {z}
4071; CHECK-NEXT:    retq
4072  %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 4)
4073  ret <2 x double> %res
4074}
4075
4076define <2 x double> @test_max_sd(<2 x double> %a0, <2 x double> %a1) {
4077; CHECK-LABEL: test_max_sd:
4078; CHECK:       ## BB#0:
4079; CHECK-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0
4080; CHECK-NEXT:    retq
4081  %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 4)
4082  ret <2 x double> %res
4083}
4084
4085define <2 x double> @test_x86_avx512_cvtsi2sd32(<2 x double> %a, i32 %b) {
4086; CHECK-LABEL: test_x86_avx512_cvtsi2sd32:
4087; CHECK:       ## BB#0:
4088; CHECK-NEXT:    vcvtsi2sdl %edi, {rz-sae}, %xmm0, %xmm0
4089; CHECK-NEXT:    retq
4090  %res = call <2 x double> @llvm.x86.avx512.cvtsi2sd32(<2 x double> %a, i32 %b, i32 3) ; <<<2 x double>> [#uses=1]
4091  ret <2 x double> %res
4092}
4093declare <2 x double> @llvm.x86.avx512.cvtsi2sd32(<2 x double>, i32, i32) nounwind readnone
4094
4095define <2 x double> @test_x86_avx512_cvtsi2sd64(<2 x double> %a, i64 %b) {
4096; CHECK-LABEL: test_x86_avx512_cvtsi2sd64:
4097; CHECK:       ## BB#0:
4098; CHECK-NEXT:    vcvtsi2sdq %rdi, {rz-sae}, %xmm0, %xmm0
4099; CHECK-NEXT:    retq
4100  %res = call <2 x double> @llvm.x86.avx512.cvtsi2sd64(<2 x double> %a, i64 %b, i32 3) ; <<<2 x double>> [#uses=1]
4101  ret <2 x double> %res
4102}
4103declare <2 x double> @llvm.x86.avx512.cvtsi2sd64(<2 x double>, i64, i32) nounwind readnone
4104
4105define <4 x float> @test_x86_avx512_cvtsi2ss32(<4 x float> %a, i32 %b) {
4106; CHECK-LABEL: test_x86_avx512_cvtsi2ss32:
4107; CHECK:       ## BB#0:
4108; CHECK-NEXT:    vcvtsi2ssl %edi, {rz-sae}, %xmm0, %xmm0
4109; CHECK-NEXT:    retq
4110  %res = call <4 x float> @llvm.x86.avx512.cvtsi2ss32(<4 x float> %a, i32 %b, i32 3) ; <<<4 x float>> [#uses=1]
4111  ret <4 x float> %res
4112}
4113declare <4 x float> @llvm.x86.avx512.cvtsi2ss32(<4 x float>, i32, i32) nounwind readnone
4114
4115define <4 x float> @test_x86_avx512_cvtsi2ss64(<4 x float> %a, i64 %b) {
4116; CHECK-LABEL: test_x86_avx512_cvtsi2ss64:
4117; CHECK:       ## BB#0:
4118; CHECK-NEXT:    vcvtsi2ssq %rdi, {rz-sae}, %xmm0, %xmm0
4119; CHECK-NEXT:    retq
4120  %res = call <4 x float> @llvm.x86.avx512.cvtsi2ss64(<4 x float> %a, i64 %b, i32 3) ; <<<4 x float>> [#uses=1]
4121  ret <4 x float> %res
4122}
4123declare <4 x float> @llvm.x86.avx512.cvtsi2ss64(<4 x float>, i64, i32) nounwind readnone
4124
4125define <4 x float> @test_x86_avx512__mm_cvt_roundu32_ss (<4 x float> %a, i32 %b)
4126; CHECK-LABEL: test_x86_avx512__mm_cvt_roundu32_ss:
4127; CHECK:       ## BB#0:
4128; CHECK-NEXT:    vcvtusi2ssl %edi, {rd-sae}, %xmm0, %xmm0
4129; CHECK-NEXT:    retq
4130{
4131  %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 1) ; <<<4 x float>> [#uses=1]
4132  ret <4 x float> %res
4133}
4134
4135define <4 x float> @test_x86_avx512__mm_cvt_roundu32_ss_mem(<4 x float> %a, i32* %ptr)
4136; CHECK-LABEL: test_x86_avx512__mm_cvt_roundu32_ss_mem:
4137; CHECK:       ## BB#0:
4138; CHECK-NEXT:    movl (%rdi), %eax
4139; CHECK-NEXT:    vcvtusi2ssl %eax, {rd-sae}, %xmm0, %xmm0
4140; CHECK-NEXT:    retq
4141{
4142  %b = load i32, i32* %ptr
4143  %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 1) ; <<<4 x float>> [#uses=1]
4144  ret <4 x float> %res
4145}
4146
4147define <4 x float> @test_x86_avx512__mm_cvtu32_ss(<4 x float> %a, i32 %b)
4148; CHECK-LABEL: test_x86_avx512__mm_cvtu32_ss:
4149; CHECK:       ## BB#0:
4150; CHECK-NEXT:    vcvtusi2ssl %edi, %xmm0, %xmm0
4151; CHECK-NEXT:    retq
4152{
4153  %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 4) ; <<<4 x float>> [#uses=1]
4154  ret <4 x float> %res
4155}
4156
4157define <4 x float> @test_x86_avx512__mm_cvtu32_ss_mem(<4 x float> %a, i32* %ptr)
4158; CHECK-LABEL: test_x86_avx512__mm_cvtu32_ss_mem:
4159; CHECK:       ## BB#0:
4160; CHECK-NEXT:    vcvtusi2ssl (%rdi), %xmm0, %xmm0
4161; CHECK-NEXT:    retq
4162{
4163  %b = load i32, i32* %ptr
4164  %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 4) ; <<<4 x float>> [#uses=1]
4165  ret <4 x float> %res
4166}
4167declare <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float>, i32, i32) nounwind readnone
4168
4169define <4 x float> @_mm_cvt_roundu64_ss (<4 x float> %a, i64 %b)
4170; CHECK-LABEL: _mm_cvt_roundu64_ss:
4171; CHECK:       ## BB#0:
4172; CHECK-NEXT:    vcvtusi2ssq %rdi, {rd-sae}, %xmm0, %xmm0
4173; CHECK-NEXT:    retq
4174{
4175  %res = call <4 x float> @llvm.x86.avx512.cvtusi642ss(<4 x float> %a, i64 %b, i32 1) ; <<<4 x float>> [#uses=1]
4176  ret <4 x float> %res
4177}
4178
4179define <4 x float> @_mm_cvtu64_ss(<4 x float> %a, i64 %b)
4180; CHECK-LABEL: _mm_cvtu64_ss:
4181; CHECK:       ## BB#0:
4182; CHECK-NEXT:    vcvtusi2ssq %rdi, %xmm0, %xmm0
4183; CHECK-NEXT:    retq
4184{
4185  %res = call <4 x float> @llvm.x86.avx512.cvtusi642ss(<4 x float> %a, i64 %b, i32 4) ; <<<4 x float>> [#uses=1]
4186  ret <4 x float> %res
4187}
4188declare <4 x float> @llvm.x86.avx512.cvtusi642ss(<4 x float>, i64, i32) nounwind readnone
4189
4190define <2 x double> @test_x86_avx512_mm_cvtu32_sd(<2 x double> %a, i32 %b)
4191; CHECK-LABEL: test_x86_avx512_mm_cvtu32_sd:
4192; CHECK:       ## BB#0:
4193; CHECK-NEXT:    vcvtusi2sdl %edi, %xmm0, %xmm0
4194; CHECK-NEXT:    retq
4195{
4196  %res = call <2 x double> @llvm.x86.avx512.cvtusi2sd(<2 x double> %a, i32 %b) ; <<<2 x double>> [#uses=1]
4197  ret <2 x double> %res
4198}
4199declare <2 x double> @llvm.x86.avx512.cvtusi2sd(<2 x double>, i32) nounwind readnone
4200
4201define <2 x double> @test_x86_avx512_mm_cvtu64_sd(<2 x double> %a, i64 %b)
4202; CHECK-LABEL: test_x86_avx512_mm_cvtu64_sd:
4203; CHECK:       ## BB#0:
4204; CHECK-NEXT:    vcvtusi2sdq %rdi, {rd-sae}, %xmm0, %xmm0
4205; CHECK-NEXT:    retq
4206{
4207  %res = call <2 x double> @llvm.x86.avx512.cvtusi642sd(<2 x double> %a, i64 %b, i32 1) ; <<<2 x double>> [#uses=1]
4208  ret <2 x double> %res
4209}
4210
4211define <2 x double> @test_x86_avx512__mm_cvt_roundu64_sd(<2 x double> %a, i64 %b)
4212; CHECK-LABEL: test_x86_avx512__mm_cvt_roundu64_sd:
4213; CHECK:       ## BB#0:
4214; CHECK-NEXT:    vcvtusi2sdq %rdi, %xmm0, %xmm0
4215; CHECK-NEXT:    retq
4216{
4217  %res = call <2 x double> @llvm.x86.avx512.cvtusi642sd(<2 x double> %a, i64 %b, i32 4) ; <<<2 x double>> [#uses=1]
4218  ret <2 x double> %res
4219}
4220declare <2 x double> @llvm.x86.avx512.cvtusi642sd(<2 x double>, i64, i32) nounwind readnone
4221
4222define <8 x i64> @test_vpmaxq(<8 x i64> %a0, <8 x i64> %a1) {
4223; CHECK-LABEL: test_vpmaxq:
4224; CHECK:       ## BB#0:
4225; CHECK-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
4226; CHECK-NEXT:    retq
4227  %res = call <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64> %a0, <8 x i64> %a1,
4228                    <8 x i64>zeroinitializer, i8 -1)
4229  ret <8 x i64> %res
4230}
4231declare <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
4232
4233define <16 x i32> @test_vpminud(<16 x i32> %a0, <16 x i32> %a1) {
4234; CHECK-LABEL: test_vpminud:
4235; CHECK:       ## BB#0:
4236; CHECK-NEXT:    vpminud %zmm1, %zmm0, %zmm0
4237; CHECK-NEXT:    retq
4238  %res = call <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32> %a0, <16 x i32> %a1,
4239                    <16 x i32>zeroinitializer, i16 -1)
4240  ret <16 x i32> %res
4241}
4242declare <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
4243
4244define <16 x i32> @test_vpmaxsd(<16 x i32> %a0, <16 x i32> %a1) {
4245; CHECK-LABEL: test_vpmaxsd:
4246; CHECK:       ## BB#0:
4247; CHECK-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm0
4248; CHECK-NEXT:    retq
4249  %res = call <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32> %a0, <16 x i32> %a1,
4250                    <16 x i32>zeroinitializer, i16 -1)
4251  ret <16 x i32> %res
4252}
4253declare <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
4254
4255define <16 x i32>@test_int_x86_avx512_mask_pmaxs_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
4256; CHECK-LABEL: test_int_x86_avx512_mask_pmaxs_d_512:
4257; CHECK:       ## BB#0:
4258; CHECK-NEXT:    kmovw %edi, %k1
4259; CHECK-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm2 {%k1}
4260; CHECK-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm0
4261; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
4262; CHECK-NEXT:    retq
4263  %res = call <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
4264  %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
4265  %res2 = add <16 x i32> %res, %res1
4266  ret <16 x i32> %res2
4267}
4268
4269define <8 x i64>@test_int_x86_avx512_mask_pmaxs_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
4270; CHECK-LABEL: test_int_x86_avx512_mask_pmaxs_q_512:
4271; CHECK:       ## BB#0:
4272; CHECK-NEXT:    movzbl %dil, %eax
4273; CHECK-NEXT:    kmovw %eax, %k1
4274; CHECK-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm2 {%k1}
4275; CHECK-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
4276; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
4277; CHECK-NEXT:    retq
4278  %res = call <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
4279  %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
4280  %res2 = add <8 x i64> %res, %res1
4281  ret <8 x i64> %res2
4282}
4283
4284declare <16 x i32> @llvm.x86.avx512.mask.pmaxu.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
4285
4286define <16 x i32>@test_int_x86_avx512_mask_pmaxu_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
4287; CHECK-LABEL: test_int_x86_avx512_mask_pmaxu_d_512:
4288; CHECK:       ## BB#0:
4289; CHECK-NEXT:    kmovw %edi, %k1
4290; CHECK-NEXT:    vpmaxud %zmm1, %zmm0, %zmm2 {%k1}
4291; CHECK-NEXT:    vpmaxud %zmm1, %zmm0, %zmm0
4292; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
4293; CHECK-NEXT:    retq
4294  %res = call <16 x i32> @llvm.x86.avx512.mask.pmaxu.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
4295  %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmaxu.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
4296  %res2 = add <16 x i32> %res, %res1
4297  ret <16 x i32> %res2
4298}
4299
4300declare <8 x i64> @llvm.x86.avx512.mask.pmaxu.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
4301
4302define <8 x i64>@test_int_x86_avx512_mask_pmaxu_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
4303; CHECK-LABEL: test_int_x86_avx512_mask_pmaxu_q_512:
4304; CHECK:       ## BB#0:
4305; CHECK-NEXT:    movzbl %dil, %eax
4306; CHECK-NEXT:    kmovw %eax, %k1
4307; CHECK-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm2 {%k1}
4308; CHECK-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
4309; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
4310; CHECK-NEXT:    retq
4311  %res = call <8 x i64> @llvm.x86.avx512.mask.pmaxu.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
4312  %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmaxu.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
4313  %res2 = add <8 x i64> %res, %res1
4314  ret <8 x i64> %res2
4315}
4316
4317declare <16 x i32> @llvm.x86.avx512.mask.pmins.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
4318
4319define <16 x i32>@test_int_x86_avx512_mask_pmins_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
4320; CHECK-LABEL: test_int_x86_avx512_mask_pmins_d_512:
4321; CHECK:       ## BB#0:
4322; CHECK-NEXT:    kmovw %edi, %k1
4323; CHECK-NEXT:    vpminsd %zmm1, %zmm0, %zmm2 {%k1}
4324; CHECK-NEXT:    vpminsd %zmm1, %zmm0, %zmm0
4325; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
4326; CHECK-NEXT:    retq
4327  %res = call <16 x i32> @llvm.x86.avx512.mask.pmins.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
4328  %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmins.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
4329  %res2 = add <16 x i32> %res, %res1
4330  ret <16 x i32> %res2
4331}
4332
4333declare <8 x i64> @llvm.x86.avx512.mask.pmins.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
4334
4335define <8 x i64>@test_int_x86_avx512_mask_pmins_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
4336; CHECK-LABEL: test_int_x86_avx512_mask_pmins_q_512:
4337; CHECK:       ## BB#0:
4338; CHECK-NEXT:    movzbl %dil, %eax
4339; CHECK-NEXT:    kmovw %eax, %k1
4340; CHECK-NEXT:    vpminsq %zmm1, %zmm0, %zmm2 {%k1}
4341; CHECK-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
4342; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
4343; CHECK-NEXT:    retq
4344  %res = call <8 x i64> @llvm.x86.avx512.mask.pmins.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
4345  %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmins.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
4346  %res2 = add <8 x i64> %res, %res1
4347  ret <8 x i64> %res2
4348}
4349
4350define <16 x i32>@test_int_x86_avx512_mask_pminu_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
4351; CHECK-LABEL: test_int_x86_avx512_mask_pminu_d_512:
4352; CHECK:       ## BB#0:
4353; CHECK-NEXT:    kmovw %edi, %k1
4354; CHECK-NEXT:    vpminud %zmm1, %zmm0, %zmm2 {%k1}
4355; CHECK-NEXT:    vpminud %zmm1, %zmm0, %zmm0
4356; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
4357; CHECK-NEXT:    retq
4358  %res = call <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
4359  %res1 = call <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
4360  %res2 = add <16 x i32> %res, %res1
4361  ret <16 x i32> %res2
4362}
4363
4364declare <8 x i64> @llvm.x86.avx512.mask.pminu.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
4365
4366define <8 x i64>@test_int_x86_avx512_mask_pminu_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
4367; CHECK-LABEL: test_int_x86_avx512_mask_pminu_q_512:
4368; CHECK:       ## BB#0:
4369; CHECK-NEXT:    movzbl %dil, %eax
4370; CHECK-NEXT:    kmovw %eax, %k1
4371; CHECK-NEXT:    vpminuq %zmm1, %zmm0, %zmm2 {%k1}
4372; CHECK-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
4373; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
4374; CHECK-NEXT:    retq
4375  %res = call <8 x i64> @llvm.x86.avx512.mask.pminu.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
4376  %res1 = call <8 x i64> @llvm.x86.avx512.mask.pminu.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
4377  %res2 = add <8 x i64> %res, %res1
4378  ret <8 x i64> %res2
4379}
4380
4381declare <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
4382
4383define <16 x i32>@test_int_x86_avx512_mask_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, <16 x i32> %x4, i16 %x3) {
4384; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_d_512:
4385; CHECK:       ## BB#0:
4386; CHECK-NEXT:    kmovw %esi, %k1
4387; CHECK-NEXT:    vmovaps %zmm1, %zmm3
4388; CHECK-NEXT:    vpermi2d (%rdi), %zmm0, %zmm3 {%k1}
4389; CHECK-NEXT:    vpermi2d %zmm2, %zmm0, %zmm1
4390; CHECK-NEXT:    vpaddd %zmm1, %zmm3, %zmm0
4391; CHECK-NEXT:    retq
4392  %x2 = load <16 x i32>, <16 x i32>* %x2p
4393  %res = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
4394  %res1 = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4, i16 -1)
4395  %res2 = add <16 x i32> %res, %res1
4396  ret <16 x i32> %res2
4397}
4398
4399declare <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>, i8)
4400
4401define <8 x double>@test_int_x86_avx512_mask_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) {
4402; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_512:
4403; CHECK:       ## BB#0:
4404; CHECK-NEXT:    movzbl %dil, %eax
4405; CHECK-NEXT:    kmovw %eax, %k1
4406; CHECK-NEXT:    vmovaps %zmm1, %zmm3
4407; CHECK-NEXT:    vpermi2pd %zmm2, %zmm0, %zmm3 {%k1}
4408; CHECK-NEXT:    vpermi2pd %zmm2, %zmm0, %zmm1
4409; CHECK-NEXT:    vaddpd %zmm1, %zmm3, %zmm0
4410; CHECK-NEXT:    retq
4411  %res = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3)
4412  %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1)
4413  %res2 = fadd <8 x double> %res, %res1
4414  ret <8 x double> %res2
4415}
4416
4417declare <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>, i16)
4418
4419define <16 x float>@test_int_x86_avx512_mask_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) {
4420; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_512:
4421; CHECK:       ## BB#0:
4422; CHECK-NEXT:    kmovw %edi, %k1
4423; CHECK-NEXT:    vmovaps %zmm1, %zmm3
4424; CHECK-NEXT:    vpermi2ps %zmm2, %zmm0, %zmm3 {%k1}
4425; CHECK-NEXT:    vpermi2ps %zmm2, %zmm0, %zmm1
4426; CHECK-NEXT:    vaddps %zmm1, %zmm3, %zmm0
4427; CHECK-NEXT:    retq
4428  %res = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3)
4429  %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1)
4430  %res2 = fadd <16 x float> %res, %res1
4431  ret <16 x float> %res2
4432}
4433
4434declare <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
4435
4436define <8 x i64>@test_int_x86_avx512_mask_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
4437; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_q_512:
4438; CHECK:       ## BB#0:
4439; CHECK-NEXT:    movzbl %dil, %eax
4440; CHECK-NEXT:    kmovw %eax, %k1
4441; CHECK-NEXT:    vmovaps %zmm1, %zmm3
4442; CHECK-NEXT:    vpermi2q %zmm2, %zmm0, %zmm3 {%k1}
4443; CHECK-NEXT:    vpermi2q %zmm2, %zmm0, %zmm1
4444; CHECK-NEXT:    vpaddq %zmm1, %zmm3, %zmm0
4445; CHECK-NEXT:    retq
4446  %res = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
4447  %res1 = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
4448  %res2 = add <8 x i64> %res, %res1
4449  ret <8 x i64> %res2
4450}
4451
4452declare <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
4453
4454define <16 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, i16 %x3) {
4455; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_512:
4456; CHECK:       ## BB#0:
4457; CHECK-NEXT:    kmovw %esi, %k1
4458; CHECK-NEXT:    vmovaps %zmm1, %zmm2
4459; CHECK-NEXT:    vpermt2d (%rdi), %zmm0, %zmm2 {%k1} {z}
4460; CHECK-NEXT:    vpermt2d %zmm1, %zmm0, %zmm1
4461; CHECK-NEXT:    vpaddd %zmm1, %zmm2, %zmm0
4462; CHECK-NEXT:    retq
4463  %x2 = load <16 x i32>, <16 x i32>* %x2p
4464  %res = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
4465  %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x1, i16 -1)
4466  %res2 = add <16 x i32> %res, %res1
4467  ret <16 x i32> %res2
4468}
4469
4470declare <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64>, <8 x double>, <8 x double>, i8)
4471
4472define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <8 x double> %x1, double* %x2ptr, i8 %x3) {
4473; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_pd_512:
4474; CHECK:       ## BB#0:
4475; CHECK-NEXT:    movzbl %sil, %eax
4476; CHECK-NEXT:    kmovw %eax, %k1
4477; CHECK-NEXT:    vmovaps %zmm1, %zmm2
4478; CHECK-NEXT:    vpermt2pd (%rdi){1to8}, %zmm0, %zmm2 {%k1} {z}
4479; CHECK-NEXT:    vpermt2pd %zmm1, %zmm0, %zmm1
4480; CHECK-NEXT:    vaddpd %zmm1, %zmm2, %zmm0
4481; CHECK-NEXT:    retq
4482  %x2s = load double, double* %x2ptr
4483  %x2ins = insertelement <8 x double> undef, double %x2s, i32 0
4484  %x2 = shufflevector <8 x double> %x2ins, <8 x double> undef, <8 x i32> zeroinitializer
4485  %res = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3)
4486  %res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> %x0, <8 x double> %x1, <8 x double> %x1, i8 -1)
4487  %res2 = fadd <8 x double> %res, %res1
4488  ret <8 x double> %res2
4489}
4490
4491declare <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32>, <16 x float>, <16 x float>, i16)
4492
4493define <16 x float>@test_int_x86_avx512_maskz_vpermt2var_ps_512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) {
4494; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_ps_512:
4495; CHECK:       ## BB#0:
4496; CHECK-NEXT:    kmovw %edi, %k1
4497; CHECK-NEXT:    vmovaps %zmm1, %zmm3
4498; CHECK-NEXT:    vpermt2ps %zmm2, %zmm0, %zmm3 {%k1} {z}
4499; CHECK-NEXT:    vpermt2ps %zmm2, %zmm0, %zmm1
4500; CHECK-NEXT:    vaddps %zmm1, %zmm3, %zmm0
4501; CHECK-NEXT:    retq
4502  %res = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3)
4503  %res1 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1)
4504  %res2 = fadd <16 x float> %res, %res1
4505  ret <16 x float> %res2
4506}
4507
4508
4509declare <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
4510
4511define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
4512; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_512:
4513; CHECK:       ## BB#0:
4514; CHECK-NEXT:    movzbl %dil, %eax
4515; CHECK-NEXT:    kmovw %eax, %k1
4516; CHECK-NEXT:    vmovaps %zmm1, %zmm3
4517; CHECK-NEXT:    vpermt2q %zmm2, %zmm0, %zmm3 {%k1} {z}
4518; CHECK-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
4519; CHECK-NEXT:    vpaddq %zmm1, %zmm3, %zmm0
4520; CHECK-NEXT:    retq
4521  %res = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
4522  %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
4523  %res2 = add <8 x i64> %res, %res1
4524  ret <8 x i64> %res2
4525}
4526
4527declare <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
4528
4529define <16 x i32>@test_int_x86_avx512_mask_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
4530; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_d_512:
4531; CHECK:       ## BB#0:
4532; CHECK-NEXT:    kmovw %edi, %k1
4533; CHECK-NEXT:    vmovaps %zmm1, %zmm3
4534; CHECK-NEXT:    vpermt2d %zmm2, %zmm0, %zmm3 {%k1}
4535; CHECK-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
4536; CHECK-NEXT:    vpaddd %zmm1, %zmm3, %zmm0
4537; CHECK-NEXT:    retq
4538  %res = call <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
4539  %res1 = call <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
4540  %res2 = add <16 x i32> %res, %res1
4541  ret <16 x i32> %res2
4542}
4543
4544declare <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
4545define <8 x double>@test_int_x86_avx512_mask_scalef_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) {
4546; CHECK-LABEL: test_int_x86_avx512_mask_scalef_pd_512:
4547; CHECK:       ## BB#0:
4548; CHECK-NEXT:    movzbl %dil, %eax
4549; CHECK-NEXT:    kmovw %eax, %k1
4550; CHECK-NEXT:    vscalefpd {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4551; CHECK-NEXT:    vscalefpd {rn-sae}, %zmm1, %zmm0, %zmm0
4552; CHECK-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
4553; CHECK-NEXT:    retq
4554  %res = call <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 3)
4555  %res1 = call <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1, i32 0)
4556  %res2 = fadd <8 x double> %res, %res1
4557  ret <8 x double> %res2
4558}
4559
4560declare <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
4561define <16 x float>@test_int_x86_avx512_mask_scalef_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) {
4562; CHECK-LABEL: test_int_x86_avx512_mask_scalef_ps_512:
4563; CHECK:       ## BB#0:
4564; CHECK-NEXT:    kmovw %edi, %k1
4565; CHECK-NEXT:    vscalefps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4566; CHECK-NEXT:    vscalefps {rn-sae}, %zmm1, %zmm0, %zmm0
4567; CHECK-NEXT:    vaddps %zmm0, %zmm2, %zmm0
4568; CHECK-NEXT:    retq
4569  %res = call <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 2)
4570  %res1 = call <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1, i32 0)
4571  %res2 = fadd <16 x float> %res, %res1
4572  ret <16 x float> %res2
4573}
4574
4575declare <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double>, <8 x double>, <8 x double>, i8)
4576
4577define <8 x double>@test_int_x86_avx512_mask_unpckh_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) {
4578; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_pd_512:
4579; CHECK:       ## BB#0:
4580; CHECK-NEXT:    movzbl %dil, %eax
4581; CHECK-NEXT:    kmovw %eax, %k1
4582; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm2 = zmm2[1],k1[1],zmm2[3],k1[3],zmm2[5],k1[5],zmm2[7],k1[7]
4583; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
4584; CHECK-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
4585; CHECK-NEXT:    retq
4586  %res = call <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3)
4587  %res1 = call <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1)
4588  %res2 = fadd <8 x double> %res, %res1
4589  ret <8 x double> %res2
4590}
4591
4592declare <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float>, <16 x float>, <16 x float>, i16)
4593
4594define <16 x float>@test_int_x86_avx512_mask_unpckh_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) {
4595; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_ps_512:
4596; CHECK:       ## BB#0:
4597; CHECK-NEXT:    kmovw %edi, %k1
4598; CHECK-NEXT:    vunpckhps {{.*#+}} zmm2 = zmm2[2],k1[2],zmm2[3],k1[3],zmm2[6],k1[6],zmm2[7],k1[7],zmm2[10],k1[10],zmm2[11],k1[11],zmm2[14],k1[14],zmm2[15],k1[15]
4599; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
4600; CHECK-NEXT:    vaddps %zmm0, %zmm2, %zmm0
4601; CHECK-NEXT:    retq
4602  %res = call <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3)
4603  %res1 = call <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1)
4604  %res2 = fadd <16 x float> %res, %res1
4605  ret <16 x float> %res2
4606}
4607
4608declare <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double>, <8 x double>, <8 x double>, i8)
4609
4610define <8 x double>@test_int_x86_avx512_mask_unpckl_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) {
4611; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_pd_512:
4612; CHECK:       ## BB#0:
4613; CHECK-NEXT:    movzbl %dil, %eax
4614; CHECK-NEXT:    kmovw %eax, %k1
4615; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm2 = zmm2[0],k1[0],zmm2[2],k1[2],zmm2[4],k1[4],zmm2[6],k1[6]
4616; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
4617; CHECK-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
4618; CHECK-NEXT:    retq
4619  %res = call <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3)
4620  %res1 = call <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1)
4621  %res2 = fadd <8 x double> %res, %res1
4622  ret <8 x double> %res2
4623}
4624
4625declare <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float>, <16 x float>, <16 x float>, i16)
4626
4627define <16 x float>@test_int_x86_avx512_mask_unpckl_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) {
4628; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_ps_512:
4629; CHECK:       ## BB#0:
4630; CHECK-NEXT:    kmovw %edi, %k1
4631; CHECK-NEXT:    vunpcklps {{.*#+}} zmm2 = zmm2[0],k1[0],zmm2[1],k1[1],zmm2[4],k1[4],zmm2[5],k1[5],zmm2[8],k1[8],zmm2[9],k1[9],zmm2[12],k1[12],zmm2[13],k1[13]
4632; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
4633; CHECK-NEXT:    vaddps %zmm0, %zmm2, %zmm0
4634; CHECK-NEXT:    retq
4635  %res = call <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3)
4636  %res1 = call <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1)
4637  %res2 = fadd <16 x float> %res, %res1
4638  ret <16 x float> %res2
4639}
4640
4641declare <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
4642
4643define <8 x i64>@test_int_x86_avx512_mask_punpcklqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
4644; CHECK-LABEL: test_int_x86_avx512_mask_punpcklqd_q_512:
4645; CHECK:       ## BB#0:
4646; CHECK-NEXT:    movzbl %dil, %eax
4647; CHECK-NEXT:    kmovw %eax, %k1
4648; CHECK-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm2[0],k1[0],zmm2[2],k1[2],zmm2[4],k1[4],zmm2[6],k1[6]
4649; CHECK-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = k1[0],zmm0[0],k1[2],zmm0[2],k1[4],zmm0[4],k1[6],zmm0[6]
4650; CHECK-NEXT:    vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
4651; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
4652; CHECK-NEXT:    vpaddq %zmm0, %zmm3, %zmm0
4653; CHECK-NEXT:    retq
4654  %res = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
4655  %res1 = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
4656  %res2 = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer,i8 %x3)
4657  %res3 = add <8 x i64> %res, %res1
4658  %res4 = add <8 x i64> %res2, %res3
4659  ret <8 x i64> %res4
4660}
4661
4662declare <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
4663
4664define <8 x i64>@test_int_x86_avx512_mask_punpckhqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
4665; CHECK-LABEL: test_int_x86_avx512_mask_punpckhqd_q_512:
4666; CHECK:       ## BB#0:
4667; CHECK-NEXT:    movzbl %dil, %eax
4668; CHECK-NEXT:    kmovw %eax, %k1
4669; CHECK-NEXT:    vpunpckhqdq {{.*#+}} zmm2 = zmm2[1],k1[1],zmm2[3],k1[3],zmm2[5],k1[5],zmm2[7],k1[7]
4670; CHECK-NEXT:    vpunpckhqdq {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
4671; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
4672; CHECK-NEXT:    retq
4673  %res = call <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
4674  %res1 = call <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
4675  %res2 = add <8 x i64> %res, %res1
4676  ret <8 x i64> %res2
4677}
4678
4679declare <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
4680
4681define <16 x i32>@test_int_x86_avx512_mask_punpckhd_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
4682; CHECK-LABEL: test_int_x86_avx512_mask_punpckhd_q_512:
4683; CHECK:       ## BB#0:
4684; CHECK-NEXT:    kmovw %edi, %k1
4685; CHECK-NEXT:    vpunpckhdq {{.*#+}} zmm2 = zmm2[2],k1[2],zmm2[3],k1[3],zmm2[6],k1[6],zmm2[7],k1[7],zmm2[10],k1[10],zmm2[11],k1[11],zmm2[14],k1[14],zmm2[15],k1[15]
4686; CHECK-NEXT:    vpunpckhdq {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
4687; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
4688; CHECK-NEXT:    retq
4689  %res = call <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
4690  %res1 = call <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
4691  %res2 = add <16 x i32> %res, %res1
4692  ret <16 x i32> %res2
4693}
4694
4695declare <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
4696
4697define <16 x i32>@test_int_x86_avx512_mask_punpckld_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
4698; CHECK-LABEL: test_int_x86_avx512_mask_punpckld_q_512:
4699; CHECK:       ## BB#0:
4700; CHECK-NEXT:    kmovw %edi, %k1
4701; CHECK-NEXT:    vpunpckldq {{.*#+}} zmm2 = zmm2[0],k1[0],zmm2[1],k1[1],zmm2[4],k1[4],zmm2[5],k1[5],zmm2[8],k1[8],zmm2[9],k1[9],zmm2[12],k1[12],zmm2[13],k1[13]
4702; CHECK-NEXT:    vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
4703; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
4704; CHECK-NEXT:    retq
4705  %res = call <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
4706  %res1 = call <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
4707  %res2 = add <16 x i32> %res, %res1
4708  ret <16 x i32> %res2
4709}
4710
4711declare <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64>, <16 x i8>, i8)
4712
4713define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) {
4714; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_512:
4715; CHECK:       ## BB#0:
4716; CHECK-NEXT:    kmovw %edi, %k1
4717; CHECK-NEXT:    vpmovqb %zmm0, %xmm1 {%k1}
4718; CHECK-NEXT:    vpmovqb %zmm0, %xmm2 {%k1} {z}
4719; CHECK-NEXT:    vpmovqb %zmm0, %xmm0
4720; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
4721; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
4722; CHECK-NEXT:    retq
4723    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1)
4724    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2)
4725    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
4726    %res3 = add <16 x i8> %res0, %res1
4727    %res4 = add <16 x i8> %res3, %res2
4728    ret <16 x i8> %res4
4729}
4730
4731declare void @llvm.x86.avx512.mask.pmov.qb.mem.512(i8* %ptr, <8 x i64>, i8)
4732
4733define void @test_int_x86_avx512_mask_pmov_qb_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
4734; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_mem_512:
4735; CHECK:       ## BB#0:
4736; CHECK-NEXT:    movzbl %sil, %eax
4737; CHECK-NEXT:    kmovw %eax, %k1
4738; CHECK-NEXT:    vpmovqb %zmm0, (%rdi)
4739; CHECK-NEXT:    vpmovqb %zmm0, (%rdi) {%k1}
4740; CHECK-NEXT:    retq
4741    call void @llvm.x86.avx512.mask.pmov.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
4742    call void @llvm.x86.avx512.mask.pmov.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
4743    ret void
4744}
4745
4746declare <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64>, <16 x i8>, i8)
4747
4748define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) {
4749; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_512:
4750; CHECK:       ## BB#0:
4751; CHECK-NEXT:    kmovw %edi, %k1
4752; CHECK-NEXT:    vpmovsqb %zmm0, %xmm1 {%k1}
4753; CHECK-NEXT:    vpmovsqb %zmm0, %xmm2 {%k1} {z}
4754; CHECK-NEXT:    vpmovsqb %zmm0, %xmm0
4755; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
4756; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
4757; CHECK-NEXT:    retq
4758    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1)
4759    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2)
4760    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
4761    %res3 = add <16 x i8> %res0, %res1
4762    %res4 = add <16 x i8> %res3, %res2
4763    ret <16 x i8> %res4
4764}
4765
4766declare void @llvm.x86.avx512.mask.pmovs.qb.mem.512(i8* %ptr, <8 x i64>, i8)
4767
4768define void @test_int_x86_avx512_mask_pmovs_qb_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
4769; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_mem_512:
4770; CHECK:       ## BB#0:
4771; CHECK-NEXT:    vpmovsqb %zmm0, (%rdi)
4772; CHECK-NEXT:    kmovw %esi, %k1
4773; CHECK-NEXT:    vpmovsqb %zmm0, (%rdi) {%k1}
4774; CHECK-NEXT:    retq
4775    call void @llvm.x86.avx512.mask.pmovs.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
4776    call void @llvm.x86.avx512.mask.pmovs.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
4777    ret void
4778}
4779
4780declare <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64>, <16 x i8>, i8)
4781
4782define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) {
4783; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_512:
4784; CHECK:       ## BB#0:
4785; CHECK-NEXT:    kmovw %edi, %k1
4786; CHECK-NEXT:    vpmovusqb %zmm0, %xmm1 {%k1}
4787; CHECK-NEXT:    vpmovusqb %zmm0, %xmm2 {%k1} {z}
4788; CHECK-NEXT:    vpmovusqb %zmm0, %xmm0
4789; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
4790; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
4791; CHECK-NEXT:    retq
4792    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1)
4793    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2)
4794    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
4795    %res3 = add <16 x i8> %res0, %res1
4796    %res4 = add <16 x i8> %res3, %res2
4797    ret <16 x i8> %res4
4798}
4799
4800declare void @llvm.x86.avx512.mask.pmovus.qb.mem.512(i8* %ptr, <8 x i64>, i8)
4801
4802define void @test_int_x86_avx512_mask_pmovus_qb_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
4803; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_mem_512:
4804; CHECK:       ## BB#0:
4805; CHECK-NEXT:    vpmovusqb %zmm0, (%rdi)
4806; CHECK-NEXT:    kmovw %esi, %k1
4807; CHECK-NEXT:    vpmovusqb %zmm0, (%rdi) {%k1}
4808; CHECK-NEXT:    retq
4809    call void @llvm.x86.avx512.mask.pmovus.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
4810    call void @llvm.x86.avx512.mask.pmovus.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
4811    ret void
4812}
4813
4814declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64>, <8 x i16>, i8)
4815
4816define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) {
4817; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_512:
4818; CHECK:       ## BB#0:
4819; CHECK-NEXT:    movzbl %dil, %eax
4820; CHECK-NEXT:    kmovw %eax, %k1
4821; CHECK-NEXT:    vpmovqw %zmm0, %xmm1 {%k1}
4822; CHECK-NEXT:    vpmovqw %zmm0, %xmm2 {%k1} {z}
4823; CHECK-NEXT:    vpmovqw %zmm0, %xmm0
4824; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
4825; CHECK-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
4826; CHECK-NEXT:    retq
4827    %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1)
4828    %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2)
4829    %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
4830    %res3 = add <8 x i16> %res0, %res1
4831    %res4 = add <8 x i16> %res3, %res2
4832    ret <8 x i16> %res4
4833}
4834
4835declare void @llvm.x86.avx512.mask.pmov.qw.mem.512(i8* %ptr, <8 x i64>, i8)
4836
4837define void @test_int_x86_avx512_mask_pmov_qw_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
4838; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_mem_512:
4839; CHECK:       ## BB#0:
4840; CHECK-NEXT:    movzbl %sil, %eax
4841; CHECK-NEXT:    kmovw %eax, %k1
4842; CHECK-NEXT:    vpmovqw %zmm0, (%rdi)
4843; CHECK-NEXT:    vpmovqw %zmm0, (%rdi) {%k1}
4844; CHECK-NEXT:    retq
4845    call void @llvm.x86.avx512.mask.pmov.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
4846    call void @llvm.x86.avx512.mask.pmov.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
4847    ret void
4848}
4849
4850declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64>, <8 x i16>, i8)
4851
4852define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) {
4853; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_512:
4854; CHECK:       ## BB#0:
4855; CHECK-NEXT:    movzbl %dil, %eax
4856; CHECK-NEXT:    kmovw %eax, %k1
4857; CHECK-NEXT:    vpmovsqw %zmm0, %xmm1 {%k1}
4858; CHECK-NEXT:    vpmovsqw %zmm0, %xmm2 {%k1} {z}
4859; CHECK-NEXT:    vpmovsqw %zmm0, %xmm0
4860; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
4861; CHECK-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
4862; CHECK-NEXT:    retq
4863    %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1)
4864    %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2)
4865    %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
4866    %res3 = add <8 x i16> %res0, %res1
4867    %res4 = add <8 x i16> %res3, %res2
4868    ret <8 x i16> %res4
4869}
4870
4871declare void @llvm.x86.avx512.mask.pmovs.qw.mem.512(i8* %ptr, <8 x i64>, i8)
4872
4873define void @test_int_x86_avx512_mask_pmovs_qw_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
4874; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_mem_512:
4875; CHECK:       ## BB#0:
4876; CHECK-NEXT:    vpmovsqw %zmm0, (%rdi)
4877; CHECK-NEXT:    kmovw %esi, %k1
4878; CHECK-NEXT:    vpmovsqw %zmm0, (%rdi) {%k1}
4879; CHECK-NEXT:    retq
4880    call void @llvm.x86.avx512.mask.pmovs.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
4881    call void @llvm.x86.avx512.mask.pmovs.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
4882    ret void
4883}
4884
4885declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64>, <8 x i16>, i8)
4886
4887define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) {
4888; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_512:
4889; CHECK:       ## BB#0:
4890; CHECK-NEXT:    movzbl %dil, %eax
4891; CHECK-NEXT:    kmovw %eax, %k1
4892; CHECK-NEXT:    vpmovusqw %zmm0, %xmm1 {%k1}
4893; CHECK-NEXT:    vpmovusqw %zmm0, %xmm2 {%k1} {z}
4894; CHECK-NEXT:    vpmovusqw %zmm0, %xmm0
4895; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
4896; CHECK-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
4897; CHECK-NEXT:    retq
4898    %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1)
4899    %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2)
4900    %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
4901    %res3 = add <8 x i16> %res0, %res1
4902    %res4 = add <8 x i16> %res3, %res2
4903    ret <8 x i16> %res4
4904}
4905
4906declare void @llvm.x86.avx512.mask.pmovus.qw.mem.512(i8* %ptr, <8 x i64>, i8)
4907
4908define void @test_int_x86_avx512_mask_pmovus_qw_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
4909; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_mem_512:
4910; CHECK:       ## BB#0:
4911; CHECK-NEXT:    vpmovusqw %zmm0, (%rdi)
4912; CHECK-NEXT:    kmovw %esi, %k1
4913; CHECK-NEXT:    vpmovusqw %zmm0, (%rdi) {%k1}
4914; CHECK-NEXT:    retq
4915    call void @llvm.x86.avx512.mask.pmovus.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
4916    call void @llvm.x86.avx512.mask.pmovus.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
4917    ret void
4918}
4919
4920declare <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64>, <8 x i32>, i8)
4921
4922define <8 x i32>@test_int_x86_avx512_mask_pmov_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) {
4923; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_512:
4924; CHECK:       ## BB#0:
4925; CHECK-NEXT:    movzbl %dil, %eax
4926; CHECK-NEXT:    kmovw %eax, %k1
4927; CHECK-NEXT:    vpmovqd %zmm0, %ymm1 {%k1}
4928; CHECK-NEXT:    vpmovqd %zmm0, %ymm2 {%k1} {z}
4929; CHECK-NEXT:    vpmovqd %zmm0, %ymm0
4930; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
4931; CHECK-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
4932; CHECK-NEXT:    retq
4933    %res0 = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1)
4934    %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2)
4935    %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2)
4936    %res3 = add <8 x i32> %res0, %res1
4937    %res4 = add <8 x i32> %res3, %res2
4938    ret <8 x i32> %res4
4939}
4940
4941declare void @llvm.x86.avx512.mask.pmov.qd.mem.512(i8* %ptr, <8 x i64>, i8)
4942
4943define void @test_int_x86_avx512_mask_pmov_qd_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
4944; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_mem_512:
4945; CHECK:       ## BB#0:
4946; CHECK-NEXT:    movzbl %sil, %eax
4947; CHECK-NEXT:    kmovw %eax, %k1
4948; CHECK-NEXT:    vpmovqd %zmm0, (%rdi)
4949; CHECK-NEXT:    vpmovqd %zmm0, (%rdi) {%k1}
4950; CHECK-NEXT:    retq
4951    call void @llvm.x86.avx512.mask.pmov.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
4952    call void @llvm.x86.avx512.mask.pmov.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
4953    ret void
4954}
4955
4956declare <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64>, <8 x i32>, i8)
4957
4958define <8 x i32>@test_int_x86_avx512_mask_pmovs_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) {
4959; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_512:
4960; CHECK:       ## BB#0:
4961; CHECK-NEXT:    movzbl %dil, %eax
4962; CHECK-NEXT:    kmovw %eax, %k1
4963; CHECK-NEXT:    vpmovsqd %zmm0, %ymm1 {%k1}
4964; CHECK-NEXT:    vpmovsqd %zmm0, %ymm2 {%k1} {z}
4965; CHECK-NEXT:    vpmovsqd %zmm0, %ymm0
4966; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
4967; CHECK-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
4968; CHECK-NEXT:    retq
4969    %res0 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1)
4970    %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2)
4971    %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2)
4972    %res3 = add <8 x i32> %res0, %res1
4973    %res4 = add <8 x i32> %res3, %res2
4974    ret <8 x i32> %res4
4975}
4976
4977declare void @llvm.x86.avx512.mask.pmovs.qd.mem.512(i8* %ptr, <8 x i64>, i8)
4978
4979define void @test_int_x86_avx512_mask_pmovs_qd_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
4980; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_mem_512:
4981; CHECK:       ## BB#0:
4982; CHECK-NEXT:    vpmovsqd %zmm0, (%rdi)
4983; CHECK-NEXT:    kmovw %esi, %k1
4984; CHECK-NEXT:    vpmovsqd %zmm0, (%rdi) {%k1}
4985; CHECK-NEXT:    retq
4986    call void @llvm.x86.avx512.mask.pmovs.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
4987    call void @llvm.x86.avx512.mask.pmovs.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
4988    ret void
4989}
4990
4991declare <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64>, <8 x i32>, i8)
4992
4993define <8 x i32>@test_int_x86_avx512_mask_pmovus_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) {
4994; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_512:
4995; CHECK:       ## BB#0:
4996; CHECK-NEXT:    movzbl %dil, %eax
4997; CHECK-NEXT:    kmovw %eax, %k1
4998; CHECK-NEXT:    vpmovusqd %zmm0, %ymm1 {%k1}
4999; CHECK-NEXT:    vpmovusqd %zmm0, %ymm2 {%k1} {z}
5000; CHECK-NEXT:    vpmovusqd %zmm0, %ymm0
5001; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
5002; CHECK-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
5003; CHECK-NEXT:    retq
5004    %res0 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1)
5005    %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2)
5006    %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2)
5007    %res3 = add <8 x i32> %res0, %res1
5008    %res4 = add <8 x i32> %res3, %res2
5009    ret <8 x i32> %res4
5010}
5011
5012declare void @llvm.x86.avx512.mask.pmovus.qd.mem.512(i8* %ptr, <8 x i64>, i8)
5013
5014define void @test_int_x86_avx512_mask_pmovus_qd_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
5015; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_mem_512:
5016; CHECK:       ## BB#0:
5017; CHECK-NEXT:    vpmovusqd %zmm0, (%rdi)
5018; CHECK-NEXT:    kmovw %esi, %k1
5019; CHECK-NEXT:    vpmovusqd %zmm0, (%rdi) {%k1}
5020; CHECK-NEXT:    retq
5021    call void @llvm.x86.avx512.mask.pmovus.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
5022    call void @llvm.x86.avx512.mask.pmovus.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
5023    ret void
5024}
5025
5026declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32>, <16 x i8>, i16)
5027
5028define <16 x i8>@test_int_x86_avx512_mask_pmov_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) {
5029; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_512:
5030; CHECK:       ## BB#0:
5031; CHECK-NEXT:    kmovw %edi, %k1
5032; CHECK-NEXT:    vpmovdb %zmm0, %xmm1 {%k1}
5033; CHECK-NEXT:    vpmovdb %zmm0, %xmm2 {%k1} {z}
5034; CHECK-NEXT:    vpmovdb %zmm0, %xmm0
5035; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
5036; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
5037; CHECK-NEXT:    retq
5038    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1)
5039    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2)
5040    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> zeroinitializer, i16 %x2)
5041    %res3 = add <16 x i8> %res0, %res1
5042    %res4 = add <16 x i8> %res3, %res2
5043    ret <16 x i8> %res4
5044}
5045
5046declare void @llvm.x86.avx512.mask.pmov.db.mem.512(i8* %ptr, <16 x i32>, i16)
5047
5048define void @test_int_x86_avx512_mask_pmov_db_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) {
5049; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_mem_512:
5050; CHECK:       ## BB#0:
5051; CHECK-NEXT:    kmovw %esi, %k1
5052; CHECK-NEXT:    vpmovdb %zmm0, (%rdi)
5053; CHECK-NEXT:    vpmovdb %zmm0, (%rdi) {%k1}
5054; CHECK-NEXT:    retq
5055    call void @llvm.x86.avx512.mask.pmov.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
5056    call void @llvm.x86.avx512.mask.pmov.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
5057    ret void
5058}
5059
5060declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32>, <16 x i8>, i16)
5061
5062define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) {
5063; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_512:
5064; CHECK:       ## BB#0:
5065; CHECK-NEXT:    kmovw %edi, %k1
5066; CHECK-NEXT:    vpmovsdb %zmm0, %xmm1 {%k1}
5067; CHECK-NEXT:    vpmovsdb %zmm0, %xmm2 {%k1} {z}
5068; CHECK-NEXT:    vpmovsdb %zmm0, %xmm0
5069; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
5070; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
5071; CHECK-NEXT:    retq
5072    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1)
5073    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2)
5074    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> zeroinitializer, i16 %x2)
5075    %res3 = add <16 x i8> %res0, %res1
5076    %res4 = add <16 x i8> %res3, %res2
5077    ret <16 x i8> %res4
5078}
5079
5080declare void @llvm.x86.avx512.mask.pmovs.db.mem.512(i8* %ptr, <16 x i32>, i16)
5081
5082define void @test_int_x86_avx512_mask_pmovs_db_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) {
5083; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_mem_512:
5084; CHECK:       ## BB#0:
5085; CHECK-NEXT:    vpmovsdb %zmm0, (%rdi)
5086; CHECK-NEXT:    kmovw %esi, %k1
5087; CHECK-NEXT:    vpmovsdb %zmm0, (%rdi) {%k1}
5088; CHECK-NEXT:    retq
5089    call void @llvm.x86.avx512.mask.pmovs.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
5090    call void @llvm.x86.avx512.mask.pmovs.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
5091    ret void
5092}
5093
5094declare <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32>, <16 x i8>, i16)
5095
5096define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) {
5097; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_512:
5098; CHECK:       ## BB#0:
5099; CHECK-NEXT:    kmovw %edi, %k1
5100; CHECK-NEXT:    vpmovusdb %zmm0, %xmm1 {%k1}
5101; CHECK-NEXT:    vpmovusdb %zmm0, %xmm2 {%k1} {z}
5102; CHECK-NEXT:    vpmovusdb %zmm0, %xmm0
5103; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
5104; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
5105; CHECK-NEXT:    retq
5106    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1)
5107    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2)
5108    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> zeroinitializer, i16 %x2)
5109    %res3 = add <16 x i8> %res0, %res1
5110    %res4 = add <16 x i8> %res3, %res2
5111    ret <16 x i8> %res4
5112}
5113
5114declare void @llvm.x86.avx512.mask.pmovus.db.mem.512(i8* %ptr, <16 x i32>, i16)
5115
5116define void @test_int_x86_avx512_mask_pmovus_db_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) {
5117; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_mem_512:
5118; CHECK:       ## BB#0:
5119; CHECK-NEXT:    vpmovusdb %zmm0, (%rdi)
5120; CHECK-NEXT:    kmovw %esi, %k1
5121; CHECK-NEXT:    vpmovusdb %zmm0, (%rdi) {%k1}
5122; CHECK-NEXT:    retq
5123    call void @llvm.x86.avx512.mask.pmovus.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
5124    call void @llvm.x86.avx512.mask.pmovus.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
5125    ret void
5126}
5127
5128declare <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32>, <16 x i16>, i16)
5129
5130define <16 x i16>@test_int_x86_avx512_mask_pmov_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) {
5131; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_512:
5132; CHECK:       ## BB#0:
5133; CHECK-NEXT:    kmovw %edi, %k1
5134; CHECK-NEXT:    vpmovdw %zmm0, %ymm1 {%k1}
5135; CHECK-NEXT:    vpmovdw %zmm0, %ymm2 {%k1} {z}
5136; CHECK-NEXT:    vpmovdw %zmm0, %ymm0
5137; CHECK-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
5138; CHECK-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
5139; CHECK-NEXT:    retq
5140    %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1)
5141    %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2)
5142    %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> zeroinitializer, i16 %x2)
5143    %res3 = add <16 x i16> %res0, %res1
5144    %res4 = add <16 x i16> %res3, %res2
5145    ret <16 x i16> %res4
5146}
5147
5148declare void @llvm.x86.avx512.mask.pmov.dw.mem.512(i8* %ptr, <16 x i32>, i16)
5149
5150define void @test_int_x86_avx512_mask_pmov_dw_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) {
5151; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_mem_512:
5152; CHECK:       ## BB#0:
5153; CHECK-NEXT:    kmovw %esi, %k1
5154; CHECK-NEXT:    vpmovdw %zmm0, (%rdi)
5155; CHECK-NEXT:    vpmovdw %zmm0, (%rdi) {%k1}
5156; CHECK-NEXT:    retq
5157    call void @llvm.x86.avx512.mask.pmov.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
5158    call void @llvm.x86.avx512.mask.pmov.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
5159    ret void
5160}
5161
5162declare <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32>, <16 x i16>, i16)
5163
5164define <16 x i16>@test_int_x86_avx512_mask_pmovs_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) {
5165; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_512:
5166; CHECK:       ## BB#0:
5167; CHECK-NEXT:    kmovw %edi, %k1
5168; CHECK-NEXT:    vpmovsdw %zmm0, %ymm1 {%k1}
5169; CHECK-NEXT:    vpmovsdw %zmm0, %ymm2 {%k1} {z}
5170; CHECK-NEXT:    vpmovsdw %zmm0, %ymm0
5171; CHECK-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
5172; CHECK-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
5173; CHECK-NEXT:    retq
5174    %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1)
5175    %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2)
5176    %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> zeroinitializer, i16 %x2)
5177    %res3 = add <16 x i16> %res0, %res1
5178    %res4 = add <16 x i16> %res3, %res2
5179    ret <16 x i16> %res4
5180}
5181
5182declare void @llvm.x86.avx512.mask.pmovs.dw.mem.512(i8* %ptr, <16 x i32>, i16)
5183
5184define void @test_int_x86_avx512_mask_pmovs_dw_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) {
5185; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_mem_512:
5186; CHECK:       ## BB#0:
5187; CHECK-NEXT:    vpmovsdw %zmm0, (%rdi)
5188; CHECK-NEXT:    kmovw %esi, %k1
5189; CHECK-NEXT:    vpmovsdw %zmm0, (%rdi) {%k1}
5190; CHECK-NEXT:    retq
5191    call void @llvm.x86.avx512.mask.pmovs.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
5192    call void @llvm.x86.avx512.mask.pmovs.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
5193    ret void
5194}
5195
5196declare <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32>, <16 x i16>, i16)
5197
5198define <16 x i16>@test_int_x86_avx512_mask_pmovus_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) {
5199; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_512:
5200; CHECK:       ## BB#0:
5201; CHECK-NEXT:    kmovw %edi, %k1
5202; CHECK-NEXT:    vpmovusdw %zmm0, %ymm1 {%k1}
5203; CHECK-NEXT:    vpmovusdw %zmm0, %ymm2 {%k1} {z}
5204; CHECK-NEXT:    vpmovusdw %zmm0, %ymm0
5205; CHECK-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
5206; CHECK-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
5207; CHECK-NEXT:    retq
5208    %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1)
5209    %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2)
5210    %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> zeroinitializer, i16 %x2)
5211    %res3 = add <16 x i16> %res0, %res1
5212    %res4 = add <16 x i16> %res3, %res2
5213    ret <16 x i16> %res4
5214}
5215
5216declare void @llvm.x86.avx512.mask.pmovus.dw.mem.512(i8* %ptr, <16 x i32>, i16)
5217
5218define void @test_int_x86_avx512_mask_pmovus_dw_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) {
5219; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_mem_512:
5220; CHECK:       ## BB#0:
5221; CHECK-NEXT:    vpmovusdw %zmm0, (%rdi)
5222; CHECK-NEXT:    kmovw %esi, %k1
5223; CHECK-NEXT:    vpmovusdw %zmm0, (%rdi) {%k1}
5224; CHECK-NEXT:    retq
5225    call void @llvm.x86.avx512.mask.pmovus.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
5226    call void @llvm.x86.avx512.mask.pmovus.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
5227    ret void
5228}
5229
5230declare <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32>, <8 x double>, i8)
5231
5232define <8 x double>@test_int_x86_avx512_mask_cvt_dq2pd_512(<8 x i32> %x0, <8 x double> %x1, i8 %x2) {
5233; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2pd_512:
5234; CHECK:       ## BB#0:
5235; CHECK-NEXT:    movzbl %dil, %eax
5236; CHECK-NEXT:    kmovw %eax, %k1
5237; CHECK-NEXT:    vcvtdq2pd %ymm0, %zmm1 {%k1}
5238; CHECK-NEXT:    vcvtdq2pd %ymm0, %zmm0
5239; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
5240; CHECK-NEXT:    retq
5241  %res = call <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 %x2)
5242  %res1 = call <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 -1)
5243  %res2 = fadd <8 x double> %res, %res1
5244  ret <8 x double> %res2
5245}
5246
5247declare <16 x float> @llvm.x86.avx512.mask.cvtdq2ps.512(<16 x i32>, <16 x float>, i16, i32)
5248
5249define <16 x float>@test_int_x86_avx512_mask_cvt_dq2ps_512(<16 x i32> %x0, <16 x float> %x1, i16 %x2) {
5250; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2ps_512:
5251; CHECK:       ## BB#0:
5252; CHECK-NEXT:    kmovw %edi, %k1
5253; CHECK-NEXT:    vcvtdq2ps %zmm0, %zmm1 {%k1}
5254; CHECK-NEXT:    vcvtdq2ps {rn-sae}, %zmm0, %zmm0
5255; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
5256; CHECK-NEXT:    retq
5257  %res = call <16 x float> @llvm.x86.avx512.mask.cvtdq2ps.512(<16 x i32> %x0, <16 x float> %x1, i16 %x2, i32 4)
5258  %res1 = call <16 x float> @llvm.x86.avx512.mask.cvtdq2ps.512(<16 x i32> %x0, <16 x float> %x1, i16 -1, i32 0)
5259  %res2 = fadd <16 x float> %res, %res1
5260  ret <16 x float> %res2
5261}
5262
5263declare <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double>, <8 x i32>, i8, i32)
5264
5265define <8 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) {
5266; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2dq_512:
5267; CHECK:       ## BB#0:
5268; CHECK-NEXT:    movzbl %dil, %eax
5269; CHECK-NEXT:    kmovw %eax, %k1
5270; CHECK-NEXT:    vcvtpd2dq %zmm0, %ymm1 {%k1}
5271; CHECK-NEXT:    vcvtpd2dq {rn-sae}, %zmm0, %ymm0
5272; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
5273; CHECK-NEXT:    retq
5274  %res = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 4)
5275  %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 0)
5276  %res2 = add <8 x i32> %res, %res1
5277  ret <8 x i32> %res2
5278}
5279
5280declare <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double>, <8 x float>, i8, i32)
5281
5282define <8 x float>@test_int_x86_avx512_mask_cvt_pd2ps_512(<8 x double> %x0, <8 x float> %x1, i8 %x2) {
5283; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2ps_512:
5284; CHECK:       ## BB#0:
5285; CHECK-NEXT:    movzbl %dil, %eax
5286; CHECK-NEXT:    kmovw %eax, %k1
5287; CHECK-NEXT:    vcvtpd2ps %zmm0, %ymm1 {%k1}
5288; CHECK-NEXT:    vcvtpd2ps {ru-sae}, %zmm0, %ymm0
5289; CHECK-NEXT:    vaddps %ymm0, %ymm1, %ymm0
5290; CHECK-NEXT:    retq
5291  %res = call <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double> %x0, <8 x float> %x1, i8 %x2, i32 4)
5292  %res1 = call <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double> %x0, <8 x float> %x1, i8 -1, i32 2)
5293  %res2 = fadd <8 x float> %res, %res1
5294  ret <8 x float> %res2
5295}
5296
5297declare <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double>, <8 x i32>, i8, i32)
5298
5299define <8 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) {
5300; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2udq_512:
5301; CHECK:       ## BB#0:
5302; CHECK-NEXT:    movzbl %dil, %eax
5303; CHECK-NEXT:    kmovw %eax, %k1
5304; CHECK-NEXT:    vcvtpd2udq {ru-sae}, %zmm0, %ymm1 {%k1}
5305; CHECK-NEXT:    vcvtpd2udq {rn-sae}, %zmm0, %ymm0
5306; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
5307; CHECK-NEXT:    retq
5308  %res = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 2)
5309  %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 0)
5310  %res2 = add <8 x i32> %res, %res1
5311  ret <8 x i32> %res2
5312}
5313
5314declare <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float>, <16 x i32>, i16, i32)
5315
5316define <16 x i32>@test_int_x86_avx512_mask_cvt_ps2dq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) {
5317; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2dq_512:
5318; CHECK:       ## BB#0:
5319; CHECK-NEXT:    kmovw %edi, %k1
5320; CHECK-NEXT:    vcvtps2dq {ru-sae}, %zmm0, %zmm1 {%k1}
5321; CHECK-NEXT:    vcvtps2dq {rn-sae}, %zmm0, %zmm0
5322; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
5323; CHECK-NEXT:    retq
5324  %res = call <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 2)
5325  %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 0)
5326  %res2 = add <16 x i32> %res, %res1
5327  ret <16 x i32> %res2
5328}
5329
5330declare <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float>, <8 x double>, i8, i32)
5331
5332define <8 x double>@test_int_x86_avx512_mask_cvt_ps2pd_512(<8 x float> %x0, <8 x double> %x1, i8 %x2) {
5333; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2pd_512:
5334; CHECK:       ## BB#0:
5335; CHECK-NEXT:    movzbl %dil, %eax
5336; CHECK-NEXT:    kmovw %eax, %k1
5337; CHECK-NEXT:    vcvtps2pd %ymm0, %zmm1 {%k1}
5338; CHECK-NEXT:    vcvtps2pd {sae}, %ymm0, %zmm0
5339; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
5340; CHECK-NEXT:    retq
5341  %res = call <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float> %x0, <8 x double> %x1, i8 %x2, i32 4)
5342  %res1 = call <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float> %x0, <8 x double> %x1, i8 -1, i32 8)
5343  %res2 = fadd <8 x double> %res, %res1
5344  ret <8 x double> %res2
5345}
5346
5347declare <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float>, <16 x i32>, i16, i32)
5348
5349define <16 x i32>@test_int_x86_avx512_mask_cvt_ps2udq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) {
5350; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2udq_512:
5351; CHECK:       ## BB#0:
5352; CHECK-NEXT:    kmovw %edi, %k1
5353; CHECK-NEXT:    vcvtps2udq {ru-sae}, %zmm0, %zmm1 {%k1}
5354; CHECK-NEXT:    vcvtps2udq {rn-sae}, %zmm0, %zmm0
5355; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
5356; CHECK-NEXT:    retq
5357  %res = call <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 2)
5358  %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 0)
5359  %res2 = add <16 x i32> %res, %res1
5360  ret <16 x i32> %res2
5361}
5362
5363declare <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double>, <8 x i32>, i8, i32)
5364
5365define <8 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) {
5366; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2dq_512:
5367; CHECK:       ## BB#0:
5368; CHECK-NEXT:    movzbl %dil, %eax
5369; CHECK-NEXT:    kmovw %eax, %k1
5370; CHECK-NEXT:    vcvttpd2dq %zmm0, %ymm1 {%k1}
5371; CHECK-NEXT:    vcvttpd2dq {sae}, %zmm0, %ymm0
5372; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
5373; CHECK-NEXT:    retq
5374  %res = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 4)
5375  %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 8)
5376  %res2 = add <8 x i32> %res, %res1
5377  ret <8 x i32> %res2
5378}
5379
5380declare <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32>, <8 x double>, i8)
5381
5382define <8 x double>@test_int_x86_avx512_mask_cvt_udq2pd_512(<8 x i32> %x0, <8 x double> %x1, i8 %x2) {
5383; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2pd_512:
5384; CHECK:       ## BB#0:
5385; CHECK-NEXT:    movzbl %dil, %eax
5386; CHECK-NEXT:    kmovw %eax, %k1
5387; CHECK-NEXT:    vcvtudq2pd %ymm0, %zmm1 {%k1}
5388; CHECK-NEXT:    vcvtudq2pd %ymm0, %zmm0
5389; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
5390; CHECK-NEXT:    retq
5391  %res = call <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 %x2)
5392  %res1 = call <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 -1)
5393  %res2 = fadd <8 x double> %res, %res1
5394  ret <8 x double> %res2
5395}
5396
5397
5398declare <16 x float> @llvm.x86.avx512.mask.cvtudq2ps.512(<16 x i32>, <16 x float>, i16, i32)
5399
5400define <16 x float>@test_int_x86_avx512_mask_cvt_udq2ps_512(<16 x i32> %x0, <16 x float> %x1, i16 %x2) {
5401; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2ps_512:
5402; CHECK:       ## BB#0:
5403; CHECK-NEXT:    kmovw %edi, %k1
5404; CHECK-NEXT:    vcvtudq2ps %zmm0, %zmm1 {%k1}
5405; CHECK-NEXT:    vcvtudq2ps {rn-sae}, %zmm0, %zmm0
5406; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
5407; CHECK-NEXT:    retq
5408  %res = call <16 x float> @llvm.x86.avx512.mask.cvtudq2ps.512(<16 x i32> %x0, <16 x float> %x1, i16 %x2, i32 4)
5409  %res1 = call <16 x float> @llvm.x86.avx512.mask.cvtudq2ps.512(<16 x i32> %x0, <16 x float> %x1, i16 -1, i32 0)
5410  %res2 = fadd <16 x float> %res, %res1
5411  ret <16 x float> %res2
5412}
5413
5414declare <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double>, <8 x i32>, i8, i32)
5415
5416define <8 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) {
5417; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2udq_512:
5418; CHECK:       ## BB#0:
5419; CHECK-NEXT:    movzbl %dil, %eax
5420; CHECK-NEXT:    kmovw %eax, %k1
5421; CHECK-NEXT:    vcvttpd2udq %zmm0, %ymm1 {%k1}
5422; CHECK-NEXT:    vcvttpd2udq {sae}, %zmm0, %ymm0
5423; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
5424; CHECK-NEXT:    retq
5425  %res = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 4)
5426  %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 8)
5427  %res2 = add <8 x i32> %res, %res1
5428  ret <8 x i32> %res2
5429}
5430
5431declare <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float>, <16 x i32>, i16, i32)
5432
5433define <16 x i32>@test_int_x86_avx512_mask_cvtt_ps2dq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) {
5434; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2dq_512:
5435; CHECK:       ## BB#0:
5436; CHECK-NEXT:    kmovw %edi, %k1
5437; CHECK-NEXT:    vcvttps2dq %zmm0, %zmm1 {%k1}
5438; CHECK-NEXT:    vcvttps2dq {sae}, %zmm0, %zmm0
5439; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
5440; CHECK-NEXT:    retq
5441  %res = call <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 4)
5442  %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 8)
5443  %res2 = add <16 x i32> %res, %res1
5444  ret <16 x i32> %res2
5445}
5446
5447declare <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float>, <16 x i32>, i16, i32)
5448
5449define <16 x i32>@test_int_x86_avx512_mask_cvtt_ps2udq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) {
5450; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2udq_512:
5451; CHECK:       ## BB#0:
5452; CHECK-NEXT:    kmovw %edi, %k1
5453; CHECK-NEXT:    vcvttps2udq %zmm0, %zmm1 {%k1}
5454; CHECK-NEXT:    vcvttps2udq {sae}, %zmm0, %zmm0
5455; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
5456; CHECK-NEXT:    retq
5457  %res = call <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 4)
5458  %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 8)
5459  %res2 = add <16 x i32> %res, %res1
5460  ret <16 x i32> %res2
5461}
5462
5463
5464declare <4 x float> @llvm.x86.avx512.mask.scalef.ss(<4 x float>, <4 x float>,<4 x float>, i8, i32)
5465define <4 x float>@test_int_x86_avx512_mask_scalef_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4) {
5466; CHECK-LABEL: test_int_x86_avx512_mask_scalef_ss:
5467; CHECK:       ## BB#0:
5468; CHECK-NEXT:    andl $1, %edi
5469; CHECK-NEXT:    kmovw %edi, %k1
5470; CHECK-NEXT:    vscalefss %xmm1, %xmm0, %xmm2 {%k1}
5471; CHECK-NEXT:    vscalefss {rn-sae}, %xmm1, %xmm0, %xmm0
5472; CHECK-NEXT:    vaddps %xmm0, %xmm2, %xmm0
5473; CHECK-NEXT:    retq
5474  %res = call <4 x float> @llvm.x86.avx512.mask.scalef.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4, i32 4)
5475  %res1 = call <4 x float> @llvm.x86.avx512.mask.scalef.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 -1, i32 8)
5476  %res2 = fadd <4 x float> %res, %res1
5477  ret <4 x float> %res2
5478}
5479
5480declare <2 x double> @llvm.x86.avx512.mask.scalef.sd(<2 x double>, <2 x double>,<2 x double>, i8, i32)
5481define <2 x double>@test_int_x86_avx512_mask_scalef_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4) {
5482; CHECK-LABEL: test_int_x86_avx512_mask_scalef_sd:
5483; CHECK:       ## BB#0:
5484; CHECK-NEXT:    andl $1, %edi
5485; CHECK-NEXT:    kmovw %edi, %k1
5486; CHECK-NEXT:    vscalefsd %xmm1, %xmm0, %xmm2 {%k1}
5487; CHECK-NEXT:    vscalefsd {rn-sae}, %xmm1, %xmm0, %xmm0
5488; CHECK-NEXT:    vaddpd %xmm0, %xmm2, %xmm0
5489; CHECK-NEXT:    retq
5490  %res = call <2 x double> @llvm.x86.avx512.mask.scalef.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4, i32 4)
5491  %res1 = call <2 x double> @llvm.x86.avx512.mask.scalef.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 -1, i32 8)
5492  %res2 = fadd <2 x double> %res, %res1
5493  ret <2 x double> %res2
5494}
5495
5496declare <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
5497
5498define <4 x float> @test_getexp_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
5499; CHECK-LABEL: test_getexp_ss:
5500; CHECK:       ## BB#0:
5501; CHECK-NEXT:    andl $1, %edi
5502; CHECK-NEXT:    kmovw %edi, %k1
5503; CHECK-NEXT:    vmovaps %zmm2, %zmm3
5504; CHECK-NEXT:    vgetexpss %xmm1, %xmm0, %xmm3 {%k1}
5505; CHECK-NEXT:    vgetexpss {sae}, %xmm1, %xmm0, %xmm2 {%k1}
5506; CHECK-NEXT:    vgetexpss {sae}, %xmm1, %xmm0, %xmm4 {%k1} {z}
5507; CHECK-NEXT:    vgetexpss {sae}, %xmm1, %xmm0, %xmm0
5508; CHECK-NEXT:    vaddps %xmm2, %xmm3, %xmm1
5509; CHECK-NEXT:    vaddps %xmm0, %xmm4, %xmm0
5510; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
5511; CHECK-NEXT:    retq
5512  %res0 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
5513  %res1 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 8)
5514  %res2 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 8)
5515  %res3 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 8)
5516
5517  %res.1 = fadd <4 x float> %res0, %res1
5518  %res.2 = fadd <4 x float> %res2, %res3
5519  %res   = fadd <4 x float> %res.1, %res.2
5520  ret <4 x float> %res
5521}
5522
5523declare <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
5524
5525define <2 x double> @test_getexp_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
5526; CHECK-LABEL: test_getexp_sd:
5527; CHECK:       ## BB#0:
5528; CHECK-NEXT:    andl $1, %edi
5529; CHECK-NEXT:    kmovw %edi, %k1
5530; CHECK-NEXT:    vmovaps %zmm2, %zmm3
5531; CHECK-NEXT:    vgetexpsd %xmm1, %xmm0, %xmm3 {%k1}
5532; CHECK-NEXT:    vgetexpsd %xmm1, %xmm0, %xmm4
5533; CHECK-NEXT:    vgetexpsd {sae}, %xmm1, %xmm0, %xmm2 {%k1}
5534; CHECK-NEXT:    vgetexpsd {sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
5535; CHECK-NEXT:    vaddpd %xmm2, %xmm3, %xmm1
5536; CHECK-NEXT:    vaddpd %xmm4, %xmm0, %xmm0
5537; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
5538; CHECK-NEXT:    retq
5539  %res0 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
5540  %res1 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 8)
5541  %res2 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 8)
5542  %res3 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 4)
5543
5544  %res.1 = fadd <2 x double> %res0, %res1
5545  %res.2 = fadd <2 x double> %res2, %res3
5546  %res   = fadd <2 x double> %res.1, %res.2
5547  ret <2 x double> %res
5548}
5549
5550declare i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double>, <2 x double>, i32, i8, i32)
5551
5552define i8@test_int_x86_avx512_mask_cmp_sd(<2 x double> %x0, <2 x double> %x1, i8 %x3, i32 %x4) {
5553; CHECK-LABEL: test_int_x86_avx512_mask_cmp_sd:
5554; CHECK:       ## BB#0:
5555; CHECK-NEXT:    andl $1, %edi
5556; CHECK-NEXT:    kmovw %edi, %k1
5557; CHECK-NEXT:    vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1}
5558; CHECK-NEXT:    kmovw %k0, %eax
5559; CHECK-NEXT:    shlb $7, %al
5560; CHECK-NEXT:    sarb $7, %al
5561; CHECK-NEXT:    retq
5562
5563  %res4 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 5, i8 %x3, i32 8)
5564  ret i8 %res4
5565}
5566
5567define i8@test_int_x86_avx512_mask_cmp_sd_all(<2 x double> %x0, <2 x double> %x1, i8 %x3, i32 %x4) {
5568; CHECK-LABEL: test_int_x86_avx512_mask_cmp_sd_all:
5569; CHECK:       ## BB#0:
5570; CHECK-NEXT:    vcmpunordsd {sae}, %xmm1, %xmm0, %k0
5571; CHECK-NEXT:    vcmplesd %xmm1, %xmm0, %k1
5572; CHECK-NEXT:    korw %k0, %k1, %k0
5573; CHECK-NEXT:    vcmpnltsd {sae}, %xmm1, %xmm0, %k1
5574; CHECK-NEXT:    vcmpneqsd %xmm1, %xmm0, %k2
5575; CHECK-NEXT:    korw %k1, %k2, %k1
5576; CHECK-NEXT:    andl $1, %edi
5577; CHECK-NEXT:    kmovw %edi, %k2
5578; CHECK-NEXT:    kandw %k2, %k1, %k1
5579; CHECK-NEXT:    korw %k1, %k0, %k0
5580; CHECK-NEXT:    kmovw %k0, %eax
5581; CHECK-NEXT:    shlb $7, %al
5582; CHECK-NEXT:    sarb $7, %al
5583; CHECK-NEXT:    retq
5584
5585  %res1 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 2, i8 -1, i32 4)
5586  %res2 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 3, i8 -1, i32 8)
5587  %res3 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 4, i8 %x3, i32 4)
5588  %res4 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 5, i8 %x3, i32 8)
5589
5590  %res11 = or i8 %res1, %res2
5591  %res12 = or i8 %res3, %res4
5592  %res13 = or i8 %res11, %res12
5593  ret i8 %res13
5594}
5595
5596declare i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float>, <4 x float>, i32, i8, i32)
5597
5598define i8@test_int_x86_avx512_mask_cmp_ss(<4 x float> %x0, <4 x float> %x1, i8 %x3, i32 %x4) {
5599; CHECK-LABEL: test_int_x86_avx512_mask_cmp_ss:
5600; CHECK:       ## BB#0:
5601; CHECK-NEXT:    andl $1, %edi
5602; CHECK-NEXT:    kmovw %edi, %k1
5603; CHECK-NEXT:    vcmpunordss %xmm1, %xmm0, %k0 {%k1}
5604; CHECK-NEXT:    kmovw %k0, %eax
5605; CHECK-NEXT:    shlb $7, %al
5606; CHECK-NEXT:    sarb $7, %al
5607; CHECK-NEXT:    retq
5608
5609  %res2 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 3, i8 %x3, i32 4)
5610  ret i8 %res2
5611}
5612
5613
5614define i8@test_int_x86_avx512_mask_cmp_ss_all(<4 x float> %x0, <4 x float> %x1, i8 %x3, i32 %x4) {
5615; CHECK-LABEL: test_int_x86_avx512_mask_cmp_ss_all:
5616; CHECK:       ## BB#0:
5617; CHECK-NEXT:    vcmpless %xmm1, %xmm0, %k1
5618; CHECK-NEXT:    vcmpunordss {sae}, %xmm1, %xmm0, %k0 {%k1}
5619; CHECK-NEXT:    vcmpneqss %xmm1, %xmm0, %k1
5620; CHECK-NEXT:    vcmpnltss {sae}, %xmm1, %xmm0, %k1 {%k1}
5621; CHECK-NEXT:    andl $1, %edi
5622; CHECK-NEXT:    kmovw %edi, %k2
5623; CHECK-NEXT:    kandw %k2, %k1, %k1
5624; CHECK-NEXT:    kandw %k1, %k0, %k0
5625; CHECK-NEXT:    kmovw %k0, %eax
5626; CHECK-NEXT:    shlb $7, %al
5627; CHECK-NEXT:    sarb $7, %al
5628; CHECK-NEXT:    retq
5629  %res1 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 2, i8 -1, i32 4)
5630  %res2 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 3, i8 -1, i32 8)
5631  %res3 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 4, i8 %x3, i32 4)
5632  %res4 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 5, i8 %x3, i32 8)
5633
5634  %res11 = and i8 %res1, %res2
5635  %res12 = and i8 %res3, %res4
5636  %res13 = and i8 %res11, %res12
5637  ret i8 %res13
5638}
5639
5640declare <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float>, <16 x float>, i32, <16 x float>, i16)
5641
5642define <16 x float>@test_int_x86_avx512_mask_shuf_f32x4(<16 x float> %x0, <16 x float> %x1, <16 x float> %x3, i16 %x4) {
5643; CHECK-LABEL: test_int_x86_avx512_mask_shuf_f32x4:
5644; CHECK:       ## BB#0:
5645; CHECK-NEXT:    kmovw %edi, %k1
5646; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm2 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
5647; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
5648; CHECK-NEXT:    vaddps %zmm0, %zmm2, %zmm0
5649; CHECK-NEXT:    retq
5650  %res = call <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 %x4)
5651  %res1 = call <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 -1)
5652  %res2 = fadd <16 x float> %res, %res1
5653  ret <16 x float> %res2
5654}
5655
5656declare <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double>, <8 x double>, i32, <8 x double>, i8)
5657
5658define <8 x double>@test_int_x86_avx512_mask_shuf_f64x2(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3, i8 %x4) {
5659; CHECK-LABEL: test_int_x86_avx512_mask_shuf_f64x2:
5660; CHECK:       ## BB#0:
5661; CHECK-NEXT:    movzbl %dil, %eax
5662; CHECK-NEXT:    kmovw %eax, %k1
5663; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm2 = zmm0[4,5,2,3],zmm1[2,3,0,1]
5664; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm3 = zmm0[4,5,2,3],zmm1[2,3,0,1]
5665; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,2,3],zmm1[2,3,0,1]
5666; CHECK-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
5667; CHECK-NEXT:    vaddpd %zmm3, %zmm0, %zmm0
5668; CHECK-NEXT:    retq
5669  %res = call <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 %x4)
5670  %res1 = call <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 -1)
5671  %res2 = call <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> zeroinitializer, i8 %x4)
5672
5673  %res3 = fadd <8 x double> %res, %res1
5674  %res4 = fadd <8 x double> %res3, %res2
5675  ret <8 x double> %res4
5676}
5677
5678declare <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32>, <16 x i32>, i32, <16 x i32>, i16)
5679
5680define <16 x i32>@test_int_x86_avx512_mask_shuf_i32x4(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x3, i16 %x4) {
5681; CHECK-LABEL: test_int_x86_avx512_mask_shuf_i32x4:
5682; CHECK:       ## BB#0:
5683; CHECK-NEXT:    kmovw %edi, %k1
5684; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm2 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
5685; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
5686; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
5687; CHECK-NEXT:    retq
5688  %res = call <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 %x4)
5689  %res1 = call <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 -1)
5690  %res2 = add <16 x i32> %res, %res1
5691  ret <16 x i32> %res2
5692}
5693
5694declare <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64>, <8 x i64>, i32, <8 x i64>, i8)
5695
5696define <8 x i64>@test_int_x86_avx512_mask_shuf_i64x2(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x3, i8 %x4) {
5697; CHECK-LABEL: test_int_x86_avx512_mask_shuf_i64x2:
5698; CHECK:       ## BB#0:
5699; CHECK-NEXT:    movzbl %dil, %eax
5700; CHECK-NEXT:    kmovw %eax, %k1
5701; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm0[4,5,2,3],zmm1[2,3,0,1]
5702; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,2,3],zmm1[2,3,0,1]
5703; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
5704; CHECK-NEXT:    retq
5705  %res = call <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 %x4)
5706  %res1 = call <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 -1)
5707  %res2 = add <8 x i64> %res, %res1
5708  ret <8 x i64> %res2
5709}
5710
5711declare <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double>, i32, <8 x double>, i8, i32)
5712
5713define <8 x double>@test_int_x86_avx512_mask_getmant_pd_512(<8 x double> %x0, <8 x double> %x2, i8 %x3) {
5714; CHECK-LABEL: test_int_x86_avx512_mask_getmant_pd_512:
5715; CHECK:       ## BB#0:
5716; CHECK-NEXT:    movzbl %dil, %eax
5717; CHECK-NEXT:    kmovw %eax, %k1
5718; CHECK-NEXT:    vgetmantpd $11, %zmm0, %zmm1 {%k1}
5719; CHECK-NEXT:    vgetmantpd $11,{sae}, %zmm0, %zmm0
5720; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
5721; CHECK-NEXT:    retq
5722  %res = call <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double> %x0, i32 11, <8 x double> %x2, i8 %x3, i32 4)
5723  %res1 = call <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double> %x0, i32 11, <8 x double> %x2, i8 -1, i32 8)
5724  %res2 = fadd <8 x double> %res, %res1
5725  ret <8 x double> %res2
5726}
5727
5728declare <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float>, i32, <16 x float>, i16, i32)
5729
5730define <16 x float>@test_int_x86_avx512_mask_getmant_ps_512(<16 x float> %x0, <16 x float> %x2, i16 %x3) {
5731; CHECK-LABEL: test_int_x86_avx512_mask_getmant_ps_512:
5732; CHECK:       ## BB#0:
5733; CHECK-NEXT:    kmovw %edi, %k1
5734; CHECK-NEXT:    vgetmantps $11, %zmm0, %zmm1 {%k1}
5735; CHECK-NEXT:    vgetmantps $11,{sae}, %zmm0, %zmm0
5736; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
5737; CHECK-NEXT:    retq
5738  %res = call <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float> %x0, i32 11, <16 x float> %x2, i16 %x3, i32 4)
5739  %res1 = call <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float> %x0, i32 11, <16 x float> %x2, i16 -1, i32 8)
5740  %res2 = fadd <16 x float> %res, %res1
5741  ret <16 x float> %res2
5742}
5743
5744declare <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double>, <2 x double>, i32, <2 x double>, i8, i32)
5745
5746define <2 x double>@test_int_x86_avx512_mask_getmant_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
5747; CHECK-LABEL: test_int_x86_avx512_mask_getmant_sd:
5748; CHECK:       ## BB#0:
5749; CHECK-NEXT:    andl $1, %edi
5750; CHECK-NEXT:    kmovw %edi, %k1
5751; CHECK-NEXT:    vmovaps %zmm2, %zmm3
5752; CHECK-NEXT:    vgetmantsd $11, %xmm1, %xmm0, %xmm3 {%k1}
5753; CHECK-NEXT:    vgetmantsd $11, %xmm1, %xmm0, %xmm4 {%k1} {z}
5754; CHECK-NEXT:    vgetmantsd $11, %xmm1, %xmm0, %xmm5
5755; CHECK-NEXT:    vgetmantsd $11,{sae}, %xmm1, %xmm0, %xmm2 {%k1}
5756; CHECK-NEXT:    vaddpd %xmm4, %xmm3, %xmm0
5757; CHECK-NEXT:    vaddpd %xmm5, %xmm2, %xmm1
5758; CHECK-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
5759; CHECK-NEXT:    retq
5760  %res  = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> %x2, i8 %x3, i32 4)
5761  %res1 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> zeroinitializer, i8 %x3, i32 4)
5762  %res2 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> %x2, i8 %x3, i32 8)
5763  %res3 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> %x2, i8 -1, i32 4)
5764  %res11 = fadd <2 x double> %res, %res1
5765  %res12 = fadd <2 x double> %res2, %res3
5766  %res13 = fadd <2 x double> %res11, %res12
5767  ret <2 x double> %res13
5768}
5769
5770declare <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float>, <4 x float>, i32, <4 x float>, i8, i32)
5771
5772define <4 x float>@test_int_x86_avx512_mask_getmant_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
5773; CHECK-LABEL: test_int_x86_avx512_mask_getmant_ss:
5774; CHECK:       ## BB#0:
5775; CHECK-NEXT:    andl $1, %edi
5776; CHECK-NEXT:    kmovw %edi, %k1
5777; CHECK-NEXT:    vgetmantss $11, %xmm1, %xmm0, %xmm2 {%k1}
5778; CHECK-NEXT:    vgetmantss $11, %xmm1, %xmm0, %xmm3 {%k1} {z}
5779; CHECK-NEXT:    vgetmantss $11, %xmm1, %xmm0, %xmm4
5780; CHECK-NEXT:    vgetmantss $11,{sae}, %xmm1, %xmm0, %xmm0
5781; CHECK-NEXT:    vaddps %xmm3, %xmm2, %xmm1
5782; CHECK-NEXT:    vaddps %xmm4, %xmm0, %xmm0
5783; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
5784; CHECK-NEXT:    retq
5785  %res  = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> %x2, i8 %x3, i32 4)
5786  %res1 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> zeroinitializer, i8 %x3, i32 4)
5787  %res2 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> %x2, i8 -1, i32 8)
5788  %res3 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> %x2, i8 -1, i32 4)
5789  %res11 = fadd <4 x float> %res, %res1
5790  %res12 = fadd <4 x float> %res2, %res3
5791  %res13 = fadd <4 x float> %res11, %res12
5792  ret <4 x float> %res13
5793}
5794
5795declare <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double>, <8 x double>, i32, <8 x double>, i8)
5796
5797define <8 x double>@test_int_x86_avx512_mask_shuf_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3, i8 %x4) {
5798; CHECK-LABEL: test_int_x86_avx512_mask_shuf_pd_512:
5799; CHECK:       ## BB#0:
5800; CHECK-NEXT:    movzbl %dil, %eax
5801; CHECK-NEXT:    kmovw %eax, %k1
5802; CHECK-NEXT:    vshufpd {{.*#+}} zmm2 = zmm2[0],k1[1],zmm2[3],k1[2],zmm2[5],k1[4],zmm2[6],k1[6]
5803; CHECK-NEXT:    vshufpd {{.*#+}} zmm3 = k1[0],zmm0[1],k1[3],zmm0[2],k1[5],zmm0[4],k1[6],zmm0[6]
5804; CHECK-NEXT:    vshufpd {{.*#+}} zmm0 = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6]
5805; CHECK-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
5806; CHECK-NEXT:    vaddpd %zmm3, %zmm0, %zmm0
5807; CHECK-NEXT:    retq
5808  %res = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 %x4)
5809  %res1 = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 -1)
5810  %res2 = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> zeroinitializer, i8 %x4)
5811
5812  %res3 = fadd <8 x double> %res, %res1
5813  %res4 = fadd <8 x double> %res3, %res2
5814  ret <8 x double> %res4
5815}
5816
5817declare <16 x float> @llvm.x86.avx512.mask.shuf.ps.512(<16 x float>, <16 x float>, i32, <16 x float>, i16)
5818
5819define <16 x float>@test_int_x86_avx512_mask_shuf_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x3, i16 %x4) {
5820; CHECK-LABEL: test_int_x86_avx512_mask_shuf_ps_512:
5821; CHECK:       ## BB#0:
5822; CHECK-NEXT:    kmovw %edi, %k1
5823; CHECK-NEXT:    vshufps {{.*#+}} zmm2 = zmm2[2,1],k1[1,0],zmm2[6,5],k1[5,4],zmm2[10,9],k1[9,8],zmm2[14,13],k1[13,12]
5824; CHECK-NEXT:    vshufps {{.*#+}} zmm0 = zmm0[2,1],zmm1[1,0],zmm0[6,5],zmm1[5,4],zmm0[10,9],zmm1[9,8],zmm0[14,13],zmm1[13,12]
5825; CHECK-NEXT:    vaddps %zmm0, %zmm2, %zmm0
5826; CHECK-NEXT:    retq
5827  %res = call <16 x float> @llvm.x86.avx512.mask.shuf.ps.512(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 %x4)
5828  %res1 = call <16 x float> @llvm.x86.avx512.mask.shuf.ps.512(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 -1)
5829  %res2 = fadd <16 x float> %res, %res1
5830  ret <16 x float> %res2
5831}
5832
5833declare <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double>, i32, <8 x double>, i8)
5834
5835define <8 x double>@test_int_x86_avx512_mask_vpermil_pd_512(<8 x double> %x0, <8 x double> %x2, i8 %x3) {
5836; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_pd_512:
5837; CHECK:       ## BB#0:
5838; CHECK-NEXT:    movzbl %dil, %eax
5839; CHECK-NEXT:    kmovw %eax, %k1
5840; CHECK-NEXT:    vpermilpd {{.*#+}} zmm1 = zmm1[0,1,3,2,5,4,6,6]
5841; CHECK-NEXT:    vpermilpd {{.*#+}} zmm2 = k1[0,1,3,2,5,4,6,6]
5842; CHECK-NEXT:    vpermilpd {{.*#+}} zmm0 = zmm0[0,1,3,2,5,4,6,6]
5843; CHECK-NEXT:    vaddpd %zmm2, %zmm1, %zmm1
5844; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
5845; CHECK-NEXT:    retq
5846  %res = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> %x2, i8 %x3)
5847  %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> zeroinitializer, i8 %x3)
5848  %res2 = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> %x2, i8 -1)
5849  %res3 = fadd <8 x double> %res, %res1
5850  %res4 = fadd <8 x double> %res3, %res2
5851  ret <8 x double> %res4
5852}
5853
5854declare <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float>, i32, <16 x float>, i16)
5855
5856define <16 x float>@test_int_x86_avx512_mask_vpermil_ps_512(<16 x float> %x0, <16 x float> %x2, i16 %x3) {
5857; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_ps_512:
5858; CHECK:       ## BB#0:
5859; CHECK-NEXT:    kmovw %edi, %k1
5860; CHECK-NEXT:    vpermilps {{.*#+}} zmm1 = zmm1[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12]
5861; CHECK-NEXT:    vpermilps {{.*#+}} zmm2 = k1[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12]
5862; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12]
5863; CHECK-NEXT:    vaddps %zmm2, %zmm1, %zmm1
5864; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
5865; CHECK-NEXT:    retq
5866  %res = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> %x2, i16 %x3)
5867  %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> zeroinitializer, i16 %x3)
5868  %res2 = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> %x2, i16 -1)
5869  %res3 = fadd <16 x float> %res, %res1
5870  %res4 = fadd <16 x float> %res3, %res2
5871  ret <16 x float> %res4
5872}
5873
5874declare <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double>, <8 x i64>, <8 x double>, i8)
5875
5876define <8 x double>@test_int_x86_avx512_mask_vpermilvar_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) {
5877; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_pd_512:
5878; CHECK:       ## BB#0:
5879; CHECK-NEXT:    movzbl %dil, %eax
5880; CHECK-NEXT:    kmovw %eax, %k1
5881; CHECK-NEXT:    vpermilpd %zmm1, %zmm0, %zmm2 {%k1}
5882; CHECK-NEXT:    vpermilpd %zmm1, %zmm0, %zmm3 {%k1} {z}
5883; CHECK-NEXT:    vpermilpd %zmm1, %zmm0, %zmm0
5884; CHECK-NEXT:    vaddpd %zmm3, %zmm2, %zmm1
5885; CHECK-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
5886; CHECK-NEXT:    retq
5887  %res = call <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3)
5888  %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> zeroinitializer, i8 %x3)
5889  %res2 = call <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1)
5890  %res3 = fadd <8 x double> %res, %res1
5891  %res4 = fadd <8 x double> %res2, %res3
5892  ret <8 x double> %res4
5893}
5894
5895declare <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float>, <16 x i32>, <16 x float>, i16)
5896
5897define <16 x float>@test_int_x86_avx512_mask_vpermilvar_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) {
5898; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_ps_512:
5899; CHECK:       ## BB#0:
5900; CHECK-NEXT:    kmovw %edi, %k1
5901; CHECK-NEXT:    vpermilps %zmm1, %zmm0, %zmm2 {%k1}
5902; CHECK-NEXT:    vpermilps %zmm1, %zmm0, %zmm3 {%k1} {z}
5903; CHECK-NEXT:    vpermilps %zmm1, %zmm0, %zmm0
5904; CHECK-NEXT:    vaddps %zmm3, %zmm2, %zmm1
5905; CHECK-NEXT:    vaddps %zmm1, %zmm0, %zmm0
5906; CHECK-NEXT:    retq
5907  %res = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3)
5908  %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> zeroinitializer, i16 %x3)
5909  %res2 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1)
5910  %res3 = fadd <16 x float> %res, %res1
5911  %res4 = fadd <16 x float> %res2, %res3
5912  ret <16 x float> %res4
5913}
5914
5915declare <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float>, <4 x float>, i32, <16 x float>, i8)
5916
5917define <16 x float>@test_int_x86_avx512_mask_insertf32x4_512(<16 x float> %x0, <4 x float> %x1, <16 x float> %x3, i8 %x4) {
5918; CHECK-LABEL: test_int_x86_avx512_mask_insertf32x4_512:
5919; CHECK:       ## BB#0:
5920; CHECK-NEXT:    kmovw %edi, %k1
5921; CHECK-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
5922; CHECK-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm3 {%k1} {z}
5923; CHECK-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm0
5924; CHECK-NEXT:    vaddps %zmm0, %zmm2, %zmm0
5925; CHECK-NEXT:    vaddps %zmm0, %zmm3, %zmm0
5926; CHECK-NEXT:    retq
5927  %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i8 %x4)
5928  %res1 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i8 -1)
5929  %res2 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> zeroinitializer, i8 %x4)
5930  %res3 = fadd <16 x float> %res, %res1
5931  %res4 = fadd <16 x float> %res2, %res3
5932  ret <16 x float> %res4
5933}
5934
5935declare <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32>, <4 x i32>, i32, <16 x i32>, i8)
5936
5937define <16 x i32>@test_int_x86_avx512_mask_inserti32x4_512(<16 x i32> %x0, <4 x i32> %x1, <16 x i32> %x3, i8 %x4) {
5938; CHECK-LABEL: test_int_x86_avx512_mask_inserti32x4_512:
5939; CHECK:       ## BB#0:
5940; CHECK-NEXT:    kmovw %edi, %k1
5941; CHECK-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
5942; CHECK-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm3 {%k1} {z}
5943; CHECK-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm0
5944; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
5945; CHECK-NEXT:    vpaddd %zmm0, %zmm3, %zmm0
5946; CHECK-NEXT:    retq
5947  %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i8 %x4)
5948  %res1 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i8 -1)
5949  %res2 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> zeroinitializer, i8 %x4)
5950  %res3 = add <16 x i32> %res, %res1
5951  %res4 = add <16 x i32> %res2, %res3
5952  ret <16 x i32> %res4
5953}
5954
5955declare <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double>, <4 x double>, i32, <8 x double>, i8)
5956
5957define <8 x double>@test_int_x86_avx512_mask_insertf64x4_512(<8 x double> %x0, <4 x double> %x1, <8 x double> %x3, i8 %x4) {
5958; CHECK-LABEL: test_int_x86_avx512_mask_insertf64x4_512:
5959; CHECK:       ## BB#0:
5960; CHECK-NEXT:    movzbl %dil, %eax
5961; CHECK-NEXT:    kmovw %eax, %k1
5962; CHECK-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
5963; CHECK-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm3 {%k1} {z}
5964; CHECK-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
5965; CHECK-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
5966; CHECK-NEXT:    vaddpd %zmm0, %zmm3, %zmm0
5967; CHECK-NEXT:    retq
5968  %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 %x4)
5969  %res1 = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 -1)
5970  %res2 = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> zeroinitializer, i8 %x4)
5971  %res3 = fadd <8 x double> %res, %res1
5972  %res4 = fadd <8 x double> %res2, %res3
5973  ret <8 x double> %res4
5974}
5975
5976declare <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64>, <4 x i64>, i32, <8 x i64>, i8)
5977
5978define <8 x i64>@test_int_x86_avx512_mask_inserti64x4_512(<8 x i64> %x0, <4 x i64> %x1, <8 x i64> %x3, i8 %x4) {
5979; CHECK-LABEL: test_int_x86_avx512_mask_inserti64x4_512:
5980; CHECK:       ## BB#0:
5981; CHECK-NEXT:    movzbl %dil, %eax
5982; CHECK-NEXT:    kmovw %eax, %k1
5983; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
5984; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm3 {%k1} {z}
5985; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
5986; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
5987; CHECK-NEXT:    vpaddq %zmm0, %zmm3, %zmm0
5988; CHECK-NEXT:    retq
5989  %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 %x4)
5990  %res1 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1)
5991  %res2 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> zeroinitializer, i8 %x4)
5992  %res3 = add <8 x i64> %res, %res1
5993  %res4 = add <8 x i64> %res2, %res3
5994  ret <8 x i64> %res4
5995}
5996
5997declare <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<4 x float>, <4 x float>, <2 x double>, i8, i32)
5998
5999define <2 x double>@test_int_x86_avx512_mask_cvt_ss2sd_round(<4 x float> %x0,<4 x float> %x1, <2 x double> %x2, i8 %x3) {
6000; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ss2sd_round:
6001; CHECK:       ## BB#0:
6002; CHECK-NEXT:    andl $1, %edi
6003; CHECK-NEXT:    kmovw %edi, %k1
6004; CHECK-NEXT:    vcvtss2sd %xmm1, %xmm0, %xmm2 {%k1}
6005; CHECK-NEXT:    vcvtss2sd {sae}, %xmm1, %xmm0, %xmm0
6006; CHECK-NEXT:    vaddpd %xmm0, %xmm2, %xmm0
6007; CHECK-NEXT:    retq
6008  %res = call <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<4 x float> %x0, <4 x float> %x1, <2 x double> %x2, i8 %x3, i32 4)
6009  %res1 = call <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<4 x float> %x0, <4 x float> %x1, <2 x double> %x2, i8 -1, i32 8)
6010  %res2 = fadd <2 x double> %res, %res1
6011  ret <2 x double> %res2
6012}
6013
6014declare <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<2 x double>, <2 x double>, <4 x float>, i8, i32)
6015
6016define <4 x float>@test_int_x86_avx512_mask_cvt_sd2ss_round(<2 x double> %x0,<2 x double> %x1, <4 x float> %x2, i8 %x3) {
6017; CHECK-LABEL: test_int_x86_avx512_mask_cvt_sd2ss_round:
6018; CHECK:       ## BB#0:
6019; CHECK-NEXT:    andl $1, %edi
6020; CHECK-NEXT:    kmovw %edi, %k1
6021; CHECK-NEXT:    vcvtsd2ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
6022; CHECK-NEXT:    vcvtsd2ss {rn-sae}, %xmm1, %xmm0, %xmm0
6023; CHECK-NEXT:    vaddps %xmm0, %xmm2, %xmm0
6024; CHECK-NEXT:    retq
6025  %res = call <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<2 x double> %x0, <2 x double> %x1, <4 x float> %x2, i8 %x3, i32 3)
6026  %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<2 x double> %x0, <2 x double> %x1, <4 x float> %x2, i8 -1, i32 8)
6027  %res2 = fadd <4 x float> %res, %res1
6028  ret <4 x float> %res2
6029}
6030
6031declare <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32, i16)
6032
6033define <16 x i32>@test_int_x86_avx512_mask_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x4) {
6034; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_d_512:
6035; CHECK:       ## BB#0:
6036; CHECK-NEXT:    kmovw %edi, %k1
6037; CHECK-NEXT:    vmovaps %zmm0, %zmm3
6038; CHECK-NEXT:    vpternlogd $33, %zmm2, %zmm1, %zmm3 {%k1}
6039; CHECK-NEXT:    vpternlogd $33, %zmm2, %zmm1, %zmm0
6040; CHECK-NEXT:    vpaddd %zmm0, %zmm3, %zmm0
6041; CHECK-NEXT:    retq
6042  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 %x4)
6043  %res1 = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 -1)
6044  %res2 = add <16 x i32> %res, %res1
6045  ret <16 x i32> %res2
6046}
6047
6048declare <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32, i16)
6049
6050define <16 x i32>@test_int_x86_avx512_maskz_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x4) {
6051; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_d_512:
6052; CHECK:       ## BB#0:
6053; CHECK-NEXT:    kmovw %edi, %k1
6054; CHECK-NEXT:    vmovaps %zmm0, %zmm3
6055; CHECK-NEXT:    vpternlogd $33, %zmm2, %zmm1, %zmm3 {%k1} {z}
6056; CHECK-NEXT:    vpternlogd $33, %zmm2, %zmm1, %zmm0
6057; CHECK-NEXT:    vpaddd %zmm0, %zmm3, %zmm0
6058; CHECK-NEXT:    retq
6059  %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 %x4)
6060  %res1 = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 -1)
6061  %res2 = add <16 x i32> %res, %res1
6062  ret <16 x i32> %res2
6063}
6064
6065declare <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32, i8)
6066
6067define <8 x i64>@test_int_x86_avx512_mask_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4) {
6068; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_q_512:
6069; CHECK:       ## BB#0:
6070; CHECK-NEXT:    movzbl %dil, %eax
6071; CHECK-NEXT:    kmovw %eax, %k1
6072; CHECK-NEXT:    vmovaps %zmm0, %zmm3
6073; CHECK-NEXT:    vpternlogq $33, %zmm2, %zmm1, %zmm3 {%k1}
6074; CHECK-NEXT:    vpternlogq $33, %zmm2, %zmm1, %zmm0
6075; CHECK-NEXT:    vpaddq %zmm0, %zmm3, %zmm0
6076; CHECK-NEXT:    retq
6077  %res = call <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 %x4)
6078  %res1 = call <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 -1)
6079  %res2 = add <8 x i64> %res, %res1
6080  ret <8 x i64> %res2
6081}
6082
6083declare <8 x i64> @llvm.x86.avx512.maskz.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32, i8)
6084
6085define <8 x i64>@test_int_x86_avx512_maskz_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4) {
6086; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_q_512:
6087; CHECK:       ## BB#0:
6088; CHECK-NEXT:    movzbl %dil, %eax
6089; CHECK-NEXT:    kmovw %eax, %k1
6090; CHECK-NEXT:    vmovaps %zmm0, %zmm3
6091; CHECK-NEXT:    vpternlogq $33, %zmm2, %zmm1, %zmm3 {%k1} {z}
6092; CHECK-NEXT:    vpternlogq $33, %zmm2, %zmm1, %zmm0
6093; CHECK-NEXT:    vpaddq %zmm0, %zmm3, %zmm0
6094; CHECK-NEXT:    retq
6095  %res = call <8 x i64> @llvm.x86.avx512.maskz.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 %x4)
6096  %res1 = call <8 x i64> @llvm.x86.avx512.maskz.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 -1)
6097  %res2 = add <8 x i64> %res, %res1
6098  ret <8 x i64> %res2
6099}
6100
6101declare <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float>, <16 x float>, i16)
6102
6103define <16 x float>@test_int_x86_avx512_mask_movsldup_512(<16 x float> %x0, <16 x float> %x1, i16 %x2) {
6104; CHECK-LABEL: test_int_x86_avx512_mask_movsldup_512:
6105; CHECK:       ## BB#0:
6106; CHECK-NEXT:    kmovw %edi, %k1
6107; CHECK-NEXT:    vmovsldup {{.*#+}} zmm1 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
6108; CHECK-NEXT:    vmovsldup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
6109; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
6110; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
6111; CHECK-NEXT:    vaddps %zmm0, %zmm2, %zmm0
6112; CHECK-NEXT:    retq
6113  %res = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2)
6114  %res1 = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> %x1, i16 -1)
6115  %res2 = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> zeroinitializer, i16 %x2)
6116  %res3 = fadd <16 x float> %res, %res1
6117  %res4 = fadd <16 x float> %res2, %res3
6118  ret <16 x float> %res4
6119}
6120
6121declare <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float>, <16 x float>, i16)
6122
6123define <16 x float>@test_int_x86_avx512_mask_movshdup_512(<16 x float> %x0, <16 x float> %x1, i16 %x2) {
6124; CHECK-LABEL: test_int_x86_avx512_mask_movshdup_512:
6125; CHECK:       ## BB#0:
6126; CHECK-NEXT:    kmovw %edi, %k1
6127; CHECK-NEXT:    vmovshdup {{.*#+}} zmm1 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
6128; CHECK-NEXT:    vmovshdup {{.*#+}} zmm2 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
6129; CHECK-NEXT:    vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
6130; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
6131; CHECK-NEXT:    vaddps %zmm0, %zmm2, %zmm0
6132; CHECK-NEXT:    retq
6133  %res = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2)
6134  %res1 = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> %x1, i16 -1)
6135  %res2 = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> zeroinitializer, i16 %x2)
6136  %res3 = fadd <16 x float> %res, %res1
6137  %res4 = fadd <16 x float> %res2, %res3
6138  ret <16 x float> %res4
6139}
6140
6141declare <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double>, <8 x double>, i8)
6142
6143define <8 x double>@test_int_x86_avx512_mask_movddup_512(<8 x double> %x0, <8 x double> %x1, i8 %x2) {
6144; CHECK-LABEL: test_int_x86_avx512_mask_movddup_512:
6145; CHECK:       ## BB#0:
6146; CHECK-NEXT:    movzbl %dil, %eax
6147; CHECK-NEXT:    kmovw %eax, %k1
6148; CHECK-NEXT:    vmovddup {{.*#+}} zmm1 = zmm0[0,0,2,2,4,4,6,6]
6149; CHECK-NEXT:    vmovddup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6]
6150; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
6151; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
6152; CHECK-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
6153; CHECK-NEXT:    retq
6154  %res = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 %x2)
6155  %res1 = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 -1)
6156  %res2 = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> zeroinitializer, i8 %x2)
6157  %res3 = fadd <8 x double> %res, %res1
6158  %res4 = fadd <8 x double> %res2, %res3
6159  ret <8 x double> %res4
6160}
6161
6162define i32 @test_x86_avx512_comi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1) {
6163; CHECK-LABEL: test_x86_avx512_comi_sd_eq_sae:
6164; CHECK:       ## BB#0:
6165; CHECK-NEXT:    vcomisd {sae}, %xmm1, %xmm0
6166; CHECK-NEXT:    sete %al
6167; CHECK-NEXT:    movzbl %al, %eax
6168; CHECK-NEXT:    retq
6169  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 0, i32 8)
6170  ret i32 %res
6171}
6172
6173define i32 @test_x86_avx512_ucomi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1) {
6174; CHECK-LABEL: test_x86_avx512_ucomi_sd_eq_sae:
6175; CHECK:       ## BB#0:
6176; CHECK-NEXT:    vucomisd {sae}, %xmm1, %xmm0
6177; CHECK-NEXT:    sete %al
6178; CHECK-NEXT:    movzbl %al, %eax
6179; CHECK-NEXT:    retq
6180  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 8, i32 8)
6181  ret i32 %res
6182}
6183
6184define i32 @test_x86_avx512_comi_sd_eq(<2 x double> %a0, <2 x double> %a1) {
6185; CHECK-LABEL: test_x86_avx512_comi_sd_eq:
6186; CHECK:       ## BB#0:
6187; CHECK-NEXT:    vcomisd %xmm1, %xmm0
6188; CHECK-NEXT:    sete %al
6189; CHECK-NEXT:    movzbl %al, %eax
6190; CHECK-NEXT:    retq
6191  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 0, i32 4)
6192  ret i32 %res
6193}
6194
6195define i32 @test_x86_avx512_ucomi_sd_eq(<2 x double> %a0, <2 x double> %a1) {
6196; CHECK-LABEL: test_x86_avx512_ucomi_sd_eq:
6197; CHECK:       ## BB#0:
6198; CHECK-NEXT:    vucomisd %xmm1, %xmm0
6199; CHECK-NEXT:    sete %al
6200; CHECK-NEXT:    movzbl %al, %eax
6201; CHECK-NEXT:    retq
6202  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 8, i32 4)
6203  ret i32 %res
6204}
6205
6206define i32 @test_x86_avx512_comi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1) {
6207; CHECK-LABEL: test_x86_avx512_comi_sd_lt_sae:
6208; CHECK:       ## BB#0:
6209; CHECK-NEXT:    vcomisd {sae}, %xmm1, %xmm0
6210; CHECK-NEXT:    sbbl %eax, %eax
6211; CHECK-NEXT:    andl $1, %eax
6212; CHECK-NEXT:    retq
6213  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 1, i32 8)
6214  ret i32 %res
6215}
6216
6217define i32 @test_x86_avx512_ucomi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1) {
6218; CHECK-LABEL: test_x86_avx512_ucomi_sd_lt_sae:
6219; CHECK:       ## BB#0:
6220; CHECK-NEXT:    vucomisd {sae}, %xmm1, %xmm0
6221; CHECK-NEXT:    sbbl %eax, %eax
6222; CHECK-NEXT:    andl $1, %eax
6223; CHECK-NEXT:    retq
6224  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 9, i32 8)
6225  ret i32 %res
6226}
6227
6228define i32 @test_x86_avx512_comi_sd_lt(<2 x double> %a0, <2 x double> %a1) {
6229; CHECK-LABEL: test_x86_avx512_comi_sd_lt:
6230; CHECK:       ## BB#0:
6231; CHECK-NEXT:    vcomisd %xmm1, %xmm0
6232; CHECK-NEXT:    sbbl %eax, %eax
6233; CHECK-NEXT:    andl $1, %eax
6234; CHECK-NEXT:    retq
6235  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 1, i32 4)
6236  ret i32 %res
6237}
6238
6239define i32 @test_x86_avx512_ucomi_sd_lt(<2 x double> %a0, <2 x double> %a1) {
6240; CHECK-LABEL: test_x86_avx512_ucomi_sd_lt:
6241; CHECK:       ## BB#0:
6242; CHECK-NEXT:    vucomisd %xmm1, %xmm0
6243; CHECK-NEXT:    sbbl %eax, %eax
6244; CHECK-NEXT:    andl $1, %eax
6245; CHECK-NEXT:    retq
6246  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 9, i32 4)
6247  ret i32 %res
6248}
6249
6250declare i32 @llvm.x86.avx512.vcomi.sd(<2 x double>, <2 x double>, i32, i32)
6251
6252define i32 @test_x86_avx512_ucomi_ss_lt(<4 x float> %a0, <4 x float> %a1) {
6253; CHECK-LABEL: test_x86_avx512_ucomi_ss_lt:
6254; CHECK:       ## BB#0:
6255; CHECK-NEXT:    vucomiss %xmm1, %xmm0
6256; CHECK-NEXT:    sbbl %eax, %eax
6257; CHECK-NEXT:    andl $1, %eax
6258; CHECK-NEXT:    retq
6259  %res = call i32 @llvm.x86.avx512.vcomi.ss(<4 x float> %a0, <4 x float> %a1, i32 9, i32 4)
6260  ret i32 %res
6261}
6262
6263declare i32 @llvm.x86.avx512.vcomi.ss(<4 x float>, <4 x float>, i32, i32)
6264declare <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float>, <4 x float>, <4 x float>, i8)
6265
6266define <4 x float>@test_int_x86_avx512_mask_move_ss_rrk(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
6267; CHECK-LABEL: test_int_x86_avx512_mask_move_ss_rrk:
6268; CHECK:       ## BB#0:
6269; CHECK-NEXT:    andl $1, %edi
6270; CHECK-NEXT:    kmovw %edi, %k1
6271; CHECK-NEXT:    vmovss %xmm1, %xmm0, %xmm2 {%k1}
6272; CHECK-NEXT:    vmovaps %zmm2, %zmm0
6273; CHECK-NEXT:    retq
6274  %res = call <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
6275  ret <4 x float> %res
6276}
6277
6278define <4 x float>@test_int_x86_avx512_mask_move_ss_rrkz(<4 x float> %x0, <4 x float> %x1, i8 %x2) {
6279; CHECK-LABEL: test_int_x86_avx512_mask_move_ss_rrkz:
6280; CHECK:       ## BB#0:
6281; CHECK-NEXT:    andl $1, %edi
6282; CHECK-NEXT:    kmovw %edi, %k1
6283; CHECK-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1} {z}
6284; CHECK-NEXT:    retq
6285  %res = call <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> zeroinitializer, i8 %x2)
6286  ret <4 x float> %res
6287}
6288
6289define <4 x float>@test_int_x86_avx512_mask_move_ss_rr(<4 x float> %x0, <4 x float> %x1, i8 %x2) {
6290; CHECK-LABEL: test_int_x86_avx512_mask_move_ss_rr:
6291; CHECK:       ## BB#0:
6292; CHECK-NEXT:    vmovss %xmm1, %xmm0, %xmm0
6293; CHECK-NEXT:    retq
6294  %res = call <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> zeroinitializer, i8 -1)
6295  ret <4 x float> %res
6296}
6297
6298declare <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double>, <2 x double>, <2 x double>, i8)
6299define <2 x double>@test_int_x86_avx512_mask_move_sd_rr(<2 x double> %x0, <2 x double> %x1, i8 %x2) {
6300; CHECK-LABEL: test_int_x86_avx512_mask_move_sd_rr:
6301; CHECK:       ## BB#0:
6302; CHECK-NEXT:    vmovsd %xmm1, %xmm0, %xmm0
6303; CHECK-NEXT:    retq
6304  %res = call <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> zeroinitializer, i8 -1)
6305  ret <2 x double> %res
6306}
6307
6308define <2 x double>@test_int_x86_avx512_mask_move_sd_rrkz(<2 x double> %x0, <2 x double> %x1, i8 %x2) {
6309; CHECK-LABEL: test_int_x86_avx512_mask_move_sd_rrkz:
6310; CHECK:       ## BB#0:
6311; CHECK-NEXT:    andl $1, %edi
6312; CHECK-NEXT:    kmovw %edi, %k1
6313; CHECK-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1} {z}
6314; CHECK-NEXT:    retq
6315  %res = call <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> zeroinitializer, i8 %x2)
6316  ret <2 x double> %res
6317}
6318
6319define <2 x double>@test_int_x86_avx512_mask_move_sd_rrk(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
6320; CHECK-LABEL: test_int_x86_avx512_mask_move_sd_rrk:
6321; CHECK:       ## BB#0:
6322; CHECK-NEXT:    andl $1, %edi
6323; CHECK-NEXT:    kmovw %edi, %k1
6324; CHECK-NEXT:    vmovsd %xmm1, %xmm0, %xmm2 {%k1}
6325; CHECK-NEXT:    vmovaps %zmm2, %zmm0
6326; CHECK-NEXT:    retq
6327  %res = call <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
6328  ret <2 x double> %res
6329}
6330
6331