• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vl --show-mc-encoding| FileCheck %s
2
3; 256-bit
4
5define i8 @test_pcmpeq_d_256(<8 x i32> %a, <8 x i32> %b) {
6; CHECK-LABEL: test_pcmpeq_d_256
7; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 ##
8  %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.256(<8 x i32> %a, <8 x i32> %b, i8 -1)
9  ret i8 %res
10}
11
12define i8 @test_mask_pcmpeq_d_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
13; CHECK-LABEL: test_mask_pcmpeq_d_256
14; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} ##
15  %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.256(<8 x i32> %a, <8 x i32> %b, i8 %mask)
16  ret i8 %res
17}
18
19declare i8 @llvm.x86.avx512.mask.pcmpeq.d.256(<8 x i32>, <8 x i32>, i8)
20
21define i8 @test_pcmpeq_q_256(<4 x i64> %a, <4 x i64> %b) {
22; CHECK-LABEL: test_pcmpeq_q_256
23; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 ##
24  %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.256(<4 x i64> %a, <4 x i64> %b, i8 -1)
25  ret i8 %res
26}
27
28define i8 @test_mask_pcmpeq_q_256(<4 x i64> %a, <4 x i64> %b, i8 %mask) {
29; CHECK-LABEL: test_mask_pcmpeq_q_256
30; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} ##
31  %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.256(<4 x i64> %a, <4 x i64> %b, i8 %mask)
32  ret i8 %res
33}
34
35declare i8 @llvm.x86.avx512.mask.pcmpeq.q.256(<4 x i64>, <4 x i64>, i8)
36
37define i8 @test_pcmpgt_d_256(<8 x i32> %a, <8 x i32> %b) {
38; CHECK-LABEL: test_pcmpgt_d_256
39; CHECK: vpcmpgtd %ymm1, %ymm0, %k0 ##
40  %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.256(<8 x i32> %a, <8 x i32> %b, i8 -1)
41  ret i8 %res
42}
43
44define i8 @test_mask_pcmpgt_d_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
45; CHECK-LABEL: test_mask_pcmpgt_d_256
46; CHECK: vpcmpgtd %ymm1, %ymm0, %k0 {%k1} ##
47  %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.256(<8 x i32> %a, <8 x i32> %b, i8 %mask)
48  ret i8 %res
49}
50
51declare i8 @llvm.x86.avx512.mask.pcmpgt.d.256(<8 x i32>, <8 x i32>, i8)
52
53define i8 @test_pcmpgt_q_256(<4 x i64> %a, <4 x i64> %b) {
54; CHECK-LABEL: test_pcmpgt_q_256
55; CHECK: vpcmpgtq %ymm1, %ymm0, %k0 ##
56  %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.256(<4 x i64> %a, <4 x i64> %b, i8 -1)
57  ret i8 %res
58}
59
60define i8 @test_mask_pcmpgt_q_256(<4 x i64> %a, <4 x i64> %b, i8 %mask) {
61; CHECK-LABEL: test_mask_pcmpgt_q_256
62; CHECK: vpcmpgtq %ymm1, %ymm0, %k0 {%k1} ##
63  %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.256(<4 x i64> %a, <4 x i64> %b, i8 %mask)
64  ret i8 %res
65}
66
67declare i8 @llvm.x86.avx512.mask.pcmpgt.q.256(<4 x i64>, <4 x i64>, i8)
68
69define <8 x i8> @test_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1) {
70; CHECK-LABEL: test_cmp_d_256
71; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 ##
72  %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 -1)
73  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
74; CHECK: vpcmpltd %ymm1, %ymm0, %k0 ##
75  %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 1, i8 -1)
76  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
77; CHECK: vpcmpled %ymm1, %ymm0, %k0 ##
78  %res2 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 2, i8 -1)
79  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
80; CHECK: vpcmpunordd %ymm1, %ymm0, %k0 ##
81  %res3 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 3, i8 -1)
82  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
83; CHECK: vpcmpneqd %ymm1, %ymm0, %k0 ##
84  %res4 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 4, i8 -1)
85  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
86; CHECK: vpcmpnltd %ymm1, %ymm0, %k0 ##
87  %res5 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 5, i8 -1)
88  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
89; CHECK: vpcmpnled %ymm1, %ymm0, %k0 ##
90  %res6 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 6, i8 -1)
91  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
92; CHECK: vpcmpordd %ymm1, %ymm0, %k0 ##
93  %res7 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 7, i8 -1)
94  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
95  ret <8 x i8> %vec7
96}
97
98define <8 x i8> @test_mask_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) {
99; CHECK-LABEL: test_mask_cmp_d_256
100; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} ##
101  %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 %mask)
102  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
103; CHECK: vpcmpltd %ymm1, %ymm0, %k0 {%k1} ##
104  %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 1, i8 %mask)
105  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
106; CHECK: vpcmpled %ymm1, %ymm0, %k0 {%k1} ##
107  %res2 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 2, i8 %mask)
108  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
109; CHECK: vpcmpunordd %ymm1, %ymm0, %k0 {%k1} ##
110  %res3 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 3, i8 %mask)
111  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
112; CHECK: vpcmpneqd %ymm1, %ymm0, %k0 {%k1} ##
113  %res4 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 4, i8 %mask)
114  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
115; CHECK: vpcmpnltd %ymm1, %ymm0, %k0 {%k1} ##
116  %res5 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 5, i8 %mask)
117  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
118; CHECK: vpcmpnled %ymm1, %ymm0, %k0 {%k1} ##
119  %res6 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 6, i8 %mask)
120  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
121; CHECK: vpcmpordd %ymm1, %ymm0, %k0 {%k1} ##
122  %res7 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 7, i8 %mask)
123  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
124  ret <8 x i8> %vec7
125}
126
127declare i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32>, <8 x i32>, i32, i8) nounwind readnone
128
129define <8 x i8> @test_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1) {
130; CHECK-LABEL: test_ucmp_d_256
131; CHECK: vpcmpequd %ymm1, %ymm0, %k0 ##
132  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 -1)
133  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
134; CHECK: vpcmpltud %ymm1, %ymm0, %k0 ##
135  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 1, i8 -1)
136  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
137; CHECK: vpcmpleud %ymm1, %ymm0, %k0 ##
138  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 2, i8 -1)
139  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
140; CHECK: vpcmpunordud %ymm1, %ymm0, %k0 ##
141  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 3, i8 -1)
142  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
143; CHECK: vpcmpnequd %ymm1, %ymm0, %k0 ##
144  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 4, i8 -1)
145  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
146; CHECK: vpcmpnltud %ymm1, %ymm0, %k0 ##
147  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 5, i8 -1)
148  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
149; CHECK: vpcmpnleud %ymm1, %ymm0, %k0 ##
150  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 6, i8 -1)
151  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
152; CHECK: vpcmpordud %ymm1, %ymm0, %k0 ##
153  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 7, i8 -1)
154  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
155  ret <8 x i8> %vec7
156}
157
158define <8 x i8> @test_mask_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) {
159; CHECK-LABEL: test_mask_ucmp_d_256
160; CHECK: vpcmpequd %ymm1, %ymm0, %k0 {%k1} ##
161  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 %mask)
162  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
163; CHECK: vpcmpltud %ymm1, %ymm0, %k0 {%k1} ##
164  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 1, i8 %mask)
165  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
166; CHECK: vpcmpleud %ymm1, %ymm0, %k0 {%k1} ##
167  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 2, i8 %mask)
168  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
169; CHECK: vpcmpunordud %ymm1, %ymm0, %k0 {%k1} ##
170  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 3, i8 %mask)
171  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
172; CHECK: vpcmpnequd %ymm1, %ymm0, %k0 {%k1} ##
173  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 4, i8 %mask)
174  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
175; CHECK: vpcmpnltud %ymm1, %ymm0, %k0 {%k1} ##
176  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 5, i8 %mask)
177  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
178; CHECK: vpcmpnleud %ymm1, %ymm0, %k0 {%k1} ##
179  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 6, i8 %mask)
180  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
181; CHECK: vpcmpordud %ymm1, %ymm0, %k0 {%k1} ##
182  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 7, i8 %mask)
183  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
184  ret <8 x i8> %vec7
185}
186
187declare i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32>, <8 x i32>, i32, i8) nounwind readnone
188
189define <8 x i8> @test_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1) {
190; CHECK-LABEL: test_cmp_q_256
191; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 ##
192  %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 -1)
193  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
194; CHECK: vpcmpltq %ymm1, %ymm0, %k0 ##
195  %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 1, i8 -1)
196  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
197; CHECK: vpcmpleq %ymm1, %ymm0, %k0 ##
198  %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 2, i8 -1)
199  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
200; CHECK: vpcmpunordq %ymm1, %ymm0, %k0 ##
201  %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 3, i8 -1)
202  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
203; CHECK: vpcmpneqq %ymm1, %ymm0, %k0 ##
204  %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 4, i8 -1)
205  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
206; CHECK: vpcmpnltq %ymm1, %ymm0, %k0 ##
207  %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 5, i8 -1)
208  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
209; CHECK: vpcmpnleq %ymm1, %ymm0, %k0 ##
210  %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 6, i8 -1)
211  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
212; CHECK: vpcmpordq %ymm1, %ymm0, %k0 ##
213  %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 7, i8 -1)
214  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
215  ret <8 x i8> %vec7
216}
217
218define <8 x i8> @test_mask_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) {
219; CHECK-LABEL: test_mask_cmp_q_256
220; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} ##
221  %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 %mask)
222  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
223; CHECK: vpcmpltq %ymm1, %ymm0, %k0 {%k1} ##
224  %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 1, i8 %mask)
225  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
226; CHECK: vpcmpleq %ymm1, %ymm0, %k0 {%k1} ##
227  %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 2, i8 %mask)
228  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
229; CHECK: vpcmpunordq %ymm1, %ymm0, %k0 {%k1} ##
230  %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 3, i8 %mask)
231  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
232; CHECK: vpcmpneqq %ymm1, %ymm0, %k0 {%k1} ##
233  %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 4, i8 %mask)
234  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
235; CHECK: vpcmpnltq %ymm1, %ymm0, %k0 {%k1} ##
236  %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 5, i8 %mask)
237  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
238; CHECK: vpcmpnleq %ymm1, %ymm0, %k0 {%k1} ##
239  %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 6, i8 %mask)
240  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
241; CHECK: vpcmpordq %ymm1, %ymm0, %k0 {%k1} ##
242  %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 7, i8 %mask)
243  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
244  ret <8 x i8> %vec7
245}
246
247declare i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64>, <4 x i64>, i32, i8) nounwind readnone
248
249define <8 x i8> @test_ucmp_q_256(<4 x i64> %a0, <4 x i64> %a1) {
250; CHECK-LABEL: test_ucmp_q_256
251; CHECK: vpcmpequq %ymm1, %ymm0, %k0 ##
252  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 -1)
253  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
254; CHECK: vpcmpltuq %ymm1, %ymm0, %k0 ##
255  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 1, i8 -1)
256  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
257; CHECK: vpcmpleuq %ymm1, %ymm0, %k0 ##
258  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 2, i8 -1)
259  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
260; CHECK: vpcmpunorduq %ymm1, %ymm0, %k0 ##
261  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 3, i8 -1)
262  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
263; CHECK: vpcmpnequq %ymm1, %ymm0, %k0 ##
264  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 4, i8 -1)
265  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
266; CHECK: vpcmpnltuq %ymm1, %ymm0, %k0 ##
267  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 5, i8 -1)
268  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
269; CHECK: vpcmpnleuq %ymm1, %ymm0, %k0 ##
270  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 6, i8 -1)
271  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
272; CHECK: vpcmporduq %ymm1, %ymm0, %k0 ##
273  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 7, i8 -1)
274  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
275  ret <8 x i8> %vec7
276}
277
278define <8 x i8> @test_mask_ucmp_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) {
279; CHECK-LABEL: test_mask_ucmp_q_256
280; CHECK: vpcmpequq %ymm1, %ymm0, %k0 {%k1} ##
281  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 %mask)
282  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
283; CHECK: vpcmpltuq %ymm1, %ymm0, %k0 {%k1} ##
284  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 1, i8 %mask)
285  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
286; CHECK: vpcmpleuq %ymm1, %ymm0, %k0 {%k1} ##
287  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 2, i8 %mask)
288  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
289; CHECK: vpcmpunorduq %ymm1, %ymm0, %k0 {%k1} ##
290  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 3, i8 %mask)
291  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
292; CHECK: vpcmpnequq %ymm1, %ymm0, %k0 {%k1} ##
293  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 4, i8 %mask)
294  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
295; CHECK: vpcmpnltuq %ymm1, %ymm0, %k0 {%k1} ##
296  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 5, i8 %mask)
297  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
298; CHECK: vpcmpnleuq %ymm1, %ymm0, %k0 {%k1} ##
299  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 6, i8 %mask)
300  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
301; CHECK: vpcmporduq %ymm1, %ymm0, %k0 {%k1} ##
302  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 7, i8 %mask)
303  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
304  ret <8 x i8> %vec7
305}
306
307declare i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64>, <4 x i64>, i32, i8) nounwind readnone
308
309; 128-bit
310
311define i8 @test_pcmpeq_d_128(<4 x i32> %a, <4 x i32> %b) {
312; CHECK-LABEL: test_pcmpeq_d_128
313; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 ##
314  %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.128(<4 x i32> %a, <4 x i32> %b, i8 -1)
315  ret i8 %res
316}
317
318define i8 @test_mask_pcmpeq_d_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
319; CHECK-LABEL: test_mask_pcmpeq_d_128
320; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} ##
321  %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.128(<4 x i32> %a, <4 x i32> %b, i8 %mask)
322  ret i8 %res
323}
324
325declare i8 @llvm.x86.avx512.mask.pcmpeq.d.128(<4 x i32>, <4 x i32>, i8)
326
327define i8 @test_pcmpeq_q_128(<2 x i64> %a, <2 x i64> %b) {
328; CHECK-LABEL: test_pcmpeq_q_128
329; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 ##
330  %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.128(<2 x i64> %a, <2 x i64> %b, i8 -1)
331  ret i8 %res
332}
333
334define i8 @test_mask_pcmpeq_q_128(<2 x i64> %a, <2 x i64> %b, i8 %mask) {
335; CHECK-LABEL: test_mask_pcmpeq_q_128
336; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} ##
337  %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.128(<2 x i64> %a, <2 x i64> %b, i8 %mask)
338  ret i8 %res
339}
340
341declare i8 @llvm.x86.avx512.mask.pcmpeq.q.128(<2 x i64>, <2 x i64>, i8)
342
343define i8 @test_pcmpgt_d_128(<4 x i32> %a, <4 x i32> %b) {
344; CHECK-LABEL: test_pcmpgt_d_128
345; CHECK: vpcmpgtd %xmm1, %xmm0, %k0 ##
346  %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.128(<4 x i32> %a, <4 x i32> %b, i8 -1)
347  ret i8 %res
348}
349
350define i8 @test_mask_pcmpgt_d_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
351; CHECK-LABEL: test_mask_pcmpgt_d_128
352; CHECK: vpcmpgtd %xmm1, %xmm0, %k0 {%k1} ##
353  %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.128(<4 x i32> %a, <4 x i32> %b, i8 %mask)
354  ret i8 %res
355}
356
357declare i8 @llvm.x86.avx512.mask.pcmpgt.d.128(<4 x i32>, <4 x i32>, i8)
358
359define i8 @test_pcmpgt_q_128(<2 x i64> %a, <2 x i64> %b) {
360; CHECK-LABEL: test_pcmpgt_q_128
361; CHECK: vpcmpgtq %xmm1, %xmm0, %k0 ##
362  %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.128(<2 x i64> %a, <2 x i64> %b, i8 -1)
363  ret i8 %res
364}
365
366define i8 @test_mask_pcmpgt_q_128(<2 x i64> %a, <2 x i64> %b, i8 %mask) {
367; CHECK-LABEL: test_mask_pcmpgt_q_128
368; CHECK: vpcmpgtq %xmm1, %xmm0, %k0 {%k1} ##
369  %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.128(<2 x i64> %a, <2 x i64> %b, i8 %mask)
370  ret i8 %res
371}
372
373declare i8 @llvm.x86.avx512.mask.pcmpgt.q.128(<2 x i64>, <2 x i64>, i8)
374
375define <8 x i8> @test_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1) {
376; CHECK-LABEL: test_cmp_d_128
377; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 ##
378  %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 -1)
379  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
380; CHECK: vpcmpltd %xmm1, %xmm0, %k0 ##
381  %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 1, i8 -1)
382  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
383; CHECK: vpcmpled %xmm1, %xmm0, %k0 ##
384  %res2 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 2, i8 -1)
385  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
386; CHECK: vpcmpunordd %xmm1, %xmm0, %k0 ##
387  %res3 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 3, i8 -1)
388  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
389; CHECK: vpcmpneqd %xmm1, %xmm0, %k0 ##
390  %res4 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 4, i8 -1)
391  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
392; CHECK: vpcmpnltd %xmm1, %xmm0, %k0 ##
393  %res5 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 5, i8 -1)
394  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
395; CHECK: vpcmpnled %xmm1, %xmm0, %k0 ##
396  %res6 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 6, i8 -1)
397  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
398; CHECK: vpcmpordd %xmm1, %xmm0, %k0 ##
399  %res7 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 7, i8 -1)
400  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
401  ret <8 x i8> %vec7
402}
403
404define <8 x i8> @test_mask_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) {
405; CHECK-LABEL: test_mask_cmp_d_128
406; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} ##
407  %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 %mask)
408  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
409; CHECK: vpcmpltd %xmm1, %xmm0, %k0 {%k1} ##
410  %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 1, i8 %mask)
411  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
412; CHECK: vpcmpled %xmm1, %xmm0, %k0 {%k1} ##
413  %res2 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 2, i8 %mask)
414  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
415; CHECK: vpcmpunordd %xmm1, %xmm0, %k0 {%k1} ##
416  %res3 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 3, i8 %mask)
417  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
418; CHECK: vpcmpneqd %xmm1, %xmm0, %k0 {%k1} ##
419  %res4 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 4, i8 %mask)
420  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
421; CHECK: vpcmpnltd %xmm1, %xmm0, %k0 {%k1} ##
422  %res5 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 5, i8 %mask)
423  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
424; CHECK: vpcmpnled %xmm1, %xmm0, %k0 {%k1} ##
425  %res6 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 6, i8 %mask)
426  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
427; CHECK: vpcmpordd %xmm1, %xmm0, %k0 {%k1} ##
428  %res7 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 7, i8 %mask)
429  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
430  ret <8 x i8> %vec7
431}
432
433declare i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32>, <4 x i32>, i32, i8) nounwind readnone
434
435define <8 x i8> @test_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1) {
436; CHECK-LABEL: test_ucmp_d_128
437; CHECK: vpcmpequd %xmm1, %xmm0, %k0 ##
438  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 -1)
439  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
440; CHECK: vpcmpltud %xmm1, %xmm0, %k0 ##
441  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 1, i8 -1)
442  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
443; CHECK: vpcmpleud %xmm1, %xmm0, %k0 ##
444  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 2, i8 -1)
445  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
446; CHECK: vpcmpunordud %xmm1, %xmm0, %k0 ##
447  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 3, i8 -1)
448  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
449; CHECK: vpcmpnequd %xmm1, %xmm0, %k0 ##
450  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 4, i8 -1)
451  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
452; CHECK: vpcmpnltud %xmm1, %xmm0, %k0 ##
453  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 5, i8 -1)
454  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
455; CHECK: vpcmpnleud %xmm1, %xmm0, %k0 ##
456  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 6, i8 -1)
457  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
458; CHECK: vpcmpordud %xmm1, %xmm0, %k0 ##
459  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 7, i8 -1)
460  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
461  ret <8 x i8> %vec7
462}
463
464define <8 x i8> @test_mask_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) {
465; CHECK-LABEL: test_mask_ucmp_d_128
466; CHECK: vpcmpequd %xmm1, %xmm0, %k0 {%k1} ##
467  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 %mask)
468  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
469; CHECK: vpcmpltud %xmm1, %xmm0, %k0 {%k1} ##
470  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 1, i8 %mask)
471  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
472; CHECK: vpcmpleud %xmm1, %xmm0, %k0 {%k1} ##
473  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 2, i8 %mask)
474  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
475; CHECK: vpcmpunordud %xmm1, %xmm0, %k0 {%k1} ##
476  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 3, i8 %mask)
477  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
478; CHECK: vpcmpnequd %xmm1, %xmm0, %k0 {%k1} ##
479  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 4, i8 %mask)
480  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
481; CHECK: vpcmpnltud %xmm1, %xmm0, %k0 {%k1} ##
482  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 5, i8 %mask)
483  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
484; CHECK: vpcmpnleud %xmm1, %xmm0, %k0 {%k1} ##
485  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 6, i8 %mask)
486  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
487; CHECK: vpcmpordud %xmm1, %xmm0, %k0 {%k1} ##
488  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 7, i8 %mask)
489  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
490  ret <8 x i8> %vec7
491}
492
493declare i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32>, <4 x i32>, i32, i8) nounwind readnone
494
495define <8 x i8> @test_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1) {
496; CHECK-LABEL: test_cmp_q_128
497; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 ##
498  %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 -1)
499  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
500; CHECK: vpcmpltq %xmm1, %xmm0, %k0 ##
501  %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 1, i8 -1)
502  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
503; CHECK: vpcmpleq %xmm1, %xmm0, %k0 ##
504  %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 2, i8 -1)
505  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
506; CHECK: vpcmpunordq %xmm1, %xmm0, %k0 ##
507  %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 3, i8 -1)
508  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
509; CHECK: vpcmpneqq %xmm1, %xmm0, %k0 ##
510  %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 4, i8 -1)
511  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
512; CHECK: vpcmpnltq %xmm1, %xmm0, %k0 ##
513  %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 5, i8 -1)
514  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
515; CHECK: vpcmpnleq %xmm1, %xmm0, %k0 ##
516  %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 6, i8 -1)
517  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
518; CHECK: vpcmpordq %xmm1, %xmm0, %k0 ##
519  %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 7, i8 -1)
520  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
521  ret <8 x i8> %vec7
522}
523
524define <8 x i8> @test_mask_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) {
525; CHECK-LABEL: test_mask_cmp_q_128
526; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} ##
527  %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 %mask)
528  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
529; CHECK: vpcmpltq %xmm1, %xmm0, %k0 {%k1} ##
530  %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 1, i8 %mask)
531  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
532; CHECK: vpcmpleq %xmm1, %xmm0, %k0 {%k1} ##
533  %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 2, i8 %mask)
534  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
535; CHECK: vpcmpunordq %xmm1, %xmm0, %k0 {%k1} ##
536  %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 3, i8 %mask)
537  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
538; CHECK: vpcmpneqq %xmm1, %xmm0, %k0 {%k1} ##
539  %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 4, i8 %mask)
540  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
541; CHECK: vpcmpnltq %xmm1, %xmm0, %k0 {%k1} ##
542  %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 5, i8 %mask)
543  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
544; CHECK: vpcmpnleq %xmm1, %xmm0, %k0 {%k1} ##
545  %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 6, i8 %mask)
546  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
547; CHECK: vpcmpordq %xmm1, %xmm0, %k0 {%k1} ##
548  %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 7, i8 %mask)
549  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
550  ret <8 x i8> %vec7
551}
552
553declare i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64>, <2 x i64>, i32, i8) nounwind readnone
554
555define <8 x i8> @test_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1) {
556; CHECK-LABEL: test_ucmp_q_128
557; CHECK: vpcmpequq %xmm1, %xmm0, %k0 ##
558  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 -1)
559  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
560; CHECK: vpcmpltuq %xmm1, %xmm0, %k0 ##
561  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 1, i8 -1)
562  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
563; CHECK: vpcmpleuq %xmm1, %xmm0, %k0 ##
564  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 2, i8 -1)
565  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
566; CHECK: vpcmpunorduq %xmm1, %xmm0, %k0 ##
567  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 3, i8 -1)
568  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
569; CHECK: vpcmpnequq %xmm1, %xmm0, %k0 ##
570  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 4, i8 -1)
571  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
572; CHECK: vpcmpnltuq %xmm1, %xmm0, %k0 ##
573  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 5, i8 -1)
574  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
575; CHECK: vpcmpnleuq %xmm1, %xmm0, %k0 ##
576  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 6, i8 -1)
577  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
578; CHECK: vpcmporduq %xmm1, %xmm0, %k0 ##
579  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 7, i8 -1)
580  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
581  ret <8 x i8> %vec7
582}
583
584define <8 x i8> @test_mask_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) {
585; CHECK-LABEL: test_mask_ucmp_q_128
586; CHECK: vpcmpequq %xmm1, %xmm0, %k0 {%k1} ##
587  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 %mask)
588  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
589; CHECK: vpcmpltuq %xmm1, %xmm0, %k0 {%k1} ##
590  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 1, i8 %mask)
591  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
592; CHECK: vpcmpleuq %xmm1, %xmm0, %k0 {%k1} ##
593  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 2, i8 %mask)
594  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
595; CHECK: vpcmpunorduq %xmm1, %xmm0, %k0 {%k1} ##
596  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 3, i8 %mask)
597  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
598; CHECK: vpcmpnequq %xmm1, %xmm0, %k0 {%k1} ##
599  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 4, i8 %mask)
600  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
601; CHECK: vpcmpnltuq %xmm1, %xmm0, %k0 {%k1} ##
602  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 5, i8 %mask)
603  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
604; CHECK: vpcmpnleuq %xmm1, %xmm0, %k0 {%k1} ##
605  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 6, i8 %mask)
606  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
607; CHECK: vpcmporduq %xmm1, %xmm0, %k0 {%k1} ##
608  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 7, i8 %mask)
609  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
610  ret <8 x i8> %vec7
611}
612
613declare i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64>, <2 x i64>, i32, i8) nounwind readnone
614
615; CHECK-LABEL: compr1
616; CHECK: vcompresspd %zmm0
617define void @compr1(i8* %addr, <8 x double> %data, i8 %mask) {
618  call void @llvm.x86.avx512.mask.compress.store.pd.512(i8* %addr, <8 x double> %data, i8 %mask)
619  ret void
620}
621
622declare void @llvm.x86.avx512.mask.compress.store.pd.512(i8* %addr, <8 x double> %data, i8 %mask)
623
624; CHECK-LABEL: compr2
625; CHECK: vcompresspd %ymm0
626define void @compr2(i8* %addr, <4 x double> %data, i8 %mask) {
627  call void @llvm.x86.avx512.mask.compress.store.pd.256(i8* %addr, <4 x double> %data, i8 %mask)
628  ret void
629}
630
631declare void @llvm.x86.avx512.mask.compress.store.pd.256(i8* %addr, <4 x double> %data, i8 %mask)
632
633; CHECK-LABEL: compr3
634; CHECK: vcompressps %xmm0
635define void @compr3(i8* %addr, <4 x float> %data, i8 %mask) {
636  call void @llvm.x86.avx512.mask.compress.store.ps.128(i8* %addr, <4 x float> %data, i8 %mask)
637  ret void
638}
639
640declare void @llvm.x86.avx512.mask.compress.store.ps.128(i8* %addr, <4 x float> %data, i8 %mask)
641
642; CHECK-LABEL: compr4
643; CHECK: vcompresspd %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x8a,0xc0]
644define <8 x double> @compr4(i8* %addr, <8 x double> %data, i8 %mask) {
645  %res = call <8 x double> @llvm.x86.avx512.mask.compress.pd.512(<8 x double> %data, <8 x double> zeroinitializer, i8 %mask)
646  ret <8 x double> %res
647}
648
649declare <8 x double> @llvm.x86.avx512.mask.compress.pd.512(<8 x double> %data, <8 x double> %src0, i8 %mask)
650
651; CHECK-LABEL: compr5
652; CHECK: vcompresspd %ymm0, %ymm1 {%k1}  ## encoding: [0x62,0xf2,0xfd,0x29,0x8a,0xc1]
653define <4 x double> @compr5(<4 x double> %data, <4 x double> %src0, i8 %mask) {
654  %res = call <4 x double> @llvm.x86.avx512.mask.compress.pd.256( <4 x double> %data, <4 x double> %src0, i8 %mask)
655  ret <4 x double> %res
656}
657
658declare <4 x double> @llvm.x86.avx512.mask.compress.pd.256(<4 x double> %data, <4 x double> %src0, i8 %mask)
659
660; CHECK-LABEL: compr6
661; CHECK: vcompressps %xmm0
662define <4 x float> @compr6(<4 x float> %data, i8 %mask) {
663  %res = call <4 x float> @llvm.x86.avx512.mask.compress.ps.128(<4 x float> %data, <4 x float>zeroinitializer, i8 %mask)
664  ret <4 x float> %res
665}
666
667declare <4 x float> @llvm.x86.avx512.mask.compress.ps.128(<4 x float> %data, <4 x float> %src0, i8 %mask)
668
669; CHECK-LABEL: compr7
670; CHECK-NOT: vcompress
671; CHECK: vmovupd
672define void @compr7(i8* %addr, <8 x double> %data) {
673  call void @llvm.x86.avx512.mask.compress.store.pd.512(i8* %addr, <8 x double> %data, i8 -1)
674  ret void
675}
676
677; CHECK-LABEL: compr8
678; CHECK-NOT: vcompressps %xmm0
679define <4 x float> @compr8(<4 x float> %data) {
680  %res = call <4 x float> @llvm.x86.avx512.mask.compress.ps.128(<4 x float> %data, <4 x float>zeroinitializer, i8 -1)
681  ret <4 x float> %res
682}
683
684; CHECK-LABEL: compr9
685; CHECK: vpcompressq %zmm0, (%rdi) {%k1}  ## encoding: [0x62,0xf2,0xfd,0x49,0x8b,0x07]
686define void @compr9(i8* %addr, <8 x i64> %data, i8 %mask) {
687  call void @llvm.x86.avx512.mask.compress.store.q.512(i8* %addr, <8 x i64> %data, i8 %mask)
688  ret void
689}
690
691declare void @llvm.x86.avx512.mask.compress.store.q.512(i8* %addr, <8 x i64> %data, i8 %mask)
692
693; CHECK-LABEL: compr10
694; CHECK: vpcompressd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x8b,0xc0]
695define <4 x i32> @compr10(<4 x i32> %data, i8 %mask) {
696  %res = call <4 x i32> @llvm.x86.avx512.mask.compress.d.128(<4 x i32> %data, <4 x i32>zeroinitializer, i8 %mask)
697  ret <4 x i32> %res
698}
699
700declare <4 x i32> @llvm.x86.avx512.mask.compress.d.128(<4 x i32> %data, <4 x i32> %src0, i8 %mask)
701
702; Expand
703
704; CHECK-LABEL: expand1
705; CHECK: vexpandpd (%rdi), %zmm0 {%k1}  ## encoding: [0x62,0xf2,0xfd,0x49,0x88,0x07]
706define <8 x double> @expand1(i8* %addr, <8 x double> %data, i8 %mask) {
707  %res = call <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(i8* %addr, <8 x double> %data, i8 %mask)
708  ret <8 x double> %res
709}
710
711declare <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(i8* %addr, <8 x double> %data, i8 %mask)
712
713; CHECK-LABEL: expand2
714; CHECK: vexpandpd (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x88,0x07]
715define <4 x double> @expand2(i8* %addr, <4 x double> %data, i8 %mask) {
716  %res = call <4 x double> @llvm.x86.avx512.mask.expand.load.pd.256(i8* %addr, <4 x double> %data, i8 %mask)
717  ret <4 x double> %res
718}
719
720declare <4 x double> @llvm.x86.avx512.mask.expand.load.pd.256(i8* %addr, <4 x double> %data, i8 %mask)
721
722; CHECK-LABEL: expand3
723; CHECK: vexpandps (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x88,0x07]
724define <4 x float> @expand3(i8* %addr, <4 x float> %data, i8 %mask) {
725  %res = call <4 x float> @llvm.x86.avx512.mask.expand.load.ps.128(i8* %addr, <4 x float> %data, i8 %mask)
726  ret <4 x float> %res
727}
728
729declare <4 x float> @llvm.x86.avx512.mask.expand.load.ps.128(i8* %addr, <4 x float> %data, i8 %mask)
730
731; CHECK-LABEL: expand4
732; CHECK: vexpandpd %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x88,0xc0]
733define <8 x double> @expand4(i8* %addr, <8 x double> %data, i8 %mask) {
734  %res = call <8 x double> @llvm.x86.avx512.mask.expand.pd.512(<8 x double> %data, <8 x double> zeroinitializer, i8 %mask)
735  ret <8 x double> %res
736}
737
738declare <8 x double> @llvm.x86.avx512.mask.expand.pd.512(<8 x double> %data, <8 x double> %src0, i8 %mask)
739
740; CHECK-LABEL: expand5
741; CHECK: vexpandpd %ymm0, %ymm1 {%k1}  ## encoding: [0x62,0xf2,0xfd,0x29,0x88,0xc8]
742define <4 x double> @expand5(<4 x double> %data, <4 x double> %src0, i8 %mask) {
743  %res = call <4 x double> @llvm.x86.avx512.mask.expand.pd.256( <4 x double> %data, <4 x double> %src0, i8 %mask)
744  ret <4 x double> %res
745}
746
747declare <4 x double> @llvm.x86.avx512.mask.expand.pd.256(<4 x double> %data, <4 x double> %src0, i8 %mask)
748
749; CHECK-LABEL: expand6
750; CHECK: vexpandps %xmm0
751define <4 x float> @expand6(<4 x float> %data, i8 %mask) {
752  %res = call <4 x float> @llvm.x86.avx512.mask.expand.ps.128(<4 x float> %data, <4 x float>zeroinitializer, i8 %mask)
753  ret <4 x float> %res
754}
755
756declare <4 x float> @llvm.x86.avx512.mask.expand.ps.128(<4 x float> %data, <4 x float> %src0, i8 %mask)
757
758; CHECK-LABEL: expand7
759; CHECK-NOT: vexpand
760; CHECK: vmovupd
761define <8 x double> @expand7(i8* %addr, <8 x double> %data) {
762  %res = call <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(i8* %addr, <8 x double> %data, i8 -1)
763  ret <8 x double> %res
764}
765
766; CHECK-LABEL: expand8
767; CHECK-NOT: vexpandps %xmm0
768define <4 x float> @expand8(<4 x float> %data) {
769  %res = call <4 x float> @llvm.x86.avx512.mask.expand.ps.128(<4 x float> %data, <4 x float>zeroinitializer, i8 -1)
770  ret <4 x float> %res
771}
772
773; CHECK-LABEL: expand9
774; CHECK: vpexpandq (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x89,0x07]
775define <8 x i64> @expand9(i8* %addr, <8 x i64> %data, i8 %mask) {
776  %res = call <8 x i64> @llvm.x86.avx512.mask.expand.load.q.512(i8* %addr, <8 x i64> %data, i8 %mask)
777  ret <8 x i64> %res
778}
779
780declare <8 x i64> @llvm.x86.avx512.mask.expand.load.q.512(i8* %addr, <8 x i64> %data, i8 %mask)
781
782; CHECK-LABEL: expand10
783; CHECK: vpexpandd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x89,0xc0]
784define <4 x i32> @expand10(<4 x i32> %data, i8 %mask) {
785  %res = call <4 x i32> @llvm.x86.avx512.mask.expand.d.128(<4 x i32> %data, <4 x i32>zeroinitializer, i8 %mask)
786  ret <4 x i32> %res
787}
788
789declare <4 x i32> @llvm.x86.avx512.mask.expand.d.128(<4 x i32> %data, <4 x i32> %src0, i8 %mask)
790
791define <8 x float> @test_x86_mask_blend_ps_256(i8 %a0, <8 x float> %a1, <8 x float> %a2) {
792  ; CHECK: vblendmps %ymm1, %ymm0
793  %res = call <8 x float> @llvm.x86.avx512.mask.blend.ps.256(<8 x float> %a1, <8 x float> %a2, i8 %a0) ; <<8 x float>> [#uses=1]
794  ret <8 x float> %res
795}
796
797declare <8 x float> @llvm.x86.avx512.mask.blend.ps.256(<8 x float>, <8 x float>, i8) nounwind readonly
798
799define <4 x double> @test_x86_mask_blend_pd_256(i8 %a0, <4 x double> %a1, <4 x double> %a2) {
800  ; CHECK: vblendmpd %ymm1, %ymm0
801  %res = call <4 x double> @llvm.x86.avx512.mask.blend.pd.256(<4 x double> %a1, <4 x double> %a2, i8 %a0) ; <<4 x double>> [#uses=1]
802  ret <4 x double> %res
803}
804
805define <4 x double> @test_x86_mask_blend_pd_256_memop(<4 x double> %a, <4 x double>* %ptr, i8 %mask) {
806  ; CHECK-LABEL: test_x86_mask_blend_pd_256_memop
807  ; CHECK: vblendmpd (%
808  %b = load <4 x double>, <4 x double>* %ptr
809  %res = call <4 x double> @llvm.x86.avx512.mask.blend.pd.256(<4 x double> %a, <4 x double> %b, i8 %mask) ; <<4 x double>> [#uses=1]
810  ret <4 x double> %res
811}
812declare <4 x double> @llvm.x86.avx512.mask.blend.pd.256(<4 x double>, <4 x double>, i8) nounwind readonly
813
814; CHECK-LABEL: test_x86_mask_blend_d_256
815; CHECK: vpblendmd
816define <8 x i32> @test_x86_mask_blend_d_256(i8 %a0, <8 x i32> %a1, <8 x i32> %a2) {
817  %res = call <8 x i32> @llvm.x86.avx512.mask.blend.d.256(<8 x i32> %a1, <8 x i32> %a2, i8 %a0) ; <<8 x i32>> [#uses=1]
818  ret <8 x i32> %res
819}
820declare <8 x i32> @llvm.x86.avx512.mask.blend.d.256(<8 x i32>, <8 x i32>, i8) nounwind readonly
821
822define <4 x i64> @test_x86_mask_blend_q_256(i8 %a0, <4 x i64> %a1, <4 x i64> %a2) {
823  ; CHECK: vpblendmq
824  %res = call <4 x i64> @llvm.x86.avx512.mask.blend.q.256(<4 x i64> %a1, <4 x i64> %a2, i8 %a0) ; <<4 x i64>> [#uses=1]
825  ret <4 x i64> %res
826}
827declare <4 x i64> @llvm.x86.avx512.mask.blend.q.256(<4 x i64>, <4 x i64>, i8) nounwind readonly
828
829define <4 x float> @test_x86_mask_blend_ps_128(i8 %a0, <4 x float> %a1, <4 x float> %a2) {
830  ; CHECK: vblendmps %xmm1, %xmm0
831  %res = call <4 x float> @llvm.x86.avx512.mask.blend.ps.128(<4 x float> %a1, <4 x float> %a2, i8 %a0) ; <<4 x float>> [#uses=1]
832  ret <4 x float> %res
833}
834
835declare <4 x float> @llvm.x86.avx512.mask.blend.ps.128(<4 x float>, <4 x float>, i8) nounwind readonly
836
837define <2 x double> @test_x86_mask_blend_pd_128(i8 %a0, <2 x double> %a1, <2 x double> %a2) {
838  ; CHECK: vblendmpd %xmm1, %xmm0
839  %res = call <2 x double> @llvm.x86.avx512.mask.blend.pd.128(<2 x double> %a1, <2 x double> %a2, i8 %a0) ; <<2 x double>> [#uses=1]
840  ret <2 x double> %res
841}
842
843define <2 x double> @test_x86_mask_blend_pd_128_memop(<2 x double> %a, <2 x double>* %ptr, i8 %mask) {
844  ; CHECK-LABEL: test_x86_mask_blend_pd_128_memop
845  ; CHECK: vblendmpd (%
846  %b = load <2 x double>, <2 x double>* %ptr
847  %res = call <2 x double> @llvm.x86.avx512.mask.blend.pd.128(<2 x double> %a, <2 x double> %b, i8 %mask) ; <<2 x double>> [#uses=1]
848  ret <2 x double> %res
849}
850declare <2 x double> @llvm.x86.avx512.mask.blend.pd.128(<2 x double>, <2 x double>, i8) nounwind readonly
851
852define <4 x i32> @test_x86_mask_blend_d_128(i8 %a0, <4 x i32> %a1, <4 x i32> %a2) {
853  ; CHECK: vpblendmd
854  %res = call <4 x i32> @llvm.x86.avx512.mask.blend.d.128(<4 x i32> %a1, <4 x i32> %a2, i8 %a0) ; <<4 x i32>> [#uses=1]
855  ret <4 x i32> %res
856}
857declare <4 x i32> @llvm.x86.avx512.mask.blend.d.128(<4 x i32>, <4 x i32>, i8) nounwind readonly
858
859define <2 x i64> @test_x86_mask_blend_q_128(i8 %a0, <2 x i64> %a1, <2 x i64> %a2) {
860  ; CHECK: vpblendmq
861  %res = call <2 x i64> @llvm.x86.avx512.mask.blend.q.128(<2 x i64> %a1, <2 x i64> %a2, i8 %a0) ; <<2 x i64>> [#uses=1]
862  ret <2 x i64> %res
863}
864declare <2 x i64> @llvm.x86.avx512.mask.blend.q.128(<2 x i64>, <2 x i64>, i8) nounwind readonly
865
866
867define < 2 x i64> @test_mask_mul_epi32_rr_128(< 4 x i32> %a, < 4 x i32> %b) {
868  ;CHECK-LABEL: test_mask_mul_epi32_rr_128
869  ;CHECK: vpmuldq %xmm1, %xmm0, %xmm0     ## encoding: [0x62,0xf2,0xfd,0x08,0x28,0xc1]
870  %res = call < 2 x i64> @llvm.x86.avx512.mask.pmul.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> zeroinitializer, i8 -1)
871  ret < 2 x i64> %res
872}
873
874define < 2 x i64> @test_mask_mul_epi32_rrk_128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> %passThru, i8 %mask) {
875  ;CHECK-LABEL: test_mask_mul_epi32_rrk_128
876  ;CHECK: vpmuldq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x28,0xd1]
877  %res = call < 2 x i64> @llvm.x86.avx512.mask.pmul.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> %passThru, i8 %mask)
878  ret < 2 x i64> %res
879}
880
881define < 2 x i64> @test_mask_mul_epi32_rrkz_128(< 4 x i32> %a, < 4 x i32> %b, i8 %mask) {
882  ;CHECK-LABEL: test_mask_mul_epi32_rrkz_128
883  ;CHECK: vpmuldq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x28,0xc1]
884  %res = call < 2 x i64> @llvm.x86.avx512.mask.pmul.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> zeroinitializer, i8 %mask)
885  ret < 2 x i64> %res
886}
887
888define < 2 x i64> @test_mask_mul_epi32_rm_128(< 4 x i32> %a, < 4 x i32>* %ptr_b) {
889  ;CHECK-LABEL: test_mask_mul_epi32_rm_128
890  ;CHECK: vpmuldq (%rdi), %xmm0, %xmm0    ## encoding: [0x62,0xf2,0xfd,0x08,0x28,0x07]
891  %b = load < 4 x i32>, < 4 x i32>* %ptr_b
892  %res = call < 2 x i64> @llvm.x86.avx512.mask.pmul.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> zeroinitializer, i8 -1)
893  ret < 2 x i64> %res
894}
895
896define < 2 x i64> @test_mask_mul_epi32_rmk_128(< 4 x i32> %a, < 4 x i32>* %ptr_b, < 2 x i64> %passThru, i8 %mask) {
897  ;CHECK-LABEL: test_mask_mul_epi32_rmk_128
898  ;CHECK: vpmuldq (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x28,0x0f]
899  %b = load < 4 x i32>, < 4 x i32>* %ptr_b
900  %res = call < 2 x i64> @llvm.x86.avx512.mask.pmul.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> %passThru, i8 %mask)
901  ret < 2 x i64> %res
902}
903
904define < 2 x i64> @test_mask_mul_epi32_rmkz_128(< 4 x i32> %a, < 4 x i32>* %ptr_b, i8 %mask) {
905  ;CHECK-LABEL: test_mask_mul_epi32_rmkz_128
906  ;CHECK: vpmuldq (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x28,0x07]
907  %b = load < 4 x i32>, < 4 x i32>* %ptr_b
908  %res = call < 2 x i64> @llvm.x86.avx512.mask.pmul.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> zeroinitializer, i8 %mask)
909  ret < 2 x i64> %res
910}
911
912define < 2 x i64> @test_mask_mul_epi32_rmb_128(< 4 x i32> %a, i64* %ptr_b) {
913  ;CHECK-LABEL: test_mask_mul_epi32_rmb_128
914  ;CHECK: vpmuldq (%rdi){1to2}, %xmm0, %xmm0  ## encoding: [0x62,0xf2,0xfd,0x18,0x28,0x07]
915  %q = load i64, i64* %ptr_b
916  %vecinit.i = insertelement < 2 x i64> undef, i64 %q, i32 0
917  %b64 = shufflevector < 2 x i64> %vecinit.i, < 2 x i64> undef, <2 x i32> zeroinitializer
918  %b = bitcast < 2 x i64> %b64 to < 4 x i32>
919  %res = call < 2 x i64> @llvm.x86.avx512.mask.pmul.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> zeroinitializer, i8 -1)
920  ret < 2 x i64> %res
921}
922
923define < 2 x i64> @test_mask_mul_epi32_rmbk_128(< 4 x i32> %a, i64* %ptr_b, < 2 x i64> %passThru, i8 %mask) {
924  ;CHECK-LABEL: test_mask_mul_epi32_rmbk_128
925  ;CHECK: vpmuldq (%rdi){1to2}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x19,0x28,0x0f]
926  %q = load i64, i64* %ptr_b
927  %vecinit.i = insertelement < 2 x i64> undef, i64 %q, i32 0
928  %b64 = shufflevector < 2 x i64> %vecinit.i, < 2 x i64> undef, <2 x i32> zeroinitializer
929  %b = bitcast < 2 x i64> %b64 to < 4 x i32>
930  %res = call < 2 x i64> @llvm.x86.avx512.mask.pmul.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> %passThru, i8 %mask)
931  ret < 2 x i64> %res
932}
933
934define < 2 x i64> @test_mask_mul_epi32_rmbkz_128(< 4 x i32> %a, i64* %ptr_b, i8 %mask) {
935  ;CHECK-LABEL: test_mask_mul_epi32_rmbkz_128
936  ;CHECK: vpmuldq (%rdi){1to2}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x99,0x28,0x07]
937  %q = load i64, i64* %ptr_b
938  %vecinit.i = insertelement < 2 x i64> undef, i64 %q, i32 0
939  %b64 = shufflevector < 2 x i64> %vecinit.i, < 2 x i64> undef, < 2 x i32> zeroinitializer
940  %b = bitcast < 2 x i64> %b64 to < 4 x i32>
941  %res = call < 2 x i64> @llvm.x86.avx512.mask.pmul.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> zeroinitializer, i8 %mask)
942  ret < 2 x i64> %res
943}
944
945declare < 2 x i64> @llvm.x86.avx512.mask.pmul.dq.128(< 4 x i32>, < 4 x i32>, < 2 x i64>, i8)
946
947define < 4 x i64> @test_mask_mul_epi32_rr_256(< 8 x i32> %a, < 8 x i32> %b) {
948  ;CHECK-LABEL: test_mask_mul_epi32_rr_256
949  ;CHECK: vpmuldq %ymm1, %ymm0, %ymm0     ## encoding: [0x62,0xf2,0xfd,0x28,0x28,0xc1]
950  %res = call < 4 x i64> @llvm.x86.avx512.mask.pmul.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> zeroinitializer, i8 -1)
951  ret < 4 x i64> %res
952}
953
954define < 4 x i64> @test_mask_mul_epi32_rrk_256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> %passThru, i8 %mask) {
955  ;CHECK-LABEL: test_mask_mul_epi32_rrk_256
956  ;CHECK: vpmuldq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x28,0xd1]
957  %res = call < 4 x i64> @llvm.x86.avx512.mask.pmul.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> %passThru, i8 %mask)
958  ret < 4 x i64> %res
959}
960
961define < 4 x i64> @test_mask_mul_epi32_rrkz_256(< 8 x i32> %a, < 8 x i32> %b, i8 %mask) {
962  ;CHECK-LABEL: test_mask_mul_epi32_rrkz_256
963  ;CHECK: vpmuldq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x28,0xc1]
964  %res = call < 4 x i64> @llvm.x86.avx512.mask.pmul.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> zeroinitializer, i8 %mask)
965  ret < 4 x i64> %res
966}
967
968define < 4 x i64> @test_mask_mul_epi32_rm_256(< 8 x i32> %a, < 8 x i32>* %ptr_b) {
969  ;CHECK-LABEL: test_mask_mul_epi32_rm_256
970  ;CHECK: vpmuldq (%rdi), %ymm0, %ymm0    ## encoding: [0x62,0xf2,0xfd,0x28,0x28,0x07]
971  %b = load < 8 x i32>, < 8 x i32>* %ptr_b
972  %res = call < 4 x i64> @llvm.x86.avx512.mask.pmul.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> zeroinitializer, i8 -1)
973  ret < 4 x i64> %res
974}
975
976define < 4 x i64> @test_mask_mul_epi32_rmk_256(< 8 x i32> %a, < 8 x i32>* %ptr_b, < 4 x i64> %passThru, i8 %mask) {
977  ;CHECK-LABEL: test_mask_mul_epi32_rmk_256
978  ;CHECK: vpmuldq (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x28,0x0f]
979  %b = load < 8 x i32>, < 8 x i32>* %ptr_b
980  %res = call < 4 x i64> @llvm.x86.avx512.mask.pmul.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> %passThru, i8 %mask)
981  ret < 4 x i64> %res
982}
983
984define < 4 x i64> @test_mask_mul_epi32_rmkz_256(< 8 x i32> %a, < 8 x i32>* %ptr_b, i8 %mask) {
985  ;CHECK-LABEL: test_mask_mul_epi32_rmkz_256
986  ;CHECK: vpmuldq (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x28,0x07]
987  %b = load < 8 x i32>, < 8 x i32>* %ptr_b
988  %res = call < 4 x i64> @llvm.x86.avx512.mask.pmul.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> zeroinitializer, i8 %mask)
989  ret < 4 x i64> %res
990}
991
992define < 4 x i64> @test_mask_mul_epi32_rmb_256(< 8 x i32> %a, i64* %ptr_b) {
993  ;CHECK-LABEL: test_mask_mul_epi32_rmb_256
994  ;CHECK: vpmuldq (%rdi){1to4}, %ymm0, %ymm0  ## encoding: [0x62,0xf2,0xfd,0x38,0x28,0x07]
995  %q = load i64, i64* %ptr_b
996  %vecinit.i = insertelement < 4 x i64> undef, i64 %q, i32 0
997  %b64 = shufflevector < 4 x i64> %vecinit.i, < 4 x i64> undef, < 4 x i32> zeroinitializer
998  %b = bitcast < 4 x i64> %b64 to < 8 x i32>
999  %res = call < 4 x i64> @llvm.x86.avx512.mask.pmul.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> zeroinitializer, i8 -1)
1000  ret < 4 x i64> %res
1001}
1002
1003define < 4 x i64> @test_mask_mul_epi32_rmbk_256(< 8 x i32> %a, i64* %ptr_b, < 4 x i64> %passThru, i8 %mask) {
1004  ;CHECK-LABEL: test_mask_mul_epi32_rmbk_256
1005  ;CHECK: vpmuldq (%rdi){1to4}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x39,0x28,0x0f]
1006  %q = load i64, i64* %ptr_b
1007  %vecinit.i = insertelement < 4 x i64> undef, i64 %q, i32 0
1008  %b64 = shufflevector < 4 x i64> %vecinit.i, < 4 x i64> undef, < 4 x i32> zeroinitializer
1009  %b = bitcast < 4 x i64> %b64 to < 8 x i32>
1010  %res = call < 4 x i64> @llvm.x86.avx512.mask.pmul.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> %passThru, i8 %mask)
1011  ret < 4 x i64> %res
1012}
1013
1014define < 4 x i64> @test_mask_mul_epi32_rmbkz_256(< 8 x i32> %a, i64* %ptr_b, i8 %mask) {
1015  ;CHECK-LABEL: test_mask_mul_epi32_rmbkz_256
1016  ;CHECK: vpmuldq (%rdi){1to4}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xb9,0x28,0x07]
1017  %q = load i64, i64* %ptr_b
1018  %vecinit.i = insertelement < 4 x i64> undef, i64 %q, i32 0
1019  %b64 = shufflevector < 4 x i64> %vecinit.i, < 4 x i64> undef, < 4 x i32> zeroinitializer
1020  %b = bitcast < 4 x i64> %b64 to < 8 x i32>
1021  %res = call < 4 x i64> @llvm.x86.avx512.mask.pmul.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> zeroinitializer, i8 %mask)
1022  ret < 4 x i64> %res
1023}
1024
1025declare < 4 x i64> @llvm.x86.avx512.mask.pmul.dq.256(< 8 x i32>, < 8 x i32>, < 4 x i64>, i8)
1026
1027define < 2 x i64> @test_mask_mul_epu32_rr_128(< 4 x i32> %a, < 4 x i32> %b) {
1028  ;CHECK-LABEL: test_mask_mul_epu32_rr_128
1029  ;CHECK: vpmuludq %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xf4,0xc1]
1030  %res = call < 2 x i64> @llvm.x86.avx512.mask.pmulu.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> zeroinitializer, i8 -1)
1031  ret < 2 x i64> %res
1032}
1033
1034define < 2 x i64> @test_mask_mul_epu32_rrk_128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> %passThru, i8 %mask) {
1035  ;CHECK-LABEL: test_mask_mul_epu32_rrk_128
1036  ;CHECK: vpmuludq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xf4,0xd1]
1037  %res = call < 2 x i64> @llvm.x86.avx512.mask.pmulu.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> %passThru, i8 %mask)
1038  ret < 2 x i64> %res
1039}
1040
1041define < 2 x i64> @test_mask_mul_epu32_rrkz_128(< 4 x i32> %a, < 4 x i32> %b, i8 %mask) {
1042  ;CHECK-LABEL: test_mask_mul_epu32_rrkz_128
1043  ;CHECK: vpmuludq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xf4,0xc1]
1044  %res = call < 2 x i64> @llvm.x86.avx512.mask.pmulu.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> zeroinitializer, i8 %mask)
1045  ret < 2 x i64> %res
1046}
1047
1048define < 2 x i64> @test_mask_mul_epu32_rm_128(< 4 x i32> %a, < 4 x i32>* %ptr_b) {
1049  ;CHECK-LABEL: test_mask_mul_epu32_rm_128
1050  ;CHECK: vpmuludq (%rdi), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xf4,0x07]
1051  %b = load < 4 x i32>, < 4 x i32>* %ptr_b
1052  %res = call < 2 x i64> @llvm.x86.avx512.mask.pmulu.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> zeroinitializer, i8 -1)
1053  ret < 2 x i64> %res
1054}
1055
1056define < 2 x i64> @test_mask_mul_epu32_rmk_128(< 4 x i32> %a, < 4 x i32>* %ptr_b, < 2 x i64> %passThru, i8 %mask) {
1057  ;CHECK-LABEL: test_mask_mul_epu32_rmk_128
1058  ;CHECK: vpmuludq (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xf4,0x0f]
1059  %b = load < 4 x i32>, < 4 x i32>* %ptr_b
1060  %res = call < 2 x i64> @llvm.x86.avx512.mask.pmulu.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> %passThru, i8 %mask)
1061  ret < 2 x i64> %res
1062}
1063
1064define < 2 x i64> @test_mask_mul_epu32_rmkz_128(< 4 x i32> %a, < 4 x i32>* %ptr_b, i8 %mask) {
1065  ;CHECK-LABEL: test_mask_mul_epu32_rmkz_128
1066  ;CHECK: vpmuludq (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xf4,0x07]
1067  %b = load < 4 x i32>, < 4 x i32>* %ptr_b
1068  %res = call < 2 x i64> @llvm.x86.avx512.mask.pmulu.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> zeroinitializer, i8 %mask)
1069  ret < 2 x i64> %res
1070}
1071
1072define < 2 x i64> @test_mask_mul_epu32_rmb_128(< 4 x i32> %a, i64* %ptr_b) {
1073  ;CHECK-LABEL: test_mask_mul_epu32_rmb_128
1074  ;CHECK: vpmuludq (%rdi){1to2}, %xmm0, %xmm0  ## encoding: [0x62,0xf1,0xfd,0x18,0xf4,0x07]
1075  %q = load i64, i64* %ptr_b
1076  %vecinit.i = insertelement < 2 x i64> undef, i64 %q, i32 0
1077  %b64 = shufflevector < 2 x i64> %vecinit.i, < 2 x i64> undef, <2 x i32> zeroinitializer
1078  %b = bitcast < 2 x i64> %b64 to < 4 x i32>
1079  %res = call < 2 x i64> @llvm.x86.avx512.mask.pmulu.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> zeroinitializer, i8 -1)
1080  ret < 2 x i64> %res
1081}
1082
1083define < 2 x i64> @test_mask_mul_epu32_rmbk_128(< 4 x i32> %a, i64* %ptr_b, < 2 x i64> %passThru, i8 %mask) {
1084  ;CHECK-LABEL: test_mask_mul_epu32_rmbk_128
1085  ;CHECK: vpmuludq (%rdi){1to2}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x19,0xf4,0x0f]
1086  %q = load i64, i64* %ptr_b
1087  %vecinit.i = insertelement < 2 x i64> undef, i64 %q, i32 0
1088  %b64 = shufflevector < 2 x i64> %vecinit.i, < 2 x i64> undef, <2 x i32> zeroinitializer
1089  %b = bitcast < 2 x i64> %b64 to < 4 x i32>
1090  %res = call < 2 x i64> @llvm.x86.avx512.mask.pmulu.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> %passThru, i8 %mask)
1091  ret < 2 x i64> %res
1092}
1093
1094define < 2 x i64> @test_mask_mul_epu32_rmbkz_128(< 4 x i32> %a, i64* %ptr_b, i8 %mask) {
1095  ;CHECK-LABEL: test_mask_mul_epu32_rmbkz_128
1096  ;CHECK: vpmuludq (%rdi){1to2}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x99,0xf4,0x07]
1097  %q = load i64, i64* %ptr_b
1098  %vecinit.i = insertelement < 2 x i64> undef, i64 %q, i32 0
1099  %b64 = shufflevector < 2 x i64> %vecinit.i, < 2 x i64> undef, < 2 x i32> zeroinitializer
1100  %b = bitcast < 2 x i64> %b64 to < 4 x i32>
1101  %res = call < 2 x i64> @llvm.x86.avx512.mask.pmulu.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> zeroinitializer, i8 %mask)
1102  ret < 2 x i64> %res
1103}
1104
1105declare < 2 x i64> @llvm.x86.avx512.mask.pmulu.dq.128(< 4 x i32>, < 4 x i32>, < 2 x i64>, i8)
1106
1107define < 4 x i64> @test_mask_mul_epu32_rr_256(< 8 x i32> %a, < 8 x i32> %b) {
1108  ;CHECK-LABEL: test_mask_mul_epu32_rr_256
1109  ;CHECK: vpmuludq %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf4,0xc1]
1110  %res = call < 4 x i64> @llvm.x86.avx512.mask.pmulu.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> zeroinitializer, i8 -1)
1111  ret < 4 x i64> %res
1112}
1113
1114define < 4 x i64> @test_mask_mul_epu32_rrk_256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> %passThru, i8 %mask) {
1115  ;CHECK-LABEL: test_mask_mul_epu32_rrk_256
1116  ;CHECK: vpmuludq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xf4,0xd1]
1117  %res = call < 4 x i64> @llvm.x86.avx512.mask.pmulu.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> %passThru, i8 %mask)
1118  ret < 4 x i64> %res
1119}
1120
1121define < 4 x i64> @test_mask_mul_epu32_rrkz_256(< 8 x i32> %a, < 8 x i32> %b, i8 %mask) {
1122  ;CHECK-LABEL: test_mask_mul_epu32_rrkz_256
1123  ;CHECK: vpmuludq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xf4,0xc1]
1124  %res = call < 4 x i64> @llvm.x86.avx512.mask.pmulu.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> zeroinitializer, i8 %mask)
1125  ret < 4 x i64> %res
1126}
1127
1128define < 4 x i64> @test_mask_mul_epu32_rm_256(< 8 x i32> %a, < 8 x i32>* %ptr_b) {
1129  ;CHECK-LABEL: test_mask_mul_epu32_rm_256
1130  ;CHECK: vpmuludq (%rdi), %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf4,0x07]
1131  %b = load < 8 x i32>, < 8 x i32>* %ptr_b
1132  %res = call < 4 x i64> @llvm.x86.avx512.mask.pmulu.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> zeroinitializer, i8 -1)
1133  ret < 4 x i64> %res
1134}
1135
1136define < 4 x i64> @test_mask_mul_epu32_rmk_256(< 8 x i32> %a, < 8 x i32>* %ptr_b, < 4 x i64> %passThru, i8 %mask) {
1137  ;CHECK-LABEL: test_mask_mul_epu32_rmk_256
1138  ;CHECK: vpmuludq (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xf4,0x0f]
1139  %b = load < 8 x i32>, < 8 x i32>* %ptr_b
1140  %res = call < 4 x i64> @llvm.x86.avx512.mask.pmulu.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> %passThru, i8 %mask)
1141  ret < 4 x i64> %res
1142}
1143
1144define < 4 x i64> @test_mask_mul_epu32_rmkz_256(< 8 x i32> %a, < 8 x i32>* %ptr_b, i8 %mask) {
1145  ;CHECK-LABEL: test_mask_mul_epu32_rmkz_256
1146  ;CHECK: vpmuludq (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xf4,0x07]
1147  %b = load < 8 x i32>, < 8 x i32>* %ptr_b
1148  %res = call < 4 x i64> @llvm.x86.avx512.mask.pmulu.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> zeroinitializer, i8 %mask)
1149  ret < 4 x i64> %res
1150}
1151
1152define < 4 x i64> @test_mask_mul_epu32_rmb_256(< 8 x i32> %a, i64* %ptr_b) {
1153  ;CHECK-LABEL: test_mask_mul_epu32_rmb_256
1154  ;CHECK: vpmuludq (%rdi){1to4}, %ymm0, %ymm0  ## encoding: [0x62,0xf1,0xfd,0x38,0xf4,0x07]
1155  %q = load i64, i64* %ptr_b
1156  %vecinit.i = insertelement < 4 x i64> undef, i64 %q, i32 0
1157  %b64 = shufflevector < 4 x i64> %vecinit.i, < 4 x i64> undef, < 4 x i32> zeroinitializer
1158  %b = bitcast < 4 x i64> %b64 to < 8 x i32>
1159  %res = call < 4 x i64> @llvm.x86.avx512.mask.pmulu.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> zeroinitializer, i8 -1)
1160  ret < 4 x i64> %res
1161}
1162
1163define < 4 x i64> @test_mask_mul_epu32_rmbk_256(< 8 x i32> %a, i64* %ptr_b, < 4 x i64> %passThru, i8 %mask) {
1164  ;CHECK-LABEL: test_mask_mul_epu32_rmbk_256
1165  ;CHECK: vpmuludq (%rdi){1to4}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x39,0xf4,0x0f]
1166  %q = load i64, i64* %ptr_b
1167  %vecinit.i = insertelement < 4 x i64> undef, i64 %q, i32 0
1168  %b64 = shufflevector < 4 x i64> %vecinit.i, < 4 x i64> undef, < 4 x i32> zeroinitializer
1169  %b = bitcast < 4 x i64> %b64 to < 8 x i32>
1170  %res = call < 4 x i64> @llvm.x86.avx512.mask.pmulu.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> %passThru, i8 %mask)
1171  ret < 4 x i64> %res
1172}
1173
1174define < 4 x i64> @test_mask_mul_epu32_rmbkz_256(< 8 x i32> %a, i64* %ptr_b, i8 %mask) {
1175  ;CHECK-LABEL: test_mask_mul_epu32_rmbkz_256
1176  ;CHECK: vpmuludq (%rdi){1to4}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xb9,0xf4,0x07]
1177  %q = load i64, i64* %ptr_b
1178  %vecinit.i = insertelement < 4 x i64> undef, i64 %q, i32 0
1179  %b64 = shufflevector < 4 x i64> %vecinit.i, < 4 x i64> undef, < 4 x i32> zeroinitializer
1180  %b = bitcast < 4 x i64> %b64 to < 8 x i32>
1181  %res = call < 4 x i64> @llvm.x86.avx512.mask.pmulu.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> zeroinitializer, i8 %mask)
1182  ret < 4 x i64> %res
1183}
1184
1185declare < 4 x i64> @llvm.x86.avx512.mask.pmulu.dq.256(< 8 x i32>, < 8 x i32>, < 4 x i64>, i8)
1186
1187define <4 x i32> @test_mask_add_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
1188  ;CHECK-LABEL: test_mask_add_epi32_rr_128
1189  ;CHECK: vpaddd %xmm1, %xmm0, %xmm0     ## encoding: [0x62,0xf1,0x7d,0x08,0xfe,0xc1]
1190  %res = call <4 x i32> @llvm.x86.avx512.mask.padd.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
1191  ret <4 x i32> %res
1192}
1193
1194define <4 x i32> @test_mask_add_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask) {
1195  ;CHECK-LABEL: test_mask_add_epi32_rrk_128
1196  ;CHECK: vpaddd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfe,0xd1]
1197  %res = call <4 x i32> @llvm.x86.avx512.mask.padd.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
1198  ret <4 x i32> %res
1199}
1200
1201define <4 x i32> @test_mask_add_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
1202  ;CHECK-LABEL: test_mask_add_epi32_rrkz_128
1203  ;CHECK: vpaddd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfe,0xc1]
1204  %res = call <4 x i32> @llvm.x86.avx512.mask.padd.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
1205  ret <4 x i32> %res
1206}
1207
1208define <4 x i32> @test_mask_add_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
1209  ;CHECK-LABEL: test_mask_add_epi32_rm_128
1210  ;CHECK: vpaddd (%rdi), %xmm0, %xmm0    ## encoding: [0x62,0xf1,0x7d,0x08,0xfe,0x07]
1211  %b = load <4 x i32>, <4 x i32>* %ptr_b
1212  %res = call <4 x i32> @llvm.x86.avx512.mask.padd.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
1213  ret <4 x i32> %res
1214}
1215
1216define <4 x i32> @test_mask_add_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <4 x i32> %passThru, i8 %mask) {
1217  ;CHECK-LABEL: test_mask_add_epi32_rmk_128
1218  ;CHECK: vpaddd (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfe,0x0f]
1219  %b = load <4 x i32>, <4 x i32>* %ptr_b
1220  %res = call <4 x i32> @llvm.x86.avx512.mask.padd.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
1221  ret <4 x i32> %res
1222}
1223
1224define <4 x i32> @test_mask_add_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
1225  ;CHECK-LABEL: test_mask_add_epi32_rmkz_128
1226  ;CHECK: vpaddd (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfe,0x07]
1227  %b = load <4 x i32>, <4 x i32>* %ptr_b
1228  %res = call <4 x i32> @llvm.x86.avx512.mask.padd.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
1229  ret <4 x i32> %res
1230}
1231
1232define <4 x i32> @test_mask_add_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
1233  ;CHECK-LABEL: test_mask_add_epi32_rmb_128
1234  ;CHECK: vpaddd (%rdi){1to4}, %xmm0, %xmm0  ## encoding: [0x62,0xf1,0x7d,0x18,0xfe,0x07]
1235  %q = load i32, i32* %ptr_b
1236  %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
1237  %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
1238  %res = call <4 x i32> @llvm.x86.avx512.mask.padd.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
1239  ret <4 x i32> %res
1240}
1241
1242define <4 x i32> @test_mask_add_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <4 x i32> %passThru, i8 %mask) {
1243  ;CHECK-LABEL: test_mask_add_epi32_rmbk_128
1244  ;CHECK: vpaddd (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0xfe,0x0f]
1245  %q = load i32, i32* %ptr_b
1246  %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
1247  %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
1248  %res = call <4 x i32> @llvm.x86.avx512.mask.padd.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
1249  ret <4 x i32> %res
1250}
1251
1252define <4 x i32> @test_mask_add_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
1253  ;CHECK-LABEL: test_mask_add_epi32_rmbkz_128
1254  ;CHECK: vpaddd (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xfe,0x07]
1255  %q = load i32, i32* %ptr_b
1256  %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
1257  %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
1258  %res = call <4 x i32> @llvm.x86.avx512.mask.padd.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
1259  ret <4 x i32> %res
1260}
1261
1262declare <4 x i32> @llvm.x86.avx512.mask.padd.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
1263
1264define <4 x i32> @test_mask_sub_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
1265  ;CHECK-LABEL: test_mask_sub_epi32_rr_128
1266  ;CHECK: vpsubd %xmm1, %xmm0, %xmm0     ## encoding: [0x62,0xf1,0x7d,0x08,0xfa,0xc1]
1267  %res = call <4 x i32> @llvm.x86.avx512.mask.psub.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
1268  ret <4 x i32> %res
1269}
1270
1271define <4 x i32> @test_mask_sub_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask) {
1272  ;CHECK-LABEL: test_mask_sub_epi32_rrk_128
1273  ;CHECK: vpsubd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfa,0xd1]
1274  %res = call <4 x i32> @llvm.x86.avx512.mask.psub.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
1275  ret <4 x i32> %res
1276}
1277
1278define <4 x i32> @test_mask_sub_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
1279  ;CHECK-LABEL: test_mask_sub_epi32_rrkz_128
1280  ;CHECK: vpsubd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfa,0xc1]
1281  %res = call <4 x i32> @llvm.x86.avx512.mask.psub.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
1282  ret <4 x i32> %res
1283}
1284
1285define <4 x i32> @test_mask_sub_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
1286  ;CHECK-LABEL: test_mask_sub_epi32_rm_128
1287  ;CHECK: (%rdi), %xmm0, %xmm0    ## encoding: [0x62,0xf1,0x7d,0x08,0xfa,0x07]
1288  %b = load <4 x i32>, <4 x i32>* %ptr_b
1289  %res = call <4 x i32> @llvm.x86.avx512.mask.psub.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
1290  ret <4 x i32> %res
1291}
1292
1293define <4 x i32> @test_mask_sub_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <4 x i32> %passThru, i8 %mask) {
1294  ;CHECK-LABEL: test_mask_sub_epi32_rmk_128
1295  ;CHECK: vpsubd (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfa,0x0f]
1296  %b = load <4 x i32>, <4 x i32>* %ptr_b
1297  %res = call <4 x i32> @llvm.x86.avx512.mask.psub.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
1298  ret <4 x i32> %res
1299}
1300
1301define <4 x i32> @test_mask_sub_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
1302  ;CHECK-LABEL: test_mask_sub_epi32_rmkz_128
1303  ;CHECK: vpsubd (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfa,0x07]
1304  %b = load <4 x i32>, <4 x i32>* %ptr_b
1305  %res = call <4 x i32> @llvm.x86.avx512.mask.psub.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
1306  ret <4 x i32> %res
1307}
1308
1309define <4 x i32> @test_mask_sub_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
1310  ;CHECK-LABEL: test_mask_sub_epi32_rmb_128
1311  ;CHECK: vpsubd (%rdi){1to4}, %xmm0, %xmm0  ## encoding: [0x62,0xf1,0x7d,0x18,0xfa,0x07]
1312  %q = load i32, i32* %ptr_b
1313  %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
1314  %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
1315  %res = call <4 x i32> @llvm.x86.avx512.mask.psub.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
1316  ret <4 x i32> %res
1317}
1318
1319define <4 x i32> @test_mask_sub_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <4 x i32> %passThru, i8 %mask) {
1320  ;CHECK-LABEL: test_mask_sub_epi32_rmbk_128
1321  ;CHECK: vpsubd (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0xfa,0x0f]
1322  %q = load i32, i32* %ptr_b
1323  %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
1324  %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
1325  %res = call <4 x i32> @llvm.x86.avx512.mask.psub.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
1326  ret <4 x i32> %res
1327}
1328
1329define <4 x i32> @test_mask_sub_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
1330  ;CHECK-LABEL: test_mask_sub_epi32_rmbkz_128
1331  ;CHECK: vpsubd (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xfa,0x07]
1332  %q = load i32, i32* %ptr_b
1333  %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
1334  %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
1335  %res = call <4 x i32> @llvm.x86.avx512.mask.psub.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
1336  ret <4 x i32> %res
1337}
1338
1339declare <4 x i32> @llvm.x86.avx512.mask.psub.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
1340
1341define <8 x i32> @test_mask_sub_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
1342  ;CHECK-LABEL: test_mask_sub_epi32_rr_256
1343  ;CHECK: vpsubd %ymm1, %ymm0, %ymm0     ## encoding: [0x62,0xf1,0x7d,0x28,0xfa,0xc1]
1344  %res = call <8 x i32> @llvm.x86.avx512.mask.psub.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
1345  ret <8 x i32> %res
1346}
1347
1348define <8 x i32> @test_mask_sub_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask) {
1349  ;CHECK-LABEL: test_mask_sub_epi32_rrk_256
1350  ;CHECK: vpsubd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfa,0xd1]
1351  %res = call <8 x i32> @llvm.x86.avx512.mask.psub.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
1352  ret <8 x i32> %res
1353}
1354
1355define <8 x i32> @test_mask_sub_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
1356  ;CHECK-LABEL: test_mask_sub_epi32_rrkz_256
1357  ;CHECK: vpsubd %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfa,0xc1]
1358  %res = call <8 x i32> @llvm.x86.avx512.mask.psub.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
1359  ret <8 x i32> %res
1360}
1361
1362define <8 x i32> @test_mask_sub_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
1363  ;CHECK-LABEL: test_mask_sub_epi32_rm_256
1364  ;CHECK: vpsubd (%rdi), %ymm0, %ymm0    ## encoding: [0x62,0xf1,0x7d,0x28,0xfa,0x07]
1365  %b = load <8 x i32>, <8 x i32>* %ptr_b
1366  %res = call <8 x i32> @llvm.x86.avx512.mask.psub.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
1367  ret <8 x i32> %res
1368}
1369
1370define <8 x i32> @test_mask_sub_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <8 x i32> %passThru, i8 %mask) {
1371  ;CHECK-LABEL: test_mask_sub_epi32_rmk_256
1372  ;CHECK: vpsubd (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfa,0x0f]
1373  %b = load <8 x i32>, <8 x i32>* %ptr_b
1374  %res = call <8 x i32> @llvm.x86.avx512.mask.psub.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
1375  ret <8 x i32> %res
1376}
1377
1378define <8 x i32> @test_mask_sub_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i8 %mask) {
1379  ;CHECK-LABEL: test_mask_sub_epi32_rmkz_256
1380  ;CHECK: vpsubd (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfa,0x07]
1381  %b = load <8 x i32>, <8 x i32>* %ptr_b
1382  %res = call <8 x i32> @llvm.x86.avx512.mask.psub.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
1383  ret <8 x i32> %res
1384}
1385
1386define <8 x i32> @test_mask_sub_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
1387  ;CHECK-LABEL: test_mask_sub_epi32_rmb_256
1388  ;CHECK: vpsubd (%rdi){1to8}, %ymm0, %ymm0  ## encoding: [0x62,0xf1,0x7d,0x38,0xfa,0x07]
1389  %q = load i32, i32* %ptr_b
1390  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
1391  %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
1392  %res = call <8 x i32> @llvm.x86.avx512.mask.psub.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
1393  ret <8 x i32> %res
1394}
1395
1396define <8 x i32> @test_mask_sub_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <8 x i32> %passThru, i8 %mask) {
1397  ;CHECK-LABEL: test_mask_sub_epi32_rmbk_256
1398  ;CHECK: vpsubd (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0xfa,0x0f]
1399  %q = load i32, i32* %ptr_b
1400  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
1401  %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
1402  %res = call <8 x i32> @llvm.x86.avx512.mask.psub.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
1403  ret <8 x i32> %res
1404}
1405
1406define <8 x i32> @test_mask_sub_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i8 %mask) {
1407  ;CHECK-LABEL: test_mask_sub_epi32_rmbkz_256
1408  ;CHECK: vpsubd (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0xfa,0x07]
1409  %q = load i32, i32* %ptr_b
1410  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
1411  %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
1412  %res = call <8 x i32> @llvm.x86.avx512.mask.psub.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
1413  ret <8 x i32> %res
1414}
1415
1416declare <8 x i32> @llvm.x86.avx512.mask.psub.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
1417
1418define <8 x i32> @test_mask_add_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
1419  ;CHECK-LABEL: test_mask_add_epi32_rr_256
1420  ;CHECK: vpaddd %ymm1, %ymm0, %ymm0     ## encoding: [0x62,0xf1,0x7d,0x28,0xfe,0xc1]
1421  %res = call <8 x i32> @llvm.x86.avx512.mask.padd.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
1422  ret <8 x i32> %res
1423}
1424
1425define <8 x i32> @test_mask_add_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask) {
1426  ;CHECK-LABEL: test_mask_add_epi32_rrk_256
1427  ;CHECK: vpaddd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfe,0xd1]
1428  %res = call <8 x i32> @llvm.x86.avx512.mask.padd.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
1429  ret <8 x i32> %res
1430}
1431
1432define <8 x i32> @test_mask_add_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
1433  ;CHECK-LABEL: test_mask_add_epi32_rrkz_256
1434  ;CHECK: vpaddd %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfe,0xc1]
1435  %res = call <8 x i32> @llvm.x86.avx512.mask.padd.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
1436  ret <8 x i32> %res
1437}
1438
1439define <8 x i32> @test_mask_add_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
1440  ;CHECK-LABEL: test_mask_add_epi32_rm_256
1441  ;CHECK: vpaddd (%rdi), %ymm0, %ymm0    ## encoding: [0x62,0xf1,0x7d,0x28,0xfe,0x07]
1442  %b = load <8 x i32>, <8 x i32>* %ptr_b
1443  %res = call <8 x i32> @llvm.x86.avx512.mask.padd.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
1444  ret <8 x i32> %res
1445}
1446
1447define <8 x i32> @test_mask_add_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <8 x i32> %passThru, i8 %mask) {
1448  ;CHECK-LABEL: test_mask_add_epi32_rmk_256
1449  ;CHECK: vpaddd (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfe,0x0f]
1450  %b = load <8 x i32>, <8 x i32>* %ptr_b
1451  %res = call <8 x i32> @llvm.x86.avx512.mask.padd.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
1452  ret <8 x i32> %res
1453}
1454
1455define <8 x i32> @test_mask_add_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i8 %mask) {
1456  ;CHECK-LABEL: test_mask_add_epi32_rmkz_256
1457  ;CHECK: vpaddd (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfe,0x07]
1458  %b = load <8 x i32>, <8 x i32>* %ptr_b
1459  %res = call <8 x i32> @llvm.x86.avx512.mask.padd.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
1460  ret <8 x i32> %res
1461}
1462
1463define <8 x i32> @test_mask_add_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
1464  ;CHECK-LABEL: test_mask_add_epi32_rmb_256
1465  ;CHECK: vpaddd (%rdi){1to8}, %ymm0, %ymm0  ## encoding: [0x62,0xf1,0x7d,0x38,0xfe,0x07]
1466  %q = load i32, i32* %ptr_b
1467  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
1468  %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
1469  %res = call <8 x i32> @llvm.x86.avx512.mask.padd.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
1470  ret <8 x i32> %res
1471}
1472
1473define <8 x i32> @test_mask_add_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <8 x i32> %passThru, i8 %mask) {
1474  ;CHECK-LABEL: test_mask_add_epi32_rmbk_256
1475  ;CHECK: vpaddd (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0xfe,0x0f]
1476  %q = load i32, i32* %ptr_b
1477  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
1478  %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
1479  %res = call <8 x i32> @llvm.x86.avx512.mask.padd.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
1480  ret <8 x i32> %res
1481}
1482
1483define <8 x i32> @test_mask_add_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i8 %mask) {
1484  ;CHECK-LABEL: test_mask_add_epi32_rmbkz_256
1485  ;CHECK: vpaddd (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0xfe,0x07]
1486  %q = load i32, i32* %ptr_b
1487  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
1488  %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
1489  %res = call <8 x i32> @llvm.x86.avx512.mask.padd.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
1490  ret <8 x i32> %res
1491}
1492
1493declare <8 x i32> @llvm.x86.avx512.mask.padd.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
1494
1495define <4 x i32> @test_mask_and_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
1496  ;CHECK-LABEL: test_mask_and_epi32_rr_128
1497  ;CHECK: vpandd  %xmm1, %xmm0, %xmm0     ## encoding: [0x62,0xf1,0x7d,0x08,0xdb,0xc1]
1498  %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
1499  ret <4 x i32> %res
1500}
1501
1502define <4 x i32> @test_mask_and_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask) {
1503  ;CHECK-LABEL: test_mask_and_epi32_rrk_128
1504  ;CHECK: vpandd  %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdb,0xd1]
1505  %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
1506  ret <4 x i32> %res
1507}
1508
1509define <4 x i32> @test_mask_and_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
1510  ;CHECK-LABEL: test_mask_and_epi32_rrkz_128
1511  ;CHECK: vpandd  %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdb,0xc1]
1512  %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
1513  ret <4 x i32> %res
1514}
1515
1516define <4 x i32> @test_mask_and_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
1517  ;CHECK-LABEL: test_mask_and_epi32_rm_128
1518  ;CHECK: vpandd  (%rdi), %xmm0, %xmm0    ## encoding: [0x62,0xf1,0x7d,0x08,0xdb,0x07]
1519  %b = load <4 x i32>, <4 x i32>* %ptr_b
1520  %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
1521  ret <4 x i32> %res
1522}
1523
1524define <4 x i32> @test_mask_and_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <4 x i32> %passThru, i8 %mask) {
1525  ;CHECK-LABEL: test_mask_and_epi32_rmk_128
1526  ;CHECK: vpandd  (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdb,0x0f]
1527  %b = load <4 x i32>, <4 x i32>* %ptr_b
1528  %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
1529  ret <4 x i32> %res
1530}
1531
1532define <4 x i32> @test_mask_and_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
1533  ;CHECK-LABEL: test_mask_and_epi32_rmkz_128
1534  ;CHECK: vpandd  (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdb,0x07]
1535  %b = load <4 x i32>, <4 x i32>* %ptr_b
1536  %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
1537  ret <4 x i32> %res
1538}
1539
1540define <4 x i32> @test_mask_and_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
1541  ;CHECK-LABEL: test_mask_and_epi32_rmb_128
1542  ;CHECK: vpandd  (%rdi){1to4}, %xmm0, %xmm0  ## encoding: [0x62,0xf1,0x7d,0x18,0xdb,0x07]
1543  %q = load i32, i32* %ptr_b
1544  %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
1545  %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
1546  %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
1547  ret <4 x i32> %res
1548}
1549
1550define <4 x i32> @test_mask_and_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <4 x i32> %passThru, i8 %mask) {
1551  ;CHECK-LABEL: test_mask_and_epi32_rmbk_128
1552  ;CHECK: vpandd  (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0xdb,0x0f]
1553  %q = load i32, i32* %ptr_b
1554  %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
1555  %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
1556  %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
1557  ret <4 x i32> %res
1558}
1559
1560define <4 x i32> @test_mask_and_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
1561  ;CHECK-LABEL: test_mask_and_epi32_rmbkz_128
1562  ;CHECK: vpandd  (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xdb,0x07]
1563  %q = load i32, i32* %ptr_b
1564  %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
1565  %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
1566  %res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
1567  ret <4 x i32> %res
1568}
1569
1570declare <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
1571
1572define <8 x i32> @test_mask_and_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
1573  ;CHECK-LABEL: test_mask_and_epi32_rr_256
1574  ;CHECK: vpandd  %ymm1, %ymm0, %ymm0     ## encoding: [0x62,0xf1,0x7d,0x28,0xdb,0xc1]
1575  %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
1576  ret <8 x i32> %res
1577}
1578
1579define <8 x i32> @test_mask_and_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask) {
1580  ;CHECK-LABEL: test_mask_and_epi32_rrk_256
1581  ;CHECK: vpandd  %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdb,0xd1]
1582  %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
1583  ret <8 x i32> %res
1584}
1585
1586define <8 x i32> @test_mask_and_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
1587  ;CHECK-LABEL: test_mask_and_epi32_rrkz_256
1588  ;CHECK: vpandd  %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdb,0xc1]
1589  %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
1590  ret <8 x i32> %res
1591}
1592
1593define <8 x i32> @test_mask_and_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
1594  ;CHECK-LABEL: test_mask_and_epi32_rm_256
1595  ;CHECK: vpandd  (%rdi), %ymm0, %ymm0    ## encoding: [0x62,0xf1,0x7d,0x28,0xdb,0x07]
1596  %b = load <8 x i32>, <8 x i32>* %ptr_b
1597  %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
1598  ret <8 x i32> %res
1599}
1600
1601define <8 x i32> @test_mask_and_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <8 x i32> %passThru, i8 %mask) {
1602  ;CHECK-LABEL: test_mask_and_epi32_rmk_256
1603  ;CHECK: vpandd  (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdb,0x0f]
1604  %b = load <8 x i32>, <8 x i32>* %ptr_b
1605  %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
1606  ret <8 x i32> %res
1607}
1608
1609define <8 x i32> @test_mask_and_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i8 %mask) {
1610  ;CHECK-LABEL: test_mask_and_epi32_rmkz_256
1611  ;CHECK: vpandd  (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdb,0x07]
1612  %b = load <8 x i32>, <8 x i32>* %ptr_b
1613  %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
1614  ret <8 x i32> %res
1615}
1616
1617define <8 x i32> @test_mask_and_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
1618  ;CHECK-LABEL: test_mask_and_epi32_rmb_256
1619  ;CHECK: vpandd  (%rdi){1to8}, %ymm0, %ymm0  ## encoding: [0x62,0xf1,0x7d,0x38,0xdb,0x07]
1620  %q = load i32, i32* %ptr_b
1621  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
1622  %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
1623  %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
1624  ret <8 x i32> %res
1625}
1626
1627define <8 x i32> @test_mask_and_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <8 x i32> %passThru, i8 %mask) {
1628  ;CHECK-LABEL: test_mask_and_epi32_rmbk_256
1629  ;CHECK: vpandd  (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0xdb,0x0f]
1630  %q = load i32, i32* %ptr_b
1631  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
1632  %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
1633  %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
1634  ret <8 x i32> %res
1635}
1636
1637define <8 x i32> @test_mask_and_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i8 %mask) {
1638  ;CHECK-LABEL: test_mask_and_epi32_rmbkz_256
1639  ;CHECK: vpandd  (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0xdb,0x07]
1640  %q = load i32, i32* %ptr_b
1641  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
1642  %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
1643  %res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
1644  ret <8 x i32> %res
1645}
1646
1647declare <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
1648
1649define <4 x i32> @test_mask_or_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
1650  ;CHECK-LABEL: test_mask_or_epi32_rr_128
1651  ;CHECK: vpord   %xmm1, %xmm0, %xmm0     ## encoding: [0x62,0xf1,0x7d,0x08,0xeb,0xc1]
1652  %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
1653  ret <4 x i32> %res
1654}
1655
1656define <4 x i32> @test_mask_or_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask) {
1657  ;CHECK-LABEL: test_mask_or_epi32_rrk_128
1658  ;CHECK: vpord   %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xeb,0xd1]
1659  %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
1660  ret <4 x i32> %res
1661}
1662
1663define <4 x i32> @test_mask_or_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
1664  ;CHECK-LABEL: test_mask_or_epi32_rrkz_128
1665  ;CHECK: vpord   %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xeb,0xc1]
1666  %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
1667  ret <4 x i32> %res
1668}
1669
1670define <4 x i32> @test_mask_or_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
1671  ;CHECK-LABEL: test_mask_or_epi32_rm_128
1672  ;CHECK: vpord   (%rdi), %xmm0, %xmm0    ## encoding: [0x62,0xf1,0x7d,0x08,0xeb,0x07]
1673  %b = load <4 x i32>, <4 x i32>* %ptr_b
1674  %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
1675  ret <4 x i32> %res
1676}
1677
1678define <4 x i32> @test_mask_or_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <4 x i32> %passThru, i8 %mask) {
1679  ;CHECK-LABEL: test_mask_or_epi32_rmk_128
1680  ;CHECK: vpord   (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xeb,0x0f]
1681  %b = load <4 x i32>, <4 x i32>* %ptr_b
1682  %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
1683  ret <4 x i32> %res
1684}
1685
1686define <4 x i32> @test_mask_or_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
1687  ;CHECK-LABEL: test_mask_or_epi32_rmkz_128
1688  ;CHECK: vpord   (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xeb,0x07]
1689  %b = load <4 x i32>, <4 x i32>* %ptr_b
1690  %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
1691  ret <4 x i32> %res
1692}
1693
1694define <4 x i32> @test_mask_or_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
1695  ;CHECK-LABEL: test_mask_or_epi32_rmb_128
1696  ;CHECK: vpord   (%rdi){1to4}, %xmm0, %xmm0  ## encoding: [0x62,0xf1,0x7d,0x18,0xeb,0x07]
1697  %q = load i32, i32* %ptr_b
1698  %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
1699  %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
1700  %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
1701  ret <4 x i32> %res
1702}
1703
1704define <4 x i32> @test_mask_or_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <4 x i32> %passThru, i8 %mask) {
1705  ;CHECK-LABEL: test_mask_or_epi32_rmbk_128
1706  ;CHECK: vpord   (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0xeb,0x0f]
1707  %q = load i32, i32* %ptr_b
1708  %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
1709  %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
1710  %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
1711  ret <4 x i32> %res
1712}
1713
1714define <4 x i32> @test_mask_or_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
1715  ;CHECK-LABEL: test_mask_or_epi32_rmbkz_128
1716  ;CHECK: vpord   (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xeb,0x07]
1717  %q = load i32, i32* %ptr_b
1718  %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
1719  %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
1720  %res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
1721  ret <4 x i32> %res
1722}
1723
1724declare <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
1725
1726define <8 x i32> @test_mask_or_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
1727  ;CHECK-LABEL: test_mask_or_epi32_rr_256
1728  ;CHECK: vpord   %ymm1, %ymm0, %ymm0     ## encoding: [0x62,0xf1,0x7d,0x28,0xeb,0xc1]
1729  %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
1730  ret <8 x i32> %res
1731}
1732
1733define <8 x i32> @test_mask_or_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask) {
1734  ;CHECK-LABEL: test_mask_or_epi32_rrk_256
1735  ;CHECK: vpord   %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xeb,0xd1]
1736  %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
1737  ret <8 x i32> %res
1738}
1739
1740define <8 x i32> @test_mask_or_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
1741  ;CHECK-LABEL: test_mask_or_epi32_rrkz_256
1742  ;CHECK: vpord   %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xeb,0xc1]
1743  %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
1744  ret <8 x i32> %res
1745}
1746
1747define <8 x i32> @test_mask_or_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
1748  ;CHECK-LABEL: test_mask_or_epi32_rm_256
1749  ;CHECK: vpord   (%rdi), %ymm0, %ymm0    ## encoding: [0x62,0xf1,0x7d,0x28,0xeb,0x07]
1750  %b = load <8 x i32>, <8 x i32>* %ptr_b
1751  %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
1752  ret <8 x i32> %res
1753}
1754
1755define <8 x i32> @test_mask_or_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <8 x i32> %passThru, i8 %mask) {
1756  ;CHECK-LABEL: test_mask_or_epi32_rmk_256
1757  ;CHECK: vpord   (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xeb,0x0f]
1758  %b = load <8 x i32>, <8 x i32>* %ptr_b
1759  %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
1760  ret <8 x i32> %res
1761}
1762
1763define <8 x i32> @test_mask_or_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i8 %mask) {
1764  ;CHECK-LABEL: test_mask_or_epi32_rmkz_256
1765  ;CHECK: vpord   (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xeb,0x07]
1766  %b = load <8 x i32>, <8 x i32>* %ptr_b
1767  %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
1768  ret <8 x i32> %res
1769}
1770
1771define <8 x i32> @test_mask_or_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
1772  ;CHECK-LABEL: test_mask_or_epi32_rmb_256
1773  ;CHECK: vpord   (%rdi){1to8}, %ymm0, %ymm0  ## encoding: [0x62,0xf1,0x7d,0x38,0xeb,0x07]
1774  %q = load i32, i32* %ptr_b
1775  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
1776  %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
1777  %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
1778  ret <8 x i32> %res
1779}
1780
1781define <8 x i32> @test_mask_or_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <8 x i32> %passThru, i8 %mask) {
1782  ;CHECK-LABEL: test_mask_or_epi32_rmbk_256
1783  ;CHECK: vpord   (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0xeb,0x0f]
1784  %q = load i32, i32* %ptr_b
1785  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
1786  %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
1787  %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
1788  ret <8 x i32> %res
1789}
1790
1791define <8 x i32> @test_mask_or_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i8 %mask) {
1792  ;CHECK-LABEL: test_mask_or_epi32_rmbkz_256
1793  ;CHECK: vpord   (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0xeb,0x07]
1794  %q = load i32, i32* %ptr_b
1795  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
1796  %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
1797  %res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
1798  ret <8 x i32> %res
1799}
1800
1801declare <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
1802
1803define <4 x i32> @test_mask_xor_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
1804  ;CHECK-LABEL: test_mask_xor_epi32_rr_128
1805  ;CHECK: vpxord  %xmm1, %xmm0, %xmm0     ## encoding: [0x62,0xf1,0x7d,0x08,0xef,0xc1]
1806  %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
1807  ret <4 x i32> %res
1808}
1809
1810define <4 x i32> @test_mask_xor_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask) {
1811  ;CHECK-LABEL: test_mask_xor_epi32_rrk_128
1812  ;CHECK: vpxord  %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xef,0xd1]
1813  %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
1814  ret <4 x i32> %res
1815}
1816
1817define <4 x i32> @test_mask_xor_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
1818  ;CHECK-LABEL: test_mask_xor_epi32_rrkz_128
1819  ;CHECK: vpxord  %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xef,0xc1]
1820  %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
1821  ret <4 x i32> %res
1822}
1823
1824define <4 x i32> @test_mask_xor_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
1825  ;CHECK-LABEL: test_mask_xor_epi32_rm_128
1826  ;CHECK: vpxord  (%rdi), %xmm0, %xmm0    ## encoding: [0x62,0xf1,0x7d,0x08,0xef,0x07]
1827  %b = load <4 x i32>, <4 x i32>* %ptr_b
1828  %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
1829  ret <4 x i32> %res
1830}
1831
1832define <4 x i32> @test_mask_xor_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <4 x i32> %passThru, i8 %mask) {
1833  ;CHECK-LABEL: test_mask_xor_epi32_rmk_128
1834  ;CHECK: vpxord  (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xef,0x0f]
1835  %b = load <4 x i32>, <4 x i32>* %ptr_b
1836  %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
1837  ret <4 x i32> %res
1838}
1839
1840define <4 x i32> @test_mask_xor_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
1841  ;CHECK-LABEL: test_mask_xor_epi32_rmkz_128
1842  ;CHECK: vpxord  (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xef,0x07]
1843  %b = load <4 x i32>, <4 x i32>* %ptr_b
1844  %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
1845  ret <4 x i32> %res
1846}
1847
1848define <4 x i32> @test_mask_xor_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
1849  ;CHECK-LABEL: test_mask_xor_epi32_rmb_128
1850  ;CHECK: vpxord  (%rdi){1to4}, %xmm0, %xmm0  ## encoding: [0x62,0xf1,0x7d,0x18,0xef,0x07]
1851  %q = load i32, i32* %ptr_b
1852  %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
1853  %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
1854  %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
1855  ret <4 x i32> %res
1856}
1857
1858define <4 x i32> @test_mask_xor_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <4 x i32> %passThru, i8 %mask) {
1859  ;CHECK-LABEL: test_mask_xor_epi32_rmbk_128
1860  ;CHECK: vpxord  (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0xef,0x0f]
1861  %q = load i32, i32* %ptr_b
1862  %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
1863  %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
1864  %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
1865  ret <4 x i32> %res
1866}
1867
1868define <4 x i32> @test_mask_xor_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
1869  ;CHECK-LABEL: test_mask_xor_epi32_rmbkz_128
1870  ;CHECK: vpxord  (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xef,0x07]
1871  %q = load i32, i32* %ptr_b
1872  %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
1873  %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
1874  %res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
1875  ret <4 x i32> %res
1876}
1877
1878declare <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
1879
1880define <8 x i32> @test_mask_xor_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
1881  ;CHECK-LABEL: test_mask_xor_epi32_rr_256
1882  ;CHECK: vpxord  %ymm1, %ymm0, %ymm0     ## encoding: [0x62,0xf1,0x7d,0x28,0xef,0xc1]
1883  %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
1884  ret <8 x i32> %res
1885}
1886
1887define <8 x i32> @test_mask_xor_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask) {
1888  ;CHECK-LABEL: test_mask_xor_epi32_rrk_256
1889  ;CHECK: vpxord  %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xef,0xd1]
1890  %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
1891  ret <8 x i32> %res
1892}
1893
1894define <8 x i32> @test_mask_xor_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
1895  ;CHECK-LABEL: test_mask_xor_epi32_rrkz_256
1896  ;CHECK: vpxord  %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xef,0xc1]
1897  %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
1898  ret <8 x i32> %res
1899}
1900
1901define <8 x i32> @test_mask_xor_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
1902  ;CHECK-LABEL: test_mask_xor_epi32_rm_256
1903  ;CHECK: vpxord  (%rdi), %ymm0, %ymm0    ## encoding: [0x62,0xf1,0x7d,0x28,0xef,0x07]
1904  %b = load <8 x i32>, <8 x i32>* %ptr_b
1905  %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
1906  ret <8 x i32> %res
1907}
1908
1909define <8 x i32> @test_mask_xor_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <8 x i32> %passThru, i8 %mask) {
1910  ;CHECK-LABEL: test_mask_xor_epi32_rmk_256
1911  ;CHECK: vpxord  (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xef,0x0f]
1912  %b = load <8 x i32>, <8 x i32>* %ptr_b
1913  %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
1914  ret <8 x i32> %res
1915}
1916
1917define <8 x i32> @test_mask_xor_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i8 %mask) {
1918  ;CHECK-LABEL: test_mask_xor_epi32_rmkz_256
1919  ;CHECK: vpxord  (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xef,0x07]
1920  %b = load <8 x i32>, <8 x i32>* %ptr_b
1921  %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
1922  ret <8 x i32> %res
1923}
1924
1925define <8 x i32> @test_mask_xor_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
1926  ;CHECK-LABEL: test_mask_xor_epi32_rmb_256
1927  ;CHECK: vpxord  (%rdi){1to8}, %ymm0, %ymm0  ## encoding: [0x62,0xf1,0x7d,0x38,0xef,0x07]
1928  %q = load i32, i32* %ptr_b
1929  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
1930  %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
1931  %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
1932  ret <8 x i32> %res
1933}
1934
1935define <8 x i32> @test_mask_xor_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <8 x i32> %passThru, i8 %mask) {
1936  ;CHECK-LABEL: test_mask_xor_epi32_rmbk_256
1937  ;CHECK: vpxord  (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0xef,0x0f]
1938  %q = load i32, i32* %ptr_b
1939  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
1940  %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
1941  %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
1942  ret <8 x i32> %res
1943}
1944
1945define <8 x i32> @test_mask_xor_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i8 %mask) {
1946  ;CHECK-LABEL: test_mask_xor_epi32_rmbkz_256
1947  ;CHECK: vpxord  (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0xef,0x07]
1948  %q = load i32, i32* %ptr_b
1949  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
1950  %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
1951  %res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
1952  ret <8 x i32> %res
1953}
1954
1955declare <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
1956
1957define <4 x i32> @test_mask_andnot_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
1958  ;CHECK-LABEL: test_mask_andnot_epi32_rr_128
1959  ;CHECK: vpandnd  %xmm1, %xmm0, %xmm0     ## encoding: [0x62,0xf1,0x7d,0x08,0xdf,0xc1]
1960  %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
1961  ret <4 x i32> %res
1962}
1963
1964define <4 x i32> @test_mask_andnot_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask) {
1965  ;CHECK-LABEL: test_mask_andnot_epi32_rrk_128
1966  ;CHECK: vpandnd  %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdf,0xd1]
1967  %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
1968  ret <4 x i32> %res
1969}
1970
1971define <4 x i32> @test_mask_andnot_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
1972  ;CHECK-LABEL: test_mask_andnot_epi32_rrkz_128
1973  ;CHECK: vpandnd  %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdf,0xc1]
1974  %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
1975  ret <4 x i32> %res
1976}
1977
1978define <4 x i32> @test_mask_andnot_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
1979  ;CHECK-LABEL: test_mask_andnot_epi32_rm_128
1980  ;CHECK: vpandnd  (%rdi), %xmm0, %xmm0    ## encoding: [0x62,0xf1,0x7d,0x08,0xdf,0x07]
1981  %b = load <4 x i32>, <4 x i32>* %ptr_b
1982  %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
1983  ret <4 x i32> %res
1984}
1985
1986define <4 x i32> @test_mask_andnot_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <4 x i32> %passThru, i8 %mask) {
1987  ;CHECK-LABEL: test_mask_andnot_epi32_rmk_128
1988  ;CHECK: vpandnd  (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdf,0x0f]
1989  %b = load <4 x i32>, <4 x i32>* %ptr_b
1990  %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
1991  ret <4 x i32> %res
1992}
1993
1994define <4 x i32> @test_mask_andnot_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
1995  ;CHECK-LABEL: test_mask_andnot_epi32_rmkz_128
1996  ;CHECK: vpandnd  (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdf,0x07]
1997  %b = load <4 x i32>, <4 x i32>* %ptr_b
1998  %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
1999  ret <4 x i32> %res
2000}
2001
2002define <4 x i32> @test_mask_andnot_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
2003  ;CHECK-LABEL: test_mask_andnot_epi32_rmb_128
2004  ;CHECK: vpandnd  (%rdi){1to4}, %xmm0, %xmm0  ## encoding: [0x62,0xf1,0x7d,0x18,0xdf,0x07]
2005  %q = load i32, i32* %ptr_b
2006  %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
2007  %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
2008  %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
2009  ret <4 x i32> %res
2010}
2011
2012define <4 x i32> @test_mask_andnot_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <4 x i32> %passThru, i8 %mask) {
2013  ;CHECK-LABEL: test_mask_andnot_epi32_rmbk_128
2014  ;CHECK: vpandnd  (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0xdf,0x0f]
2015  %q = load i32, i32* %ptr_b
2016  %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
2017  %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
2018  %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask)
2019  ret <4 x i32> %res
2020}
2021
2022define <4 x i32> @test_mask_andnot_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
2023  ;CHECK-LABEL: test_mask_andnot_epi32_rmbkz_128
2024  ;CHECK: vpandnd  (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xdf,0x07]
2025  %q = load i32, i32* %ptr_b
2026  %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
2027  %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
2028  %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 %mask)
2029  ret <4 x i32> %res
2030}
2031
2032declare <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
2033
2034define <8 x i32> @test_mask_andnot_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
2035  ;CHECK-LABEL: test_mask_andnot_epi32_rr_256
2036  ;CHECK: vpandnd  %ymm1, %ymm0, %ymm0     ## encoding: [0x62,0xf1,0x7d,0x28,0xdf,0xc1]
2037  %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
2038  ret <8 x i32> %res
2039}
2040
2041define <8 x i32> @test_mask_andnot_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask) {
2042  ;CHECK-LABEL: test_mask_andnot_epi32_rrk_256
2043  ;CHECK: vpandnd  %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdf,0xd1]
2044  %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
2045  ret <8 x i32> %res
2046}
2047
2048define <8 x i32> @test_mask_andnot_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
2049  ;CHECK-LABEL: test_mask_andnot_epi32_rrkz_256
2050  ;CHECK: vpandnd  %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdf,0xc1]
2051  %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
2052  ret <8 x i32> %res
2053}
2054
2055define <8 x i32> @test_mask_andnot_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
2056  ;CHECK-LABEL: test_mask_andnot_epi32_rm_256
2057  ;CHECK: vpandnd  (%rdi), %ymm0, %ymm0    ## encoding: [0x62,0xf1,0x7d,0x28,0xdf,0x07]
2058  %b = load <8 x i32>, <8 x i32>* %ptr_b
2059  %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
2060  ret <8 x i32> %res
2061}
2062
2063define <8 x i32> @test_mask_andnot_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <8 x i32> %passThru, i8 %mask) {
2064  ;CHECK-LABEL: test_mask_andnot_epi32_rmk_256
2065  ;CHECK: vpandnd  (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdf,0x0f]
2066  %b = load <8 x i32>, <8 x i32>* %ptr_b
2067  %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
2068  ret <8 x i32> %res
2069}
2070
2071define <8 x i32> @test_mask_andnot_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i8 %mask) {
2072  ;CHECK-LABEL: test_mask_andnot_epi32_rmkz_256
2073  ;CHECK: vpandnd  (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdf,0x07]
2074  %b = load <8 x i32>, <8 x i32>* %ptr_b
2075  %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
2076  ret <8 x i32> %res
2077}
2078
2079define <8 x i32> @test_mask_andnot_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
2080  ;CHECK-LABEL: test_mask_andnot_epi32_rmb_256
2081  ;CHECK: vpandnd  (%rdi){1to8}, %ymm0, %ymm0  ## encoding: [0x62,0xf1,0x7d,0x38,0xdf,0x07]
2082  %q = load i32, i32* %ptr_b
2083  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
2084  %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
2085  %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
2086  ret <8 x i32> %res
2087}
2088
2089define <8 x i32> @test_mask_andnot_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <8 x i32> %passThru, i8 %mask) {
2090  ;CHECK-LABEL: test_mask_andnot_epi32_rmbk_256
2091  ;CHECK: vpandnd  (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0xdf,0x0f]
2092  %q = load i32, i32* %ptr_b
2093  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
2094  %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
2095  %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask)
2096  ret <8 x i32> %res
2097}
2098
2099define <8 x i32> @test_mask_andnot_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i8 %mask) {
2100  ;CHECK-LABEL: test_mask_andnot_epi32_rmbkz_256
2101  ;CHECK: vpandnd  (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0xdf,0x07]
2102  %q = load i32, i32* %ptr_b
2103  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
2104  %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
2105  %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 %mask)
2106  ret <8 x i32> %res
2107}
2108
2109declare <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
2110
2111define <2 x i64> @test_mask_andnot_epi64_rr_128(<2 x i64> %a, <2 x i64> %b) {
2112  ;CHECK-LABEL: test_mask_andnot_epi64_rr_128
2113  ;CHECK: vpandnq  %xmm1, %xmm0, %xmm0     ## encoding: [0x62,0xf1,0xfd,0x08,0xdf,0xc1]
2114  %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 -1)
2115  ret <2 x i64> %res
2116}
2117
2118define <2 x i64> @test_mask_andnot_epi64_rrk_128(<2 x i64> %a, <2 x i64> %b, <2 x i64> %passThru, i8 %mask) {
2119  ;CHECK-LABEL: test_mask_andnot_epi64_rrk_128
2120  ;CHECK: vpandnq  %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xdf,0xd1]
2121  %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> %passThru, i8 %mask)
2122  ret <2 x i64> %res
2123}
2124
2125define <2 x i64> @test_mask_andnot_epi64_rrkz_128(<2 x i64> %a, <2 x i64> %b, i8 %mask) {
2126  ;CHECK-LABEL: test_mask_andnot_epi64_rrkz_128
2127  ;CHECK: vpandnq  %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xdf,0xc1]
2128  %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 %mask)
2129  ret <2 x i64> %res
2130}
2131
2132define <2 x i64> @test_mask_andnot_epi64_rm_128(<2 x i64> %a, <2 x i64>* %ptr_b) {
2133  ;CHECK-LABEL: test_mask_andnot_epi64_rm_128
2134  ;CHECK: vpandnq  (%rdi), %xmm0, %xmm0    ## encoding: [0x62,0xf1,0xfd,0x08,0xdf,0x07]
2135  %b = load <2 x i64>, <2 x i64>* %ptr_b
2136  %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 -1)
2137  ret <2 x i64> %res
2138}
2139
2140define <2 x i64> @test_mask_andnot_epi64_rmk_128(<2 x i64> %a, <2 x i64>* %ptr_b, <2 x i64> %passThru, i8 %mask) {
2141  ;CHECK-LABEL: test_mask_andnot_epi64_rmk_128
2142  ;CHECK: vpandnq  (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xdf,0x0f]
2143  %b = load <2 x i64>, <2 x i64>* %ptr_b
2144  %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> %passThru, i8 %mask)
2145  ret <2 x i64> %res
2146}
2147
2148define <2 x i64> @test_mask_andnot_epi64_rmkz_128(<2 x i64> %a, <2 x i64>* %ptr_b, i8 %mask) {
2149  ;CHECK-LABEL: test_mask_andnot_epi64_rmkz_128
2150  ;CHECK: vpandnq  (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xdf,0x07]
2151  %b = load <2 x i64>, <2 x i64>* %ptr_b
2152  %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 %mask)
2153  ret <2 x i64> %res
2154}
2155
2156define <2 x i64> @test_mask_andnot_epi64_rmb_128(<2 x i64> %a, i64* %ptr_b) {
2157  ;CHECK-LABEL: test_mask_andnot_epi64_rmb_128
2158  ;CHECK: vpandnq  (%rdi){1to2}, %xmm0, %xmm0  ## encoding: [0x62,0xf1,0xfd,0x18,0xdf,0x07]
2159  %q = load i64, i64* %ptr_b
2160  %vecinit.i = insertelement <2 x i64> undef, i64 %q, i32 0
2161  %b = shufflevector <2 x i64> %vecinit.i, <2 x i64> undef, <2 x i32> zeroinitializer
2162  %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 -1)
2163  ret <2 x i64> %res
2164}
2165
2166define <2 x i64> @test_mask_andnot_epi64_rmbk_128(<2 x i64> %a, i64* %ptr_b, <2 x i64> %passThru, i8 %mask) {
2167  ;CHECK-LABEL: test_mask_andnot_epi64_rmbk_128
2168  ;CHECK: vpandnq  (%rdi){1to2}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x19,0xdf,0x0f]
2169  %q = load i64, i64* %ptr_b
2170  %vecinit.i = insertelement <2 x i64> undef, i64 %q, i32 0
2171  %b = shufflevector <2 x i64> %vecinit.i, <2 x i64> undef, <2 x i32> zeroinitializer
2172  %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> %passThru, i8 %mask)
2173  ret <2 x i64> %res
2174}
2175
2176define <2 x i64> @test_mask_andnot_epi64_rmbkz_128(<2 x i64> %a, i64* %ptr_b, i8 %mask) {
2177  ;CHECK-LABEL: test_mask_andnot_epi64_rmbkz_128
2178  ;CHECK: vpandnq  (%rdi){1to2}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x99,0xdf,0x07]
2179  %q = load i64, i64* %ptr_b
2180  %vecinit.i = insertelement <2 x i64> undef, i64 %q, i32 0
2181  %b = shufflevector <2 x i64> %vecinit.i, <2 x i64> undef, <2 x i32> zeroinitializer
2182  %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 %mask)
2183  ret <2 x i64> %res
2184}
2185
2186declare <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
2187
2188define <4 x i64> @test_mask_andnot_epi64_rr_256(<4 x i64> %a, <4 x i64> %b) {
2189  ;CHECK-LABEL: test_mask_andnot_epi64_rr_256
2190  ;CHECK: vpandnq  %ymm1, %ymm0, %ymm0     ## encoding: [0x62,0xf1,0xfd,0x28,0xdf,0xc1]
2191  %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 -1)
2192  ret <4 x i64> %res
2193}
2194
2195define <4 x i64> @test_mask_andnot_epi64_rrk_256(<4 x i64> %a, <4 x i64> %b, <4 x i64> %passThru, i8 %mask) {
2196  ;CHECK-LABEL: test_mask_andnot_epi64_rrk_256
2197  ;CHECK: vpandnq  %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xdf,0xd1]
2198  %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> %passThru, i8 %mask)
2199  ret <4 x i64> %res
2200}
2201
2202define <4 x i64> @test_mask_andnot_epi64_rrkz_256(<4 x i64> %a, <4 x i64> %b, i8 %mask) {
2203  ;CHECK-LABEL: test_mask_andnot_epi64_rrkz_256
2204  ;CHECK: vpandnq  %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xdf,0xc1]
2205  %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 %mask)
2206  ret <4 x i64> %res
2207}
2208
2209define <4 x i64> @test_mask_andnot_epi64_rm_256(<4 x i64> %a, <4 x i64>* %ptr_b) {
2210  ;CHECK-LABEL: test_mask_andnot_epi64_rm_256
2211  ;CHECK: vpandnq  (%rdi), %ymm0, %ymm0    ## encoding: [0x62,0xf1,0xfd,0x28,0xdf,0x07]
2212  %b = load <4 x i64>, <4 x i64>* %ptr_b
2213  %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 -1)
2214  ret <4 x i64> %res
2215}
2216
2217define <4 x i64> @test_mask_andnot_epi64_rmk_256(<4 x i64> %a, <4 x i64>* %ptr_b, <4 x i64> %passThru, i8 %mask) {
2218  ;CHECK-LABEL: test_mask_andnot_epi64_rmk_256
2219  ;CHECK: vpandnq  (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xdf,0x0f]
2220  %b = load <4 x i64>, <4 x i64>* %ptr_b
2221  %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> %passThru, i8 %mask)
2222  ret <4 x i64> %res
2223}
2224
2225define <4 x i64> @test_mask_andnot_epi64_rmkz_256(<4 x i64> %a, <4 x i64>* %ptr_b, i8 %mask) {
2226  ;CHECK-LABEL: test_mask_andnot_epi64_rmkz_256
2227  ;CHECK: vpandnq  (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xdf,0x07]
2228  %b = load <4 x i64>, <4 x i64>* %ptr_b
2229  %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 %mask)
2230  ret <4 x i64> %res
2231}
2232
2233define <4 x i64> @test_mask_andnot_epi64_rmb_256(<4 x i64> %a, i64* %ptr_b) {
2234  ;CHECK-LABEL: test_mask_andnot_epi64_rmb_256
2235  ;CHECK: vpandnq  (%rdi){1to4}, %ymm0, %ymm0  ## encoding: [0x62,0xf1,0xfd,0x38,0xdf,0x07]
2236  %q = load i64, i64* %ptr_b
2237  %vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
2238  %b = shufflevector <4 x i64> %vecinit.i, <4 x i64> undef, <4 x i32> zeroinitializer
2239  %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 -1)
2240  ret <4 x i64> %res
2241}
2242
2243define <4 x i64> @test_mask_andnot_epi64_rmbk_256(<4 x i64> %a, i64* %ptr_b, <4 x i64> %passThru, i8 %mask) {
2244  ;CHECK-LABEL: test_mask_andnot_epi64_rmbk_256
2245  ;CHECK: vpandnq  (%rdi){1to4}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x39,0xdf,0x0f]
2246  %q = load i64, i64* %ptr_b
2247  %vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
2248  %b = shufflevector <4 x i64> %vecinit.i, <4 x i64> undef, <4 x i32> zeroinitializer
2249  %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> %passThru, i8 %mask)
2250  ret <4 x i64> %res
2251}
2252
2253define <4 x i64> @test_mask_andnot_epi64_rmbkz_256(<4 x i64> %a, i64* %ptr_b, i8 %mask) {
2254  ;CHECK-LABEL: test_mask_andnot_epi64_rmbkz_256
2255  ;CHECK: vpandnq  (%rdi){1to4}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xb9,0xdf,0x07]
2256  %q = load i64, i64* %ptr_b
2257  %vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
2258  %b = shufflevector <4 x i64> %vecinit.i, <4 x i64> undef, <4 x i32> zeroinitializer
2259  %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 %mask)
2260  ret <4 x i64> %res
2261}
2262
2263declare <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
2264
2265define i8 @test_cmpps_256(<8 x float> %a, <8 x float> %b) {
2266 ;CHECK: vcmpleps  %ymm1, %ymm0, %k0  ## encoding: [0x62,0xf1,0x7c,0x28,0xc2,0xc1,0x02]
2267   %res = call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %a, <8 x float> %b, i32 2, i8 -1)
2268   ret i8 %res
2269 }
2270 declare i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> , <8 x float> , i32, i8)
2271
2272define i8 @test_cmpps_128(<4 x float> %a, <4 x float> %b) {
2273 ;CHECK: vcmpleps  %xmm1, %xmm0, %k0  ## encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x02]
2274   %res = call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> %a, <4 x float> %b, i32 2, i8 -1)
2275   ret i8 %res
2276 }
2277 declare i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> , <4 x float> , i32, i8)
2278
2279define i8 @test_cmppd_256(<4 x double> %a, <4 x double> %b) {
2280 ;CHECK: vcmplepd  %ymm1, %ymm0, %k0  ## encoding: [0x62,0xf1,0xfd,0x28,0xc2,0xc1,0x02]
2281   %res = call i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> %a, <4 x double> %b, i32 2, i8 -1)
2282   ret i8 %res
2283 }
2284 declare i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> , <4 x double> , i32, i8)
2285
2286define i8 @test_cmppd_128(<2 x double> %a, <2 x double> %b) {
2287 ;CHECK: vcmplepd  %xmm1, %xmm0, %k0  ## encoding: [0x62,0xf1,0xfd,0x08,0xc2,0xc1,0x02]
2288   %res = call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> %a, <2 x double> %b, i32 2, i8 -1)
2289   ret i8 %res
2290 }
2291 declare i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> , <2 x double> , i32, i8)
2292
2293define <8 x float> @test_mm512_maskz_add_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
2294  ;CHECK-LABEL: test_mm512_maskz_add_ps_256
2295  ;CHECK: vaddps %ymm1, %ymm0, %ymm0 {%k1} {z}
2296  %res = call <8 x float> @llvm.x86.avx512.mask.add.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 %mask)
2297  ret <8 x float> %res
2298}
2299
2300define <8 x float> @test_mm512_mask_add_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
2301  ;CHECK-LABEL: test_mm512_mask_add_ps_256
2302  ;CHECK: vaddps %ymm1, %ymm0, %ymm2 {%k1}
2303  %res = call <8 x float> @llvm.x86.avx512.mask.add.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask)
2304  ret <8 x float> %res
2305}
2306
2307define <8 x float> @test_mm512_add_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
2308  ;CHECK-LABEL: test_mm512_add_ps_256
2309  ;CHECK: vaddps %ymm1, %ymm0, %ymm0
2310  %res = call <8 x float> @llvm.x86.avx512.mask.add.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 -1)
2311  ret <8 x float> %res
2312}
2313declare <8 x float> @llvm.x86.avx512.mask.add.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
2314
2315define <4 x float> @test_mm512_maskz_add_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
2316  ;CHECK-LABEL: test_mm512_maskz_add_ps_128
2317  ;CHECK: vaddps %xmm1, %xmm0, %xmm0 {%k1} {z}
2318  %res = call <4 x float> @llvm.x86.avx512.mask.add.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 %mask)
2319  ret <4 x float> %res
2320}
2321
2322define <4 x float> @test_mm512_mask_add_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
2323  ;CHECK-LABEL: test_mm512_mask_add_ps_128
2324  ;CHECK: vaddps %xmm1, %xmm0, %xmm2 {%k1}
2325  %res = call <4 x float> @llvm.x86.avx512.mask.add.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask)
2326  ret <4 x float> %res
2327}
2328
2329define <4 x float> @test_mm512_add_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
2330  ;CHECK-LABEL: test_mm512_add_ps_128
2331  ;CHECK: vaddps %xmm1, %xmm0, %xmm0
2332  %res = call <4 x float> @llvm.x86.avx512.mask.add.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1)
2333  ret <4 x float> %res
2334}
2335declare <4 x float> @llvm.x86.avx512.mask.add.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
2336
2337define <8 x float> @test_mm512_maskz_sub_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
2338  ;CHECK-LABEL: test_mm512_maskz_sub_ps_256
2339  ;CHECK: vsubps %ymm1, %ymm0, %ymm0 {%k1} {z}
2340  %res = call <8 x float> @llvm.x86.avx512.mask.sub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 %mask)
2341  ret <8 x float> %res
2342}
2343
2344define <8 x float> @test_mm512_mask_sub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
2345  ;CHECK-LABEL: test_mm512_mask_sub_ps_256
2346  ;CHECK: vsubps %ymm1, %ymm0, %ymm2 {%k1}
2347  %res = call <8 x float> @llvm.x86.avx512.mask.sub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask)
2348  ret <8 x float> %res
2349}
2350
2351define <8 x float> @test_mm512_sub_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
2352  ;CHECK-LABEL: test_mm512_sub_ps_256
2353  ;CHECK: vsubps %ymm1, %ymm0, %ymm0
2354  %res = call <8 x float> @llvm.x86.avx512.mask.sub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 -1)
2355  ret <8 x float> %res
2356}
2357declare <8 x float> @llvm.x86.avx512.mask.sub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
2358
2359define <4 x float> @test_mm512_maskz_sub_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
2360  ;CHECK-LABEL: test_mm512_maskz_sub_ps_128
2361  ;CHECK: vsubps %xmm1, %xmm0, %xmm0 {%k1} {z}
2362  %res = call <4 x float> @llvm.x86.avx512.mask.sub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 %mask)
2363  ret <4 x float> %res
2364}
2365
2366define <4 x float> @test_mm512_mask_sub_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
2367  ;CHECK-LABEL: test_mm512_mask_sub_ps_128
2368  ;CHECK: vsubps %xmm1, %xmm0, %xmm2 {%k1}
2369  %res = call <4 x float> @llvm.x86.avx512.mask.sub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask)
2370  ret <4 x float> %res
2371}
2372
2373define <4 x float> @test_mm512_sub_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
2374  ;CHECK-LABEL: test_mm512_sub_ps_128
2375  ;CHECK: vsubps %xmm1, %xmm0, %xmm0
2376  %res = call <4 x float> @llvm.x86.avx512.mask.sub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1)
2377  ret <4 x float> %res
2378}
2379declare <4 x float> @llvm.x86.avx512.mask.sub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
2380
2381define <8 x float> @test_mm512_maskz_mul_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
2382  ;CHECK-LABEL: test_mm512_maskz_mul_ps_256
2383  ;CHECK: vmulps %ymm1, %ymm0, %ymm0 {%k1} {z}
2384  %res = call <8 x float> @llvm.x86.avx512.mask.mul.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 %mask)
2385  ret <8 x float> %res
2386}
2387
2388define <8 x float> @test_mm512_mask_mul_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
2389  ;CHECK-LABEL: test_mm512_mask_mul_ps_256
2390  ;CHECK: vmulps %ymm1, %ymm0, %ymm2 {%k1}
2391  %res = call <8 x float> @llvm.x86.avx512.mask.mul.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask)
2392  ret <8 x float> %res
2393}
2394
2395define <8 x float> @test_mm512_mul_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
2396  ;CHECK-LABEL: test_mm512_mul_ps_256
2397  ;CHECK: vmulps %ymm1, %ymm0, %ymm0
2398  %res = call <8 x float> @llvm.x86.avx512.mask.mul.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 -1)
2399  ret <8 x float> %res
2400}
2401declare <8 x float> @llvm.x86.avx512.mask.mul.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
2402
2403define <4 x float> @test_mm512_maskz_mul_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
2404  ;CHECK-LABEL: test_mm512_maskz_mul_ps_128
2405  ;CHECK: vmulps %xmm1, %xmm0, %xmm0 {%k1} {z}
2406  %res = call <4 x float> @llvm.x86.avx512.mask.mul.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 %mask)
2407  ret <4 x float> %res
2408}
2409
2410define <4 x float> @test_mm512_mask_mul_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
2411  ;CHECK-LABEL: test_mm512_mask_mul_ps_128
2412  ;CHECK: vmulps %xmm1, %xmm0, %xmm2 {%k1}
2413  %res = call <4 x float> @llvm.x86.avx512.mask.mul.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask)
2414  ret <4 x float> %res
2415}
2416
2417define <4 x float> @test_mm512_mul_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
2418  ;CHECK-LABEL: test_mm512_mul_ps_128
2419  ;CHECK: vmulps %xmm1, %xmm0, %xmm0
2420  %res = call <4 x float> @llvm.x86.avx512.mask.mul.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1)
2421  ret <4 x float> %res
2422}
2423declare <4 x float> @llvm.x86.avx512.mask.mul.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
2424
2425define <8 x float> @test_mm512_maskz_div_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
2426  ;CHECK-LABEL: test_mm512_maskz_div_ps_256
2427  ;CHECK: vdivps %ymm1, %ymm0, %ymm0 {%k1} {z}
2428  %res = call <8 x float> @llvm.x86.avx512.mask.div.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 %mask)
2429  ret <8 x float> %res
2430}
2431
2432define <8 x float> @test_mm512_mask_div_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
2433  ;CHECK-LABEL: test_mm512_mask_div_ps_256
2434  ;CHECK: vdivps %ymm1, %ymm0, %ymm2 {%k1}
2435  %res = call <8 x float> @llvm.x86.avx512.mask.div.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask)
2436  ret <8 x float> %res
2437}
2438
2439define <8 x float> @test_mm512_div_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
2440  ;CHECK-LABEL: test_mm512_div_ps_256
2441  ;CHECK: vdivps %ymm1, %ymm0, %ymm0
2442  %res = call <8 x float> @llvm.x86.avx512.mask.div.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 -1)
2443  ret <8 x float> %res
2444}
2445declare <8 x float> @llvm.x86.avx512.mask.div.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
2446
2447define <4 x float> @test_mm512_maskz_div_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
2448  ;CHECK-LABEL: test_mm512_maskz_div_ps_128
2449  ;CHECK: vdivps %xmm1, %xmm0, %xmm0 {%k1} {z}
2450  %res = call <4 x float> @llvm.x86.avx512.mask.div.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 %mask)
2451  ret <4 x float> %res
2452}
2453
2454define <4 x float> @test_mm512_mask_div_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
2455  ;CHECK-LABEL: test_mm512_mask_div_ps_128
2456  ;CHECK: vdivps %xmm1, %xmm0, %xmm2 {%k1}
2457  %res = call <4 x float> @llvm.x86.avx512.mask.div.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask)
2458  ret <4 x float> %res
2459}
2460
2461define <4 x float> @test_mm512_div_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
2462  ;CHECK-LABEL: test_mm512_div_ps_128
2463  ;CHECK: vdivps %xmm1, %xmm0, %xmm0
2464  %res = call <4 x float> @llvm.x86.avx512.mask.div.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1)
2465  ret <4 x float> %res
2466}
2467declare <4 x float> @llvm.x86.avx512.mask.div.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
2468
2469define <8 x float> @test_mm512_maskz_max_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
2470  ;CHECK-LABEL: test_mm512_maskz_max_ps_256
2471  ;CHECK: vmaxps %ymm1, %ymm0, %ymm0 {%k1} {z}
2472  %res = call <8 x float> @llvm.x86.avx512.mask.max.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 %mask)
2473  ret <8 x float> %res
2474}
2475
2476define <8 x float> @test_mm512_mask_max_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
2477  ;CHECK-LABEL: test_mm512_mask_max_ps_256
2478  ;CHECK: vmaxps %ymm1, %ymm0, %ymm2 {%k1}
2479  %res = call <8 x float> @llvm.x86.avx512.mask.max.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask)
2480  ret <8 x float> %res
2481}
2482
2483define <8 x float> @test_mm512_max_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
2484  ;CHECK-LABEL: test_mm512_max_ps_256
2485  ;CHECK: vmaxps %ymm1, %ymm0, %ymm0
2486  %res = call <8 x float> @llvm.x86.avx512.mask.max.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 -1)
2487  ret <8 x float> %res
2488}
2489declare <8 x float> @llvm.x86.avx512.mask.max.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
2490
2491define <4 x float> @test_mm512_maskz_max_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
2492  ;CHECK-LABEL: test_mm512_maskz_max_ps_128
2493  ;CHECK: vmaxps %xmm1, %xmm0, %xmm0 {%k1} {z}
2494  %res = call <4 x float> @llvm.x86.avx512.mask.max.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 %mask)
2495  ret <4 x float> %res
2496}
2497
2498define <4 x float> @test_mm512_mask_max_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
2499  ;CHECK-LABEL: test_mm512_mask_max_ps_128
2500  ;CHECK: vmaxps %xmm1, %xmm0, %xmm2 {%k1}
2501  %res = call <4 x float> @llvm.x86.avx512.mask.max.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask)
2502  ret <4 x float> %res
2503}
2504
2505define <4 x float> @test_mm512_max_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
2506  ;CHECK-LABEL: test_mm512_max_ps_128
2507  ;CHECK: vmaxps %xmm1, %xmm0, %xmm0
2508  %res = call <4 x float> @llvm.x86.avx512.mask.max.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1)
2509  ret <4 x float> %res
2510}
2511declare <4 x float> @llvm.x86.avx512.mask.max.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
2512
2513define <8 x float> @test_mm512_maskz_min_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
2514  ;CHECK-LABEL: test_mm512_maskz_min_ps_256
2515  ;CHECK: vminps %ymm1, %ymm0, %ymm0 {%k1} {z}
2516  %res = call <8 x float> @llvm.x86.avx512.mask.min.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 %mask)
2517  ret <8 x float> %res
2518}
2519
2520define <8 x float> @test_mm512_mask_min_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
2521  ;CHECK-LABEL: test_mm512_mask_min_ps_256
2522  ;CHECK: vminps %ymm1, %ymm0, %ymm2 {%k1}
2523  %res = call <8 x float> @llvm.x86.avx512.mask.min.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask)
2524  ret <8 x float> %res
2525}
2526
2527define <8 x float> @test_mm512_min_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
2528  ;CHECK-LABEL: test_mm512_min_ps_256
2529  ;CHECK: vminps %ymm1, %ymm0, %ymm0
2530  %res = call <8 x float> @llvm.x86.avx512.mask.min.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 -1)
2531  ret <8 x float> %res
2532}
2533declare <8 x float> @llvm.x86.avx512.mask.min.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
2534
2535define <4 x float> @test_mm512_maskz_min_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
2536  ;CHECK-LABEL: test_mm512_maskz_min_ps_128
2537  ;CHECK: vminps %xmm1, %xmm0, %xmm0 {%k1} {z}
2538  %res = call <4 x float> @llvm.x86.avx512.mask.min.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 %mask)
2539  ret <4 x float> %res
2540}
2541
2542define <4 x float> @test_mm512_mask_min_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
2543  ;CHECK-LABEL: test_mm512_mask_min_ps_128
2544  ;CHECK: vminps %xmm1, %xmm0, %xmm2 {%k1}
2545  %res = call <4 x float> @llvm.x86.avx512.mask.min.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask)
2546  ret <4 x float> %res
2547}
2548
2549define <4 x float> @test_mm512_min_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
2550  ;CHECK-LABEL: test_mm512_min_ps_128
2551  ;CHECK: vminps %xmm1, %xmm0, %xmm0
2552  %res = call <4 x float> @llvm.x86.avx512.mask.min.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1)
2553  ret <4 x float> %res
2554}
2555declare <4 x float> @llvm.x86.avx512.mask.min.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
2556
2557define <4 x double> @test_sqrt_pd_256(<4 x double> %a0, i8 %mask) {
2558  ; CHECK-LABEL: test_sqrt_pd_256
2559  ; CHECK: vsqrtpd
2560  %res = call <4 x double> @llvm.x86.avx512.mask.sqrt.pd.256(<4 x double> %a0,  <4 x double> zeroinitializer, i8 %mask)
2561  ret <4 x double> %res
2562}
2563declare <4 x double> @llvm.x86.avx512.mask.sqrt.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
2564
2565define <8 x float> @test_sqrt_ps_256(<8 x float> %a0, i8 %mask) {
2566  ; CHECK-LABEL: test_sqrt_ps_256
2567  ; CHECK: vsqrtps
2568  %res = call <8 x float> @llvm.x86.avx512.mask.sqrt.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 %mask)
2569  ret <8 x float> %res
2570}
2571
2572declare <8 x float> @llvm.x86.avx512.mask.sqrt.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
2573
2574define <4 x double> @test_getexp_pd_256(<4 x double> %a0) {
2575  ; CHECK-LABEL: test_getexp_pd_256
2576  ; CHECK: vgetexppd
2577  %res = call <4 x double> @llvm.x86.avx512.mask.getexp.pd.256(<4 x double> %a0,  <4 x double> zeroinitializer, i8 -1)
2578  ret <4 x double> %res
2579}
2580
2581declare <4 x double> @llvm.x86.avx512.mask.getexp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
2582
2583define <8 x float> @test_getexp_ps_256(<8 x float> %a0) {
2584  ; CHECK-LABEL: test_getexp_ps_256
2585  ; CHECK: vgetexpps
2586  %res = call <8 x float> @llvm.x86.avx512.mask.getexp.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 -1)
2587  ret <8 x float> %res
2588}
2589declare <8 x float> @llvm.x86.avx512.mask.getexp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
2590
2591declare <4 x i32> @llvm.x86.avx512.mask.pmaxs.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
2592
2593; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_d_128
2594; CHECK-NOT: call
2595; CHECK: vpmaxsd %xmm
2596; CHECK: {%k1}
2597define <4 x i32>@test_int_x86_avx512_mask_pmaxs_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %mask) {
2598  %res = call <4 x i32> @llvm.x86.avx512.mask.pmaxs.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2 ,i8 %mask)
2599  %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmaxs.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %mask)
2600  %res2 = add <4 x i32> %res, %res1
2601  ret <4 x i32> %res2
2602}
2603
2604declare <8 x i32> @llvm.x86.avx512.mask.pmaxs.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
2605
2606; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_d_256
2607; CHECK-NOT: call
2608; CHECK: vpmaxsd %ymm
2609; CHECK: {%k1}
2610define <8 x i32>@test_int_x86_avx512_mask_pmaxs_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
2611  %res = call <8 x i32> @llvm.x86.avx512.mask.pmaxs.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
2612  %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmaxs.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
2613  %res2 = add <8 x i32> %res, %res1
2614  ret <8 x i32> %res2
2615}
2616
2617declare <2 x i64> @llvm.x86.avx512.mask.pmaxs.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
2618
2619; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_q_128
2620; CHECK-NOT: call
2621; CHECK: vpmaxsq %xmm
2622; CHECK: {%k1}
2623define <2 x i64>@test_int_x86_avx512_mask_pmaxs_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
2624  %res = call <2 x i64> @llvm.x86.avx512.mask.pmaxs.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
2625  %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmaxs.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
2626  %res2 = add <2 x i64> %res, %res1
2627  ret <2 x i64> %res2
2628}
2629
2630declare <4 x i64> @llvm.x86.avx512.mask.pmaxs.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
2631
2632; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_q_256
2633; CHECK-NOT: call
2634; CHECK: vpmaxsq %ymm
2635; CHECK: {%k1}
2636define <4 x i64>@test_int_x86_avx512_mask_pmaxs_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask) {
2637  %res = call <4 x i64> @llvm.x86.avx512.mask.pmaxs.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask)
2638  %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmaxs.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %mask)
2639  %res2 = add <4 x i64> %res, %res1
2640  ret <4 x i64> %res2
2641}
2642
2643declare <4 x i32> @llvm.x86.avx512.mask.pmaxu.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
2644
2645; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_d_128
2646; CHECK-NOT: call
2647; CHECK: vpmaxud %xmm
2648; CHECK: {%k1}
2649define <4 x i32>@test_int_x86_avx512_mask_pmaxu_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2,i8 %mask) {
2650  %res = call <4 x i32> @llvm.x86.avx512.mask.pmaxu.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %mask)
2651  %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmaxu.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %mask)
2652  %res2 = add <4 x i32> %res, %res1
2653  ret <4 x i32> %res2
2654}
2655
2656declare <8 x i32> @llvm.x86.avx512.mask.pmaxu.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
2657
2658; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_d_256
2659; CHECK-NOT: call
2660; CHECK: vpmaxud %ymm
2661; CHECK: {%k1}
2662define <8 x i32>@test_int_x86_avx512_mask_pmaxu_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
2663  %res = call <8 x i32> @llvm.x86.avx512.mask.pmaxu.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
2664  %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmaxu.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
2665  %res2 = add <8 x i32> %res, %res1
2666  ret <8 x i32> %res2
2667}
2668
2669declare <2 x i64> @llvm.x86.avx512.mask.pmaxu.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
2670
2671; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_q_128
2672; CHECK-NOT: call
2673; CHECK: vpmaxuq %xmm
2674; CHECK: {%k1}
2675define <2 x i64>@test_int_x86_avx512_mask_pmaxu_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
2676  %res = call <2 x i64> @llvm.x86.avx512.mask.pmaxu.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
2677  %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmaxu.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
2678  %res2 = add <2 x i64> %res, %res1
2679  ret <2 x i64> %res2
2680}
2681
2682declare <4 x i64> @llvm.x86.avx512.mask.pmaxu.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
2683
2684; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_q_256
2685; CHECK-NOT: call
2686; CHECK: vpmaxuq %ymm
2687; CHECK: {%k1}
2688define <4 x i64>@test_int_x86_avx512_mask_pmaxu_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask) {
2689  %res = call <4 x i64> @llvm.x86.avx512.mask.pmaxu.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask)
2690  %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmaxu.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %mask)
2691  %res2 = add <4 x i64> %res, %res1
2692  ret <4 x i64> %res2
2693}
2694
2695declare <4 x i32> @llvm.x86.avx512.mask.pmins.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
2696
2697; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_d_128
2698; CHECK-NOT: call
2699; CHECK: vpminsd %xmm
2700; CHECK: {%k1}
2701define <4 x i32>@test_int_x86_avx512_mask_pmins_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %mask) {
2702  %res = call <4 x i32> @llvm.x86.avx512.mask.pmins.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %mask)
2703  %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmins.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %mask)
2704  %res2 = add <4 x i32> %res, %res1
2705  ret <4 x i32> %res2
2706}
2707
2708declare <8 x i32> @llvm.x86.avx512.mask.pmins.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
2709
2710; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_d_256
2711; CHECK-NOT: call
2712; CHECK: vpminsd %ymm
2713; CHECK: {%k1}
2714define <8 x i32>@test_int_x86_avx512_mask_pmins_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
2715  %res = call <8 x i32> @llvm.x86.avx512.mask.pmins.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
2716  %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmins.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
2717  %res2 = add <8 x i32> %res, %res1
2718  ret <8 x i32> %res2
2719}
2720
2721declare <2 x i64> @llvm.x86.avx512.mask.pmins.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
2722
2723; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_q_128
2724; CHECK-NOT: call
2725; CHECK: vpminsq %xmm
2726; CHECK: {%k1}
2727define <2 x i64>@test_int_x86_avx512_mask_pmins_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
2728  %res = call <2 x i64> @llvm.x86.avx512.mask.pmins.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
2729  %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmins.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
2730  %res2 = add <2 x i64> %res, %res1
2731  ret <2 x i64> %res2
2732}
2733
2734declare <4 x i64> @llvm.x86.avx512.mask.pmins.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
2735
2736; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_q_256
2737; CHECK-NOT: call
2738; CHECK: vpminsq %ymm
2739; CHECK: {%k1}
2740define <4 x i64>@test_int_x86_avx512_mask_pmins_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask) {
2741  %res = call <4 x i64> @llvm.x86.avx512.mask.pmins.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask)
2742  %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmins.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %mask)
2743  %res2 = add <4 x i64> %res, %res1
2744  ret <4 x i64> %res2
2745}
2746
2747declare <4 x i32> @llvm.x86.avx512.mask.pminu.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
2748
2749; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_d_128
2750; CHECK-NOT: call
2751; CHECK: vpminud %xmm
2752; CHECK: {%k1}
2753define <4 x i32>@test_int_x86_avx512_mask_pminu_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %mask) {
2754  %res = call <4 x i32> @llvm.x86.avx512.mask.pminu.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %mask)
2755  %res1 = call <4 x i32> @llvm.x86.avx512.mask.pminu.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %mask)
2756  %res2 = add <4 x i32> %res, %res1
2757  ret <4 x i32> %res2
2758}
2759
2760declare <8 x i32> @llvm.x86.avx512.mask.pminu.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
2761
2762; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_d_256
2763; CHECK-NOT: call
2764; CHECK: vpminud %ymm
2765; CHECK: {%k1}
2766define <8 x i32>@test_int_x86_avx512_mask_pminu_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
2767  %res = call <8 x i32> @llvm.x86.avx512.mask.pminu.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
2768  %res1 = call <8 x i32> @llvm.x86.avx512.mask.pminu.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
2769  %res2 = add <8 x i32> %res, %res1
2770  ret <8 x i32> %res2
2771}
2772
2773declare <2 x i64> @llvm.x86.avx512.mask.pminu.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
2774
2775; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_q_128
2776; CHECK-NOT: call
2777; CHECK: vpminuq %xmm
2778; CHECK: {%k1}
2779define <2 x i64>@test_int_x86_avx512_mask_pminu_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
2780  %res = call <2 x i64> @llvm.x86.avx512.mask.pminu.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
2781  %res1 = call <2 x i64> @llvm.x86.avx512.mask.pminu.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
2782  %res2 = add <2 x i64> %res, %res1
2783  ret <2 x i64> %res2
2784}
2785
2786declare <4 x i64> @llvm.x86.avx512.mask.pminu.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
2787
2788; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_q_256
2789; CHECK-NOT: call
2790; CHECK: vpminuq %ymm
2791; CHECK: {%k1}
2792define <4 x i64>@test_int_x86_avx512_mask_pminu_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask) {
2793  %res = call <4 x i64> @llvm.x86.avx512.mask.pminu.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask)
2794  %res1 = call <4 x i64> @llvm.x86.avx512.mask.pminu.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %mask)
2795  %res2 = add <4 x i64> %res, %res1
2796  ret <4 x i64> %res2
2797}
2798
2799declare <4 x i32> @llvm.x86.avx512.mask.vpermt2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
2800
2801; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_d_128
2802; CHECK-NOT: call
2803; CHECK: kmov
2804; CHECK: vpermt2d %xmm{{.*}}{%k1}
2805; CHECK-NOT: {z}
2806define <4 x i32>@test_int_x86_avx512_mask_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
2807  %res = call <4 x i32> @llvm.x86.avx512.mask.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
2808  %res1 = call <4 x i32> @llvm.x86.avx512.mask.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
2809  %res2 = add <4 x i32> %res, %res1
2810  ret <4 x i32> %res2
2811}
2812
2813declare <4 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
2814
2815; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_d_128
2816; CHECK-NOT: call
2817; CHECK: kmov
2818; CHECK: vpermt2d %xmm{{.*}}{%k1} {z}
2819define <4 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
2820  %res = call <4 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
2821  %res1 = call <4 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
2822  %res2 = add <4 x i32> %res, %res1
2823  ret <4 x i32> %res2
2824}
2825
2826declare <8 x i32> @llvm.x86.avx512.mask.vpermt2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
2827
2828; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_d_256
2829; CHECK-NOT: call
2830; CHECK: kmov
2831; CHECK: vpermt2d %ymm{{.*}}{%k1}
2832; CHECK-NOT: {z}
2833define <8 x i32>@test_int_x86_avx512_mask_vpermt2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
2834  %res = call <8 x i32> @llvm.x86.avx512.mask.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
2835  %res1 = call <8 x i32> @llvm.x86.avx512.mask.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
2836  %res2 = add <8 x i32> %res, %res1
2837  ret <8 x i32> %res2
2838}
2839
2840declare <8 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
2841
2842; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_d_256
2843; CHECK-NOT: call
2844; CHECK: kmov
2845; CHECK: vpermt2d {{.*}}{%k1} {z}
2846define <8 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
2847  %res = call <8 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
2848  %res1 = call <8 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
2849  %res2 = add <8 x i32> %res, %res1
2850  ret <8 x i32> %res2
2851}
2852
2853declare <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double>, <2 x i64>, <2 x double>, i8)
2854
2855; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_pd_128
2856; CHECK-NOT: call
2857; CHECK: kmov
2858; CHECK: vpermi2pd %xmm{{.*}}{%k1}
2859define <2 x double>@test_int_x86_avx512_mask_vpermi2var_pd_128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3) {
2860  %res = call <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3)
2861  %res1 = call <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 -1)
2862  %res2 = fadd <2 x double> %res, %res1
2863  ret <2 x double> %res2
2864}
2865
2866declare <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double>, <4 x i64>, <4 x double>, i8)
2867
2868; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_pd_256
2869; CHECK-NOT: call
2870; CHECK: kmov
2871; CHECK: vpermi2pd %ymm{{.*}}{%k1}
2872define <4 x double>@test_int_x86_avx512_mask_vpermi2var_pd_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) {
2873  %res = call <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3)
2874  %res1 = call <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 -1)
2875  %res2 = fadd <4 x double> %res, %res1
2876  ret <4 x double> %res2
2877}
2878
2879declare <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float>, <4 x i32>, <4 x float>, i8)
2880
2881; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_ps_128
2882; CHECK-NOT: call
2883; CHECK: kmov
2884; CHECK: vpermi2ps %xmm{{.*}}{%k1}
2885define <4 x float>@test_int_x86_avx512_mask_vpermi2var_ps_128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 %x3) {
2886  %res = call <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 %x3)
2887  %res1 = call <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 -1)
2888  %res2 = fadd <4 x float> %res, %res1
2889  ret <4 x float> %res2
2890}
2891
2892declare <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float>, <8 x i32>, <8 x float>, i8)
2893
2894; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_ps_256
2895; CHECK-NOT: call
2896; CHECK: kmov
2897; CHECK: vpermi2ps %ymm{{.*}}{%k1}
2898define <8 x float>@test_int_x86_avx512_mask_vpermi2var_ps_256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) {
2899  %res = call <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3)
2900  %res1 = call <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 -1)
2901  %res2 = fadd <8 x float> %res, %res1
2902  ret <8 x float> %res2
2903}
2904
2905declare <2 x i64> @llvm.x86.avx512.mask.pabs.q.128(<2 x i64>, <2 x i64>, i8)
2906
2907; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_q_128
2908; CHECK-NOT: call
2909; CHECK: kmov
2910; CHECK: vpabsq{{.*}}{%k1}
2911define <2 x i64>@test_int_x86_avx512_mask_pabs_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) {
2912  %res = call <2 x i64> @llvm.x86.avx512.mask.pabs.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2)
2913  %res1 = call <2 x i64> @llvm.x86.avx512.mask.pabs.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 -1)
2914  %res2 = add <2 x i64> %res, %res1
2915  ret <2 x i64> %res2
2916}
2917
2918declare <4 x i64> @llvm.x86.avx512.mask.pabs.q.256(<4 x i64>, <4 x i64>, i8)
2919
2920; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_q_256
2921; CHECK-NOT: call
2922; CHECK: kmov
2923; CHECK: vpabsq{{.*}}{%k1}
2924define <4 x i64>@test_int_x86_avx512_mask_pabs_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) {
2925  %res = call <4 x i64> @llvm.x86.avx512.mask.pabs.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2)
2926  %res1 = call <4 x i64> @llvm.x86.avx512.mask.pabs.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 -1)
2927  %res2 = add <4 x i64> %res, %res1
2928  ret <4 x i64> %res2
2929}
2930
2931declare <4 x i32> @llvm.x86.avx512.mask.pabs.d.128(<4 x i32>, <4 x i32>, i8)
2932
2933; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_d_128
2934; CHECK-NOT: call
2935; CHECK: kmov
2936; CHECK: vpabsd{{.*}}{%k1}
2937define <4 x i32>@test_int_x86_avx512_mask_pabs_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) {
2938  %res = call <4 x i32> @llvm.x86.avx512.mask.pabs.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2)
2939  %res1 = call <4 x i32> @llvm.x86.avx512.mask.pabs.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 -1)
2940  %res2 = add <4 x i32> %res, %res1
2941  ret <4 x i32> %res2
2942}
2943
2944declare <8 x i32> @llvm.x86.avx512.mask.pabs.d.256(<8 x i32>, <8 x i32>, i8)
2945
2946; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_d_256
2947; CHECK-NOT: call
2948; CHECK: kmov
2949; CHECK: vpabsd{{.*}}{%k1}
2950define <8 x i32>@test_int_x86_avx512_mask_pabs_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) {
2951  %res = call <8 x i32> @llvm.x86.avx512.mask.pabs.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2)
2952  %res1 = call <8 x i32> @llvm.x86.avx512.mask.pabs.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 -1)
2953  %res2 = add <8 x i32> %res, %res1
2954  ret <8 x i32> %res2
2955}
2956
2957
2958declare <2 x double> @llvm.x86.avx512.mask.scalef.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
2959
2960; CHECK-LABEL: @test_int_x86_avx512_mask_scalef_pd_128
2961; CHECK-NOT: call
2962; CHECK: kmov
2963; CHECK: vscalefpd{{.*}}{%k1}
2964define <2 x double>@test_int_x86_avx512_mask_scalef_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
2965  %res = call <2 x double> @llvm.x86.avx512.mask.scalef.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
2966  %res1 = call <2 x double> @llvm.x86.avx512.mask.scalef.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
2967  %res2 = fadd <2 x double> %res, %res1
2968  ret <2 x double> %res2
2969}
2970
2971declare <4 x double> @llvm.x86.avx512.mask.scalef.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
2972
2973; CHECK-LABEL: @test_int_x86_avx512_mask_scalef_pd_256
2974; CHECK-NOT: call
2975; CHECK: kmov
2976; CHECK: vscalefpd{{.*}}{%k1}
2977define <4 x double>@test_int_x86_avx512_mask_scalef_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
2978  %res = call <4 x double> @llvm.x86.avx512.mask.scalef.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
2979  %res1 = call <4 x double> @llvm.x86.avx512.mask.scalef.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
2980  %res2 = fadd <4 x double> %res, %res1
2981  ret <4 x double> %res2
2982}
2983
2984declare <4 x float> @llvm.x86.avx512.mask.scalef.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
2985; CHECK-LABEL: @test_int_x86_avx512_mask_scalef_ps_128
2986; CHECK-NOT: call
2987; CHECK: kmov
2988; CHECK: vscalefps{{.*}}{%k1}
2989define <4 x float>@test_int_x86_avx512_mask_scalef_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
2990  %res = call <4 x float> @llvm.x86.avx512.mask.scalef.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
2991  %res1 = call <4 x float> @llvm.x86.avx512.mask.scalef.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
2992  %res2 = fadd <4 x float> %res, %res1
2993  ret <4 x float> %res2
2994}
2995
2996declare <8 x float> @llvm.x86.avx512.mask.scalef.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
2997; CHECK-LABEL: @test_int_x86_avx512_mask_scalef_ps_256
2998; CHECK-NOT: call
2999; CHECK: kmov
3000; CHECK: vscalefps{{.*}}{%k1}
3001define <8 x float>@test_int_x86_avx512_mask_scalef_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
3002  %res = call <8 x float> @llvm.x86.avx512.mask.scalef.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
3003  %res1 = call <8 x float> @llvm.x86.avx512.mask.scalef.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
3004  %res2 = fadd <8 x float> %res, %res1
3005  ret <8 x float> %res2
3006}
3007
3008declare <2 x double> @llvm.x86.avx512.mask.unpckh.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
3009
3010define <2 x double>@test_int_x86_avx512_mask_unpckh_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
3011; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_pd_128:
3012; CHECK:         vunpckhpd %xmm1, %xmm0, %xmm2 {%k1}
3013; CHECK-NEXT:    ## xmm2 = xmm2[1],k1[1]
3014; CHECK-NEXT:    vunpckhpd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x15,0xc1]
3015; CHECK-NEXT:    ## xmm0 = xmm0[1],xmm1[1]
3016  %res = call <2 x double> @llvm.x86.avx512.mask.unpckh.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
3017  %res1 = call <2 x double> @llvm.x86.avx512.mask.unpckh.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
3018  %res2 = fadd <2 x double> %res, %res1
3019  ret <2 x double> %res2
3020}
3021
3022declare <4 x double> @llvm.x86.avx512.mask.unpckh.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
3023
3024define <4 x double>@test_int_x86_avx512_mask_unpckh_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
3025; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_pd_256:
3026; CHECK:         vunpckhpd %ymm1, %ymm0, %ymm2 {%k1}
3027; CHECK-NEXT:    ## ymm2 = ymm2[1],k1[1],ymm2[3],k1[3]
3028; CHECK-NEXT:    vunpckhpd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x15,0xc1]
3029; CHECK-NEXT:    ## ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
3030  %res = call <4 x double> @llvm.x86.avx512.mask.unpckh.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
3031  %res1 = call <4 x double> @llvm.x86.avx512.mask.unpckh.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
3032  %res2 = fadd <4 x double> %res, %res1
3033  ret <4 x double> %res2
3034}
3035
3036declare <4 x float> @llvm.x86.avx512.mask.unpckh.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
3037
3038define <4 x float>@test_int_x86_avx512_mask_unpckh_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
3039; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_ps_128:
3040; CHECK:         vunpckhps %xmm1, %xmm0, %xmm2 {%k1}
3041; CHECK-NEXT:    ## xmm2 = xmm2[2],k1[2],xmm2[3],k1[3]
3042; CHECK-NEXT:    vunpckhps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x15,0xc1]
3043; CHECK-NEXT:    ## xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3044  %res = call <4 x float> @llvm.x86.avx512.mask.unpckh.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
3045  %res1 = call <4 x float> @llvm.x86.avx512.mask.unpckh.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
3046  %res2 = fadd <4 x float> %res, %res1
3047  ret <4 x float> %res2
3048}
3049
3050declare <8 x float> @llvm.x86.avx512.mask.unpckh.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
3051
3052define <8 x float>@test_int_x86_avx512_mask_unpckh_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
3053; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_ps_256:
3054; CHECK:       ## BB#0:
3055; CHECK:         vunpckhps %ymm1, %ymm0, %ymm2 {%k1}
3056; CHECK-NEXT:    ## ymm2 = ymm2[2],k1[2],ymm2[3],k1[3],ymm2[6],k1[6],ymm2[7],k1[7]
3057; CHECK-NEXT:    vunpckhps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x15,0xc1]
3058; CHECK-NEXT:    ## ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
3059  %res = call <8 x float> @llvm.x86.avx512.mask.unpckh.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
3060  %res1 = call <8 x float> @llvm.x86.avx512.mask.unpckh.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
3061  %res2 = fadd <8 x float> %res, %res1
3062  ret <8 x float> %res2
3063}
3064
3065declare <2 x double> @llvm.x86.avx512.mask.unpckl.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
3066
3067define <2 x double>@test_int_x86_avx512_mask_unpckl_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
3068; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_pd_128:
3069; CHECK:         vunpcklpd %xmm1, %xmm0, %xmm2 {%k1}
3070; CHECK-NEXT:    ## xmm2 = xmm2[0],k1[0]
3071; CHECK-NEXT:    vunpcklpd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x14,0xc1]
3072; CHECK-NEXT:    ## xmm0 = xmm0[0],xmm1[0]
3073  %res = call <2 x double> @llvm.x86.avx512.mask.unpckl.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
3074  %res1 = call <2 x double> @llvm.x86.avx512.mask.unpckl.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
3075  %res2 = fadd <2 x double> %res, %res1
3076  ret <2 x double> %res2
3077}
3078
3079declare <4 x double> @llvm.x86.avx512.mask.unpckl.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
3080
3081define <4 x double>@test_int_x86_avx512_mask_unpckl_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
3082; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_pd_256:
3083; CHECK:         vunpcklpd %ymm1, %ymm0, %ymm2 {%k1}
3084; CHECK-NEXT:    ## ymm2 = ymm2[0],k1[0],ymm2[2],k1[2]
3085; CHECK-NEXT:    vunpcklpd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x14,0xc1]
3086; CHECK-NEXT:    ## ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
3087  %res = call <4 x double> @llvm.x86.avx512.mask.unpckl.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
3088  %res1 = call <4 x double> @llvm.x86.avx512.mask.unpckl.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
3089  %res2 = fadd <4 x double> %res, %res1
3090  ret <4 x double> %res2
3091}
3092
3093declare <4 x float> @llvm.x86.avx512.mask.unpckl.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
3094
3095define <4 x float>@test_int_x86_avx512_mask_unpckl_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
3096; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_ps_128:
3097; CHECK:         vunpcklps %xmm1, %xmm0, %xmm2 {%k1}
3098; CHECK-NEXT:    ## xmm2 = xmm2[0],k1[0],xmm2[1],k1[1]
3099; CHECK-NEXT:    vunpcklps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x14,0xc1]
3100; CHECK-NEXT:    ## xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3101  %res = call <4 x float> @llvm.x86.avx512.mask.unpckl.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
3102  %res1 = call <4 x float> @llvm.x86.avx512.mask.unpckl.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
3103  %res2 = fadd <4 x float> %res, %res1
3104  ret <4 x float> %res2
3105}
3106
3107declare <8 x float> @llvm.x86.avx512.mask.unpckl.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
3108
3109define <8 x float>@test_int_x86_avx512_mask_unpckl_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
3110; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_ps_256:
3111; CHECK:         vunpcklps %ymm1, %ymm0, %ymm2 {%k1}
3112; CHECK-NEXT:    ## ymm2 = ymm2[0],k1[0],ymm2[1],k1[1],ymm2[4],k1[4],ymm2[5],k1[5]
3113; CHECK-NEXT:    vunpcklps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x14,0xc1]
3114; CHECK-NEXT:    ## ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
3115  %res = call <8 x float> @llvm.x86.avx512.mask.unpckl.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
3116  %res1 = call <8 x float> @llvm.x86.avx512.mask.unpckl.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
3117  %res2 = fadd <8 x float> %res, %res1
3118  ret <8 x float> %res2
3119}
3120
3121declare <4 x i32> @llvm.x86.avx512.mask.punpckhd.q.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
3122
3123define <4 x i32>@test_int_x86_avx512_mask_punpckhd_q_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
3124; CHECK-LABEL: test_int_x86_avx512_mask_punpckhd_q_128:
3125; CHECK:         vpunpckhdq %xmm1, %xmm0, %xmm2 {%k1}
3126; CHECK-NEXT:    ## xmm2 = xmm2[2],k1[2],xmm2[3],k1[3]
3127; CHECK-NEXT:    vpunpckhdq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x6a,0xc1]
3128; CHECK-NEXT:    ## xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3129  %res = call <4 x i32> @llvm.x86.avx512.mask.punpckhd.q.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
3130  %res1 = call <4 x i32> @llvm.x86.avx512.mask.punpckhd.q.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
3131  %res2 = add <4 x i32> %res, %res1
3132  ret <4 x i32> %res2
3133}
3134
3135declare <4 x i32> @llvm.x86.avx512.mask.punpckld.q.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
3136
3137define <4 x i32>@test_int_x86_avx512_mask_punpckld_q_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
3138; CHECK-LABEL: test_int_x86_avx512_mask_punpckld_q_128:
3139; CHECK:         vpunpckldq %xmm1, %xmm0, %xmm2 {%k1}
3140; CHECK-NEXT:    ## xmm2 = xmm2[0],k1[0],xmm2[1],k1[1]
3141; CHECK-NEXT:    vpunpckldq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x62,0xc1]
3142; CHECK-NEXT:    ## xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3143  %res = call <4 x i32> @llvm.x86.avx512.mask.punpckld.q.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
3144  %res1 = call <4 x i32> @llvm.x86.avx512.mask.punpckld.q.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
3145  %res2 = add <4 x i32> %res, %res1
3146  ret <4 x i32> %res2
3147}
3148
3149declare <8 x i32> @llvm.x86.avx512.mask.punpckhd.q.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
3150
3151define <8 x i32>@test_int_x86_avx512_mask_punpckhd_q_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
3152; CHECK-LABEL: test_int_x86_avx512_mask_punpckhd_q_256:
3153; CHECK:       ## BB#0:
3154; CHECK:         vpunpckhdq %ymm1, %ymm0, %ymm2 {%k1}
3155; CHECK-NEXT:    ## ymm2 = ymm2[2],k1[2],ymm2[3],k1[3],ymm2[6],k1[6],ymm2[7],k1[7]
3156; CHECK-NEXT:    vpunpckhdq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x6a,0xc1]
3157; CHECK-NEXT:    ## ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
3158  %res = call <8 x i32> @llvm.x86.avx512.mask.punpckhd.q.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
3159  %res1 = call <8 x i32> @llvm.x86.avx512.mask.punpckhd.q.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
3160  %res2 = add <8 x i32> %res, %res1
3161  ret <8 x i32> %res2
3162}
3163
3164declare <8 x i32> @llvm.x86.avx512.mask.punpckld.q.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
3165
3166define <8 x i32>@test_int_x86_avx512_mask_punpckld_q_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
3167; CHECK-LABEL: test_int_x86_avx512_mask_punpckld_q_256:
3168; CHECK:         vpunpckldq %ymm1, %ymm0, %ymm2 {%k1}
3169; CHECK-NEXT:    ## ymm2 = ymm2[0],k1[0],ymm2[1],k1[1],ymm2[4],k1[4],ymm2[5],k1[5]
3170; CHECK-NEXT:    vpunpckldq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x62,0xc1]
3171; CHECK-NEXT:    ## ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
3172  %res = call <8 x i32> @llvm.x86.avx512.mask.punpckld.q.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
3173  %res1 = call <8 x i32> @llvm.x86.avx512.mask.punpckld.q.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
3174  %res2 = add <8 x i32> %res, %res1
3175  ret <8 x i32> %res2
3176}
3177
3178declare <2 x i64> @llvm.x86.avx512.mask.punpckhqd.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
3179
3180define <2 x i64>@test_int_x86_avx512_mask_punpckhqd_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
3181; CHECK-LABEL: test_int_x86_avx512_mask_punpckhqd_q_128:
3182; CHECK:         vpunpckhqdq %xmm1, %xmm0, %xmm2 {%k1}
3183; CHECK-NEXT:    ## xmm2 = xmm2[1],k1[1]
3184; CHECK-NEXT:    vpunpckhqdq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x6d,0xc1]
3185; CHECK-NEXT:    ## xmm0 = xmm0[1],xmm1[1]
3186  %res = call <2 x i64> @llvm.x86.avx512.mask.punpckhqd.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
3187  %res1 = call <2 x i64> @llvm.x86.avx512.mask.punpckhqd.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
3188  %res2 = add <2 x i64> %res, %res1
3189  ret <2 x i64> %res2
3190}
3191
3192declare <2 x i64> @llvm.x86.avx512.mask.punpcklqd.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
3193
3194define <2 x i64>@test_int_x86_avx512_mask_punpcklqd_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
3195; CHECK-LABEL: test_int_x86_avx512_mask_punpcklqd_q_128:
3196; CHECK:         vpunpcklqdq %xmm1, %xmm0, %xmm2 {%k1}
3197; CHECK-NEXT:    ## xmm2 = xmm2[0],k1[0]
3198; CHECK-NEXT:    vpunpcklqdq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x6c,0xc1]
3199; CHECK-NEXT:    ## xmm0 = xmm0[0],xmm1[0]
3200  %res = call <2 x i64> @llvm.x86.avx512.mask.punpcklqd.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
3201  %res1 = call <2 x i64> @llvm.x86.avx512.mask.punpcklqd.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
3202  %res2 = add <2 x i64> %res, %res1
3203  ret <2 x i64> %res2
3204}
3205
3206declare <4 x i64> @llvm.x86.avx512.mask.punpcklqd.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
3207
3208define <4 x i64>@test_int_x86_avx512_mask_punpcklqd_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
3209; CHECK-LABEL: test_int_x86_avx512_mask_punpcklqd_q_256:
3210; CHECK:         vpunpcklqdq %ymm1, %ymm0, %ymm2 {%k1}
3211; CHECK-NEXT:    ## ymm2 = ymm2[0],k1[0],ymm2[2],k1[2]
3212; CHECK-NEXT:    vpunpcklqdq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x6c,0xc1]
3213; CHECK-NEXT:    ## ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
3214  %res = call <4 x i64> @llvm.x86.avx512.mask.punpcklqd.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
3215  %res1 = call <4 x i64> @llvm.x86.avx512.mask.punpcklqd.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
3216  %res2 = add <4 x i64> %res, %res1
3217  ret <4 x i64> %res2
3218}
3219
3220declare <4 x i64> @llvm.x86.avx512.mask.punpckhqd.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
3221
3222define <4 x i64>@test_int_x86_avx512_mask_punpckhqd_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
3223; CHECK-LABEL: test_int_x86_avx512_mask_punpckhqd_q_256:
3224; CHECK:         vpunpckhqdq %ymm1, %ymm0, %ymm2 {%k1}
3225; CHECK-NEXT:    ## ymm2 = ymm2[1],k1[1],ymm2[3],k1[3]
3226; CHECK-NEXT:    vpunpckhqdq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x6d,0xc1]
3227; CHECK-NEXT:    ## ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
3228  %res = call <4 x i64> @llvm.x86.avx512.mask.punpckhqd.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
3229  %res1 = call <4 x i64> @llvm.x86.avx512.mask.punpckhqd.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
3230  %res2 = add <4 x i64> %res, %res1
3231  ret <4 x i64> %res2
3232}
3233
3234declare <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64>, <16 x i8>, i8)
3235
3236define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) {
3237; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_128:
3238; CHECK:       vpmovqb %xmm0, %xmm1 {%k1}
3239; CHECK-NEXT:  vpmovqb %xmm0, %xmm2 {%k1} {z}
3240; CHECK-NEXT:  vpmovqb %xmm0, %xmm0
3241    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 -1)
3242    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2)
3243    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
3244    %res3 = add <16 x i8> %res0, %res1
3245    %res4 = add <16 x i8> %res3, %res2
3246    ret <16 x i8> %res4
3247}
3248
3249declare void @llvm.x86.avx512.mask.pmov.qb.mem.128(i8* %ptr, <2 x i64>, i8)
3250
3251define void @test_int_x86_avx512_mask_pmov_qb_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) {
3252; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_mem_128:
3253; CHECK:  vpmovqb %xmm0, (%rdi)
3254; CHECK:  vpmovqb %xmm0, (%rdi) {%k1}
3255    call void @llvm.x86.avx512.mask.pmov.qb.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1)
3256    call void @llvm.x86.avx512.mask.pmov.qb.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2)
3257    ret void
3258}
3259
3260declare <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64>, <16 x i8>, i8)
3261
3262define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) {
3263; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_128:
3264; CHECK:       vpmovsqb %xmm0, %xmm1 {%k1}
3265; CHECK-NEXT:  vpmovsqb %xmm0, %xmm2 {%k1} {z}
3266; CHECK-NEXT:  vpmovsqb %xmm0, %xmm0
3267    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 -1)
3268    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2)
3269    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
3270    %res3 = add <16 x i8> %res0, %res1
3271    %res4 = add <16 x i8> %res3, %res2
3272    ret <16 x i8> %res4
3273}
3274
3275declare void @llvm.x86.avx512.mask.pmovs.qb.mem.128(i8* %ptr, <2 x i64>, i8)
3276
3277define void @test_int_x86_avx512_mask_pmovs_qb_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) {
3278; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_mem_128:
3279; CHECK:  vpmovsqb %xmm0, (%rdi)
3280; CHECK:  vpmovsqb %xmm0, (%rdi) {%k1}
3281    call void @llvm.x86.avx512.mask.pmovs.qb.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1)
3282    call void @llvm.x86.avx512.mask.pmovs.qb.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2)
3283    ret void
3284}
3285
3286declare <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64>, <16 x i8>, i8)
3287
3288define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) {
3289; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_128:
3290; CHECK:       vpmovusqb %xmm0, %xmm1 {%k1}
3291; CHECK-NEXT:  vpmovusqb %xmm0, %xmm2 {%k1} {z}
3292; CHECK-NEXT:  vpmovusqb %xmm0, %xmm0
3293    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 -1)
3294    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2)
3295    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
3296    %res3 = add <16 x i8> %res0, %res1
3297    %res4 = add <16 x i8> %res3, %res2
3298    ret <16 x i8> %res4
3299}
3300
3301declare void @llvm.x86.avx512.mask.pmovus.qb.mem.128(i8* %ptr, <2 x i64>, i8)
3302
3303define void @test_int_x86_avx512_mask_pmovus_qb_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) {
3304; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_mem_128:
3305; CHECK:  vpmovusqb %xmm0, (%rdi)
3306; CHECK:  vpmovusqb %xmm0, (%rdi) {%k1}
3307    call void @llvm.x86.avx512.mask.pmovus.qb.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1)
3308    call void @llvm.x86.avx512.mask.pmovus.qb.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2)
3309    ret void
3310}
3311
3312declare <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64>, <16 x i8>, i8)
3313
3314define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2) {
3315; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_256:
3316; CHECK:       vpmovqb %ymm0, %xmm1 {%k1}
3317; CHECK-NEXT:  vpmovqb %ymm0, %xmm2 {%k1} {z}
3318; CHECK-NEXT:  vpmovqb %ymm0, %xmm0
3319    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 -1)
3320    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2)
3321    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
3322    %res3 = add <16 x i8> %res0, %res1
3323    %res4 = add <16 x i8> %res3, %res2
3324    ret <16 x i8> %res4
3325}
3326
3327declare void @llvm.x86.avx512.mask.pmov.qb.mem.256(i8* %ptr, <4 x i64>, i8)
3328
3329define void @test_int_x86_avx512_mask_pmov_qb_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) {
3330; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_mem_256:
3331; CHECK:  vpmovqb %ymm0, (%rdi)
3332; CHECK:  vpmovqb %ymm0, (%rdi) {%k1}
3333    call void @llvm.x86.avx512.mask.pmov.qb.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1)
3334    call void @llvm.x86.avx512.mask.pmov.qb.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2)
3335    ret void
3336}
3337
3338declare <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64>, <16 x i8>, i8)
3339
3340define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2) {
3341; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_256:
3342; CHECK:       vpmovsqb %ymm0, %xmm1 {%k1}
3343; CHECK-NEXT:  vpmovsqb %ymm0, %xmm2 {%k1} {z}
3344; CHECK-NEXT:  vpmovsqb %ymm0, %xmm0
3345    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 -1)
3346    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2)
3347    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
3348    %res3 = add <16 x i8> %res0, %res1
3349    %res4 = add <16 x i8> %res3, %res2
3350    ret <16 x i8> %res4
3351}
3352
3353declare void @llvm.x86.avx512.mask.pmovs.qb.mem.256(i8* %ptr, <4 x i64>, i8)
3354
3355define void @test_int_x86_avx512_mask_pmovs_qb_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) {
3356; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_mem_256:
3357; CHECK:  vpmovsqb %ymm0, (%rdi)
3358; CHECK:  vpmovsqb %ymm0, (%rdi) {%k1}
3359    call void @llvm.x86.avx512.mask.pmovs.qb.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1)
3360    call void @llvm.x86.avx512.mask.pmovs.qb.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2)
3361    ret void
3362}
3363
3364declare <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64>, <16 x i8>, i8)
3365
3366define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2) {
3367; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_256:
3368; CHECK:       vpmovusqb %ymm0, %xmm1 {%k1}
3369; CHECK-NEXT:  vpmovusqb %ymm0, %xmm2 {%k1} {z}
3370; CHECK-NEXT:  vpmovusqb %ymm0, %xmm0
3371    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 -1)
3372    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2)
3373    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
3374    %res3 = add <16 x i8> %res0, %res1
3375    %res4 = add <16 x i8> %res3, %res2
3376    ret <16 x i8> %res4
3377}
3378
3379declare void @llvm.x86.avx512.mask.pmovus.qb.mem.256(i8* %ptr, <4 x i64>, i8)
3380
3381define void @test_int_x86_avx512_mask_pmovus_qb_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) {
3382; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_mem_256:
3383; CHECK:  vpmovusqb %ymm0, (%rdi)
3384; CHECK:  vpmovusqb %ymm0, (%rdi) {%k1}
3385    call void @llvm.x86.avx512.mask.pmovus.qb.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1)
3386    call void @llvm.x86.avx512.mask.pmovus.qb.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2)
3387    ret void
3388}
3389
3390declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64>, <8 x i16>, i8)
3391
3392define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2) {
3393; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_128:
3394; CHECK:       vpmovqw %xmm0, %xmm1 {%k1}
3395; CHECK-NEXT:  vpmovqw %xmm0, %xmm2 {%k1} {z}
3396; CHECK-NEXT:  vpmovqw %xmm0, %xmm0
3397    %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 -1)
3398    %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2)
3399    %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
3400    %res3 = add <8 x i16> %res0, %res1
3401    %res4 = add <8 x i16> %res3, %res2
3402    ret <8 x i16> %res4
3403}
3404
3405declare void @llvm.x86.avx512.mask.pmov.qw.mem.128(i8* %ptr, <2 x i64>, i8)
3406
3407define void @test_int_x86_avx512_mask_pmov_qw_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) {
3408; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_mem_128:
3409; CHECK:  vpmovqw %xmm0, (%rdi)
3410; CHECK:  vpmovqw %xmm0, (%rdi) {%k1}
3411    call void @llvm.x86.avx512.mask.pmov.qw.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1)
3412    call void @llvm.x86.avx512.mask.pmov.qw.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2)
3413    ret void
3414}
3415
3416declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64>, <8 x i16>, i8)
3417
3418define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2) {
3419; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_128:
3420; CHECK:       vpmovsqw %xmm0, %xmm1 {%k1}
3421; CHECK-NEXT:  vpmovsqw %xmm0, %xmm2 {%k1} {z}
3422; CHECK-NEXT:  vpmovsqw %xmm0, %xmm0
3423    %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 -1)
3424    %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2)
3425    %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
3426    %res3 = add <8 x i16> %res0, %res1
3427    %res4 = add <8 x i16> %res3, %res2
3428    ret <8 x i16> %res4
3429}
3430
3431declare void @llvm.x86.avx512.mask.pmovs.qw.mem.128(i8* %ptr, <2 x i64>, i8)
3432
3433define void @test_int_x86_avx512_mask_pmovs_qw_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) {
3434; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_mem_128:
3435; CHECK:  vpmovsqw %xmm0, (%rdi)
3436; CHECK:  vpmovsqw %xmm0, (%rdi) {%k1}
3437    call void @llvm.x86.avx512.mask.pmovs.qw.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1)
3438    call void @llvm.x86.avx512.mask.pmovs.qw.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2)
3439    ret void
3440}
3441
3442declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64>, <8 x i16>, i8)
3443
3444define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2) {
3445; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_128:
3446; CHECK:       vpmovusqw %xmm0, %xmm1 {%k1}
3447; CHECK-NEXT:  vpmovusqw %xmm0, %xmm2 {%k1} {z}
3448; CHECK-NEXT:  vpmovusqw %xmm0, %xmm0
3449    %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 -1)
3450    %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2)
3451    %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
3452    %res3 = add <8 x i16> %res0, %res1
3453    %res4 = add <8 x i16> %res3, %res2
3454    ret <8 x i16> %res4
3455}
3456
3457declare void @llvm.x86.avx512.mask.pmovus.qw.mem.128(i8* %ptr, <2 x i64>, i8)
3458
3459define void @test_int_x86_avx512_mask_pmovus_qw_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) {
3460; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_mem_128:
3461; CHECK:  vpmovusqw %xmm0, (%rdi)
3462; CHECK:  vpmovusqw %xmm0, (%rdi) {%k1}
3463    call void @llvm.x86.avx512.mask.pmovus.qw.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1)
3464    call void @llvm.x86.avx512.mask.pmovus.qw.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2)
3465    ret void
3466}
3467
3468declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64>, <8 x i16>, i8)
3469
3470define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2) {
3471; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_256:
3472; CHECK:       vpmovqw %ymm0, %xmm1 {%k1}
3473; CHECK-NEXT:  vpmovqw %ymm0, %xmm2 {%k1} {z}
3474; CHECK-NEXT:  vpmovqw %ymm0, %xmm0
3475    %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 -1)
3476    %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2)
3477    %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
3478    %res3 = add <8 x i16> %res0, %res1
3479    %res4 = add <8 x i16> %res3, %res2
3480    ret <8 x i16> %res4
3481}
3482
3483declare void @llvm.x86.avx512.mask.pmov.qw.mem.256(i8* %ptr, <4 x i64>, i8)
3484
3485define void @test_int_x86_avx512_mask_pmov_qw_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) {
3486; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_mem_256:
3487; CHECK:  vpmovqw %ymm0, (%rdi)
3488; CHECK:  vpmovqw %ymm0, (%rdi) {%k1}
3489    call void @llvm.x86.avx512.mask.pmov.qw.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1)
3490    call void @llvm.x86.avx512.mask.pmov.qw.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2)
3491    ret void
3492}
3493
3494declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64>, <8 x i16>, i8)
3495
3496define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2) {
3497; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_256:
3498; CHECK:       vpmovsqw %ymm0, %xmm1 {%k1}
3499; CHECK-NEXT:  vpmovsqw %ymm0, %xmm2 {%k1} {z}
3500; CHECK-NEXT:  vpmovsqw %ymm0, %xmm0
3501    %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 -1)
3502    %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2)
3503    %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
3504    %res3 = add <8 x i16> %res0, %res1
3505    %res4 = add <8 x i16> %res3, %res2
3506    ret <8 x i16> %res4
3507}
3508
3509declare void @llvm.x86.avx512.mask.pmovs.qw.mem.256(i8* %ptr, <4 x i64>, i8)
3510
3511define void @test_int_x86_avx512_mask_pmovs_qw_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) {
3512; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_mem_256:
3513; CHECK:  vpmovsqw %ymm0, (%rdi)
3514; CHECK:  vpmovsqw %ymm0, (%rdi) {%k1}
3515    call void @llvm.x86.avx512.mask.pmovs.qw.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1)
3516    call void @llvm.x86.avx512.mask.pmovs.qw.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2)
3517    ret void
3518}
3519
3520declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64>, <8 x i16>, i8)
3521
3522define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2) {
3523; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_256:
3524; CHECK:       vpmovusqw %ymm0, %xmm1 {%k1}
3525; CHECK-NEXT:  vpmovusqw %ymm0, %xmm2 {%k1} {z}
3526; CHECK-NEXT:  vpmovusqw %ymm0, %xmm0
3527    %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 -1)
3528    %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2)
3529    %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
3530    %res3 = add <8 x i16> %res0, %res1
3531    %res4 = add <8 x i16> %res3, %res2
3532    ret <8 x i16> %res4
3533}
3534
3535declare void @llvm.x86.avx512.mask.pmovus.qw.mem.256(i8* %ptr, <4 x i64>, i8)
3536
3537define void @test_int_x86_avx512_mask_pmovus_qw_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) {
3538; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_mem_256:
3539; CHECK:  vpmovusqw %ymm0, (%rdi)
3540; CHECK:  vpmovusqw %ymm0, (%rdi) {%k1}
3541    call void @llvm.x86.avx512.mask.pmovus.qw.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1)
3542    call void @llvm.x86.avx512.mask.pmovus.qw.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2)
3543    ret void
3544}
3545
3546declare <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64>, <4 x i32>, i8)
3547
3548define <4 x i32>@test_int_x86_avx512_mask_pmov_qd_128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2) {
3549; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_128:
3550; CHECK:       vpmovqd %xmm0, %xmm1 {%k1}
3551; CHECK-NEXT:  vpmovqd %xmm0, %xmm2 {%k1} {z}
3552; CHECK-NEXT:  vpmovqd %xmm0, %xmm0
3553    %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 -1)
3554    %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2)
3555    %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2)
3556    %res3 = add <4 x i32> %res0, %res1
3557    %res4 = add <4 x i32> %res3, %res2
3558    ret <4 x i32> %res4
3559}
3560
3561declare void @llvm.x86.avx512.mask.pmov.qd.mem.128(i8* %ptr, <2 x i64>, i8)
3562
3563define void @test_int_x86_avx512_mask_pmov_qd_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) {
3564; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_mem_128:
3565; CHECK:  vpmovqd %xmm0, (%rdi)
3566; CHECK:  vpmovqd %xmm0, (%rdi) {%k1}
3567    call void @llvm.x86.avx512.mask.pmov.qd.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1)
3568    call void @llvm.x86.avx512.mask.pmov.qd.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2)
3569    ret void
3570}
3571
3572declare <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64>, <4 x i32>, i8)
3573
3574define <4 x i32>@test_int_x86_avx512_mask_pmovs_qd_128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2) {
3575; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_128:
3576; CHECK:       vpmovsqd %xmm0, %xmm1 {%k1}
3577; CHECK-NEXT:  vpmovsqd %xmm0, %xmm2 {%k1} {z}
3578; CHECK-NEXT:  vpmovsqd %xmm0, %xmm0
3579    %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 -1)
3580    %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2)
3581    %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2)
3582    %res3 = add <4 x i32> %res0, %res1
3583    %res4 = add <4 x i32> %res3, %res2
3584    ret <4 x i32> %res4
3585}
3586
3587declare void @llvm.x86.avx512.mask.pmovs.qd.mem.128(i8* %ptr, <2 x i64>, i8)
3588
3589define void @test_int_x86_avx512_mask_pmovs_qd_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) {
3590; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_mem_128:
3591; CHECK:  vpmovsqd %xmm0, (%rdi)
3592; CHECK:  vpmovsqd %xmm0, (%rdi) {%k1}
3593    call void @llvm.x86.avx512.mask.pmovs.qd.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1)
3594    call void @llvm.x86.avx512.mask.pmovs.qd.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2)
3595    ret void
3596}
3597
3598declare <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64>, <4 x i32>, i8)
3599
3600define <4 x i32>@test_int_x86_avx512_mask_pmovus_qd_128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2) {
3601; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_128:
3602; CHECK:       vpmovusqd %xmm0, %xmm1 {%k1}
3603; CHECK-NEXT:  vpmovusqd %xmm0, %xmm2 {%k1} {z}
3604; CHECK-NEXT:  vpmovusqd %xmm0, %xmm0
3605    %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 -1)
3606    %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2)
3607    %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2)
3608    %res3 = add <4 x i32> %res0, %res1
3609    %res4 = add <4 x i32> %res3, %res2
3610    ret <4 x i32> %res4
3611}
3612
3613declare void @llvm.x86.avx512.mask.pmovus.qd.mem.128(i8* %ptr, <2 x i64>, i8)
3614
3615define void @test_int_x86_avx512_mask_pmovus_qd_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) {
3616; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_mem_128:
3617; CHECK:  vpmovusqd %xmm0, (%rdi)
3618; CHECK:  vpmovusqd %xmm0, (%rdi) {%k1}
3619    call void @llvm.x86.avx512.mask.pmovus.qd.mem.128(i8* %ptr, <2 x i64> %x1, i8 -1)
3620    call void @llvm.x86.avx512.mask.pmovus.qd.mem.128(i8* %ptr, <2 x i64> %x1, i8 %x2)
3621    ret void
3622}
3623
3624declare <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64>, <4 x i32>, i8)
3625
3626define <4 x i32>@test_int_x86_avx512_mask_pmov_qd_256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) {
3627; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_256:
3628; CHECK:       vpmovqd %ymm0, %xmm1 {%k1}
3629; CHECK-NEXT:  vpmovqd %ymm0, %xmm2 {%k1} {z}
3630; CHECK-NEXT:  vpmovqd %ymm0, %xmm0
3631    %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 -1)
3632    %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2)
3633    %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2)
3634    %res3 = add <4 x i32> %res0, %res1
3635    %res4 = add <4 x i32> %res3, %res2
3636    ret <4 x i32> %res4
3637}
3638
3639declare void @llvm.x86.avx512.mask.pmov.qd.mem.256(i8* %ptr, <4 x i64>, i8)
3640
3641define void @test_int_x86_avx512_mask_pmov_qd_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) {
3642; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_mem_256:
3643; CHECK:  vpmovqd %ymm0, (%rdi)
3644; CHECK:  vpmovqd %ymm0, (%rdi) {%k1}
3645    call void @llvm.x86.avx512.mask.pmov.qd.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1)
3646    call void @llvm.x86.avx512.mask.pmov.qd.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2)
3647    ret void
3648}
3649
3650declare <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64>, <4 x i32>, i8)
3651
3652define <4 x i32>@test_int_x86_avx512_mask_pmovs_qd_256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) {
3653; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_256:
3654; CHECK:       vpmovsqd %ymm0, %xmm1 {%k1}
3655; CHECK-NEXT:  vpmovsqd %ymm0, %xmm2 {%k1} {z}
3656; CHECK-NEXT:  vpmovsqd %ymm0, %xmm0
3657    %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 -1)
3658    %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2)
3659    %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2)
3660    %res3 = add <4 x i32> %res0, %res1
3661    %res4 = add <4 x i32> %res3, %res2
3662    ret <4 x i32> %res4
3663}
3664
3665declare void @llvm.x86.avx512.mask.pmovs.qd.mem.256(i8* %ptr, <4 x i64>, i8)
3666
3667define void @test_int_x86_avx512_mask_pmovs_qd_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) {
3668; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_mem_256:
3669; CHECK:  vpmovsqd %ymm0, (%rdi)
3670; CHECK:  vpmovsqd %ymm0, (%rdi) {%k1}
3671    call void @llvm.x86.avx512.mask.pmovs.qd.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1)
3672    call void @llvm.x86.avx512.mask.pmovs.qd.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2)
3673    ret void
3674}
3675
3676declare <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64>, <4 x i32>, i8)
3677
3678define <4 x i32>@test_int_x86_avx512_mask_pmovus_qd_256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) {
3679; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_256:
3680; CHECK:       vpmovusqd %ymm0, %xmm1 {%k1}
3681; CHECK-NEXT:  vpmovusqd %ymm0, %xmm2 {%k1} {z}
3682; CHECK-NEXT:  vpmovusqd %ymm0, %xmm0
3683    %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 -1)
3684    %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2)
3685    %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2)
3686    %res3 = add <4 x i32> %res0, %res1
3687    %res4 = add <4 x i32> %res3, %res2
3688    ret <4 x i32> %res4
3689}
3690
3691declare void @llvm.x86.avx512.mask.pmovus.qd.mem.256(i8* %ptr, <4 x i64>, i8)
3692
3693define void @test_int_x86_avx512_mask_pmovus_qd_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) {
3694; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_mem_256:
3695; CHECK:  vpmovusqd %ymm0, (%rdi)
3696; CHECK:  vpmovusqd %ymm0, (%rdi) {%k1}
3697    call void @llvm.x86.avx512.mask.pmovus.qd.mem.256(i8* %ptr, <4 x i64> %x1, i8 -1)
3698    call void @llvm.x86.avx512.mask.pmovus.qd.mem.256(i8* %ptr, <4 x i64> %x1, i8 %x2)
3699    ret void
3700}
3701
3702declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32>, <16 x i8>, i8)
3703
3704define <16 x i8>@test_int_x86_avx512_mask_pmov_db_128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) {
3705; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_128:
3706; CHECK:       vpmovdb %xmm0, %xmm1 {%k1}
3707; CHECK-NEXT:  vpmovdb %xmm0, %xmm2 {%k1} {z}
3708; CHECK-NEXT:  vpmovdb %xmm0, %xmm0
3709    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 -1)
3710    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2)
3711    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2)
3712    %res3 = add <16 x i8> %res0, %res1
3713    %res4 = add <16 x i8> %res3, %res2
3714    ret <16 x i8> %res4
3715}
3716
3717declare void @llvm.x86.avx512.mask.pmov.db.mem.128(i8* %ptr, <4 x i32>, i8)
3718
3719define void @test_int_x86_avx512_mask_pmov_db_mem_128(i8* %ptr, <4 x i32> %x1, i8 %x2) {
3720; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_mem_128:
3721; CHECK:  vpmovdb %xmm0, (%rdi)
3722; CHECK:  vpmovdb %xmm0, (%rdi) {%k1}
3723    call void @llvm.x86.avx512.mask.pmov.db.mem.128(i8* %ptr, <4 x i32> %x1, i8 -1)
3724    call void @llvm.x86.avx512.mask.pmov.db.mem.128(i8* %ptr, <4 x i32> %x1, i8 %x2)
3725    ret void
3726}
3727
3728declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32>, <16 x i8>, i8)
3729
3730define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) {
3731; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_128:
3732; CHECK:       vpmovsdb %xmm0, %xmm1 {%k1}
3733; CHECK-NEXT:  vpmovsdb %xmm0, %xmm2 {%k1} {z}
3734; CHECK-NEXT:  vpmovsdb %xmm0, %xmm0
3735    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 -1)
3736    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2)
3737    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2)
3738    %res3 = add <16 x i8> %res0, %res1
3739    %res4 = add <16 x i8> %res3, %res2
3740    ret <16 x i8> %res4
3741}
3742
3743declare void @llvm.x86.avx512.mask.pmovs.db.mem.128(i8* %ptr, <4 x i32>, i8)
3744
3745define void @test_int_x86_avx512_mask_pmovs_db_mem_128(i8* %ptr, <4 x i32> %x1, i8 %x2) {
3746; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_mem_128:
3747; CHECK:  vpmovsdb %xmm0, (%rdi)
3748; CHECK:  vpmovsdb %xmm0, (%rdi) {%k1}
3749    call void @llvm.x86.avx512.mask.pmovs.db.mem.128(i8* %ptr, <4 x i32> %x1, i8 -1)
3750    call void @llvm.x86.avx512.mask.pmovs.db.mem.128(i8* %ptr, <4 x i32> %x1, i8 %x2)
3751    ret void
3752}
3753
3754declare <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32>, <16 x i8>, i8)
3755
3756define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) {
3757; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_128:
3758; CHECK:       vpmovusdb %xmm0, %xmm1 {%k1}
3759; CHECK-NEXT:  vpmovusdb %xmm0, %xmm2 {%k1} {z}
3760; CHECK-NEXT:  vpmovusdb %xmm0, %xmm0
3761    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 -1)
3762    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2)
3763    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2)
3764    %res3 = add <16 x i8> %res0, %res1
3765    %res4 = add <16 x i8> %res3, %res2
3766    ret <16 x i8> %res4
3767}
3768
3769declare void @llvm.x86.avx512.mask.pmovus.db.mem.128(i8* %ptr, <4 x i32>, i8)
3770
3771define void @test_int_x86_avx512_mask_pmovus_db_mem_128(i8* %ptr, <4 x i32> %x1, i8 %x2) {
3772; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_mem_128:
3773; CHECK:  vpmovusdb %xmm0, (%rdi)
3774; CHECK:  vpmovusdb %xmm0, (%rdi) {%k1}
3775    call void @llvm.x86.avx512.mask.pmovus.db.mem.128(i8* %ptr, <4 x i32> %x1, i8 -1)
3776    call void @llvm.x86.avx512.mask.pmovus.db.mem.128(i8* %ptr, <4 x i32> %x1, i8 %x2)
3777    ret void
3778}
3779
3780declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32>, <16 x i8>, i8)
3781
3782define <16 x i8>@test_int_x86_avx512_mask_pmov_db_256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2) {
3783; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_256:
3784; CHECK:       vpmovdb %ymm0, %xmm1 {%k1}
3785; CHECK-NEXT:  vpmovdb %ymm0, %xmm2 {%k1} {z}
3786; CHECK-NEXT:  vpmovdb %ymm0, %xmm0
3787    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 -1)
3788    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2)
3789    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2)
3790    %res3 = add <16 x i8> %res0, %res1
3791    %res4 = add <16 x i8> %res3, %res2
3792    ret <16 x i8> %res4
3793}
3794
3795declare void @llvm.x86.avx512.mask.pmov.db.mem.256(i8* %ptr, <8 x i32>, i8)
3796
3797define void @test_int_x86_avx512_mask_pmov_db_mem_256(i8* %ptr, <8 x i32> %x1, i8 %x2) {
3798; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_mem_256:
3799; CHECK:  vpmovdb %ymm0, (%rdi)
3800; CHECK:  vpmovdb %ymm0, (%rdi) {%k1}
3801    call void @llvm.x86.avx512.mask.pmov.db.mem.256(i8* %ptr, <8 x i32> %x1, i8 -1)
3802    call void @llvm.x86.avx512.mask.pmov.db.mem.256(i8* %ptr, <8 x i32> %x1, i8 %x2)
3803    ret void
3804}
3805
3806declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32>, <16 x i8>, i8)
3807
3808define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2) {
3809; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_256:
3810; CHECK:       vpmovsdb %ymm0, %xmm1 {%k1}
3811; CHECK-NEXT:  vpmovsdb %ymm0, %xmm2 {%k1} {z}
3812; CHECK-NEXT:  vpmovsdb %ymm0, %xmm0
3813    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 -1)
3814    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2)
3815    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2)
3816    %res3 = add <16 x i8> %res0, %res1
3817    %res4 = add <16 x i8> %res3, %res2
3818    ret <16 x i8> %res4
3819}
3820
3821declare void @llvm.x86.avx512.mask.pmovs.db.mem.256(i8* %ptr, <8 x i32>, i8)
3822
3823define void @test_int_x86_avx512_mask_pmovs_db_mem_256(i8* %ptr, <8 x i32> %x1, i8 %x2) {
3824; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_mem_256:
3825; CHECK:  vpmovsdb %ymm0, (%rdi)
3826; CHECK:  vpmovsdb %ymm0, (%rdi) {%k1}
3827    call void @llvm.x86.avx512.mask.pmovs.db.mem.256(i8* %ptr, <8 x i32> %x1, i8 -1)
3828    call void @llvm.x86.avx512.mask.pmovs.db.mem.256(i8* %ptr, <8 x i32> %x1, i8 %x2)
3829    ret void
3830}
3831
3832declare <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32>, <16 x i8>, i8)
3833
3834define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2) {
3835; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_256:
3836; CHECK:       vpmovusdb %ymm0, %xmm1 {%k1}
3837; CHECK-NEXT:  vpmovusdb %ymm0, %xmm2 {%k1} {z}
3838; CHECK-NEXT:  vpmovusdb %ymm0, %xmm0
3839    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 -1)
3840    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2)
3841    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2)
3842    %res3 = add <16 x i8> %res0, %res1
3843    %res4 = add <16 x i8> %res3, %res2
3844    ret <16 x i8> %res4
3845}
3846
3847declare void @llvm.x86.avx512.mask.pmovus.db.mem.256(i8* %ptr, <8 x i32>, i8)
3848
3849define void @test_int_x86_avx512_mask_pmovus_db_mem_256(i8* %ptr, <8 x i32> %x1, i8 %x2) {
3850; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_mem_256:
3851; CHECK:  vpmovusdb %ymm0, (%rdi)
3852; CHECK:  vpmovusdb %ymm0, (%rdi) {%k1}
3853    call void @llvm.x86.avx512.mask.pmovus.db.mem.256(i8* %ptr, <8 x i32> %x1, i8 -1)
3854    call void @llvm.x86.avx512.mask.pmovus.db.mem.256(i8* %ptr, <8 x i32> %x1, i8 %x2)
3855    ret void
3856}
3857
3858declare <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32>, <8 x i16>, i8)
3859
3860define <8 x i16>@test_int_x86_avx512_mask_pmov_dw_128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2) {
3861; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_128:
3862; CHECK:       vpmovdw %xmm0, %xmm1 {%k1}
3863; CHECK-NEXT:  vpmovdw %xmm0, %xmm2 {%k1} {z}
3864; CHECK-NEXT:  vpmovdw %xmm0, %xmm0
3865    %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 -1)
3866    %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2)
3867    %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2)
3868    %res3 = add <8 x i16> %res0, %res1
3869    %res4 = add <8 x i16> %res3, %res2
3870    ret <8 x i16> %res4
3871}
3872
3873declare void @llvm.x86.avx512.mask.pmov.dw.mem.128(i8* %ptr, <4 x i32>, i8)
3874
3875define void @test_int_x86_avx512_mask_pmov_dw_mem_128(i8* %ptr, <4 x i32> %x1, i8 %x2) {
3876; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_mem_128:
3877; CHECK:  vpmovdw %xmm0, (%rdi)
3878; CHECK:  vpmovdw %xmm0, (%rdi) {%k1}
3879    call void @llvm.x86.avx512.mask.pmov.dw.mem.128(i8* %ptr, <4 x i32> %x1, i8 -1)
3880    call void @llvm.x86.avx512.mask.pmov.dw.mem.128(i8* %ptr, <4 x i32> %x1, i8 %x2)
3881    ret void
3882}
3883
3884declare <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32>, <8 x i16>, i8)
3885
3886define <8 x i16>@test_int_x86_avx512_mask_pmovs_dw_128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2) {
3887; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_128:
3888; CHECK:       vpmovsdw %xmm0, %xmm1 {%k1}
3889; CHECK-NEXT:  vpmovsdw %xmm0, %xmm2 {%k1} {z}
3890; CHECK-NEXT:  vpmovsdw %xmm0, %xmm0
3891    %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 -1)
3892    %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2)
3893    %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2)
3894    %res3 = add <8 x i16> %res0, %res1
3895    %res4 = add <8 x i16> %res3, %res2
3896    ret <8 x i16> %res4
3897}
3898
3899declare void @llvm.x86.avx512.mask.pmovs.dw.mem.128(i8* %ptr, <4 x i32>, i8)
3900
3901define void @test_int_x86_avx512_mask_pmovs_dw_mem_128(i8* %ptr, <4 x i32> %x1, i8 %x2) {
3902; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_mem_128:
3903; CHECK:  vpmovsdw %xmm0, (%rdi)
3904; CHECK:  vpmovsdw %xmm0, (%rdi) {%k1}
3905    call void @llvm.x86.avx512.mask.pmovs.dw.mem.128(i8* %ptr, <4 x i32> %x1, i8 -1)
3906    call void @llvm.x86.avx512.mask.pmovs.dw.mem.128(i8* %ptr, <4 x i32> %x1, i8 %x2)
3907    ret void
3908}
3909
3910declare <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32>, <8 x i16>, i8)
3911
3912define <8 x i16>@test_int_x86_avx512_mask_pmovus_dw_128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2) {
3913; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_128:
3914; CHECK:       vpmovusdw %xmm0, %xmm1 {%k1}
3915; CHECK-NEXT:  vpmovusdw %xmm0, %xmm2 {%k1} {z}
3916; CHECK-NEXT:  vpmovusdw %xmm0, %xmm0
3917    %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 -1)
3918    %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2)
3919    %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2)
3920    %res3 = add <8 x i16> %res0, %res1
3921    %res4 = add <8 x i16> %res3, %res2
3922    ret <8 x i16> %res4
3923}
3924
3925declare void @llvm.x86.avx512.mask.pmovus.dw.mem.128(i8* %ptr, <4 x i32>, i8)
3926
3927define void @test_int_x86_avx512_mask_pmovus_dw_mem_128(i8* %ptr, <4 x i32> %x1, i8 %x2) {
3928; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_mem_128:
3929; CHECK:  vpmovusdw %xmm0, (%rdi)
3930; CHECK:  vpmovusdw %xmm0, (%rdi) {%k1}
3931    call void @llvm.x86.avx512.mask.pmovus.dw.mem.128(i8* %ptr, <4 x i32> %x1, i8 -1)
3932    call void @llvm.x86.avx512.mask.pmovus.dw.mem.128(i8* %ptr, <4 x i32> %x1, i8 %x2)
3933    ret void
3934}
3935
3936declare <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32>, <8 x i16>, i8)
3937
3938define <8 x i16>@test_int_x86_avx512_mask_pmov_dw_256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2) {
3939; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_256:
3940; CHECK:       vpmovdw %ymm0, %xmm1 {%k1}
3941; CHECK-NEXT:  vpmovdw %ymm0, %xmm2 {%k1} {z}
3942; CHECK-NEXT:  vpmovdw %ymm0, %xmm0
3943    %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 -1)
3944    %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2)
3945    %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2)
3946    %res3 = add <8 x i16> %res0, %res1
3947    %res4 = add <8 x i16> %res3, %res2
3948    ret <8 x i16> %res4
3949}
3950
3951declare void @llvm.x86.avx512.mask.pmov.dw.mem.256(i8* %ptr, <8 x i32>, i8)
3952
3953define void @test_int_x86_avx512_mask_pmov_dw_mem_256(i8* %ptr, <8 x i32> %x1, i8 %x2) {
3954; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_mem_256:
3955; CHECK:  vpmovdw %ymm0, (%rdi)
3956; CHECK:  vpmovdw %ymm0, (%rdi) {%k1}
3957    call void @llvm.x86.avx512.mask.pmov.dw.mem.256(i8* %ptr, <8 x i32> %x1, i8 -1)
3958    call void @llvm.x86.avx512.mask.pmov.dw.mem.256(i8* %ptr, <8 x i32> %x1, i8 %x2)
3959    ret void
3960}
3961
3962declare <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32>, <8 x i16>, i8)
3963
3964define <8 x i16>@test_int_x86_avx512_mask_pmovs_dw_256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2) {
3965; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_256:
3966; CHECK:       vpmovsdw %ymm0, %xmm1 {%k1}
3967; CHECK-NEXT:  vpmovsdw %ymm0, %xmm2 {%k1} {z}
3968; CHECK-NEXT:  vpmovsdw %ymm0, %xmm0
3969    %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 -1)
3970    %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2)
3971    %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2)
3972    %res3 = add <8 x i16> %res0, %res1
3973    %res4 = add <8 x i16> %res3, %res2
3974    ret <8 x i16> %res4
3975}
3976
3977declare void @llvm.x86.avx512.mask.pmovs.dw.mem.256(i8* %ptr, <8 x i32>, i8)
3978
3979define void @test_int_x86_avx512_mask_pmovs_dw_mem_256(i8* %ptr, <8 x i32> %x1, i8 %x2) {
3980; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_mem_256:
3981; CHECK:  vpmovsdw %ymm0, (%rdi)
3982; CHECK:  vpmovsdw %ymm0, (%rdi) {%k1}
3983    call void @llvm.x86.avx512.mask.pmovs.dw.mem.256(i8* %ptr, <8 x i32> %x1, i8 -1)
3984    call void @llvm.x86.avx512.mask.pmovs.dw.mem.256(i8* %ptr, <8 x i32> %x1, i8 %x2)
3985    ret void
3986}
3987
3988declare <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32>, <8 x i16>, i8)
3989
3990define <8 x i16>@test_int_x86_avx512_mask_pmovus_dw_256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2) {
3991; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_256:
3992; CHECK:       vpmovusdw %ymm0, %xmm1 {%k1}
3993; CHECK-NEXT:  vpmovusdw %ymm0, %xmm2 {%k1} {z}
3994; CHECK-NEXT:  vpmovusdw %ymm0, %xmm0
3995    %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 -1)
3996    %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2)
3997    %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2)
3998    %res3 = add <8 x i16> %res0, %res1
3999    %res4 = add <8 x i16> %res3, %res2
4000    ret <8 x i16> %res4
4001}
4002
4003declare void @llvm.x86.avx512.mask.pmovus.dw.mem.256(i8* %ptr, <8 x i32>, i8)
4004
4005define void @test_int_x86_avx512_mask_pmovus_dw_mem_256(i8* %ptr, <8 x i32> %x1, i8 %x2) {
4006; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_mem_256:
4007; CHECK:  vpmovusdw %ymm0, (%rdi)
4008; CHECK:  vpmovusdw %ymm0, (%rdi) {%k1}
4009    call void @llvm.x86.avx512.mask.pmovus.dw.mem.256(i8* %ptr, <8 x i32> %x1, i8 -1)
4010    call void @llvm.x86.avx512.mask.pmovus.dw.mem.256(i8* %ptr, <8 x i32> %x1, i8 %x2)
4011    ret void
4012}
4013
4014declare <2 x double> @llvm.x86.avx512.mask.cvtdq2pd.128(<4 x i32>, <2 x double>, i8)
4015
4016define <2 x double>@test_int_x86_avx512_mask_cvt_dq2pd_128(<4 x i32> %x0, <2 x double> %x1, i8 %x2) {
4017; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2pd_128:
4018; CHECK:       ## BB#0:
4019; CHECK-NEXT:    movzbl %dil, %eax
4020; CHECK-NEXT:    kmovw %eax, %k1
4021; CHECK-NEXT:    vcvtdq2pd %xmm0, %xmm1 {%k1}
4022; CHECK-NEXT:    vcvtdq2pd %xmm0, %xmm0
4023; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
4024; CHECK-NEXT:    retq
4025  %res = call <2 x double> @llvm.x86.avx512.mask.cvtdq2pd.128(<4 x i32> %x0, <2 x double> %x1, i8 %x2)
4026  %res1 = call <2 x double> @llvm.x86.avx512.mask.cvtdq2pd.128(<4 x i32> %x0, <2 x double> %x1, i8 -1)
4027  %res2 = fadd <2 x double> %res, %res1
4028  ret <2 x double> %res2
4029}
4030
4031declare <4 x double> @llvm.x86.avx512.mask.cvtdq2pd.256(<4 x i32>, <4 x double>, i8)
4032
4033define <4 x double>@test_int_x86_avx512_mask_cvt_dq2pd_256(<4 x i32> %x0, <4 x double> %x1, i8 %x2) {
4034; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2pd_256:
4035; CHECK:       ## BB#0:
4036; CHECK-NEXT:    movzbl %dil, %eax
4037; CHECK-NEXT:    kmovw %eax, %k1
4038; CHECK-NEXT:    vcvtdq2pd %xmm0, %ymm1 {%k1}
4039; CHECK-NEXT:    vcvtdq2pd %xmm0, %ymm0
4040; CHECK-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
4041; CHECK-NEXT:    retq
4042  %res = call <4 x double> @llvm.x86.avx512.mask.cvtdq2pd.256(<4 x i32> %x0, <4 x double> %x1, i8 %x2)
4043  %res1 = call <4 x double> @llvm.x86.avx512.mask.cvtdq2pd.256(<4 x i32> %x0, <4 x double> %x1, i8 -1)
4044  %res2 = fadd <4 x double> %res, %res1
4045  ret <4 x double> %res2
4046}
4047
4048declare <4 x float> @llvm.x86.avx512.mask.cvtdq2ps.128(<4 x i32>, <4 x float>, i8)
4049
4050define <4 x float>@test_int_x86_avx512_mask_cvt_dq2ps_128(<4 x i32> %x0, <4 x float> %x1, i8 %x2) {
4051; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2ps_128:
4052; CHECK:       ## BB#0:
4053; CHECK-NEXT:    movzbl %dil, %eax
4054; CHECK-NEXT:    kmovw %eax, %k1
4055; CHECK-NEXT:    vcvtdq2ps %xmm0, %xmm1 {%k1}
4056; CHECK-NEXT:    vcvtdq2ps %xmm0, %xmm0
4057; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
4058; CHECK-NEXT:    retq
4059  %res = call <4 x float> @llvm.x86.avx512.mask.cvtdq2ps.128(<4 x i32> %x0, <4 x float> %x1, i8 %x2)
4060  %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtdq2ps.128(<4 x i32> %x0, <4 x float> %x1, i8 -1)
4061  %res2 = fadd <4 x float> %res, %res1
4062  ret <4 x float> %res2
4063}
4064
4065declare <8 x float> @llvm.x86.avx512.mask.cvtdq2ps.256(<8 x i32>, <8 x float>, i8)
4066
4067define <8 x float>@test_int_x86_avx512_mask_cvt_dq2ps_256(<8 x i32> %x0, <8 x float> %x1, i8 %x2) {
4068; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2ps_256:
4069; CHECK:       ## BB#0:
4070; CHECK-NEXT:    movzbl %dil, %eax
4071; CHECK-NEXT:    kmovw %eax, %k1
4072; CHECK-NEXT:    vcvtdq2ps %ymm0, %ymm1 {%k1}
4073; CHECK-NEXT:    vcvtdq2ps %ymm0, %ymm0
4074; CHECK-NEXT:    vaddps %ymm0, %ymm1, %ymm0
4075; CHECK-NEXT:    retq
4076  %res = call <8 x float> @llvm.x86.avx512.mask.cvtdq2ps.256(<8 x i32> %x0, <8 x float> %x1, i8 %x2)
4077  %res1 = call <8 x float> @llvm.x86.avx512.mask.cvtdq2ps.256(<8 x i32> %x0, <8 x float> %x1, i8 -1)
4078  %res2 = fadd <8 x float> %res, %res1
4079  ret <8 x float> %res2
4080}
4081
4082declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double>, <4 x i32>, i8)
4083
4084define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) {
4085; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2dq_128:
4086; CHECK:       ## BB#0:
4087; CHECK-NEXT:    movzbl %dil, %eax
4088; CHECK-NEXT:    kmovw %eax, %k1
4089; CHECK-NEXT:    vcvtpd2dq %xmm0, %xmm1 {%k1}
4090; CHECK-NEXT:    vcvtpd2dq %xmm0, %xmm0
4091; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
4092; CHECK-NEXT:    retq
4093  %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2)
4094  %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1)
4095  %res2 = add <4 x i32> %res, %res1
4096  ret <4 x i32> %res2
4097}
4098
4099declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.256(<4 x double>, <4 x i32>, i8)
4100
4101define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) {
4102; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2dq_256:
4103; CHECK:       ## BB#0:
4104; CHECK-NEXT:    movzbl %dil, %eax
4105; CHECK-NEXT:    kmovw %eax, %k1
4106; CHECK-NEXT:    vcvtpd2dq %ymm0, %xmm1 {%k1}
4107; CHECK-NEXT:    vcvtpd2dq %ymm0, %xmm0
4108; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
4109; CHECK-NEXT:    retq
4110  %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.256(<4 x double> %x0, <4 x i32> %x1, i8 %x2)
4111  %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.256(<4 x double> %x0, <4 x i32> %x1, i8 -1)
4112  %res2 = add <4 x i32> %res, %res1
4113  ret <4 x i32> %res2
4114}
4115
4116declare <4 x float> @llvm.x86.avx512.mask.cvtpd2ps.256(<4 x double>, <4 x float>, i8)
4117
4118define <4 x float>@test_int_x86_avx512_mask_cvt_pd2ps_256(<4 x double> %x0, <4 x float> %x1, i8 %x2) {
4119; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2ps_256:
4120; CHECK:       ## BB#0:
4121; CHECK-NEXT:    movzbl %dil, %eax
4122; CHECK-NEXT:    kmovw %eax, %k1
4123; CHECK-NEXT:    vcvtpd2ps %ymm0, %xmm1 {%k1}
4124; CHECK-NEXT:    vcvtpd2ps %ymm0, %xmm0
4125; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
4126; CHECK-NEXT:    retq
4127  %res = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps.256(<4 x double> %x0, <4 x float> %x1, i8 %x2)
4128  %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps.256(<4 x double> %x0, <4 x float> %x1, i8 -1)
4129  %res2 = fadd <4 x float> %res, %res1
4130  ret <4 x float> %res2
4131}
4132
4133declare <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double>, <4 x float>, i8)
4134
4135define <4 x float>@test_int_x86_avx512_mask_cvt_pd2ps(<2 x double> %x0, <4 x float> %x1, i8 %x2) {
4136; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2ps:
4137; CHECK:       ## BB#0:
4138; CHECK-NEXT:    movzbl %dil, %eax
4139; CHECK-NEXT:    kmovw %eax, %k1
4140; CHECK-NEXT:    vcvtpd2ps %xmm0, %xmm1 {%k1}
4141; CHECK-NEXT:    vcvtpd2ps %xmm0, %xmm0
4142; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
4143; CHECK-NEXT:    retq
4144  %res = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %x0, <4 x float> %x1, i8 %x2)
4145  %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %x0, <4 x float> %x1, i8 -1)
4146  %res2 = fadd <4 x float> %res, %res1
4147  ret <4 x float> %res2
4148}
4149
4150declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double>, <4 x i32>, i8)
4151
4152define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) {
4153; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2udq_128:
4154; CHECK:       ## BB#0:
4155; CHECK-NEXT:    movzbl %dil, %eax
4156; CHECK-NEXT:    kmovw %eax, %k1
4157; CHECK-NEXT:    vcvtpd2udq %xmm0, %xmm1 {%k1}
4158; CHECK-NEXT:    vcvtpd2udq %xmm0, %xmm0
4159; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
4160; CHECK-NEXT:    retq
4161  %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2)
4162  %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1)
4163  %res2 = add <4 x i32> %res, %res1
4164  ret <4 x i32> %res2
4165}
4166
4167declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double>, <4 x i32>, i8)
4168
4169define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) {
4170; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2udq_256:
4171; CHECK:       ## BB#0:
4172; CHECK-NEXT:    movzbl %dil, %eax
4173; CHECK-NEXT:    kmovw %eax, %k1
4174; CHECK-NEXT:    vcvtpd2udq %ymm0, %xmm1 {%k1}
4175; CHECK-NEXT:    vcvtpd2udq %ymm0, %xmm0
4176; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
4177; CHECK-NEXT:    retq
4178  %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> %x0, <4 x i32> %x1, i8 %x2)
4179  %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> %x0, <4 x i32> %x1, i8 -1)
4180  %res2 = add <4 x i32> %res, %res1
4181  ret <4 x i32> %res2
4182}
4183
4184declare <4 x i32> @llvm.x86.avx512.mask.cvtps2dq.128(<4 x float>, <4 x i32>, i8)
4185
4186define <4 x i32>@test_int_x86_avx512_mask_cvt_ps2dq_128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) {
4187; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2dq_128:
4188; CHECK:       ## BB#0:
4189; CHECK-NEXT:    movzbl %dil, %eax
4190; CHECK-NEXT:    kmovw %eax, %k1
4191; CHECK-NEXT:    vcvtps2dq %xmm0, %xmm1 {%k1}
4192; CHECK-NEXT:    vcvtps2dq %xmm0, %xmm0
4193; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
4194; CHECK-NEXT:    retq
4195  %res = call <4 x i32> @llvm.x86.avx512.mask.cvtps2dq.128(<4 x float> %x0, <4 x i32> %x1, i8 %x2)
4196  %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvtps2dq.128(<4 x float> %x0, <4 x i32> %x1, i8 -1)
4197  %res2 = add <4 x i32> %res, %res1
4198  ret <4 x i32> %res2
4199}
4200
4201declare <8 x i32> @llvm.x86.avx512.mask.cvtps2dq.256(<8 x float>, <8 x i32>, i8)
4202
4203define <8 x i32>@test_int_x86_avx512_mask_cvt_ps2dq_256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) {
4204; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2dq_256:
4205; CHECK:       ## BB#0:
4206; CHECK-NEXT:    movzbl %dil, %eax
4207; CHECK-NEXT:    kmovw %eax, %k1
4208; CHECK-NEXT:    vcvtps2dq %ymm0, %ymm1 {%k1}
4209; CHECK-NEXT:    vcvtps2dq %ymm0, %ymm0
4210; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
4211; CHECK-NEXT:    retq
4212  %res = call <8 x i32> @llvm.x86.avx512.mask.cvtps2dq.256(<8 x float> %x0, <8 x i32> %x1, i8 %x2)
4213  %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvtps2dq.256(<8 x float> %x0, <8 x i32> %x1, i8 -1)
4214  %res2 = add <8 x i32> %res, %res1
4215  ret <8 x i32> %res2
4216}
4217
4218declare <2 x double> @llvm.x86.avx512.mask.cvtps2pd.128(<4 x float>, <2 x double>, i8)
4219
4220define <2 x double>@test_int_x86_avx512_mask_cvt_ps2pd_128(<4 x float> %x0, <2 x double> %x1, i8 %x2) {
4221; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2pd_128:
4222; CHECK:       ## BB#0:
4223; CHECK-NEXT:    movzbl %dil, %eax
4224; CHECK-NEXT:    kmovw %eax, %k1
4225; CHECK-NEXT:    vcvtps2pd %xmm0, %xmm1 {%k1}
4226; CHECK-NEXT:    vcvtps2pd %xmm0, %xmm0
4227; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
4228; CHECK-NEXT:    retq
4229  %res = call <2 x double> @llvm.x86.avx512.mask.cvtps2pd.128(<4 x float> %x0, <2 x double> %x1, i8 %x2)
4230  %res1 = call <2 x double> @llvm.x86.avx512.mask.cvtps2pd.128(<4 x float> %x0, <2 x double> %x1, i8 -1)
4231  %res2 = fadd <2 x double> %res, %res1
4232  ret <2 x double> %res2
4233}
4234
4235declare <4 x double> @llvm.x86.avx512.mask.cvtps2pd.256(<4 x float>, <4 x double>, i8)
4236
4237define <4 x double>@test_int_x86_avx512_mask_cvt_ps2pd_256(<4 x float> %x0, <4 x double> %x1, i8 %x2) {
4238; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2pd_256:
4239; CHECK:       ## BB#0:
4240; CHECK-NEXT:    movzbl %dil, %eax
4241; CHECK-NEXT:    kmovw %eax, %k1
4242; CHECK-NEXT:    vcvtps2pd %xmm0, %ymm1 {%k1}
4243; CHECK-NEXT:    vcvtps2pd %xmm0, %ymm0
4244; CHECK-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
4245; CHECK-NEXT:    retq
4246  %res = call <4 x double> @llvm.x86.avx512.mask.cvtps2pd.256(<4 x float> %x0, <4 x double> %x1, i8 %x2)
4247  %res1 = call <4 x double> @llvm.x86.avx512.mask.cvtps2pd.256(<4 x float> %x0, <4 x double> %x1, i8 -1)
4248  %res2 = fadd <4 x double> %res, %res1
4249  ret <4 x double> %res2
4250}
4251
4252declare <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float>, <4 x i32>, i8)
4253
4254define <4 x i32>@test_int_x86_avx512_mask_cvt_ps2udq_128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) {
4255; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2udq_128:
4256; CHECK:       ## BB#0:
4257; CHECK-NEXT:    movzbl %dil, %eax
4258; CHECK-NEXT:    kmovw %eax, %k1
4259; CHECK-NEXT:    vcvtps2udq %xmm0, %xmm1 {%k1}
4260; CHECK-NEXT:    vcvtps2udq %xmm0, %xmm0
4261; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
4262; CHECK-NEXT:    retq
4263  %res = call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> %x0, <4 x i32> %x1, i8 %x2)
4264  %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> %x0, <4 x i32> %x1, i8 -1)
4265  %res2 = add <4 x i32> %res, %res1
4266  ret <4 x i32> %res2
4267}
4268
4269declare <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float>, <8 x i32>, i8)
4270
4271define <8 x i32>@test_int_x86_avx512_mask_cvt_ps2udq_256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) {
4272; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2udq_256:
4273; CHECK:       ## BB#0:
4274; CHECK-NEXT:    movzbl %dil, %eax
4275; CHECK-NEXT:    kmovw %eax, %k1
4276; CHECK-NEXT:    vcvtps2udq %ymm0, %ymm1 {%k1}
4277; CHECK-NEXT:    vcvtps2udq %ymm0, %ymm0
4278; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
4279; CHECK-NEXT:    retq
4280  %res = call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> %x0, <8 x i32> %x1, i8 %x2)
4281  %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> %x0, <8 x i32> %x1, i8 -1)
4282  %res2 = add <8 x i32> %res, %res1
4283  ret <8 x i32> %res2
4284}
4285
4286declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double>, <4 x i32>, i8)
4287
4288define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) {
4289; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2dq_128:
4290; CHECK:       ## BB#0:
4291; CHECK-NEXT:    movzbl %dil, %eax
4292; CHECK-NEXT:    kmovw %eax, %k1
4293; CHECK-NEXT:    vcvttpd2dq %xmm0, %xmm1 {%k1}
4294; CHECK-NEXT:    vcvttpd2dq %xmm0, %xmm0
4295; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
4296; CHECK-NEXT:    retq
4297  %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2)
4298  %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1)
4299  %res2 = add <4 x i32> %res, %res1
4300  ret <4 x i32> %res2
4301}
4302
4303declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.256(<4 x double>, <4 x i32>, i8)
4304
4305define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) {
4306; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2dq_256:
4307; CHECK:       ## BB#0:
4308; CHECK-NEXT:    movzbl %dil, %eax
4309; CHECK-NEXT:    kmovw %eax, %k1
4310; CHECK-NEXT:    vcvttpd2dq %ymm0, %xmm1 {%k1}
4311; CHECK-NEXT:    vcvttpd2dq %ymm0, %xmm0
4312; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
4313; CHECK-NEXT:    retq
4314  %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.256(<4 x double> %x0, <4 x i32> %x1, i8 %x2)
4315  %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.256(<4 x double> %x0, <4 x i32> %x1, i8 -1)
4316  %res2 = add <4 x i32> %res, %res1
4317  ret <4 x i32> %res2
4318}
4319
4320declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double>, <4 x i32>, i8)
4321
4322define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) {
4323; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2udq_128:
4324; CHECK:       ## BB#0:
4325; CHECK-NEXT:    movzbl %dil, %eax
4326; CHECK-NEXT:    kmovw %eax, %k1
4327; CHECK-NEXT:    vcvttpd2udq %xmm0, %xmm1 {%k1}
4328; CHECK-NEXT:    vcvttpd2udq %xmm0, %xmm0
4329; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
4330; CHECK-NEXT:    retq
4331  %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2)
4332  %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1)
4333  %res2 = add <4 x i32> %res, %res1
4334  ret <4 x i32> %res2
4335}
4336
4337declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double>, <4 x i32>, i8)
4338
4339define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) {
4340; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2udq_256:
4341; CHECK:       ## BB#0:
4342; CHECK-NEXT:    movzbl %dil, %eax
4343; CHECK-NEXT:    kmovw %eax, %k1
4344; CHECK-NEXT:    vcvttpd2udq %ymm0, %xmm1 {%k1}
4345; CHECK-NEXT:    vcvttpd2udq %ymm0, %xmm0
4346; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
4347; CHECK-NEXT:    retq
4348  %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> %x0, <4 x i32> %x1, i8 %x2)
4349  %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> %x0, <4 x i32> %x1, i8 -1)
4350  %res2 = add <4 x i32> %res, %res1
4351  ret <4 x i32> %res2
4352}
4353
4354declare <4 x i32> @llvm.x86.avx512.mask.cvttps2dq.128(<4 x float>, <4 x i32>, i8)
4355
4356define <4 x i32>@test_int_x86_avx512_mask_cvtt_ps2dq_128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) {
4357; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2dq_128:
4358; CHECK:       ## BB#0:
4359; CHECK-NEXT:    movzbl %dil, %eax
4360; CHECK-NEXT:    kmovw %eax, %k1
4361; CHECK-NEXT:    vcvttps2dq %xmm0, %xmm1 {%k1}
4362; CHECK-NEXT:    vcvttps2dq %xmm0, %xmm0
4363; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
4364; CHECK-NEXT:    retq
4365  %res = call <4 x i32> @llvm.x86.avx512.mask.cvttps2dq.128(<4 x float> %x0, <4 x i32> %x1, i8 %x2)
4366  %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvttps2dq.128(<4 x float> %x0, <4 x i32> %x1, i8 -1)
4367  %res2 = add <4 x i32> %res, %res1
4368  ret <4 x i32> %res2
4369}
4370
4371declare <8 x i32> @llvm.x86.avx512.mask.cvttps2dq.256(<8 x float>, <8 x i32>, i8)
4372
4373define <8 x i32>@test_int_x86_avx512_mask_cvtt_ps2dq_256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) {
4374; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2dq_256:
4375; CHECK:       ## BB#0:
4376; CHECK-NEXT:    movzbl %dil, %eax
4377; CHECK-NEXT:    kmovw %eax, %k1
4378; CHECK-NEXT:    vcvttps2dq %ymm0, %ymm1 {%k1}
4379; CHECK-NEXT:    vcvttps2dq %ymm0, %ymm0
4380; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
4381; CHECK-NEXT:    retq
4382  %res = call <8 x i32> @llvm.x86.avx512.mask.cvttps2dq.256(<8 x float> %x0, <8 x i32> %x1, i8 %x2)
4383  %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvttps2dq.256(<8 x float> %x0, <8 x i32> %x1, i8 -1)
4384  %res2 = add <8 x i32> %res, %res1
4385  ret <8 x i32> %res2
4386}
4387
4388declare <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float>, <4 x i32>, i8)
4389
4390define <4 x i32>@test_int_x86_avx512_mask_cvtt_ps2udq_128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) {
4391; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2udq_128:
4392; CHECK:       ## BB#0:
4393; CHECK-NEXT:    movzbl %dil, %eax
4394; CHECK-NEXT:    kmovw %eax, %k1
4395; CHECK-NEXT:    vcvttps2udq %xmm0, %xmm1 {%k1}
4396; CHECK-NEXT:    vcvttps2udq %xmm0, %xmm0
4397; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
4398; CHECK-NEXT:    retq
4399  %res = call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> %x0, <4 x i32> %x1, i8 %x2)
4400  %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> %x0, <4 x i32> %x1, i8 -1)
4401  %res2 = add <4 x i32> %res, %res1
4402  ret <4 x i32> %res2
4403}
4404
4405declare <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float>, <8 x i32>, i8)
4406
4407define <8 x i32>@test_int_x86_avx512_mask_cvtt_ps2udq_256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) {
4408; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2udq_256:
4409; CHECK:       ## BB#0:
4410; CHECK-NEXT:    movzbl %dil, %eax
4411; CHECK-NEXT:    kmovw %eax, %k1
4412; CHECK-NEXT:    vcvttps2udq %ymm0, %ymm1 {%k1}
4413; CHECK-NEXT:    vcvttps2udq %ymm0, %ymm0
4414; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
4415; CHECK-NEXT:    retq
4416  %res = call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> %x0, <8 x i32> %x1, i8 %x2)
4417  %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> %x0, <8 x i32> %x1, i8 -1)
4418  %res2 = add <8 x i32> %res, %res1
4419  ret <8 x i32> %res2
4420}
4421
4422declare <2 x double> @llvm.x86.avx512.mask.cvtudq2pd.128(<4 x i32>, <2 x double>, i8)
4423
4424define <2 x double>@test_int_x86_avx512_mask_cvt_udq2pd_128(<4 x i32> %x0, <2 x double> %x1, i8 %x2) {
4425; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2pd_128:
4426; CHECK:       ## BB#0:
4427; CHECK-NEXT:    movzbl %dil, %eax
4428; CHECK-NEXT:    kmovw %eax, %k1
4429; CHECK-NEXT:    vcvtudq2pd %xmm0, %xmm1 {%k1}
4430; CHECK-NEXT:    vcvtudq2pd %xmm0, %xmm0
4431; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
4432; CHECK-NEXT:    retq
4433  %res = call <2 x double> @llvm.x86.avx512.mask.cvtudq2pd.128(<4 x i32> %x0, <2 x double> %x1, i8 %x2)
4434  %res1 = call <2 x double> @llvm.x86.avx512.mask.cvtudq2pd.128(<4 x i32> %x0, <2 x double> %x1, i8 -1)
4435  %res2 = fadd <2 x double> %res, %res1
4436  ret <2 x double> %res2
4437}
4438
4439declare <4 x double> @llvm.x86.avx512.mask.cvtudq2pd.256(<4 x i32>, <4 x double>, i8)
4440
4441define <4 x double>@test_int_x86_avx512_mask_cvt_udq2pd_256(<4 x i32> %x0, <4 x double> %x1, i8 %x2) {
4442; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2pd_256:
4443; CHECK:       ## BB#0:
4444; CHECK-NEXT:    movzbl %dil, %eax
4445; CHECK-NEXT:    kmovw %eax, %k1
4446; CHECK-NEXT:    vcvtudq2pd %xmm0, %ymm1 {%k1}
4447; CHECK-NEXT:    vcvtudq2pd %xmm0, %ymm0
4448; CHECK-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
4449; CHECK-NEXT:    retq
4450  %res = call <4 x double> @llvm.x86.avx512.mask.cvtudq2pd.256(<4 x i32> %x0, <4 x double> %x1, i8 %x2)
4451  %res1 = call <4 x double> @llvm.x86.avx512.mask.cvtudq2pd.256(<4 x i32> %x0, <4 x double> %x1, i8 -1)
4452  %res2 = fadd <4 x double> %res, %res1
4453  ret <4 x double> %res2
4454}
4455
4456declare <4 x float> @llvm.x86.avx512.mask.cvtudq2ps.128(<4 x i32>, <4 x float>, i8)
4457
4458define <4 x float>@test_int_x86_avx512_mask_cvt_udq2ps_128(<4 x i32> %x0, <4 x float> %x1, i8 %x2) {
4459; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2ps_128:
4460; CHECK:       ## BB#0:
4461; CHECK-NEXT:    movzbl %dil, %eax
4462; CHECK-NEXT:    kmovw %eax, %k1
4463; CHECK-NEXT:    vcvtudq2ps %xmm0, %xmm1 {%k1}
4464; CHECK-NEXT:    vcvtudq2ps %xmm0, %xmm0
4465; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
4466; CHECK-NEXT:    retq
4467  %res = call <4 x float> @llvm.x86.avx512.mask.cvtudq2ps.128(<4 x i32> %x0, <4 x float> %x1, i8 %x2)
4468  %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtudq2ps.128(<4 x i32> %x0, <4 x float> %x1, i8 -1)
4469  %res2 = fadd <4 x float> %res, %res1
4470  ret <4 x float> %res2
4471}
4472
4473declare <8 x float> @llvm.x86.avx512.mask.cvtudq2ps.256(<8 x i32>, <8 x float>, i8)
4474
4475define <8 x float>@test_int_x86_avx512_mask_cvt_udq2ps_256(<8 x i32> %x0, <8 x float> %x1, i8 %x2) {
4476; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2ps_256:
4477; CHECK:       ## BB#0:
4478; CHECK-NEXT:    movzbl %dil, %eax
4479; CHECK-NEXT:    kmovw %eax, %k1
4480; CHECK-NEXT:    vcvtudq2ps %ymm0, %ymm1 {%k1}
4481; CHECK-NEXT:    vcvtudq2ps %ymm0, %ymm0
4482; CHECK-NEXT:    vaddps %ymm0, %ymm1, %ymm0
4483; CHECK-NEXT:    retq
4484  %res = call <8 x float> @llvm.x86.avx512.mask.cvtudq2ps.256(<8 x i32> %x0, <8 x float> %x1, i8 %x2)
4485  %res1 = call <8 x float> @llvm.x86.avx512.mask.cvtudq2ps.256(<8 x i32> %x0, <8 x float> %x1, i8 -1)
4486  %res2 = fadd <8 x float> %res, %res1
4487  ret <8 x float> %res2
4488}
4489
4490declare <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128(<2 x double>, i32, <2 x double>, i8)
4491; CHECK-LABEL: @test_int_x86_avx512_mask_rndscale_pd_128
4492; CHECK-NOT: call
4493; CHECK: kmov
4494; CHECK: vrndscalepd {{.*}}{%k1}
4495; CHECK: vrndscalepd
4496define <2 x double>@test_int_x86_avx512_mask_rndscale_pd_128(<2 x double> %x0, <2 x double> %x2, i8 %x3) {
4497  %res = call <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128(<2 x double> %x0, i32 4, <2 x double> %x2, i8 %x3)
4498  %res1 = call <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128(<2 x double> %x0, i32 88, <2 x double> %x2, i8 -1)
4499  %res2 = fadd <2 x double> %res, %res1
4500  ret <2 x double> %res2
4501}
4502
4503declare <4 x double> @llvm.x86.avx512.mask.rndscale.pd.256(<4 x double>, i32, <4 x double>, i8)
4504; CHECK-LABEL: @test_int_x86_avx512_mask_rndscale_pd_256
4505; CHECK-NOT: call
4506; CHECK: kmov
4507; CHECK: vrndscalepd {{.*}}{%k1}
4508; CHECK: vrndscalepd
4509define <4 x double>@test_int_x86_avx512_mask_rndscale_pd_256(<4 x double> %x0, <4 x double> %x2, i8 %x3) {
4510  %res = call <4 x double> @llvm.x86.avx512.mask.rndscale.pd.256(<4 x double> %x0, i32 4, <4 x double> %x2, i8 %x3)
4511  %res1 = call <4 x double> @llvm.x86.avx512.mask.rndscale.pd.256(<4 x double> %x0, i32 88, <4 x double> %x2, i8 -1)
4512  %res2 = fadd <4 x double> %res, %res1
4513  ret <4 x double> %res2
4514}
4515
4516declare <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128(<4 x float>, i32, <4 x float>, i8)
4517; CHECK-LABEL: @test_int_x86_avx512_mask_rndscale_ps_128
4518; CHECK-NOT: call
4519; CHECK: kmov
4520; CHECK: vrndscaleps {{.*}}{%k1}
4521; CHECK: vrndscaleps
4522define <4 x float>@test_int_x86_avx512_mask_rndscale_ps_128(<4 x float> %x0, <4 x float> %x2, i8 %x3) {
4523  %res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128(<4 x float> %x0, i32 88, <4 x float> %x2, i8 %x3)
4524  %res1 = call <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128(<4 x float> %x0, i32 4, <4 x float> %x2, i8 -1)
4525  %res2 = fadd <4 x float> %res, %res1
4526  ret <4 x float> %res2
4527}
4528
4529declare <8 x float> @llvm.x86.avx512.mask.rndscale.ps.256(<8 x float>, i32, <8 x float>, i8)
4530
4531; CHECK-LABEL: @test_int_x86_avx512_mask_rndscale_ps_256
4532; CHECK-NOT: call
4533; CHECK: kmov
4534; CHECK: vrndscaleps {{.*}}{%k1}
4535; CHECK: vrndscaleps
4536define <8 x float>@test_int_x86_avx512_mask_rndscale_ps_256(<8 x float> %x0, <8 x float> %x2, i8 %x3) {
4537  %res = call <8 x float> @llvm.x86.avx512.mask.rndscale.ps.256(<8 x float> %x0, i32 5, <8 x float> %x2, i8 %x3)
4538  %res1 = call <8 x float> @llvm.x86.avx512.mask.rndscale.ps.256(<8 x float> %x0, i32 66, <8 x float> %x2, i8 -1)
4539  %res2 = fadd <8 x float> %res, %res1
4540  ret <8 x float> %res2
4541}
4542
4543declare <8 x float> @llvm.x86.avx512.mask.shuf.f32x4.256(<8 x float>, <8 x float>, i32, <8 x float>, i8)
4544
4545define <8 x float>@test_int_x86_avx512_mask_shuf_f32x4_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x3, i8 %x4) {
4546; CHECK-LABEL: test_int_x86_avx512_mask_shuf_f32x4_256:
4547; CHECK:       ## BB#0:
4548; CHECK-NEXT:    movzbl %dil, %eax
4549; CHECK-NEXT:    kmovw %eax, %k1
4550; CHECK-NEXT:    vshuff32x4 $22, %ymm1, %ymm0, %ymm2 {%k1}
4551; CHECK-NEXT:    ## ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4552; CHECK-NEXT:    vshuff32x4 $22, %ymm1, %ymm0, %ymm3 {%k1} {z}
4553; CHECK-NEXT:    ## ymm3 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4554; CHECK-NEXT:    vshuff32x4 $22, %ymm1, %ymm0, %ymm0
4555; CHECK-NEXT:    ## ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4556; CHECK-NEXT:    vaddps %ymm0, %ymm2, %ymm0
4557; CHECK-NEXT:    vaddps %ymm0, %ymm3, %ymm0
4558; CHECK-NEXT:    retq
4559  %res = call <8 x float> @llvm.x86.avx512.mask.shuf.f32x4.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> %x3, i8 %x4)
4560  %res1 = call <8 x float> @llvm.x86.avx512.mask.shuf.f32x4.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> %x3, i8 -1)
4561  %res2 = call <8 x float> @llvm.x86.avx512.mask.shuf.f32x4.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> zeroinitializer, i8 %x4)
4562  %res3 = fadd <8 x float> %res, %res1
4563  %res4 = fadd <8 x float> %res2, %res3
4564  ret <8 x float> %res4
4565}
4566
4567declare <4 x double> @llvm.x86.avx512.mask.shuf.f64x2.256(<4 x double>, <4 x double>, i32, <4 x double>, i8)
4568
4569define <4 x double>@test_int_x86_avx512_mask_shuf_f64x2_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x3, i8 %x4) {
4570; CHECK-LABEL: test_int_x86_avx512_mask_shuf_f64x2_256:
4571; CHECK:       ## BB#0:
4572; CHECK-NEXT:    movzbl %dil, %eax
4573; CHECK-NEXT:    kmovw %eax, %k1
4574; CHECK-NEXT:    vshuff64x2 $22, %ymm1, %ymm0, %ymm2 {%k1}
4575; CHECK-NEXT:    ## ymm2 = ymm0[0,1],ymm1[2,3]
4576; CHECK-NEXT:    vshuff64x2 $22, %ymm1, %ymm0, %ymm3 {%k1} {z}
4577; CHECK-NEXT:    ## ymm3 = ymm0[0,1],ymm1[2,3]
4578; CHECK-NEXT:    vshuff64x2 $22, %ymm1, %ymm0, %ymm0
4579; CHECK-NEXT:    ## ymm0 = ymm0[0,1],ymm1[2,3]
4580; CHECK-NEXT:    vaddpd %ymm0, %ymm2, %ymm0
4581; CHECK-NEXT:    vaddpd %ymm0, %ymm3, %ymm0
4582; CHECK-NEXT:    retq
4583  %res = call <4 x double> @llvm.x86.avx512.mask.shuf.f64x2.256(<4 x double> %x0, <4 x double> %x1, i32 22, <4 x double> %x3, i8 %x4)
4584  %res1 = call <4 x double> @llvm.x86.avx512.mask.shuf.f64x2.256(<4 x double> %x0, <4 x double> %x1, i32 22, <4 x double> %x3, i8 -1)
4585  %res2 = call <4 x double> @llvm.x86.avx512.mask.shuf.f64x2.256(<4 x double> %x0, <4 x double> %x1, i32 22, <4 x double> zeroinitializer, i8 %x4)
4586  %res3 = fadd <4 x double> %res, %res1
4587  %res4 = fadd <4 x double> %res2, %res3
4588  ret <4 x double> %res4
4589}
4590
4591declare <8 x i32> @llvm.x86.avx512.mask.shuf.i32x4.256(<8 x i32>, <8 x i32>, i32, <8 x i32>, i8)
4592
4593define <8 x i32>@test_int_x86_avx512_mask_shuf_i32x4_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x3, i8 %x4) {
4594; CHECK-LABEL: test_int_x86_avx512_mask_shuf_i32x4_256:
4595; CHECK:       ## BB#0:
4596; CHECK-NEXT:    movzbl %dil, %eax
4597; CHECK-NEXT:    kmovw %eax, %k1
4598; CHECK-NEXT:    vshufi32x4 $22, %ymm1, %ymm0, %ymm2 {%k1}
4599; CHECK-NEXT:    ## ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4600; CHECK-NEXT:    vshufi32x4 $22, %ymm1, %ymm0, %ymm0
4601; CHECK-NEXT:    ## ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4602; CHECK-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
4603; CHECK-NEXT:    retq
4604  %res = call <8 x i32> @llvm.x86.avx512.mask.shuf.i32x4.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 %x4)
4605  %res1 = call <8 x i32> @llvm.x86.avx512.mask.shuf.i32x4.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 -1)
4606  %res2 = add <8 x i32> %res, %res1
4607  ret <8 x i32> %res2
4608}
4609
4610declare <4 x i64> @llvm.x86.avx512.mask.shuf.i64x2.256(<4 x i64>, <4 x i64>, i32, <4 x i64>, i8)
4611
4612define <4 x i64>@test_int_x86_avx512_mask_shuf_i64x2_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x3, i8 %x4) {
4613; CHECK-LABEL: test_int_x86_avx512_mask_shuf_i64x2_256:
4614; CHECK:       ## BB#0:
4615; CHECK-NEXT:    movzbl %dil, %eax
4616; CHECK-NEXT:    kmovw %eax, %k1
4617; CHECK-NEXT:    vshufi64x2 $22, %ymm1, %ymm0, %ymm2 {%k1}
4618; CHECK-NEXT:    ## ymm2 = ymm0[0,1],ymm1[2,3]
4619; CHECK-NEXT:    vshufi64x2 $22, %ymm1, %ymm0, %ymm0
4620; CHECK-NEXT:    ## ymm0 = ymm0[0,1],ymm1[2,3]
4621; CHECK-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
4622; CHECK-NEXT:    retq
4623  %res = call <4 x i64> @llvm.x86.avx512.mask.shuf.i64x2.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 %x4)
4624  %res1 = call <4 x i64> @llvm.x86.avx512.mask.shuf.i64x2.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 -1)
4625  %res2 = add <4 x i64> %res, %res1
4626  ret <4 x i64> %res2
4627}
4628
4629declare <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float>, i32, <4 x float>, i8)
4630
4631define <4 x float>@test_int_x86_avx512_mask_vextractf32x4_256(<8 x float> %x0, <4 x float> %x2, i8 %x3) {
4632; CHECK-LABEL: test_int_x86_avx512_mask_vextractf32x4_256:
4633; CHECK:       ## BB#0:
4634; CHECK-NEXT:    kmovw %edi, %k1
4635; CHECK-NEXT:    vextractf32x4 $1, %ymm0, %xmm1 {%k1}
4636; CHECK-NEXT:    vextractf32x4 $1, %ymm0, %xmm2 {%k1} {z}
4637; CHECK-NEXT:    vextractf32x4 $1, %ymm0, %xmm0
4638; CHECK-NEXT:    vaddps %xmm2, %xmm1, %xmm1
4639; CHECK-NEXT:    vaddps %xmm1, %xmm0, %xmm0
4640; CHECK-NEXT:    retq
4641  %res  = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> %x2, i8 %x3)
4642  %res1 = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> zeroinitializer, i8 %x3)
4643  %res2 = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> zeroinitializer, i8 -1)
4644  %res3 = fadd <4 x float> %res, %res1
4645  %res4 = fadd <4 x float> %res2, %res3
4646  ret <4 x float> %res4
4647}
4648
4649declare <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double>, i32, <2 x double>, i8)
4650
4651define <2 x double>@test_int_x86_avx512_mask_getmant_pd_128(<2 x double> %x0, <2 x double> %x2, i8 %x3) {
4652; CHECK-LABEL: test_int_x86_avx512_mask_getmant_pd_128:
4653; CHECK:       ## BB#0:
4654; CHECK-NEXT:    movzbl %dil, %eax
4655; CHECK-NEXT:    kmovw %eax, %k1
4656; CHECK-NEXT:    vgetmantpd $11, %xmm0, %xmm1 {%k1}
4657; CHECK-NEXT:    vgetmantpd $11, %xmm0, %xmm2 {%k1} {z}
4658; CHECK-NEXT:    vgetmantpd $11, %xmm0, %xmm0
4659; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
4660; CHECK-NEXT:    vaddpd %xmm0, %xmm2, %xmm0
4661; CHECK-NEXT:    retq
4662  %res = call <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double> %x0, i32 11, <2 x double> %x2, i8 %x3)
4663  %res2 = call <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double> %x0, i32 11, <2 x double> zeroinitializer, i8 %x3)
4664  %res1 = call <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double> %x0, i32 11, <2 x double> %x2, i8 -1)
4665  %res3 = fadd <2 x double> %res, %res1
4666  %res4 = fadd <2 x double> %res2, %res3
4667  ret <2 x double> %res4
4668}
4669
4670declare <4 x double> @llvm.x86.avx512.mask.getmant.pd.256(<4 x double>, i32, <4 x double>, i8)
4671
4672define <4 x double>@test_int_x86_avx512_mask_getmant_pd_256(<4 x double> %x0, <4 x double> %x2, i8 %x3) {
4673; CHECK-LABEL: test_int_x86_avx512_mask_getmant_pd_256:
4674; CHECK:       ## BB#0:
4675; CHECK-NEXT:    movzbl %dil, %eax
4676; CHECK-NEXT:    kmovw %eax, %k1
4677; CHECK-NEXT:    vgetmantpd $11, %ymm0, %ymm1 {%k1}
4678; CHECK-NEXT:    vgetmantpd $11, %ymm0, %ymm0
4679; CHECK-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
4680; CHECK-NEXT:    retq
4681  %res = call <4 x double> @llvm.x86.avx512.mask.getmant.pd.256(<4 x double> %x0, i32 11, <4 x double> %x2, i8 %x3)
4682  %res1 = call <4 x double> @llvm.x86.avx512.mask.getmant.pd.256(<4 x double> %x0, i32 11, <4 x double> %x2, i8 -1)
4683  %res2 = fadd <4 x double> %res, %res1
4684  ret <4 x double> %res2
4685}
4686
4687declare <4 x float> @llvm.x86.avx512.mask.getmant.ps.128(<4 x float>, i32, <4 x float>, i8)
4688
4689define <4 x float>@test_int_x86_avx512_mask_getmant_ps_128(<4 x float> %x0, <4 x float> %x2, i8 %x3) {
4690; CHECK-LABEL: test_int_x86_avx512_mask_getmant_ps_128:
4691; CHECK:       ## BB#0:
4692; CHECK-NEXT:    movzbl %dil, %eax
4693; CHECK-NEXT:    kmovw %eax, %k1
4694; CHECK-NEXT:    vgetmantps $11, %xmm0, %xmm1 {%k1}
4695; CHECK-NEXT:    vgetmantps $11, %xmm0, %xmm0
4696; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
4697; CHECK-NEXT:    retq
4698  %res = call <4 x float> @llvm.x86.avx512.mask.getmant.ps.128(<4 x float> %x0, i32 11, <4 x float> %x2, i8 %x3)
4699  %res1 = call <4 x float> @llvm.x86.avx512.mask.getmant.ps.128(<4 x float> %x0, i32 11, <4 x float> %x2, i8 -1)
4700  %res2 = fadd <4 x float> %res, %res1
4701  ret <4 x float> %res2
4702}
4703
4704declare <8 x float> @llvm.x86.avx512.mask.getmant.ps.256(<8 x float>, i32, <8 x float>, i8)
4705
4706define <8 x float>@test_int_x86_avx512_mask_getmant_ps_256(<8 x float> %x0, <8 x float> %x2, i8 %x3) {
4707; CHECK-LABEL: test_int_x86_avx512_mask_getmant_ps_256:
4708; CHECK:       ## BB#0:
4709; CHECK-NEXT:    movzbl %dil, %eax
4710; CHECK-NEXT:    kmovw %eax, %k1
4711; CHECK-NEXT:    vgetmantps $11, %ymm0, %ymm1 {%k1}
4712; CHECK-NEXT:    vgetmantps $11, %ymm0, %ymm0
4713; CHECK-NEXT:    vaddps %ymm0, %ymm1, %ymm0
4714; CHECK-NEXT:    retq
4715  %res = call <8 x float> @llvm.x86.avx512.mask.getmant.ps.256(<8 x float> %x0, i32 11, <8 x float> %x2, i8 %x3)
4716  %res1 = call <8 x float> @llvm.x86.avx512.mask.getmant.ps.256(<8 x float> %x0, i32 11, <8 x float> %x2, i8 -1)
4717  %res2 = fadd <8 x float> %res, %res1
4718  ret <8 x float> %res2
4719}
4720
4721declare <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double>, <2 x double>, i32, <2 x double>, i8)
4722
4723define <2 x double>@test_int_x86_avx512_mask_shuf_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4) {
4724; CHECK-LABEL: test_int_x86_avx512_mask_shuf_pd_128:
4725; CHECK:       ## BB#0:
4726; CHECK-NEXT:    movzbl %dil, %eax
4727; CHECK-NEXT:    kmovw %eax, %k1
4728; CHECK-NEXT:    vshufpd $22, %xmm1, %xmm0, %xmm2 {%k1}
4729; CHECK-NEXT:    ## xmm2 = xmm2[0],k1[1]
4730; CHECK-NEXT:    vshufpd $22, %xmm1, %xmm0, %xmm3 {%k1} {z}
4731; CHECK-NEXT:    ## xmm3 = k1[0],xmm0[1]
4732; CHECK-NEXT:    vshufpd $22, %xmm1, %xmm0, %xmm0
4733; CHECK-NEXT:    ## xmm0 = xmm0[0],xmm1[1]
4734; CHECK-NEXT:    vaddpd %xmm0, %xmm2, %xmm0
4735; CHECK-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
4736; CHECK-NEXT:    retq
4737  %res = call <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double> %x0, <2 x double> %x1, i32 22, <2 x double> %x3, i8 %x4)
4738  %res1 = call <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double> %x0, <2 x double> %x1, i32 22, <2 x double> %x3, i8 -1)
4739  %res2 = call <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double> %x0, <2 x double> %x1, i32 22, <2 x double> zeroinitializer, i8 %x4)
4740  %res3 = fadd <2 x double> %res, %res1
4741  %res4 = fadd <2 x double> %res2, %res3
4742  ret <2 x double> %res4
4743}
4744
4745declare <4 x double> @llvm.x86.avx512.mask.shuf.pd.256(<4 x double>, <4 x double>, i32, <4 x double>, i8)
4746
4747define <4 x double>@test_int_x86_avx512_mask_shuf_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x3, i8 %x4) {
4748; CHECK-LABEL: test_int_x86_avx512_mask_shuf_pd_256:
4749; CHECK:       ## BB#0:
4750; CHECK-NEXT:    movzbl %dil, %eax
4751; CHECK-NEXT:    kmovw %eax, %k1
4752; CHECK-NEXT:    vshufpd $22, %ymm1, %ymm0, %ymm2 {%k1}
4753; CHECK-NEXT:    ## ymm2 = ymm2[0],k1[1],ymm2[3],k1[2]
4754; CHECK-NEXT:    vshufpd $22, %ymm1, %ymm0, %ymm0
4755; CHECK-NEXT:    ## ymm0 = ymm0[0],ymm1[1],ymm0[3],ymm1[2]
4756; CHECK-NEXT:    vaddpd %ymm0, %ymm2, %ymm0
4757; CHECK-NEXT:    retq
4758  %res = call <4 x double> @llvm.x86.avx512.mask.shuf.pd.256(<4 x double> %x0, <4 x double> %x1, i32 22, <4 x double> %x3, i8 %x4)
4759  %res1 = call <4 x double> @llvm.x86.avx512.mask.shuf.pd.256(<4 x double> %x0, <4 x double> %x1, i32 22, <4 x double> %x3, i8 -1)
4760  %res2 = fadd <4 x double> %res, %res1
4761  ret <4 x double> %res2
4762}
4763
4764declare <4 x float> @llvm.x86.avx512.mask.shuf.ps.128(<4 x float>, <4 x float>, i32, <4 x float>, i8)
4765
4766define <4 x float>@test_int_x86_avx512_mask_shuf_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4) {
4767; CHECK-LABEL: test_int_x86_avx512_mask_shuf_ps_128:
4768; CHECK:       ## BB#0:
4769; CHECK-NEXT:    movzbl %dil, %eax
4770; CHECK-NEXT:    kmovw %eax, %k1
4771; CHECK-NEXT:    vshufps $22, %xmm1, %xmm0, %xmm2 {%k1}
4772; CHECK-NEXT:    ## xmm2 = xmm2[2,1],k1[1,0]
4773; CHECK-NEXT:    vshufps $22, %xmm1, %xmm0, %xmm0
4774; CHECK-NEXT:    ## xmm0 = xmm0[2,1],xmm1[1,0]
4775; CHECK-NEXT:    vaddps %xmm0, %xmm2, %xmm0
4776; CHECK-NEXT:    retq
4777  %res = call <4 x float> @llvm.x86.avx512.mask.shuf.ps.128(<4 x float> %x0, <4 x float> %x1, i32 22, <4 x float> %x3, i8 %x4)
4778  %res1 = call <4 x float> @llvm.x86.avx512.mask.shuf.ps.128(<4 x float> %x0, <4 x float> %x1, i32 22, <4 x float> %x3, i8 -1)
4779  %res2 = fadd <4 x float> %res, %res1
4780  ret <4 x float> %res2
4781}
4782
4783declare <8 x float> @llvm.x86.avx512.mask.shuf.ps.256(<8 x float>, <8 x float>, i32, <8 x float>, i8)
4784
4785define <8 x float>@test_int_x86_avx512_mask_shuf_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x3, i8 %x4) {
4786; CHECK-LABEL: test_int_x86_avx512_mask_shuf_ps_256:
4787; CHECK:       ## BB#0:
4788; CHECK-NEXT:    movzbl %dil, %eax
4789; CHECK-NEXT:    kmovw %eax, %k1
4790; CHECK-NEXT:    vshufps $22, %ymm1, %ymm0, %ymm2 {%k1}
4791; CHECK-NEXT:    ## ymm2 = ymm2[2,1],k1[1,0],ymm2[6,5],k1[5,4]
4792; CHECK-NEXT:    vshufps $22, %ymm1, %ymm0, %ymm0
4793; CHECK-NEXT:    ## ymm0 = ymm0[2,1],ymm1[1,0],ymm0[6,5],ymm1[5,4]
4794; CHECK-NEXT:    vaddps %ymm0, %ymm2, %ymm0
4795; CHECK-NEXT:    retq
4796  %res = call <8 x float> @llvm.x86.avx512.mask.shuf.ps.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> %x3, i8 %x4)
4797  %res1 = call <8 x float> @llvm.x86.avx512.mask.shuf.ps.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> %x3, i8 -1)
4798  %res2 = fadd <8 x float> %res, %res1
4799  ret <8 x float> %res2
4800}
4801
4802declare <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32>, <4 x i32>, i32, <4 x i32>, i8)
4803
4804define <4 x i32>@test_int_x86_avx512_mask_valign_d_128(<4 x i32> %x0, <4 x i32> %x1,<4 x i32> %x3, i8 %x4) {
4805; CHECK-LABEL: test_int_x86_avx512_mask_valign_d_128:
4806; CHECK:       ## BB#0:
4807; CHECK-NEXT:    movzbl %dil, %eax
4808; CHECK-NEXT:    kmovw %eax, %k1
4809; CHECK-NEXT:    valignd $22, %xmm1, %xmm0, %xmm2 {%k1}
4810; CHECK-NEXT:    valignd $22, %xmm1, %xmm0, %xmm3 {%k1} {z}
4811; CHECK-NEXT:    valignd $22, %xmm1, %xmm0, %xmm0
4812; CHECK-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
4813; CHECK-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
4814; CHECK-NEXT:    retq
4815  %res = call <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22, <4 x i32> %x3, i8 %x4)
4816  %res1 = call <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22, <4 x i32> %x3, i8 -1)
4817    %res2 = call <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22, <4 x i32> zeroinitializer,i8 %x4)
4818  %res3 = add <4 x i32> %res, %res1
4819    %res4 = add <4 x i32> %res3, %res2
4820  ret <4 x i32> %res4
4821}
4822
4823declare <8 x i32> @llvm.x86.avx512.mask.valign.d.256(<8 x i32>, <8 x i32>, i32, <8 x i32>, i8)
4824
4825define <8 x i32>@test_int_x86_avx512_mask_valign_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x3, i8 %x4) {
4826; CHECK-LABEL: test_int_x86_avx512_mask_valign_d_256:
4827; CHECK:       ## BB#0:
4828; CHECK-NEXT:    movzbl %dil, %eax
4829; CHECK-NEXT:    kmovw %eax, %k1
4830; CHECK-NEXT:    valignd $22, %ymm1, %ymm0, %ymm2 {%k1}
4831; CHECK-NEXT:    valignd $22, %ymm1, %ymm0, %ymm0
4832; CHECK-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
4833; CHECK-NEXT:    retq
4834  %res = call <8 x i32> @llvm.x86.avx512.mask.valign.d.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 %x4)
4835  %res1 = call <8 x i32> @llvm.x86.avx512.mask.valign.d.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 -1)
4836  %res2 = add <8 x i32> %res, %res1
4837  ret <8 x i32> %res2
4838}
4839
4840declare <2 x i64> @llvm.x86.avx512.mask.valign.q.128(<2 x i64>, <2 x i64>, i32, <2 x i64>, i8)
4841
4842define <2 x i64>@test_int_x86_avx512_mask_valign_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x3, i8 %x4) {
4843; CHECK-LABEL: test_int_x86_avx512_mask_valign_q_128:
4844; CHECK:       ## BB#0:
4845; CHECK-NEXT:    movzbl %dil, %eax
4846; CHECK-NEXT:    kmovw %eax, %k1
4847; CHECK-NEXT:    valignq $22, %xmm1, %xmm0, %xmm2 {%k1}
4848; CHECK-NEXT:    valignq $22, %xmm1, %xmm0, %xmm0
4849; CHECK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
4850; CHECK-NEXT:    retq
4851  %res = call <2 x i64> @llvm.x86.avx512.mask.valign.q.128(<2 x i64> %x0, <2 x i64> %x1, i32 22, <2 x i64> %x3, i8 %x4)
4852  %res1 = call <2 x i64> @llvm.x86.avx512.mask.valign.q.128(<2 x i64> %x0, <2 x i64> %x1, i32 22, <2 x i64> %x3, i8 -1)
4853  %res2 = add <2 x i64> %res, %res1
4854  ret <2 x i64> %res2
4855}
4856
4857declare <4 x i64> @llvm.x86.avx512.mask.valign.q.256(<4 x i64>, <4 x i64>, i32, <4 x i64>, i8)
4858
4859define <4 x i64>@test_int_x86_avx512_mask_valign_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x3, i8 %x4) {
4860; CHECK-LABEL: test_int_x86_avx512_mask_valign_q_256:
4861; CHECK:       ## BB#0:
4862; CHECK-NEXT:    movzbl %dil, %eax
4863; CHECK-NEXT:    kmovw %eax, %k1
4864; CHECK-NEXT:    valignq $22, %ymm1, %ymm0, %ymm2 {%k1}
4865; CHECK-NEXT:    valignq $22, %ymm1, %ymm0, %ymm0
4866; CHECK-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
4867; CHECK-NEXT:    retq
4868  %res = call <4 x i64> @llvm.x86.avx512.mask.valign.q.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 %x4)
4869  %res1 = call <4 x i64> @llvm.x86.avx512.mask.valign.q.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 -1)
4870  %res2 = add <4 x i64> %res, %res1
4871  ret <4 x i64> %res2
4872}
4873
4874declare <4 x double> @llvm.x86.avx512.mask.vpermil.pd.256(<4 x double>, i32, <4 x double>, i8)
4875
4876define <4 x double>@test_int_x86_avx512_mask_vpermil_pd_256(<4 x double> %x0, <4 x double> %x2, i8 %x3) {
4877; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_pd_256:
4878; CHECK:       ## BB#0:
4879; CHECK-NEXT:    movzbl %dil, %eax
4880; CHECK-NEXT:    kmovw %eax, %k1
4881; CHECK-NEXT:    vpermilpd $22, %ymm0, %ymm1 {%k1}
4882; CHECK-NEXT:    ## ymm1 = ymm1[0,1,3,2]
4883; CHECK-NEXT:    vpermilpd $22, %ymm0, %ymm2 {%k1} {z}
4884; CHECK-NEXT:    ## ymm2 = k1[0,1,3,2]
4885; CHECK-NEXT:    vpermilpd $22, %ymm0, %ymm0
4886; CHECK-NEXT:    ## ymm0 = ymm0[0,1,3,2]
4887; CHECK-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
4888; CHECK-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
4889; CHECK-NEXT:    retq
4890  %res = call <4 x double> @llvm.x86.avx512.mask.vpermil.pd.256(<4 x double> %x0, i32 22, <4 x double> %x2, i8 %x3)
4891  %res1 = call <4 x double> @llvm.x86.avx512.mask.vpermil.pd.256(<4 x double> %x0, i32 22, <4 x double> zeroinitializer, i8 %x3)
4892  %res2 = call <4 x double> @llvm.x86.avx512.mask.vpermil.pd.256(<4 x double> %x0, i32 22, <4 x double> %x2, i8 -1)
4893  %res3 = fadd <4 x double> %res, %res1
4894  %res4 = fadd <4 x double> %res2, %res3
4895  ret <4 x double> %res4
4896}
4897
4898declare <2 x double> @llvm.x86.avx512.mask.vpermil.pd.128(<2 x double>, i32, <2 x double>, i8)
4899
4900define <2 x double>@test_int_x86_avx512_mask_vpermil_pd_128(<2 x double> %x0, <2 x double> %x2, i8 %x3) {
4901; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_pd_128:
4902; CHECK:       ## BB#0:
4903; CHECK-NEXT:    movzbl %dil, %eax
4904; CHECK-NEXT:    kmovw %eax, %k1
4905; CHECK-NEXT:    vpermilpd $1, %xmm0, %xmm1 {%k1}
4906; CHECK-NEXT:    ## xmm1 = xmm1[1,0]
4907; CHECK-NEXT:    vpermilpd $1, %xmm0, %xmm2 {%k1} {z}
4908; CHECK-NEXT:    ## xmm2 = k1[1,0]
4909; CHECK-NEXT:    vpermilpd $1, %xmm0, %xmm0
4910; CHECK-NEXT:    ## xmm0 = xmm0[1,0]
4911; CHECK-NEXT:    vaddpd %xmm2, %xmm1, %xmm1
4912; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
4913; CHECK-NEXT:    retq
4914  %res = call <2 x double> @llvm.x86.avx512.mask.vpermil.pd.128(<2 x double> %x0, i32 1, <2 x double> %x2, i8 %x3)
4915  %res1 = call <2 x double> @llvm.x86.avx512.mask.vpermil.pd.128(<2 x double> %x0, i32 1, <2 x double> zeroinitializer, i8 %x3)
4916  %res2 = call <2 x double> @llvm.x86.avx512.mask.vpermil.pd.128(<2 x double> %x0, i32 1, <2 x double> %x2, i8 -1)
4917  %res3 = fadd <2 x double> %res, %res1
4918  %res4 = fadd <2 x double> %res3, %res2
4919  ret <2 x double> %res4
4920}
4921
4922declare <8 x float> @llvm.x86.avx512.mask.vpermil.ps.256(<8 x float>, i32, <8 x float>, i8)
4923
4924define <8 x float>@test_int_x86_avx512_mask_vpermil_ps_256(<8 x float> %x0, <8 x float> %x2, i8 %x3) {
4925; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_ps_256:
4926; CHECK:       ## BB#0:
4927; CHECK-NEXT:    movzbl %dil, %eax
4928; CHECK-NEXT:    kmovw %eax, %k1
4929; CHECK-NEXT:    vpermilps $22, %ymm0, %ymm1 {%k1}
4930; CHECK-NEXT:    ## ymm1 = ymm1[2,1,1,0,6,5,5,4]
4931; CHECK-NEXT:    vpermilps $22, %ymm0, %ymm2 {%k1} {z}
4932; CHECK-NEXT:    ## ymm2 = k1[2,1,1,0,6,5,5,4]
4933; CHECK-NEXT:    vpermilps $22, %ymm0, %ymm0
4934; CHECK-NEXT:    ## ymm0 = ymm0[2,1,1,0,6,5,5,4]
4935; CHECK-NEXT:    vaddps %ymm2, %ymm1, %ymm1
4936; CHECK-NEXT:    vaddps %ymm0, %ymm1, %ymm0
4937; CHECK-NEXT:    retq
4938  %res = call <8 x float> @llvm.x86.avx512.mask.vpermil.ps.256(<8 x float> %x0, i32 22, <8 x float> %x2, i8 %x3)
4939  %res1 = call <8 x float> @llvm.x86.avx512.mask.vpermil.ps.256(<8 x float> %x0, i32 22, <8 x float> zeroinitializer, i8 %x3)
4940  %res2 = call <8 x float> @llvm.x86.avx512.mask.vpermil.ps.256(<8 x float> %x0, i32 22, <8 x float> %x2, i8 -1)
4941  %res3 = fadd <8 x float> %res, %res1
4942  %res4 = fadd <8 x float> %res3, %res2
4943  ret <8 x float> %res4
4944}
4945
4946declare <4 x float> @llvm.x86.avx512.mask.vpermil.ps.128(<4 x float>, i32, <4 x float>, i8)
4947
4948define <4 x float>@test_int_x86_avx512_mask_vpermil_ps_128(<4 x float> %x0, <4 x float> %x2, i8 %x3) {
4949; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_ps_128:
4950; CHECK:       ## BB#0:
4951; CHECK-NEXT:    movzbl %dil, %eax
4952; CHECK-NEXT:    kmovw %eax, %k1
4953; CHECK-NEXT:    vpermilps $22, %xmm0, %xmm1 {%k1}
4954; CHECK-NEXT:    ## xmm1 = xmm1[2,1,1,0]
4955; CHECK-NEXT:    vpermilps $22, %xmm0, %xmm2 {%k1} {z}
4956; CHECK-NEXT:    ## xmm2 = k1[2,1,1,0]
4957; CHECK-NEXT:    vpermilps $22, %xmm0, %xmm0
4958; CHECK-NEXT:    ## xmm0 = xmm0[2,1,1,0]
4959; CHECK-NEXT:    vaddps %xmm2, %xmm1, %xmm1
4960; CHECK-NEXT:    vaddps %xmm1, %xmm0, %xmm0
4961; CHECK-NEXT:    retq
4962  %res = call <4 x float> @llvm.x86.avx512.mask.vpermil.ps.128(<4 x float> %x0, i32 22, <4 x float> %x2, i8 %x3)
4963  %res1 = call <4 x float> @llvm.x86.avx512.mask.vpermil.ps.128(<4 x float> %x0, i32 22, <4 x float> zeroinitializer, i8 %x3)
4964  %res2 = call <4 x float> @llvm.x86.avx512.mask.vpermil.ps.128(<4 x float> %x0, i32 22, <4 x float> %x2, i8 -1)
4965  %res3 = fadd <4 x float> %res, %res1
4966  %res4 = fadd <4 x float> %res2, %res3
4967  ret <4 x float> %res4
4968}
4969
4970declare <4 x double> @llvm.x86.avx512.mask.vpermilvar.pd.256(<4 x double>, <4 x i64>, <4 x double>, i8)
4971
4972define <4 x double>@test_int_x86_avx512_mask_vpermilvar_pd_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) {
4973; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_pd_256:
4974; CHECK:       ## BB#0:
4975; CHECK-NEXT:    movzbl %dil, %eax
4976; CHECK-NEXT:    kmovw %eax, %k1
4977; CHECK-NEXT:    vpermilpd %ymm1, %ymm0, %ymm2 {%k1}
4978; CHECK-NEXT:    vpermilpd %ymm1, %ymm0, %ymm3 {%k1} {z}
4979; CHECK-NEXT:    vpermilpd %ymm1, %ymm0, %ymm0
4980; CHECK-NEXT:    vaddpd %ymm3, %ymm2, %ymm1
4981; CHECK-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
4982; CHECK-NEXT:    retq
4983  %res = call <4 x double> @llvm.x86.avx512.mask.vpermilvar.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3)
4984  %res1 = call <4 x double> @llvm.x86.avx512.mask.vpermilvar.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> zeroinitializer, i8 %x3)
4985  %res2 = call <4 x double> @llvm.x86.avx512.mask.vpermilvar.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 -1)
4986  %res3 = fadd <4 x double> %res, %res1
4987  %res4 = fadd <4 x double> %res2, %res3
4988  ret <4 x double> %res4
4989}
4990
4991declare <2 x double> @llvm.x86.avx512.mask.vpermilvar.pd.128(<2 x double>, <2 x i64>, <2 x double>, i8)
4992
4993define <2 x double>@test_int_x86_avx512_mask_vpermilvar_pd_128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3) {
4994; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_pd_128:
4995; CHECK:       ## BB#0:
4996; CHECK-NEXT:    movzbl %dil, %eax
4997; CHECK-NEXT:    kmovw %eax, %k1
4998; CHECK-NEXT:    vpermilpd %xmm1, %xmm0, %xmm2 {%k1}
4999; CHECK-NEXT:    vpermilpd %xmm1, %xmm0, %xmm3 {%k1} {z}
5000; CHECK-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
5001; CHECK-NEXT:    vaddpd %xmm3, %xmm2, %xmm1
5002; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
5003; CHECK-NEXT:    retq
5004  %res = call <2 x double> @llvm.x86.avx512.mask.vpermilvar.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3)
5005  %res1 = call <2 x double> @llvm.x86.avx512.mask.vpermilvar.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> zeroinitializer, i8 %x3)
5006  %res2 = call <2 x double> @llvm.x86.avx512.mask.vpermilvar.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 -1)
5007  %res3 = fadd <2 x double> %res, %res1
5008  %res4 = fadd <2 x double> %res3, %res2
5009  ret <2 x double> %res4
5010}
5011
5012declare <8 x float> @llvm.x86.avx512.mask.vpermilvar.ps.256(<8 x float>, <8 x i32>, <8 x float>, i8)
5013
5014define <8 x float>@test_int_x86_avx512_mask_vpermilvar_ps_256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) {
5015; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_ps_256:
5016; CHECK:       ## BB#0:
5017; CHECK-NEXT:    movzbl %dil, %eax
5018; CHECK-NEXT:    kmovw %eax, %k1
5019; CHECK-NEXT:    vpermilps %ymm1, %ymm0, %ymm2 {%k1}
5020; CHECK-NEXT:    vpermilps %ymm1, %ymm0, %ymm3 {%k1} {z}
5021; CHECK-NEXT:    vpermilps %ymm1, %ymm0, %ymm0
5022; CHECK-NEXT:    vaddps %ymm3, %ymm2, %ymm1
5023; CHECK-NEXT:    vaddps %ymm0, %ymm1, %ymm0
5024; CHECK-NEXT:    retq
5025  %res = call <8 x float> @llvm.x86.avx512.mask.vpermilvar.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3)
5026  %res1 = call <8 x float> @llvm.x86.avx512.mask.vpermilvar.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> zeroinitializer, i8 %x3)
5027  %res2 = call <8 x float> @llvm.x86.avx512.mask.vpermilvar.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 -1)
5028  %res3 = fadd <8 x float> %res, %res1
5029  %res4 = fadd <8 x float> %res3, %res2
5030  ret <8 x float> %res4
5031}
5032
5033declare <4 x float> @llvm.x86.avx512.mask.vpermilvar.ps.128(<4 x float>, <4 x i32>, <4 x float>, i8)
5034
5035define <4 x float>@test_int_x86_avx512_mask_vpermilvar_ps_128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 %x3) {
5036; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_ps_128:
5037; CHECK:       ## BB#0:
5038; CHECK-NEXT:    movzbl %dil, %eax
5039; CHECK-NEXT:    kmovw %eax, %k1
5040; CHECK-NEXT:    vpermilps %xmm1, %xmm0, %xmm2 {%k1}
5041; CHECK-NEXT:    vpermilps %xmm1, %xmm0, %xmm3 {%k1} {z}
5042; CHECK-NEXT:    vpermilps %xmm1, %xmm0, %xmm0
5043; CHECK-NEXT:    vaddps %xmm3, %xmm2, %xmm1
5044; CHECK-NEXT:    vaddps %xmm1, %xmm0, %xmm0
5045; CHECK-NEXT:    retq
5046  %res = call <4 x float> @llvm.x86.avx512.mask.vpermilvar.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 %x3)
5047  %res1 = call <4 x float> @llvm.x86.avx512.mask.vpermilvar.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> zeroinitializer, i8 %x3)
5048  %res2 = call <4 x float> @llvm.x86.avx512.mask.vpermilvar.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 -1)
5049  %res3 = fadd <4 x float> %res, %res1
5050  %res4 = fadd <4 x float> %res2, %res3
5051  ret <4 x float> %res4
5052}
5053
5054declare <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float>, <4 x float>, i32, <8 x float>, i8)
5055
5056define <8 x float>@test_int_x86_avx512_mask_insertf32x4_256(<8 x float> %x0, <4 x float> %x1, <8 x float> %x3, i8 %x4) {
5057; CHECK-LABEL: test_int_x86_avx512_mask_insertf32x4_256:
5058; CHECK:       ## BB#0:
5059; CHECK-NEXT:    movzbl %dil, %eax
5060; CHECK-NEXT:    kmovw %eax, %k1
5061; CHECK-NEXT:    vinsertf32x4 $1, %xmm1, %ymm0, %ymm2 {%k1}
5062; CHECK-NEXT:    vinsertf32x4 $1, %xmm1, %ymm0, %ymm3 {%k1} {z}
5063; CHECK-NEXT:    vinsertf32x4 $1, %xmm1, %ymm0, %ymm0
5064; CHECK-NEXT:    vaddps %ymm0, %ymm2, %ymm0
5065; CHECK-NEXT:    vaddps %ymm0, %ymm3, %ymm0
5066; CHECK-NEXT:    retq
5067  %res = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 %x4)
5068  %res1 = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 -1)
5069  %res2 = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> zeroinitializer, i8 %x4)
5070  %res3 = fadd <8 x float> %res, %res1
5071  %res4 = fadd <8 x float> %res2, %res3
5072  ret <8 x float> %res4
5073}
5074
5075declare <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32>, <4 x i32>, i32, <8 x i32>, i8)
5076
5077define <8 x i32>@test_int_x86_avx512_mask_inserti32x4_256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x3, i8 %x4) {
5078; CHECK-LABEL: test_int_x86_avx512_mask_inserti32x4_256:
5079; CHECK:       ## BB#0:
5080; CHECK-NEXT:    movzbl %dil, %eax
5081; CHECK-NEXT:    kmovw %eax, %k1
5082; CHECK-NEXT:    vinserti32x4 $1, %xmm1, %ymm0, %ymm2 {%k1}
5083; CHECK-NEXT:    vinserti32x4 $1, %xmm1, %ymm0, %ymm3 {%k1} {z}
5084; CHECK-NEXT:    vinserti32x4 $1, %xmm1, %ymm0, %ymm0
5085; CHECK-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
5086; CHECK-NEXT:    vpaddd %ymm0, %ymm3, %ymm0
5087; CHECK-NEXT:    retq
5088
5089  %res = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> %x3, i8 %x4)
5090  %res1 = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> %x3, i8 -1)
5091  %res2 = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> zeroinitializer, i8 %x4)
5092  %res3 = add <8 x i32> %res, %res1
5093  %res4 = add <8 x i32> %res2, %res3
5094  ret <8 x i32> %res4
5095}
5096
5097declare <4 x i32> @llvm.x86.avx512.mask.pternlog.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i32, i8)
5098
5099define <4 x i32>@test_int_x86_avx512_mask_pternlog_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x4) {
5100; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_d_128:
5101; CHECK:       ## BB#0:
5102; CHECK-NEXT:    movzbl %dil, %eax
5103; CHECK-NEXT:    kmovw %eax, %k1
5104; CHECK-NEXT:    vmovaps %zmm0, %zmm3
5105; CHECK-NEXT:    vpternlogd $33, %xmm2, %xmm1, %xmm3 {%k1}
5106; CHECK-NEXT:    vpternlogd $33, %xmm2, %xmm1, %xmm0
5107; CHECK-NEXT:    vpaddd %xmm0, %xmm3, %xmm0
5108; CHECK-NEXT:    retq
5109  %res = call <4 x i32> @llvm.x86.avx512.mask.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33, i8 %x4)
5110  %res1 = call <4 x i32> @llvm.x86.avx512.mask.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33, i8 -1)
5111  %res2 = add <4 x i32> %res, %res1
5112  ret <4 x i32> %res2
5113}
5114
5115declare <4 x i32> @llvm.x86.avx512.maskz.pternlog.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i32, i8)
5116
5117define <4 x i32>@test_int_x86_avx512_maskz_pternlog_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x4) {
5118; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_d_128:
5119; CHECK:       ## BB#0:
5120; CHECK-NEXT:    movzbl %dil, %eax
5121; CHECK-NEXT:    kmovw %eax, %k1
5122; CHECK-NEXT:    vmovaps %zmm0, %zmm3
5123; CHECK-NEXT:    vpternlogd $33, %xmm2, %xmm1, %xmm3 {%k1} {z}
5124; CHECK-NEXT:    vpternlogd $33, %xmm2, %xmm1, %xmm0
5125; CHECK-NEXT:    vpaddd %xmm0, %xmm3, %xmm0
5126; CHECK-NEXT:    retq
5127  %res = call <4 x i32> @llvm.x86.avx512.maskz.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33, i8 %x4)
5128  %res1 = call <4 x i32> @llvm.x86.avx512.maskz.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33, i8 -1)
5129  %res2 = add <4 x i32> %res, %res1
5130  ret <4 x i32> %res2
5131}
5132
5133declare <8 x i32> @llvm.x86.avx512.mask.pternlog.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i32, i8)
5134
5135define <8 x i32>@test_int_x86_avx512_mask_pternlog_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x4) {
5136; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_d_256:
5137; CHECK:       ## BB#0:
5138; CHECK-NEXT:    movzbl %dil, %eax
5139; CHECK-NEXT:    kmovw %eax, %k1
5140; CHECK-NEXT:    vmovaps %zmm0, %zmm3
5141; CHECK-NEXT:    vpternlogd $33, %ymm2, %ymm1, %ymm3 {%k1}
5142; CHECK-NEXT:    vpternlogd $33, %ymm2, %ymm1, %ymm0
5143; CHECK-NEXT:    vpaddd %ymm0, %ymm3, %ymm0
5144; CHECK-NEXT:    retq
5145  %res = call <8 x i32> @llvm.x86.avx512.mask.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33, i8 %x4)
5146  %res1 = call <8 x i32> @llvm.x86.avx512.mask.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33, i8 -1)
5147  %res2 = add <8 x i32> %res, %res1
5148  ret <8 x i32> %res2
5149}
5150
5151declare <8 x i32> @llvm.x86.avx512.maskz.pternlog.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i32, i8)
5152
5153define <8 x i32>@test_int_x86_avx512_maskz_pternlog_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x4) {
5154; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_d_256:
5155; CHECK:       ## BB#0:
5156; CHECK-NEXT:    movzbl %dil, %eax
5157; CHECK-NEXT:    kmovw %eax, %k1
5158; CHECK-NEXT:    vmovaps %zmm0, %zmm3
5159; CHECK-NEXT:    vpternlogd $33, %ymm2, %ymm1, %ymm3 {%k1} {z}
5160; CHECK-NEXT:    vpternlogd $33, %ymm2, %ymm1, %ymm0
5161; CHECK-NEXT:    vpaddd %ymm0, %ymm3, %ymm0
5162; CHECK-NEXT:    retq
5163  %res = call <8 x i32> @llvm.x86.avx512.maskz.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33, i8 %x4)
5164  %res1 = call <8 x i32> @llvm.x86.avx512.maskz.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33, i8 -1)
5165  %res2 = add <8 x i32> %res, %res1
5166  ret <8 x i32> %res2
5167}
5168
5169declare <2 x i64> @llvm.x86.avx512.mask.pternlog.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i32, i8)
5170
5171define <2 x i64>@test_int_x86_avx512_mask_pternlog_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x4) {
5172; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_q_128:
5173; CHECK:       ## BB#0:
5174; CHECK-NEXT:    movzbl %dil, %eax
5175; CHECK-NEXT:    kmovw %eax, %k1
5176; CHECK-NEXT:    vmovaps %zmm0, %zmm3
5177; CHECK-NEXT:    vpternlogq $33, %xmm2, %xmm1, %xmm3 {%k1}
5178; CHECK-NEXT:    vpternlogq $33, %xmm2, %xmm1, %xmm0
5179; CHECK-NEXT:    vpaddq %xmm0, %xmm3, %xmm0
5180; CHECK-NEXT:    retq
5181  %res = call <2 x i64> @llvm.x86.avx512.mask.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33, i8 %x4)
5182  %res1 = call <2 x i64> @llvm.x86.avx512.mask.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33, i8 -1)
5183  %res2 = add <2 x i64> %res, %res1
5184  ret <2 x i64> %res2
5185}
5186
5187declare <2 x i64> @llvm.x86.avx512.maskz.pternlog.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i32, i8)
5188
5189define <2 x i64>@test_int_x86_avx512_maskz_pternlog_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x4) {
5190; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_q_128:
5191; CHECK:       ## BB#0:
5192; CHECK-NEXT:    movzbl %dil, %eax
5193; CHECK-NEXT:    kmovw %eax, %k1
5194; CHECK-NEXT:    vmovaps %zmm0, %zmm3
5195; CHECK-NEXT:    vpternlogq $33, %xmm2, %xmm1, %xmm3 {%k1} {z}
5196; CHECK-NEXT:    vpternlogq $33, %xmm2, %xmm1, %xmm0
5197; CHECK-NEXT:    vpaddq %xmm0, %xmm3, %xmm0
5198; CHECK-NEXT:    retq
5199  %res = call <2 x i64> @llvm.x86.avx512.maskz.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33, i8 %x4)
5200  %res1 = call <2 x i64> @llvm.x86.avx512.maskz.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33, i8 -1)
5201  %res2 = add <2 x i64> %res, %res1
5202  ret <2 x i64> %res2
5203}
5204
5205declare <4 x i64> @llvm.x86.avx512.mask.pternlog.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i32, i8)
5206
5207define <4 x i64>@test_int_x86_avx512_mask_pternlog_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x4) {
5208; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_q_256:
5209; CHECK:       ## BB#0:
5210; CHECK-NEXT:    movzbl %dil, %eax
5211; CHECK-NEXT:    kmovw %eax, %k1
5212; CHECK-NEXT:    vmovaps %zmm0, %zmm3
5213; CHECK-NEXT:    vpternlogq $33, %ymm2, %ymm1, %ymm3 {%k1}
5214; CHECK-NEXT:    vpternlogq $33, %ymm2, %ymm1, %ymm0
5215; CHECK-NEXT:    vpaddq %ymm0, %ymm3, %ymm0
5216; CHECK-NEXT:    retq
5217  %res = call <4 x i64> @llvm.x86.avx512.mask.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33, i8 %x4)
5218  %res1 = call <4 x i64> @llvm.x86.avx512.mask.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33, i8 -1)
5219  %res2 = add <4 x i64> %res, %res1
5220  ret <4 x i64> %res2
5221}
5222
5223declare <4 x i64> @llvm.x86.avx512.maskz.pternlog.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i32, i8)
5224
5225define <4 x i64>@test_int_x86_avx512_maskz_pternlog_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x4) {
5226; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_q_256:
5227; CHECK:       ## BB#0:
5228; CHECK-NEXT:    movzbl %dil, %eax
5229; CHECK-NEXT:    kmovw %eax, %k1
5230; CHECK-NEXT:    vmovaps %zmm0, %zmm3
5231; CHECK-NEXT:    vpternlogq $33, %ymm2, %ymm1, %ymm3 {%k1} {z}
5232; CHECK-NEXT:    vpternlogq $33, %ymm2, %ymm1, %ymm0
5233; CHECK-NEXT:    vpaddq %ymm0, %ymm3, %ymm0
5234; CHECK-NEXT:    retq
5235  %res = call <4 x i64> @llvm.x86.avx512.maskz.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33, i8 %x4)
5236  %res1 = call <4 x i64> @llvm.x86.avx512.maskz.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33, i8 -1)
5237  %res2 = add <4 x i64> %res, %res1
5238  ret <4 x i64> %res2
5239}
5240
5241declare <8 x i32> @llvm.x86.avx512.pbroadcastd.256(<4 x i32>, <8 x i32>, i8)
5242
5243define <8 x i32>@test_int_x86_avx512_pbroadcastd_256(<4 x i32> %x0, <8 x i32> %x1, i8 %mask) {
5244; CHECK-LABEL: test_int_x86_avx512_pbroadcastd_256:
5245; CHECK:       ## BB#0:
5246; CHECK-NEXT:    movzbl %dil, %eax
5247; CHECK-NEXT:    kmovw %eax, %k1
5248; CHECK-NEXT:    vpbroadcastd %xmm0, %ymm1 {%k1}
5249; CHECK-NEXT:    vpbroadcastd %xmm0, %ymm2 {%k1} {z}
5250; CHECK-NEXT:    vpbroadcastd %xmm0, %ymm0
5251; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
5252; CHECK-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
5253; CHECK-NEXT:    retq
5254  %res = call <8 x i32> @llvm.x86.avx512.pbroadcastd.256(<4 x i32> %x0, <8 x i32> %x1, i8 -1)
5255  %res1 = call <8 x i32> @llvm.x86.avx512.pbroadcastd.256(<4 x i32> %x0, <8 x i32> %x1, i8 %mask)
5256  %res2 = call <8 x i32> @llvm.x86.avx512.pbroadcastd.256(<4 x i32> %x0, <8 x i32> zeroinitializer, i8 %mask)
5257  %res3 = add <8 x i32> %res, %res1
5258  %res4 = add <8 x i32> %res2, %res3
5259  ret <8 x i32> %res4
5260}
5261
5262declare <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32>, <4 x i32>, i8)
5263
5264define <4 x i32>@test_int_x86_avx512_pbroadcastd_128(<4 x i32> %x0, <4 x i32> %x1, i8 %mask) {
5265; CHECK-LABEL: test_int_x86_avx512_pbroadcastd_128:
5266; CHECK:       ## BB#0:
5267; CHECK-NEXT:    movzbl %dil, %eax
5268; CHECK-NEXT:    kmovw %eax, %k1
5269; CHECK-NEXT:    vpbroadcastd %xmm0, %xmm1 {%k1}
5270; CHECK-NEXT:    vpbroadcastd %xmm0, %xmm2 {%k1} {z}
5271; CHECK-NEXT:    vpbroadcastd %xmm0, %xmm0
5272; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
5273; CHECK-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
5274; CHECK-NEXT:    retq
5275  %res = call <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32> %x0, <4 x i32> %x1, i8 -1)
5276  %res1 = call <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32> %x0, <4 x i32> %x1, i8 %mask)
5277  %res2 = call <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32> %x0, <4 x i32> zeroinitializer, i8 %mask)
5278  %res3 = add <4 x i32> %res, %res1
5279  %res4 = add <4 x i32> %res2, %res3
5280  ret <4 x i32> %res4
5281}
5282
5283declare <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64>, <4 x i64>, i8)
5284
5285define <4 x i64>@test_int_x86_avx512_pbroadcastq_256(<2 x i64> %x0, <4 x i64> %x1, i8 %mask) {
5286; CHECK-LABEL: test_int_x86_avx512_pbroadcastq_256:
5287; CHECK:       ## BB#0:
5288; CHECK-NEXT:    movzbl %dil, %eax
5289; CHECK-NEXT:    kmovw %eax, %k1
5290; CHECK-NEXT:    vpbroadcastq %xmm0, %ymm1 {%k1}
5291; CHECK-NEXT:    vpbroadcastq %xmm0, %ymm2 {%k1} {z}
5292; CHECK-NEXT:    vpbroadcastq %xmm0, %ymm0
5293; CHECK-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
5294; CHECK-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
5295; CHECK-NEXT:    retq
5296  %res = call <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64> %x0, <4 x i64> %x1,i8 -1)
5297  %res1 = call <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64> %x0, <4 x i64> %x1,i8 %mask)
5298  %res2 = call <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64> %x0, <4 x i64> zeroinitializer,i8 %mask)
5299  %res3 = add <4 x i64> %res, %res1
5300  %res4 = add <4 x i64> %res2, %res3
5301  ret <4 x i64> %res4
5302}
5303
5304declare <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64>, <2 x i64>, i8)
5305
5306define <2 x i64>@test_int_x86_avx512_pbroadcastq_128(<2 x i64> %x0, <2 x i64> %x1, i8 %mask) {
5307; CHECK-LABEL: test_int_x86_avx512_pbroadcastq_128:
5308; CHECK:       ## BB#0:
5309; CHECK-NEXT:    movzbl %dil, %eax
5310; CHECK-NEXT:    kmovw %eax, %k1
5311; CHECK-NEXT:    vpbroadcastq %xmm0, %xmm1 {%k1}
5312; CHECK-NEXT:    vpbroadcastq %xmm0, %xmm2 {%k1} {z}
5313; CHECK-NEXT:    vpbroadcastq %xmm0, %xmm0
5314; CHECK-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
5315; CHECK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
5316; CHECK-NEXT:    retq
5317  %res = call <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64> %x0, <2 x i64> %x1,i8 -1)
5318  %res1 = call <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64> %x0, <2 x i64> %x1,i8 %mask)
5319  %res2 = call <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64> %x0, <2 x i64> zeroinitializer,i8 %mask)
5320  %res3 = add <2 x i64> %res, %res1
5321  %res4 = add <2 x i64> %res2, %res3
5322  ret <2 x i64> %res4
5323}
5324
5325define <4 x float> @test_x86_vcvtph2ps_128(<8 x i16> %a0) {
5326  ; CHECK: test_x86_vcvtph2ps_128
5327  ; CHECK: vcvtph2ps  %xmm0, %xmm0
5328  %res = call <4 x float> @llvm.x86.avx512.mask.vcvtph2ps.128(<8 x i16> %a0, <4 x float> zeroinitializer, i8 -1)
5329  ret <4 x float> %res
5330}
5331
5332define <4 x float> @test_x86_vcvtph2ps_128_rrk(<8 x i16> %a0,<4 x float> %a1, i8 %mask) {
5333  ; CHECK: test_x86_vcvtph2ps_128_rrk
5334  ; CHECK: vcvtph2ps  %xmm0, %xmm1 {%k1}
5335  %res = call <4 x float> @llvm.x86.avx512.mask.vcvtph2ps.128(<8 x i16> %a0, <4 x float> %a1, i8 %mask)
5336  ret <4 x float> %res
5337}
5338
5339
5340define <4 x float> @test_x86_vcvtph2ps_128_rrkz(<8 x i16> %a0, i8 %mask) {
5341  ; CHECK: test_x86_vcvtph2ps_128_rrkz
5342  ; CHECK: vcvtph2ps  %xmm0, %xmm0 {%k1} {z}
5343  %res = call <4 x float> @llvm.x86.avx512.mask.vcvtph2ps.128(<8 x i16> %a0, <4 x float> zeroinitializer, i8 %mask)
5344  ret <4 x float> %res
5345}
5346
5347declare <4 x float> @llvm.x86.avx512.mask.vcvtph2ps.128(<8 x i16>, <4 x float>, i8) nounwind readonly
5348
5349define <8 x float> @test_x86_vcvtph2ps_256(<8 x i16> %a0) {
5350  ; CHECK: test_x86_vcvtph2ps_256
5351  ; CHECK: vcvtph2ps  %xmm0, %ymm0
5352  %res = call <8 x float> @llvm.x86.avx512.mask.vcvtph2ps.256(<8 x i16> %a0, <8 x float> zeroinitializer, i8 -1)
5353  ret <8 x float> %res
5354}
5355
5356define <8 x float> @test_x86_vcvtph2ps_256_rrk(<8 x i16> %a0,<8 x float> %a1, i8 %mask) {
5357  ; CHECK: test_x86_vcvtph2ps_256_rrk
5358  ; CHECK: vcvtph2ps  %xmm0, %ymm1 {%k1}
5359  %res = call <8 x float> @llvm.x86.avx512.mask.vcvtph2ps.256(<8 x i16> %a0, <8 x float> %a1, i8 %mask)
5360  ret <8 x float> %res
5361}
5362
5363define <8 x float> @test_x86_vcvtph2ps_256_rrkz(<8 x i16> %a0, i8 %mask) {
5364  ; CHECK: test_x86_vcvtph2ps_256_rrkz
5365  ; CHECK: vcvtph2ps  %xmm0, %ymm0 {%k1} {z}
5366  %res = call <8 x float> @llvm.x86.avx512.mask.vcvtph2ps.256(<8 x i16> %a0, <8 x float> zeroinitializer, i8 %mask)
5367  ret <8 x float> %res
5368}
5369
5370declare <8 x float> @llvm.x86.avx512.mask.vcvtph2ps.256(<8 x i16>, <8 x float>, i8) nounwind readonly
5371
5372define <8 x i16> @test_x86_vcvtps2ph_128(<4 x float> %a0) {
5373  ; CHECK: test_x86_vcvtps2ph_128
5374  ; CHECK: vcvtps2ph $2, %xmm0, %xmm0
5375  %res = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 -1)
5376  ret <8 x i16> %res
5377}
5378
5379
5380declare <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float>, i32, <8 x i16>, i8) nounwind readonly
5381
5382define <8 x i16> @test_x86_vcvtps2ph_256(<8 x float> %a0) {
5383  ; CHECK: test_x86_vcvtps2ph_256
5384  ; CHECK: vcvtps2ph $2, %ymm0, %xmm0
5385  %res = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 -1)
5386  ret <8 x i16> %res
5387}
5388
5389declare <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float>, i32, <8 x i16>, i8) nounwind readonly
5390
5391declare <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float>, <4 x float>, i8)
5392
5393define <4 x float>@test_int_x86_avx512_mask_movsldup_128(<4 x float> %x0, <4 x float> %x1, i8 %x2) {
5394; CHECK-LABEL: test_int_x86_avx512_mask_movsldup_128:
5395; CHECK:       ## BB#0:
5396; CHECK-NEXT:    movzbl %dil, %eax
5397; CHECK-NEXT:    kmovw %eax, %k1
5398; CHECK-NEXT:    vmovsldup %xmm0, %xmm1 {%k1}
5399; CHECK-NEXT:    ## xmm1 = xmm0[0,0,2,2]
5400; CHECK-NEXT:    vmovsldup %xmm0, %xmm2 {%k1} {z}
5401; CHECK-NEXT:    ## xmm2 = xmm0[0,0,2,2]
5402; CHECK-NEXT:    vmovsldup %xmm0, %xmm0
5403; CHECK-NEXT:    ## xmm0 = xmm0[0,0,2,2]
5404; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
5405; CHECK-NEXT:    vaddps %xmm0, %xmm2, %xmm0
5406; CHECK-NEXT:    retq
5407  %res = call <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float> %x0, <4 x float> %x1, i8 %x2)
5408  %res1 = call <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float> %x0, <4 x float> %x1, i8 -1)
5409  %res2 = call <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float> %x0, <4 x float> zeroinitializer, i8 %x2)
5410  %res3 = fadd <4 x float> %res, %res1
5411  %res4 = fadd <4 x float> %res2, %res3
5412  ret <4 x float> %res4
5413}
5414
5415declare <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float>, <8 x float>, i8)
5416
5417define <8 x float>@test_int_x86_avx512_mask_movsldup_256(<8 x float> %x0, <8 x float> %x1, i8 %x2) {
5418; CHECK-LABEL: test_int_x86_avx512_mask_movsldup_256:
5419; CHECK:       ## BB#0:
5420; CHECK-NEXT:    movzbl %dil, %eax
5421; CHECK-NEXT:    kmovw %eax, %k1
5422; CHECK-NEXT:    vmovsldup %ymm0, %ymm1 {%k1}
5423; CHECK-NEXT:    ## ymm1 = ymm0[0,0,2,2,4,4,6,6]
5424; CHECK-NEXT:    vmovsldup %ymm0, %ymm2 {%k1} {z}
5425; CHECK-NEXT:    ## ymm2 = ymm0[0,0,2,2,4,4,6,6]
5426; CHECK-NEXT:    vmovsldup %ymm0, %ymm0
5427; CHECK-NEXT:    ## ymm0 = ymm0[0,0,2,2,4,4,6,6]
5428; CHECK-NEXT:    vaddps %ymm0, %ymm1, %ymm0
5429; CHECK-NEXT:    vaddps %ymm0, %ymm2, %ymm0
5430; CHECK-NEXT:    retq
5431  %res = call <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float> %x0, <8 x float> %x1, i8 %x2)
5432  %res1 = call <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float> %x0, <8 x float> %x1, i8 -1)
5433  %res2 = call <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float> %x0, <8 x float> zeroinitializer, i8 %x2)
5434  %res3 = fadd <8 x float> %res, %res1
5435  %res4 = fadd <8 x float> %res2, %res3
5436  ret <8 x float> %res4
5437}
5438
5439declare <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float>, <4 x float>, i8)
5440
5441define <4 x float>@test_int_x86_avx512_mask_movshdup_128(<4 x float> %x0, <4 x float> %x1, i8 %x2) {
5442; CHECK-LABEL: test_int_x86_avx512_mask_movshdup_128:
5443; CHECK:       ## BB#0:
5444; CHECK-NEXT:    movzbl %dil, %eax
5445; CHECK-NEXT:    kmovw %eax, %k1
5446; CHECK-NEXT:    vmovshdup %xmm0, %xmm1 {%k1}
5447; CHECK-NEXT:    ## xmm1 = xmm0[1,1,3,3]
5448; CHECK-NEXT:    vmovshdup %xmm0, %xmm2 {%k1} {z}
5449; CHECK-NEXT:    ## xmm2 = xmm0[1,1,3,3]
5450; CHECK-NEXT:    vmovshdup %xmm0, %xmm0
5451; CHECK-NEXT:    ## xmm0 = xmm0[1,1,3,3]
5452; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
5453; CHECK-NEXT:    vaddps %xmm0, %xmm2, %xmm0
5454; CHECK-NEXT:    retq
5455  %res = call <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float> %x0, <4 x float> %x1, i8 %x2)
5456  %res1 = call <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float> %x0, <4 x float> %x1, i8 -1)
5457  %res2 = call <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float> %x0, <4 x float> zeroinitializer, i8 %x2)
5458  %res3 = fadd <4 x float> %res, %res1
5459  %res4 = fadd <4 x float> %res2, %res3
5460  ret <4 x float> %res4
5461}
5462
5463declare <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float>, <8 x float>, i8)
5464
5465define <8 x float>@test_int_x86_avx512_mask_movshdup_256(<8 x float> %x0, <8 x float> %x1, i8 %x2) {
5466; CHECK-LABEL: test_int_x86_avx512_mask_movshdup_256:
5467; CHECK:       ## BB#0:
5468; CHECK-NEXT:    movzbl %dil, %eax
5469; CHECK-NEXT:    kmovw %eax, %k1
5470; CHECK-NEXT:    vmovshdup %ymm0, %ymm1 {%k1}
5471; CHECK-NEXT:    ## ymm1 = ymm0[1,1,3,3,5,5,7,7]
5472; CHECK-NEXT:    vmovshdup %ymm0, %ymm2 {%k1} {z}
5473; CHECK-NEXT:    ## ymm2 = ymm0[1,1,3,3,5,5,7,7]
5474; CHECK-NEXT:    vmovshdup %ymm0, %ymm0
5475; CHECK-NEXT:    ## ymm0 = ymm0[1,1,3,3,5,5,7,7]
5476; CHECK-NEXT:    vaddps %ymm0, %ymm1, %ymm0
5477; CHECK-NEXT:    vaddps %ymm0, %ymm2, %ymm0
5478; CHECK-NEXT:    retq
5479  %res = call <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float> %x0, <8 x float> %x1, i8 %x2)
5480  %res1 = call <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float> %x0, <8 x float> %x1, i8 -1)
5481  %res2 = call <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float> %x0, <8 x float> zeroinitializer, i8 %x2)
5482  %res3 = fadd <8 x float> %res, %res1
5483  %res4 = fadd <8 x float> %res2, %res3
5484  ret <8 x float> %res4
5485}
5486declare <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double>, <2 x double>, i8)
5487
5488define <2 x double>@test_int_x86_avx512_mask_movddup_128(<2 x double> %x0, <2 x double> %x1, i8 %x2) {
5489; CHECK-LABEL: test_int_x86_avx512_mask_movddup_128:
5490; CHECK:       ## BB#0:
5491; CHECK-NEXT:    movzbl %dil, %eax
5492; CHECK-NEXT:    kmovw %eax, %k1
5493; CHECK-NEXT:    vmovddup %xmm0, %xmm1 {%k1}
5494; CHECK-NEXT:    ## xmm1 = xmm0[0,0]
5495; CHECK-NEXT:    vmovddup %xmm0, %xmm2 {%k1} {z}
5496; CHECK-NEXT:    ## xmm2 = xmm0[0,0]
5497; CHECK-NEXT:    vmovddup %xmm0, %xmm0
5498; CHECK-NEXT:    ## xmm0 = xmm0[0,0]
5499; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
5500; CHECK-NEXT:    vaddpd %xmm0, %xmm2, %xmm0
5501; CHECK-NEXT:    retq
5502  %res = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> %x1, i8 %x2)
5503  %res1 = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> %x1, i8 -1)
5504  %res2 = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> zeroinitializer, i8 %x2)
5505  %res3 = fadd <2 x double> %res, %res1
5506  %res4 = fadd <2 x double> %res2, %res3
5507  ret <2 x double> %res4
5508}
5509
5510declare <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double>, <4 x double>, i8)
5511
5512define <4 x double>@test_int_x86_avx512_mask_movddup_256(<4 x double> %x0, <4 x double> %x1, i8 %x2) {
5513; CHECK-LABEL: test_int_x86_avx512_mask_movddup_256:
5514; CHECK:       ## BB#0:
5515; CHECK-NEXT:    movzbl %dil, %eax
5516; CHECK-NEXT:    kmovw %eax, %k1
5517; CHECK-NEXT:    vmovddup %ymm0, %ymm1 {%k1}
5518; CHECK-NEXT:    ## ymm1 = ymm0[0,0,2,2]
5519; CHECK-NEXT:    vmovddup %ymm0, %ymm2 {%k1} {z}
5520; CHECK-NEXT:    ## ymm2 = ymm0[0,0,2,2]
5521; CHECK-NEXT:    vmovddup %ymm0, %ymm0
5522; CHECK-NEXT:    ## ymm0 = ymm0[0,0,2,2]
5523; CHECK-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
5524; CHECK-NEXT:    vaddpd %ymm0, %ymm2, %ymm0
5525; CHECK-NEXT:    retq
5526  %res = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> %x1, i8 %x2)
5527  %res1 = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> %x1, i8 -1)
5528  %res2 = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> zeroinitializer, i8 %x2)
5529  %res3 = fadd <4 x double> %res, %res1
5530  %res4 = fadd <4 x double> %res2, %res3
5531  ret <4 x double> %res4
5532}
5533