• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw -mattr=+avx512vl --show-mc-encoding| FileCheck %s
2
3; 256-bit
4
5define i32 @test_pcmpeq_b_256(<32 x i8> %a, <32 x i8> %b) {
6; CHECK-LABEL: test_pcmpeq_b_256
7; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 ##
8  %res = call i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8> %a, <32 x i8> %b, i32 -1)
9  ret i32 %res
10}
11
12define i32 @test_mask_pcmpeq_b_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
13; CHECK-LABEL: test_mask_pcmpeq_b_256
14; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} ##
15  %res = call i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8> %a, <32 x i8> %b, i32 %mask)
16  ret i32 %res
17}
18
19declare i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8>, <32 x i8>, i32)
20
21define i16 @test_pcmpeq_w_256(<16 x i16> %a, <16 x i16> %b) {
22; CHECK-LABEL: test_pcmpeq_w_256
23; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 ##
24  %res = call i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16> %a, <16 x i16> %b, i16 -1)
25  ret i16 %res
26}
27
28define i16 @test_mask_pcmpeq_w_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
29; CHECK-LABEL: test_mask_pcmpeq_w_256
30; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} ##
31  %res = call i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16> %a, <16 x i16> %b, i16 %mask)
32  ret i16 %res
33}
34
35declare i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16>, <16 x i16>, i16)
36
37define i32 @test_pcmpgt_b_256(<32 x i8> %a, <32 x i8> %b) {
38; CHECK-LABEL: test_pcmpgt_b_256
39; CHECK: vpcmpgtb %ymm1, %ymm0, %k0 ##
40  %res = call i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8> %a, <32 x i8> %b, i32 -1)
41  ret i32 %res
42}
43
44define i32 @test_mask_pcmpgt_b_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
45; CHECK-LABEL: test_mask_pcmpgt_b_256
46; CHECK: vpcmpgtb %ymm1, %ymm0, %k0 {%k1} ##
47  %res = call i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8> %a, <32 x i8> %b, i32 %mask)
48  ret i32 %res
49}
50
51declare i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8>, <32 x i8>, i32)
52
53define i16 @test_pcmpgt_w_256(<16 x i16> %a, <16 x i16> %b) {
54; CHECK-LABEL: test_pcmpgt_w_256
55; CHECK: vpcmpgtw %ymm1, %ymm0, %k0 ##
56  %res = call i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16> %a, <16 x i16> %b, i16 -1)
57  ret i16 %res
58}
59
60define i16 @test_mask_pcmpgt_w_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
61; CHECK-LABEL: test_mask_pcmpgt_w_256
62; CHECK: vpcmpgtw %ymm1, %ymm0, %k0 {%k1} ##
63  %res = call i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16> %a, <16 x i16> %b, i16 %mask)
64  ret i16 %res
65}
66
67declare i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16>, <16 x i16>, i16)
68
69define <8 x i32> @test_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1) {
70; CHECK_LABEL: test_cmp_b_256
71; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 ##
72  %res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 -1)
73  %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
74; CHECK: vpcmpltb %ymm1, %ymm0, %k0 ##
75  %res1 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 -1)
76  %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
77; CHECK: vpcmpleb %ymm1, %ymm0, %k0 ##
78  %res2 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 -1)
79  %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
80; CHECK: vpcmpunordb %ymm1, %ymm0, %k0 ##
81  %res3 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 -1)
82  %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
83; CHECK: vpcmpneqb %ymm1, %ymm0, %k0 ##
84  %res4 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 -1)
85  %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
86; CHECK: vpcmpnltb %ymm1, %ymm0, %k0 ##
87  %res5 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 -1)
88  %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
89; CHECK: vpcmpnleb %ymm1, %ymm0, %k0 ##
90  %res6 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 -1)
91  %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
92; CHECK: vpcmpordb %ymm1, %ymm0, %k0 ##
93  %res7 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 -1)
94  %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
95  ret <8 x i32> %vec7
96}
97
98define <8 x i32> @test_mask_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
99; CHECK_LABEL: test_mask_cmp_b_256
100; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} ##
101  %res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask)
102  %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
103; CHECK: vpcmpltb %ymm1, %ymm0, %k0 {%k1} ##
104  %res1 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 %mask)
105  %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
106; CHECK: vpcmpleb %ymm1, %ymm0, %k0 {%k1} ##
107  %res2 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 %mask)
108  %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
109; CHECK: vpcmpunordb %ymm1, %ymm0, %k0 {%k1} ##
110  %res3 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 %mask)
111  %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
112; CHECK: vpcmpneqb %ymm1, %ymm0, %k0 {%k1} ##
113  %res4 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 %mask)
114  %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
115; CHECK: vpcmpnltb %ymm1, %ymm0, %k0 {%k1} ##
116  %res5 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 %mask)
117  %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
118; CHECK: vpcmpnleb %ymm1, %ymm0, %k0 {%k1} ##
119  %res6 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 %mask)
120  %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
121; CHECK: vpcmpordb %ymm1, %ymm0, %k0 {%k1} ##
122  %res7 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 %mask)
123  %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
124  ret <8 x i32> %vec7
125}
126
127declare i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8>, <32 x i8>, i32, i32) nounwind readnone
128
129define <8 x i32> @test_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1) {
130; CHECK_LABEL: test_ucmp_b_256
131; CHECK: vpcmpequb %ymm1, %ymm0, %k0 ##
132  %res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 -1)
133  %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
134; CHECK: vpcmpltub %ymm1, %ymm0, %k0 ##
135  %res1 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 -1)
136  %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
137; CHECK: vpcmpleub %ymm1, %ymm0, %k0 ##
138  %res2 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 -1)
139  %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
140; CHECK: vpcmpunordub %ymm1, %ymm0, %k0 ##
141  %res3 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 -1)
142  %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
143; CHECK: vpcmpnequb %ymm1, %ymm0, %k0 ##
144  %res4 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 -1)
145  %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
146; CHECK: vpcmpnltub %ymm1, %ymm0, %k0 ##
147  %res5 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 -1)
148  %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
149; CHECK: vpcmpnleub %ymm1, %ymm0, %k0 ##
150  %res6 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 -1)
151  %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
152; CHECK: vpcmpordub %ymm1, %ymm0, %k0 ##
153  %res7 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 -1)
154  %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
155  ret <8 x i32> %vec7
156}
157
158define <8 x i32> @test_mask_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
159; CHECK_LABEL: test_mask_ucmp_b_256
160; CHECK: vpcmpequb %ymm1, %ymm0, %k0 {%k1} ##
161  %res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask)
162  %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
163; CHECK: vpcmpltub %ymm1, %ymm0, %k0 {%k1} ##
164  %res1 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 %mask)
165  %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
166; CHECK: vpcmpleub %ymm1, %ymm0, %k0 {%k1} ##
167  %res2 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 %mask)
168  %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
169; CHECK: vpcmpunordub %ymm1, %ymm0, %k0 {%k1} ##
170  %res3 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 %mask)
171  %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
172; CHECK: vpcmpnequb %ymm1, %ymm0, %k0 {%k1} ##
173  %res4 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 %mask)
174  %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
175; CHECK: vpcmpnltub %ymm1, %ymm0, %k0 {%k1} ##
176  %res5 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 %mask)
177  %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
178; CHECK: vpcmpnleub %ymm1, %ymm0, %k0 {%k1} ##
179  %res6 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 %mask)
180  %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
181; CHECK: vpcmpordub %ymm1, %ymm0, %k0 {%k1} ##
182  %res7 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 %mask)
183  %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
184  ret <8 x i32> %vec7
185}
186
187declare i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8>, <32 x i8>, i32, i32) nounwind readnone
188
189define <8 x i16> @test_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1) {
190; CHECK_LABEL: test_cmp_w_256
191; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 ##
192  %res0 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 -1)
193  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
194; CHECK: vpcmpltw %ymm1, %ymm0, %k0 ##
195  %res1 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 -1)
196  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
197; CHECK: vpcmplew %ymm1, %ymm0, %k0 ##
198  %res2 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 -1)
199  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
200; CHECK: vpcmpunordw %ymm1, %ymm0, %k0 ##
201  %res3 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 -1)
202  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
203; CHECK: vpcmpneqw %ymm1, %ymm0, %k0 ##
204  %res4 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 -1)
205  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
206; CHECK: vpcmpnltw %ymm1, %ymm0, %k0 ##
207  %res5 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 -1)
208  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
209; CHECK: vpcmpnlew %ymm1, %ymm0, %k0 ##
210  %res6 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 -1)
211  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
212; CHECK: vpcmpordw %ymm1, %ymm0, %k0 ##
213  %res7 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 -1)
214  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
215  ret <8 x i16> %vec7
216}
217
218define <8 x i16> @test_mask_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1, i16 %mask) {
219; CHECK_LABEL: test_mask_cmp_w_256
220; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} ##
221  %res0 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 %mask)
222  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
223; CHECK: vpcmpltw %ymm1, %ymm0, %k0 {%k1} ##
224  %res1 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 %mask)
225  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
226; CHECK: vpcmplew %ymm1, %ymm0, %k0 {%k1} ##
227  %res2 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 %mask)
228  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
229; CHECK: vpcmpunordw %ymm1, %ymm0, %k0 {%k1} ##
230  %res3 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 %mask)
231  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
232; CHECK: vpcmpneqw %ymm1, %ymm0, %k0 {%k1} ##
233  %res4 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 %mask)
234  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
235; CHECK: vpcmpnltw %ymm1, %ymm0, %k0 {%k1} ##
236  %res5 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 %mask)
237  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
238; CHECK: vpcmpnlew %ymm1, %ymm0, %k0 {%k1} ##
239  %res6 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 %mask)
240  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
241; CHECK: vpcmpordw %ymm1, %ymm0, %k0 {%k1} ##
242  %res7 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 %mask)
243  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
244  ret <8 x i16> %vec7
245}
246
247declare i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16>, <16 x i16>, i32, i16) nounwind readnone
248
249define <8 x i16> @test_ucmp_w_256(<16 x i16> %a0, <16 x i16> %a1) {
250; CHECK_LABEL: test_ucmp_w_256
251; CHECK: vpcmpequw %ymm1, %ymm0, %k0 ##
252  %res0 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 -1)
253  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
254; CHECK: vpcmpltuw %ymm1, %ymm0, %k0 ##
255  %res1 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 -1)
256  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
257; CHECK: vpcmpleuw %ymm1, %ymm0, %k0 ##
258  %res2 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 -1)
259  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
260; CHECK: vpcmpunorduw %ymm1, %ymm0, %k0 ##
261  %res3 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 -1)
262  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
263; CHECK: vpcmpnequw %ymm1, %ymm0, %k0 ##
264  %res4 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 -1)
265  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
266; CHECK: vpcmpnltuw %ymm1, %ymm0, %k0 ##
267  %res5 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 -1)
268  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
269; CHECK: vpcmpnleuw %ymm1, %ymm0, %k0 ##
270  %res6 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 -1)
271  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
272; CHECK: vpcmporduw %ymm1, %ymm0, %k0 ##
273  %res7 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 -1)
274  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
275  ret <8 x i16> %vec7
276}
277
278define <8 x i16> @test_mask_ucmp_w_256(<16 x i16> %a0, <16 x i16> %a1, i16 %mask) {
279; CHECK_LABEL: test_mask_ucmp_w_256
280; CHECK: vpcmpequw %ymm1, %ymm0, %k0 {%k1} ##
281  %res0 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 %mask)
282  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
283; CHECK: vpcmpltuw %ymm1, %ymm0, %k0 {%k1} ##
284  %res1 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 %mask)
285  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
286; CHECK: vpcmpleuw %ymm1, %ymm0, %k0 {%k1} ##
287  %res2 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 %mask)
288  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
289; CHECK: vpcmpunorduw %ymm1, %ymm0, %k0 {%k1} ##
290  %res3 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 %mask)
291  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
292; CHECK: vpcmpnequw %ymm1, %ymm0, %k0 {%k1} ##
293  %res4 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 %mask)
294  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
295; CHECK: vpcmpnltuw %ymm1, %ymm0, %k0 {%k1} ##
296  %res5 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 %mask)
297  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
298; CHECK: vpcmpnleuw %ymm1, %ymm0, %k0 {%k1} ##
299  %res6 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 %mask)
300  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
301; CHECK: vpcmporduw %ymm1, %ymm0, %k0 {%k1} ##
302  %res7 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 %mask)
303  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
304  ret <8 x i16> %vec7
305}
306
307declare i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16>, <16 x i16>, i32, i16) nounwind readnone
308
309; 128-bit
310
311define i16 @test_pcmpeq_b_128(<16 x i8> %a, <16 x i8> %b) {
312; CHECK-LABEL: test_pcmpeq_b_128
313; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 ##
314  %res = call i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8> %a, <16 x i8> %b, i16 -1)
315  ret i16 %res
316}
317
318define i16 @test_mask_pcmpeq_b_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
319; CHECK-LABEL: test_mask_pcmpeq_b_128
320; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} ##
321  %res = call i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8> %a, <16 x i8> %b, i16 %mask)
322  ret i16 %res
323}
324
325declare i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8>, <16 x i8>, i16)
326
327define i8 @test_pcmpeq_w_128(<8 x i16> %a, <8 x i16> %b) {
328; CHECK-LABEL: test_pcmpeq_w_128
329; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 ##
330  %res = call i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16> %a, <8 x i16> %b, i8 -1)
331  ret i8 %res
332}
333
334define i8 @test_mask_pcmpeq_w_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
335; CHECK-LABEL: test_mask_pcmpeq_w_128
336; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} ##
337  %res = call i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16> %a, <8 x i16> %b, i8 %mask)
338  ret i8 %res
339}
340
341declare i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16>, <8 x i16>, i8)
342
343define i16 @test_pcmpgt_b_128(<16 x i8> %a, <16 x i8> %b) {
344; CHECK-LABEL: test_pcmpgt_b_128
345; CHECK: vpcmpgtb %xmm1, %xmm0, %k0 ##
346  %res = call i16 @llvm.x86.avx512.mask.pcmpgt.b.128(<16 x i8> %a, <16 x i8> %b, i16 -1)
347  ret i16 %res
348}
349
350define i16 @test_mask_pcmpgt_b_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
351; CHECK-LABEL: test_mask_pcmpgt_b_128
352; CHECK: vpcmpgtb %xmm1, %xmm0, %k0 {%k1} ##
353  %res = call i16 @llvm.x86.avx512.mask.pcmpgt.b.128(<16 x i8> %a, <16 x i8> %b, i16 %mask)
354  ret i16 %res
355}
356
357declare i16 @llvm.x86.avx512.mask.pcmpgt.b.128(<16 x i8>, <16 x i8>, i16)
358
359define i8 @test_pcmpgt_w_128(<8 x i16> %a, <8 x i16> %b) {
360; CHECK-LABEL: test_pcmpgt_w_128
361; CHECK: vpcmpgtw %xmm1, %xmm0, %k0 ##
362  %res = call i8 @llvm.x86.avx512.mask.pcmpgt.w.128(<8 x i16> %a, <8 x i16> %b, i8 -1)
363  ret i8 %res
364}
365
366define i8 @test_mask_pcmpgt_w_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
367; CHECK-LABEL: test_mask_pcmpgt_w_128
368; CHECK: vpcmpgtw %xmm1, %xmm0, %k0 {%k1} ##
369  %res = call i8 @llvm.x86.avx512.mask.pcmpgt.w.128(<8 x i16> %a, <8 x i16> %b, i8 %mask)
370  ret i8 %res
371}
372
373declare i8 @llvm.x86.avx512.mask.pcmpgt.w.128(<8 x i16>, <8 x i16>, i8)
374
375define <8 x i16> @test_cmp_b_128(<16 x i8> %a0, <16 x i8> %a1) {
376; CHECK_LABEL: test_cmp_b_128
377; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 ##
378  %res0 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 -1)
379  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
380; CHECK: vpcmpltb %xmm1, %xmm0, %k0 ##
381  %res1 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 -1)
382  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
383; CHECK: vpcmpleb %xmm1, %xmm0, %k0 ##
384  %res2 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 -1)
385  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
386; CHECK: vpcmpunordb %xmm1, %xmm0, %k0 ##
387  %res3 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 -1)
388  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
389; CHECK: vpcmpneqb %xmm1, %xmm0, %k0 ##
390  %res4 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 -1)
391  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
392; CHECK: vpcmpnltb %xmm1, %xmm0, %k0 ##
393  %res5 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 -1)
394  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
395; CHECK: vpcmpnleb %xmm1, %xmm0, %k0 ##
396  %res6 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 -1)
397  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
398; CHECK: vpcmpordb %xmm1, %xmm0, %k0 ##
399  %res7 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 -1)
400  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
401  ret <8 x i16> %vec7
402}
403
404define <8 x i16> @test_mask_cmp_b_128(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) {
405; CHECK_LABEL: test_mask_cmp_b_128
406; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} ##
407  %res0 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 %mask)
408  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
409; CHECK: vpcmpltb %xmm1, %xmm0, %k0 {%k1} ##
410  %res1 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 %mask)
411  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
412; CHECK: vpcmpleb %xmm1, %xmm0, %k0 {%k1} ##
413  %res2 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 %mask)
414  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
415; CHECK: vpcmpunordb %xmm1, %xmm0, %k0 {%k1} ##
416  %res3 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 %mask)
417  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
418; CHECK: vpcmpneqb %xmm1, %xmm0, %k0 {%k1} ##
419  %res4 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 %mask)
420  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
421; CHECK: vpcmpnltb %xmm1, %xmm0, %k0 {%k1} ##
422  %res5 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 %mask)
423  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
424; CHECK: vpcmpnleb %xmm1, %xmm0, %k0 {%k1} ##
425  %res6 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 %mask)
426  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
427; CHECK: vpcmpordb %xmm1, %xmm0, %k0 {%k1} ##
428  %res7 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 %mask)
429  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
430  ret <8 x i16> %vec7
431}
432
433declare i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8>, <16 x i8>, i32, i16) nounwind readnone
434
435define <8 x i16> @test_ucmp_b_128(<16 x i8> %a0, <16 x i8> %a1) {
436; CHECK_LABEL: test_ucmp_b_128
437; CHECK: vpcmpequb %xmm1, %xmm0, %k0 ##
438  %res0 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 -1)
439  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
440; CHECK: vpcmpltub %xmm1, %xmm0, %k0 ##
441  %res1 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 -1)
442  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
443; CHECK: vpcmpleub %xmm1, %xmm0, %k0 ##
444  %res2 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 -1)
445  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
446; CHECK: vpcmpunordub %xmm1, %xmm0, %k0 ##
447  %res3 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 -1)
448  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
449; CHECK: vpcmpnequb %xmm1, %xmm0, %k0 ##
450  %res4 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 -1)
451  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
452; CHECK: vpcmpnltub %xmm1, %xmm0, %k0 ##
453  %res5 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 -1)
454  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
455; CHECK: vpcmpnleub %xmm1, %xmm0, %k0 ##
456  %res6 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 -1)
457  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
458; CHECK: vpcmpordub %xmm1, %xmm0, %k0 ##
459  %res7 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 -1)
460  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
461  ret <8 x i16> %vec7
462}
463
464define <8 x i16> @test_mask_ucmp_b_128(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) {
465; CHECK_LABEL: test_mask_ucmp_b_128
466; CHECK: vpcmpequb %xmm1, %xmm0, %k0 {%k1} ##
467  %res0 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 %mask)
468  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
469; CHECK: vpcmpltub %xmm1, %xmm0, %k0 {%k1} ##
470  %res1 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 %mask)
471  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
472; CHECK: vpcmpleub %xmm1, %xmm0, %k0 {%k1} ##
473  %res2 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 %mask)
474  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
475; CHECK: vpcmpunordub %xmm1, %xmm0, %k0 {%k1} ##
476  %res3 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 %mask)
477  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
478; CHECK: vpcmpnequb %xmm1, %xmm0, %k0 {%k1} ##
479  %res4 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 %mask)
480  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
481; CHECK: vpcmpnltub %xmm1, %xmm0, %k0 {%k1} ##
482  %res5 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 %mask)
483  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
484; CHECK: vpcmpnleub %xmm1, %xmm0, %k0 {%k1} ##
485  %res6 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 %mask)
486  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
487; CHECK: vpcmpordub %xmm1, %xmm0, %k0 {%k1} ##
488  %res7 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 %mask)
489  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
490  ret <8 x i16> %vec7
491}
492
493declare i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8>, <16 x i8>, i32, i16) nounwind readnone
494
495define <8 x i8> @test_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1) {
496; CHECK_LABEL: test_cmp_w_128
497; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 ##
498  %res0 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 -1)
499  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
500; CHECK: vpcmpltw %xmm1, %xmm0, %k0 ##
501  %res1 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 -1)
502  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
503; CHECK: vpcmplew %xmm1, %xmm0, %k0 ##
504  %res2 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 -1)
505  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
506; CHECK: vpcmpunordw %xmm1, %xmm0, %k0 ##
507  %res3 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 -1)
508  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
509; CHECK: vpcmpneqw %xmm1, %xmm0, %k0 ##
510  %res4 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 -1)
511  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
512; CHECK: vpcmpnltw %xmm1, %xmm0, %k0 ##
513  %res5 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 -1)
514  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
515; CHECK: vpcmpnlew %xmm1, %xmm0, %k0 ##
516  %res6 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 -1)
517  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
518; CHECK: vpcmpordw %xmm1, %xmm0, %k0 ##
519  %res7 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 -1)
520  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
521  ret <8 x i8> %vec7
522}
523
524define <8 x i8> @test_mask_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) {
525; CHECK_LABEL: test_mask_cmp_w_128
526; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} ##
527  %res0 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 %mask)
528  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
529; CHECK: vpcmpltw %xmm1, %xmm0, %k0 {%k1} ##
530  %res1 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 %mask)
531  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
532; CHECK: vpcmplew %xmm1, %xmm0, %k0 {%k1} ##
533  %res2 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 %mask)
534  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
535; CHECK: vpcmpunordw %xmm1, %xmm0, %k0 {%k1} ##
536  %res3 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 %mask)
537  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
538; CHECK: vpcmpneqw %xmm1, %xmm0, %k0 {%k1} ##
539  %res4 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 %mask)
540  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
541; CHECK: vpcmpnltw %xmm1, %xmm0, %k0 {%k1} ##
542  %res5 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 %mask)
543  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
544; CHECK: vpcmpnlew %xmm1, %xmm0, %k0 {%k1} ##
545  %res6 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 %mask)
546  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
547; CHECK: vpcmpordw %xmm1, %xmm0, %k0 {%k1} ##
548  %res7 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 %mask)
549  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
550  ret <8 x i8> %vec7
551}
552
553declare i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16>, <8 x i16>, i32, i8) nounwind readnone
554
555define <8 x i8> @test_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1) {
556; CHECK_LABEL: test_ucmp_w_128
557; CHECK: vpcmpequw %xmm1, %xmm0, %k0 ##
558  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 -1)
559  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
560; CHECK: vpcmpltuw %xmm1, %xmm0, %k0 ##
561  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 -1)
562  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
563; CHECK: vpcmpleuw %xmm1, %xmm0, %k0 ##
564  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 -1)
565  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
566; CHECK: vpcmpunorduw %xmm1, %xmm0, %k0 ##
567  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 -1)
568  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
569; CHECK: vpcmpnequw %xmm1, %xmm0, %k0 ##
570  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 -1)
571  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
572; CHECK: vpcmpnltuw %xmm1, %xmm0, %k0 ##
573  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 -1)
574  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
575; CHECK: vpcmpnleuw %xmm1, %xmm0, %k0 ##
576  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 -1)
577  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
578; CHECK: vpcmporduw %xmm1, %xmm0, %k0 ##
579  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 -1)
580  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
581  ret <8 x i8> %vec7
582}
583
584define <8 x i8> @test_mask_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) {
585; CHECK_LABEL: test_mask_ucmp_w_128
586; CHECK: vpcmpequw %xmm1, %xmm0, %k0 {%k1} ##
587  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 %mask)
588  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
589; CHECK: vpcmpltuw %xmm1, %xmm0, %k0 {%k1} ##
590  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 %mask)
591  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
592; CHECK: vpcmpleuw %xmm1, %xmm0, %k0 {%k1} ##
593  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 %mask)
594  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
595; CHECK: vpcmpunorduw %xmm1, %xmm0, %k0 {%k1} ##
596  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 %mask)
597  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
598; CHECK: vpcmpnequw %xmm1, %xmm0, %k0 {%k1} ##
599  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 %mask)
600  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
601; CHECK: vpcmpnltuw %xmm1, %xmm0, %k0 {%k1} ##
602  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 %mask)
603  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
604; CHECK: vpcmpnleuw %xmm1, %xmm0, %k0 {%k1} ##
605  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 %mask)
606  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
607; CHECK: vpcmporduw %xmm1, %xmm0, %k0 {%k1} ##
608  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 %mask)
609  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
610  ret <8 x i8> %vec7
611}
612
613declare i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16>, <8 x i16>, i32, i8) nounwind readnone
614
615declare <8 x float> @llvm.x86.avx512.mask.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
616
617define <8 x float> @test_mask_vfmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) {
618  ; CHECK-LABEL: test_mask_vfmadd256_ps
619  ; CHECK: vfmadd213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xa8,0xc2]
620  %res = call <8 x float> @llvm.x86.avx512.mask.vfmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind
621  ret <8 x float> %res
622}
623
624declare <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
625
626define <4 x float> @test_mask_vfmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
627  ; CHECK-LABEL: test_mask_vfmadd128_ps
628  ; CHECK: vfmadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0xc2]
629  %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
630  ret <4 x float> %res
631}
632
633declare <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
634
635define <4 x double> @test_mask_fmadd256_pd(<4 x double> %a, <4 x double> %b, <4 x double> %c, i8 %mask) {
636; CHECK-LABEL: test_mask_fmadd256_pd:
637; CHECK: vfmadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0xc2]
638  %res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c, i8 %mask)
639  ret <4 x double> %res
640}
641
642declare <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
643
644define <2 x double> @test_mask_fmadd128_pd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
645; CHECK-LABEL: test_mask_fmadd128_pd:
646; CHECK: vfmadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0xc2]
647  %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask)
648  ret <2 x double> %res
649}
650
651define <2 x double>@test_int_x86_avx512_mask_vfmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
652; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_pd_128:
653; CHECK:       ## BB#0:
654; CHECK-NEXT:    movzbl %dil, %eax
655; CHECK-NEXT:    kmovw %eax, %k1
656; CHECK-NEXT:    vmovaps %zmm0, %zmm3
657; CHECK-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm3 {%k1}
658; CHECK-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm0
659; CHECK-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
660; CHECK-NEXT:    retq
661  %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
662  %res1 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
663  %res2 = fadd <2 x double> %res, %res1
664  ret <2 x double> %res2
665}
666
667declare <2 x double> @llvm.x86.avx512.mask3.vfmadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
668
669define <2 x double>@test_int_x86_avx512_mask3_vfmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
670; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_pd_128:
671; CHECK:       ## BB#0:
672; CHECK-NEXT:    movzbl %dil, %eax
673; CHECK-NEXT:    kmovw %eax, %k1
674; CHECK-NEXT:    vmovaps %zmm2, %zmm3
675; CHECK-NEXT:    vfmadd231pd %xmm1, %xmm0, %xmm3 {%k1}
676; CHECK-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm0
677; CHECK-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
678; CHECK-NEXT:    retq
679  %res = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
680  %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
681  %res2 = fadd <2 x double> %res, %res1
682  ret <2 x double> %res2
683}
684
685declare <2 x double> @llvm.x86.avx512.maskz.vfmadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
686
687define <2 x double>@test_int_x86_avx512_maskz_vfmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
688; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_pd_128:
689; CHECK:       ## BB#0:
690; CHECK-NEXT:    movzbl %dil, %eax
691; CHECK-NEXT:    kmovw %eax, %k1
692; CHECK-NEXT:    vmovaps %zmm0, %zmm3
693; CHECK-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm3 {%k1} {z}
694; CHECK-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm0
695; CHECK-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
696; CHECK-NEXT:    retq
697  %res = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
698  %res1 = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
699  %res2 = fadd <2 x double> %res, %res1
700  ret <2 x double> %res2
701}
702
703define <4 x double>@test_int_x86_avx512_mask_vfmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
704; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_pd_256:
705; CHECK:       ## BB#0:
706; CHECK-NEXT:    movzbl %dil, %eax
707; CHECK-NEXT:    kmovw %eax, %k1
708; CHECK-NEXT:    vmovaps %zmm0, %zmm3
709; CHECK-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm3 {%k1}
710; CHECK-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0
711; CHECK-NEXT:    vaddpd %ymm0, %ymm3, %ymm0
712; CHECK-NEXT:    retq
713  %res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
714  %res1 = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
715  %res2 = fadd <4 x double> %res, %res1
716  ret <4 x double> %res2
717}
718
719declare <4 x double> @llvm.x86.avx512.mask3.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
720
721define <4 x double>@test_int_x86_avx512_mask3_vfmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
722; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_pd_256:
723; CHECK:       ## BB#0:
724; CHECK-NEXT:    movzbl %dil, %eax
725; CHECK-NEXT:    kmovw %eax, %k1
726; CHECK-NEXT:    vmovaps %zmm2, %zmm3
727; CHECK-NEXT:    vfmadd231pd %ymm1, %ymm0, %ymm3 {%k1}
728; CHECK-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0
729; CHECK-NEXT:    vaddpd %ymm0, %ymm3, %ymm0
730; CHECK-NEXT:    retq
731  %res = call <4 x double> @llvm.x86.avx512.mask3.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
732  %res1 = call <4 x double> @llvm.x86.avx512.mask3.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
733  %res2 = fadd <4 x double> %res, %res1
734  ret <4 x double> %res2
735}
736
737declare <4 x double> @llvm.x86.avx512.maskz.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
738
739define <4 x double>@test_int_x86_avx512_maskz_vfmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
740; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_pd_256:
741; CHECK:       ## BB#0:
742; CHECK-NEXT:    movzbl %dil, %eax
743; CHECK-NEXT:    kmovw %eax, %k1
744; CHECK-NEXT:    vmovaps %zmm0, %zmm3
745; CHECK-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm3 {%k1} {z}
746; CHECK-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0
747; CHECK-NEXT:    vaddpd %ymm0, %ymm3, %ymm0
748; CHECK-NEXT:    retq
749  %res = call <4 x double> @llvm.x86.avx512.maskz.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
750  %res1 = call <4 x double> @llvm.x86.avx512.maskz.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
751  %res2 = fadd <4 x double> %res, %res1
752  ret <4 x double> %res2
753}
754
755define <4 x float>@test_int_x86_avx512_mask_vfmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
756; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ps_128:
757; CHECK:       ## BB#0:
758; CHECK-NEXT:    movzbl %dil, %eax
759; CHECK-NEXT:    kmovw %eax, %k1
760; CHECK-NEXT:    vmovaps %zmm0, %zmm3
761; CHECK-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm3 {%k1}
762; CHECK-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm0
763; CHECK-NEXT:    vaddps %xmm0, %xmm3, %xmm0
764; CHECK-NEXT:    retq
765  %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
766  %res1 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
767  %res2 = fadd <4 x float> %res, %res1
768  ret <4 x float> %res2
769}
770
771declare <4 x float> @llvm.x86.avx512.mask3.vfmadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
772
773define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
774; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ps_128:
775; CHECK:       ## BB#0:
776; CHECK-NEXT:    movzbl %dil, %eax
777; CHECK-NEXT:    kmovw %eax, %k1
778; CHECK-NEXT:    vmovaps %zmm2, %zmm3
779; CHECK-NEXT:    vfmadd231ps %xmm1, %xmm0, %xmm3 {%k1}
780; CHECK-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm0
781; CHECK-NEXT:    vaddps %xmm0, %xmm3, %xmm0
782; CHECK-NEXT:    retq
783  %res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
784  %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
785  %res2 = fadd <4 x float> %res, %res1
786  ret <4 x float> %res2
787}
788
789declare <4 x float> @llvm.x86.avx512.maskz.vfmadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
790
791define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
792; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ps_128:
793; CHECK:       ## BB#0:
794; CHECK-NEXT:    movzbl %dil, %eax
795; CHECK-NEXT:    kmovw %eax, %k1
796; CHECK-NEXT:    vmovaps %zmm0, %zmm3
797; CHECK-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm3 {%k1} {z}
798; CHECK-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm0
799; CHECK-NEXT:    vaddps %xmm0, %xmm3, %xmm0
800; CHECK-NEXT:    retq
801  %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
802  %res1 = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
803  %res2 = fadd <4 x float> %res, %res1
804  ret <4 x float> %res2
805}
806
807define <8 x float>@test_int_x86_avx512_mask_vfmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
808; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ps_256:
809; CHECK:       ## BB#0:
810; CHECK-NEXT:    movzbl %dil, %eax
811; CHECK-NEXT:    kmovw %eax, %k1
812; CHECK-NEXT:    vmovaps %zmm0, %zmm3
813; CHECK-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm3 {%k1}
814; CHECK-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm0
815; CHECK-NEXT:    vaddps %ymm0, %ymm3, %ymm0
816; CHECK-NEXT:    retq
817  %res = call <8 x float> @llvm.x86.avx512.mask.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
818  %res1 = call <8 x float> @llvm.x86.avx512.mask.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
819  %res2 = fadd <8 x float> %res, %res1
820  ret <8 x float> %res2
821}
822
823declare <8 x float> @llvm.x86.avx512.mask3.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
824
825define <8 x float>@test_int_x86_avx512_mask3_vfmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
826; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ps_256:
827; CHECK:       ## BB#0:
828; CHECK-NEXT:    movzbl %dil, %eax
829; CHECK-NEXT:    kmovw %eax, %k1
830; CHECK-NEXT:    vmovaps %zmm2, %zmm3
831; CHECK-NEXT:    vfmadd231ps %ymm1, %ymm0, %ymm3 {%k1}
832; CHECK-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm0
833; CHECK-NEXT:    vaddps %ymm0, %ymm3, %ymm0
834; CHECK-NEXT:    retq
835  %res = call <8 x float> @llvm.x86.avx512.mask3.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
836  %res1 = call <8 x float> @llvm.x86.avx512.mask3.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
837  %res2 = fadd <8 x float> %res, %res1
838  ret <8 x float> %res2
839}
840
841declare <8 x float> @llvm.x86.avx512.maskz.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
842
843define <8 x float>@test_int_x86_avx512_maskz_vfmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
844; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ps_256:
845; CHECK:       ## BB#0:
846; CHECK-NEXT:    movzbl %dil, %eax
847; CHECK-NEXT:    kmovw %eax, %k1
848; CHECK-NEXT:    vmovaps %zmm0, %zmm3
849; CHECK-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm3 {%k1} {z}
850; CHECK-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm0
851; CHECK-NEXT:    vaddps %ymm0, %ymm3, %ymm0
852; CHECK-NEXT:    retq
853  %res = call <8 x float> @llvm.x86.avx512.maskz.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
854  %res1 = call <8 x float> @llvm.x86.avx512.maskz.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
855  %res2 = fadd <8 x float> %res, %res1
856  ret <8 x float> %res2
857}
858
859
860declare <2 x double> @llvm.x86.avx512.mask3.vfmsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
861
862define <2 x double>@test_int_x86_avx512_mask3_vfmsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
863; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_pd_128:
864; CHECK:       ## BB#0:
865; CHECK-NEXT:    movzbl %dil, %eax
866; CHECK-NEXT:    kmovw %eax, %k1
867; CHECK-NEXT:    vmovaps %zmm2, %zmm3
868; CHECK-NEXT:    vfmsub231pd %xmm1, %xmm0, %xmm3 {%k1}
869; CHECK-NEXT:    vfmsub213pd %xmm2, %xmm1, %xmm0
870; CHECK-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
871; CHECK-NEXT:    retq
872  %res = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
873  %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
874  %res2 = fadd <2 x double> %res, %res1
875  ret <2 x double> %res2
876}
877
878
879declare <4 x double> @llvm.x86.avx512.mask3.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
880
881define <4 x double>@test_int_x86_avx512_mask3_vfmsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
882; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_pd_256:
883; CHECK:       ## BB#0:
884; CHECK-NEXT:    movzbl %dil, %eax
885; CHECK-NEXT:    kmovw %eax, %k1
886; CHECK-NEXT:    vmovaps %zmm2, %zmm3
887; CHECK-NEXT:    vfmsub231pd %ymm1, %ymm0, %ymm3 {%k1}
888; CHECK-NEXT:    vfmsub213pd %ymm2, %ymm1, %ymm0
889; CHECK-NEXT:    vaddpd %ymm0, %ymm3, %ymm0
890; CHECK-NEXT:    retq
891  %res = call <4 x double> @llvm.x86.avx512.mask3.vfmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
892  %res1 = call <4 x double> @llvm.x86.avx512.mask3.vfmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
893  %res2 = fadd <4 x double> %res, %res1
894  ret <4 x double> %res2
895}
896
897declare <4 x float> @llvm.x86.avx512.mask3.vfmsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
898
899define <4 x float>@test_int_x86_avx512_mask3_vfmsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
900; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_ps_128:
901; CHECK:       ## BB#0:
902; CHECK-NEXT:    movzbl %dil, %eax
903; CHECK-NEXT:    kmovw %eax, %k1
904; CHECK-NEXT:    vmovaps %zmm2, %zmm3
905; CHECK-NEXT:    vfmsub231ps %xmm1, %xmm0, %xmm3 {%k1}
906; CHECK-NEXT:    vfmsub213ps %xmm2, %xmm1, %xmm0
907; CHECK-NEXT:    vaddps %xmm0, %xmm3, %xmm0
908; CHECK-NEXT:    retq
909  %res = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
910  %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
911  %res2 = fadd <4 x float> %res, %res1
912  ret <4 x float> %res2
913}
914
915declare <8 x float> @llvm.x86.avx512.mask3.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
916
917define <8 x float>@test_int_x86_avx512_mask3_vfmsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
918; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_ps_256:
919; CHECK:       ## BB#0:
920; CHECK-NEXT:    movzbl %dil, %eax
921; CHECK-NEXT:    kmovw %eax, %k1
922; CHECK-NEXT:    vmovaps %zmm2, %zmm3
923; CHECK-NEXT:    vfmsub231ps %ymm1, %ymm0, %ymm3 {%k1}
924; CHECK-NEXT:    vfmsub213ps %ymm2, %ymm1, %ymm0
925; CHECK-NEXT:    vaddps %ymm0, %ymm3, %ymm0
926; CHECK-NEXT:    retq
927  %res = call <8 x float> @llvm.x86.avx512.mask3.vfmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
928  %res1 = call <8 x float> @llvm.x86.avx512.mask3.vfmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
929  %res2 = fadd <8 x float> %res, %res1
930  ret <8 x float> %res2
931}
932
933declare <8 x float> @llvm.x86.avx512.mask.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
934
935define <8 x float> @test_mask_vfnmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) {
936  ; CHECK-LABEL: test_mask_vfnmadd256_ps
937  ; CHECK: vfnmadd213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xac,0xc2]
938  %res = call <8 x float> @llvm.x86.avx512.mask.vfnmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind
939  ret <8 x float> %res
940}
941
942declare <4 x float> @llvm.x86.avx512.mask.vfnmadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
943
944define <4 x float> @test_mask_vfnmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
945  ; CHECK-LABEL: test_mask_vfnmadd128_ps
946  ; CHECK: vfnmadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xac,0xc2]
947  %res = call <4 x float> @llvm.x86.avx512.mask.vfnmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
948  ret <4 x float> %res
949}
950
951declare <4 x double> @llvm.x86.avx512.mask.vfnmadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
952
953define <4 x double> @test_mask_vfnmadd256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) {
954  ; CHECK-LABEL: test_mask_vfnmadd256_pd
955  ; CHECK: vfnmadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xac,0xc2]
956  %res = call <4 x double> @llvm.x86.avx512.mask.vfnmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
957  ret <4 x double> %res
958}
959
960declare <2 x double> @llvm.x86.avx512.mask.vfnmadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
961
962define <2 x double> @test_mask_vfnmadd128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
963  ; CHECK-LABEL: test_mask_vfnmadd128_pd
964  ; CHECK: vfnmadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xac,0xc2]
965  %res = call <2 x double> @llvm.x86.avx512.mask.vfnmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
966  ret <2 x double> %res
967}
968
969declare <8 x float> @llvm.x86.avx512.mask.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
970
971define <8 x float> @test_mask_vfnmsub256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) {
972  ; CHECK-LABEL: test_mask_vfnmsub256_ps
973  ; CHECK: vfnmsub213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xae,0xc2]
974  %res = call <8 x float> @llvm.x86.avx512.mask.vfnmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind
975  ret <8 x float> %res
976}
977
978declare <4 x float> @llvm.x86.avx512.mask.vfnmsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
979
980define <4 x float> @test_mask_vfnmsub128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
981  ; CHECK-LABEL: test_mask_vfnmsub128_ps
982  ; CHECK: vfnmsub213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xae,0xc2]
983  %res = call <4 x float> @llvm.x86.avx512.mask.vfnmsub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
984  ret <4 x float> %res
985}
986
987declare <4 x double> @llvm.x86.avx512.mask.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
988
989define <4 x double> @test_mask_vfnmsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) {
990  ; CHECK-LABEL: test_mask_vfnmsub256_pd
991  ; CHECK: vfnmsub213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xae,0xc2]
992  %res = call <4 x double> @llvm.x86.avx512.mask.vfnmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
993  ret <4 x double> %res
994}
995
996declare <2 x double> @llvm.x86.avx512.mask.vfnmsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
997
998define <2 x double> @test_mask_vfnmsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
999  ; CHECK-LABEL: test_mask_vfnmsub128_pd
1000  ; CHECK: vfnmsub213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xae,0xc2]
1001  %res = call <2 x double> @llvm.x86.avx512.mask.vfnmsub.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
1002  ret <2 x double> %res
1003}
1004
1005
1006define <2 x double>@test_int_x86_avx512_mask_vfnmsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
1007; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_pd_128:
1008; CHECK:       ## BB#0:
1009; CHECK-NEXT:    movzbl %dil, %eax
1010; CHECK-NEXT:    kmovw %eax, %k1
1011; CHECK-NEXT:    vmovaps %zmm0, %zmm3
1012; CHECK-NEXT:    vfnmsub213pd %xmm2, %xmm1, %xmm3 {%k1}
1013; CHECK-NEXT:    vfnmsub213pd %xmm2, %xmm1, %xmm0
1014; CHECK-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
1015; CHECK-NEXT:    retq
1016  %res = call <2 x double> @llvm.x86.avx512.mask.vfnmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
1017  %res1 = call <2 x double> @llvm.x86.avx512.mask.vfnmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
1018  %res2 = fadd <2 x double> %res, %res1
1019  ret <2 x double> %res2
1020}
1021
1022declare <2 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
1023
1024define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
1025; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_pd_128:
1026; CHECK:       ## BB#0:
1027; CHECK-NEXT:    movzbl %dil, %eax
1028; CHECK-NEXT:    kmovw %eax, %k1
1029; CHECK-NEXT:    vmovaps %zmm2, %zmm3
1030; CHECK-NEXT:    vfnmsub231pd %xmm1, %xmm0, %xmm3 {%k1}
1031; CHECK-NEXT:    vfnmsub213pd %xmm2, %xmm1, %xmm0
1032; CHECK-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
1033; CHECK-NEXT:    retq
1034  %res = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
1035  %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
1036  %res2 = fadd <2 x double> %res, %res1
1037  ret <2 x double> %res2
1038}
1039
1040define <4 x double>@test_int_x86_avx512_mask_vfnmsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
1041; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_pd_256:
1042; CHECK:       ## BB#0:
1043; CHECK-NEXT:    movzbl %dil, %eax
1044; CHECK-NEXT:    kmovw %eax, %k1
1045; CHECK-NEXT:    vmovaps %zmm0, %zmm3
1046; CHECK-NEXT:    vfnmsub213pd %ymm2, %ymm1, %ymm3 {%k1}
1047; CHECK-NEXT:    vfnmsub213pd %ymm2, %ymm1, %ymm0
1048; CHECK-NEXT:    vaddpd %ymm0, %ymm3, %ymm0
1049; CHECK-NEXT:    retq
1050  %res = call <4 x double> @llvm.x86.avx512.mask.vfnmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
1051  %res1 = call <4 x double> @llvm.x86.avx512.mask.vfnmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
1052  %res2 = fadd <4 x double> %res, %res1
1053  ret <4 x double> %res2
1054}
1055
1056declare <4 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
1057
1058define <4 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
1059; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_pd_256:
1060; CHECK:       ## BB#0:
1061; CHECK-NEXT:    movzbl %dil, %eax
1062; CHECK-NEXT:    kmovw %eax, %k1
1063; CHECK-NEXT:    vmovaps %zmm2, %zmm3
1064; CHECK-NEXT:    vfnmsub231pd %ymm1, %ymm0, %ymm3 {%k1}
1065; CHECK-NEXT:    vfnmsub213pd %ymm2, %ymm1, %ymm0
1066; CHECK-NEXT:    vaddpd %ymm0, %ymm3, %ymm0
1067; CHECK-NEXT:    retq
1068  %res = call <4 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
1069  %res1 = call <4 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
1070  %res2 = fadd <4 x double> %res, %res1
1071  ret <4 x double> %res2
1072}
1073
1074define <4 x float>@test_int_x86_avx512_mask_vfnmsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
1075; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_ps_128:
1076; CHECK:       ## BB#0:
1077; CHECK-NEXT:    movzbl %dil, %eax
1078; CHECK-NEXT:    kmovw %eax, %k1
1079; CHECK-NEXT:    vmovaps %zmm0, %zmm3
1080; CHECK-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm3 {%k1}
1081; CHECK-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0
1082; CHECK-NEXT:    vaddps %xmm0, %xmm3, %xmm0
1083; CHECK-NEXT:    retq
1084  %res = call <4 x float> @llvm.x86.avx512.mask.vfnmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
1085  %res1 = call <4 x float> @llvm.x86.avx512.mask.vfnmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
1086  %res2 = fadd <4 x float> %res, %res1
1087  ret <4 x float> %res2
1088}
1089
1090declare <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
1091
1092define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
1093; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_ps_128:
1094; CHECK:       ## BB#0:
1095; CHECK-NEXT:    movzbl %dil, %eax
1096; CHECK-NEXT:    kmovw %eax, %k1
1097; CHECK-NEXT:    vmovaps %zmm2, %zmm3
1098; CHECK-NEXT:    vfnmsub231ps %xmm1, %xmm0, %xmm3 {%k1}
1099; CHECK-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0
1100; CHECK-NEXT:    vaddps %xmm0, %xmm3, %xmm0
1101; CHECK-NEXT:    retq
1102  %res = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
1103  %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
1104  %res2 = fadd <4 x float> %res, %res1
1105  ret <4 x float> %res2
1106}
1107
1108define <8 x float>@test_int_x86_avx512_mask_vfnmsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
1109; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_ps_256:
1110; CHECK:       ## BB#0:
1111; CHECK-NEXT:    movzbl %dil, %eax
1112; CHECK-NEXT:    kmovw %eax, %k1
1113; CHECK-NEXT:    vmovaps %zmm0, %zmm3
1114; CHECK-NEXT:    vfnmsub213ps %ymm2, %ymm1, %ymm3 {%k1}
1115; CHECK-NEXT:    vfnmsub213ps %ymm2, %ymm1, %ymm0
1116; CHECK-NEXT:    vaddps %ymm0, %ymm3, %ymm0
1117; CHECK-NEXT:    retq
1118  %res = call <8 x float> @llvm.x86.avx512.mask.vfnmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
1119  %res1 = call <8 x float> @llvm.x86.avx512.mask.vfnmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
1120  %res2 = fadd <8 x float> %res, %res1
1121  ret <8 x float> %res2
1122}
1123
1124declare <8 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
1125
1126define <8 x float>@test_int_x86_avx512_mask3_vfnmsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
1127; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_ps_256:
1128; CHECK:       ## BB#0:
1129; CHECK-NEXT:    movzbl %dil, %eax
1130; CHECK-NEXT:    kmovw %eax, %k1
1131; CHECK-NEXT:    vmovaps %zmm2, %zmm3
1132; CHECK-NEXT:    vfnmsub231ps %ymm1, %ymm0, %ymm3 {%k1}
1133; CHECK-NEXT:    vfnmsub213ps %ymm2, %ymm1, %ymm0
1134; CHECK-NEXT:    vaddps %ymm0, %ymm3, %ymm0
1135; CHECK-NEXT:    retq
1136  %res = call <8 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
1137  %res1 = call <8 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
1138  %res2 = fadd <8 x float> %res, %res1
1139  ret <8 x float> %res2
1140}
1141
1142define <2 x double>@test_int_x86_avx512_mask_vfnmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
1143; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_pd_128:
1144; CHECK:       ## BB#0:
1145; CHECK-NEXT:    movzbl %dil, %eax
1146; CHECK-NEXT:    kmovw %eax, %k1
1147; CHECK-NEXT:    vmovaps %zmm0, %zmm3
1148; CHECK-NEXT:    vfnmadd213pd %xmm2, %xmm1, %xmm3 {%k1}
1149; CHECK-NEXT:    vfnmadd213pd %xmm2, %xmm1, %xmm0
1150; CHECK-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
1151; CHECK-NEXT:    retq
1152  %res = call <2 x double> @llvm.x86.avx512.mask.vfnmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
1153  %res1 = call <2 x double> @llvm.x86.avx512.mask.vfnmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
1154  %res2 = fadd <2 x double> %res, %res1
1155  ret <2 x double> %res2
1156}
1157
1158define <4 x double>@test_int_x86_avx512_mask_vfnmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
1159; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_pd_256:
1160; CHECK:       ## BB#0:
1161; CHECK-NEXT:    movzbl %dil, %eax
1162; CHECK-NEXT:    kmovw %eax, %k1
1163; CHECK-NEXT:    vmovaps %zmm0, %zmm3
1164; CHECK-NEXT:    vfnmadd213pd %ymm2, %ymm1, %ymm3 {%k1}
1165; CHECK-NEXT:    vfnmadd213pd %ymm2, %ymm1, %ymm0
1166; CHECK-NEXT:    vaddpd %ymm0, %ymm3, %ymm0
1167; CHECK-NEXT:    retq
1168  %res = call <4 x double> @llvm.x86.avx512.mask.vfnmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
1169  %res1 = call <4 x double> @llvm.x86.avx512.mask.vfnmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
1170  %res2 = fadd <4 x double> %res, %res1
1171  ret <4 x double> %res2
1172}
1173
1174define <4 x float>@test_int_x86_avx512_mask_vfnmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
1175; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_ps_128:
1176; CHECK:       ## BB#0:
1177; CHECK-NEXT:    movzbl %dil, %eax
1178; CHECK-NEXT:    kmovw %eax, %k1
1179; CHECK-NEXT:    vmovaps %zmm0, %zmm3
1180; CHECK-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm3 {%k1}
1181; CHECK-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0
1182; CHECK-NEXT:    vaddps %xmm0, %xmm3, %xmm0
1183; CHECK-NEXT:    retq
1184  %res = call <4 x float> @llvm.x86.avx512.mask.vfnmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
1185  %res1 = call <4 x float> @llvm.x86.avx512.mask.vfnmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
1186  %res2 = fadd <4 x float> %res, %res1
1187  ret <4 x float> %res2
1188}
1189
1190define <8 x float>@test_int_x86_avx512_mask_vfnmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
1191; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_ps_256:
1192; CHECK:       ## BB#0:
1193; CHECK-NEXT:    movzbl %dil, %eax
1194; CHECK-NEXT:    kmovw %eax, %k1
1195; CHECK-NEXT:    vmovaps %zmm0, %zmm3
1196; CHECK-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm3 {%k1}
1197; CHECK-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0
1198; CHECK-NEXT:    vaddps %ymm0, %ymm3, %ymm0
1199; CHECK-NEXT:    retq
1200  %res = call <8 x float> @llvm.x86.avx512.mask.vfnmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
1201  %res1 = call <8 x float> @llvm.x86.avx512.mask.vfnmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
1202  %res2 = fadd <8 x float> %res, %res1
1203  ret <8 x float> %res2
1204}
1205
1206declare <8 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
1207
1208define <8 x float> @test_mask_fmaddsub256_ps(<8 x float> %a, <8 x float> %b, <8 x float> %c, i8 %mask) {
1209; CHECK-LABEL: test_mask_fmaddsub256_ps:
1210; CHECK: vfmaddsub213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xa6,0xc2]
1211  %res = call <8 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c, i8 %mask)
1212  ret <8 x float> %res
1213}
1214
1215declare <4 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
1216
1217define <4 x float> @test_mask_fmaddsub128_ps(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
1218; CHECK-LABEL: test_mask_fmaddsub128_ps:
1219; CHECK: vfmaddsub213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa6,0xc2]
1220  %res = call <4 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask)
1221  ret <4 x float> %res
1222}
1223
1224declare <4 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
1225
1226define <4 x double> @test_mask_vfmaddsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) {
1227  ; CHECK-LABEL: test_mask_vfmaddsub256_pd
1228  ; CHECK: vfmaddsub213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa6,0xc2]
1229  %res = call <4 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
1230  ret <4 x double> %res
1231}
1232
1233declare <2 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
1234
1235define <2 x double> @test_mask_vfmaddsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
1236  ; CHECK-LABEL: test_mask_vfmaddsub128_pd
1237  ; CHECK: vfmaddsub213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa6,0xc2]
1238  %res = call <2 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
1239  ret <2 x double> %res
1240}
1241
1242define <2 x double>@test_int_x86_avx512_mask_vfmaddsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
1243; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_pd_128:
1244; CHECK:       ## BB#0:
1245; CHECK-NEXT:    movzbl %dil, %eax
1246; CHECK-NEXT:    kmovw %eax, %k1
1247; CHECK-NEXT:    vmovaps %zmm0, %zmm3
1248; CHECK-NEXT:    vfmaddsub213pd %xmm2, %xmm1, %xmm3 {%k1}
1249; CHECK-NEXT:    vfmaddsub213pd %xmm2, %xmm1, %xmm0
1250; CHECK-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
1251; CHECK-NEXT:    retq
1252  %res = call <2 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
1253  %res1 = call <2 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
1254  %res2 = fadd <2 x double> %res, %res1
1255  ret <2 x double> %res2
1256}
1257
1258declare <2 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
1259
1260define <2 x double>@test_int_x86_avx512_mask3_vfmaddsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
1261; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_pd_128:
1262; CHECK:       ## BB#0:
1263; CHECK-NEXT:    movzbl %dil, %eax
1264; CHECK-NEXT:    kmovw %eax, %k1
1265; CHECK-NEXT:    vmovaps %zmm2, %zmm3
1266; CHECK-NEXT:    vfmaddsub231pd %xmm1, %xmm0, %xmm3 {%k1}
1267; CHECK-NEXT:    vfmaddsub213pd %xmm2, %xmm1, %xmm0
1268; CHECK-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
1269; CHECK-NEXT:    retq
1270  %res = call <2 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
1271  %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
1272  %res2 = fadd <2 x double> %res, %res1
1273  ret <2 x double> %res2
1274}
1275
1276declare <2 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
1277
1278define <2 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
1279; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_pd_128:
1280; CHECK:       ## BB#0:
1281; CHECK-NEXT:    movzbl %dil, %eax
1282; CHECK-NEXT:    kmovw %eax, %k1
1283; CHECK-NEXT:    vmovaps %zmm0, %zmm3
1284; CHECK-NEXT:    vfmaddsub213pd %xmm2, %xmm1, %xmm3 {%k1} {z}
1285; CHECK-NEXT:    vfmaddsub213pd %xmm2, %xmm1, %xmm0
1286; CHECK-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
1287; CHECK-NEXT:    retq
1288  %res = call <2 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
1289  %res1 = call <2 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
1290  %res2 = fadd <2 x double> %res, %res1
1291  ret <2 x double> %res2
1292}
1293
1294define <4 x double>@test_int_x86_avx512_mask_vfmaddsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
1295; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_pd_256:
1296; CHECK:       ## BB#0:
1297; CHECK-NEXT:    movzbl %dil, %eax
1298; CHECK-NEXT:    kmovw %eax, %k1
1299; CHECK-NEXT:    vmovaps %zmm0, %zmm3
1300; CHECK-NEXT:    vfmaddsub213pd %ymm2, %ymm1, %ymm3 {%k1}
1301; CHECK-NEXT:    vfmaddsub213pd %ymm2, %ymm1, %ymm0
1302; CHECK-NEXT:    vaddpd %ymm0, %ymm3, %ymm0
1303; CHECK-NEXT:    retq
1304  %res = call <4 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
1305  %res1 = call <4 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
1306  %res2 = fadd <4 x double> %res, %res1
1307  ret <4 x double> %res2
1308}
1309
1310declare <4 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
1311
1312define <4 x double>@test_int_x86_avx512_mask3_vfmaddsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
1313; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_pd_256:
1314; CHECK:       ## BB#0:
1315; CHECK-NEXT:    movzbl %dil, %eax
1316; CHECK-NEXT:    kmovw %eax, %k1
1317; CHECK-NEXT:    vmovaps %zmm2, %zmm3
1318; CHECK-NEXT:    vfmaddsub231pd %ymm1, %ymm0, %ymm3 {%k1}
1319; CHECK-NEXT:    vfmaddsub213pd %ymm2, %ymm1, %ymm0
1320; CHECK-NEXT:    vaddpd %ymm0, %ymm3, %ymm0
1321; CHECK-NEXT:    retq
1322  %res = call <4 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
1323  %res1 = call <4 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
1324  %res2 = fadd <4 x double> %res, %res1
1325  ret <4 x double> %res2
1326}
1327
1328declare <4 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
1329
1330define <4 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
1331; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_pd_256:
1332; CHECK:       ## BB#0:
1333; CHECK-NEXT:    movzbl %dil, %eax
1334; CHECK-NEXT:    kmovw %eax, %k1
1335; CHECK-NEXT:    vmovaps %zmm0, %zmm3
1336; CHECK-NEXT:    vfmaddsub213pd %ymm2, %ymm1, %ymm3 {%k1} {z}
1337; CHECK-NEXT:    vfmaddsub213pd %ymm2, %ymm1, %ymm0
1338; CHECK-NEXT:    vaddpd %ymm0, %ymm3, %ymm0
1339; CHECK-NEXT:    retq
1340  %res = call <4 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
1341  %res1 = call <4 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
1342  %res2 = fadd <4 x double> %res, %res1
1343  ret <4 x double> %res2
1344}
1345
1346define <4 x float>@test_int_x86_avx512_mask_vfmaddsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
1347; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_ps_128:
1348; CHECK:       ## BB#0:
1349; CHECK-NEXT:    movzbl %dil, %eax
1350; CHECK-NEXT:    kmovw %eax, %k1
1351; CHECK-NEXT:    vmovaps %zmm0, %zmm3
1352; CHECK-NEXT:    vfmaddsub213ps %xmm2, %xmm1, %xmm3 {%k1}
1353; CHECK-NEXT:    vfmaddsub213ps %xmm2, %xmm1, %xmm0
1354; CHECK-NEXT:    vaddps %xmm0, %xmm3, %xmm0
1355; CHECK-NEXT:    retq
1356  %res = call <4 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
1357  %res1 = call <4 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
1358  %res2 = fadd <4 x float> %res, %res1
1359  ret <4 x float> %res2
1360}
1361
1362declare <4 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
1363
1364define <4 x float>@test_int_x86_avx512_mask3_vfmaddsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
1365; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_ps_128:
1366; CHECK:       ## BB#0:
1367; CHECK-NEXT:    movzbl %dil, %eax
1368; CHECK-NEXT:    kmovw %eax, %k1
1369; CHECK-NEXT:    vmovaps %zmm2, %zmm3
1370; CHECK-NEXT:    vfmaddsub231ps %xmm1, %xmm0, %xmm3 {%k1}
1371; CHECK-NEXT:    vfmaddsub213ps %xmm2, %xmm1, %xmm0
1372; CHECK-NEXT:    vaddps %xmm0, %xmm3, %xmm0
1373; CHECK-NEXT:    retq
1374  %res = call <4 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
1375  %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
1376  %res2 = fadd <4 x float> %res, %res1
1377  ret <4 x float> %res2
1378}
1379
1380declare <4 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
1381
1382define <4 x float>@test_int_x86_avx512_maskz_vfmaddsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
1383; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_ps_128:
1384; CHECK:       ## BB#0:
1385; CHECK-NEXT:    movzbl %dil, %eax
1386; CHECK-NEXT:    kmovw %eax, %k1
1387; CHECK-NEXT:    vmovaps %zmm0, %zmm3
1388; CHECK-NEXT:    vfmaddsub213ps %xmm2, %xmm1, %xmm3 {%k1} {z}
1389; CHECK-NEXT:    vfmaddsub213ps %xmm2, %xmm1, %xmm0
1390; CHECK-NEXT:    vaddps %xmm0, %xmm3, %xmm0
1391; CHECK-NEXT:    retq
1392  %res = call <4 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
1393  %res1 = call <4 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
1394  %res2 = fadd <4 x float> %res, %res1
1395  ret <4 x float> %res2
1396}
1397
1398define <8 x float>@test_int_x86_avx512_mask_vfmaddsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
1399; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_ps_256:
1400; CHECK:       ## BB#0:
1401; CHECK-NEXT:    movzbl %dil, %eax
1402; CHECK-NEXT:    kmovw %eax, %k1
1403; CHECK-NEXT:    vmovaps %zmm0, %zmm3
1404; CHECK-NEXT:    vfmaddsub213ps %ymm2, %ymm1, %ymm3 {%k1}
1405; CHECK-NEXT:    vfmaddsub213ps %ymm2, %ymm1, %ymm0
1406; CHECK-NEXT:    vaddps %ymm0, %ymm3, %ymm0
1407; CHECK-NEXT:    retq
1408  %res = call <8 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
1409  %res1 = call <8 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
1410  %res2 = fadd <8 x float> %res, %res1
1411  ret <8 x float> %res2
1412}
1413
1414declare <8 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
1415
1416define <8 x float>@test_int_x86_avx512_mask3_vfmaddsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
1417; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_ps_256:
1418; CHECK:       ## BB#0:
1419; CHECK-NEXT:    movzbl %dil, %eax
1420; CHECK-NEXT:    kmovw %eax, %k1
1421; CHECK-NEXT:    vmovaps %zmm2, %zmm3
1422; CHECK-NEXT:    vfmaddsub231ps %ymm1, %ymm0, %ymm3 {%k1}
1423; CHECK-NEXT:    vfmaddsub213ps %ymm2, %ymm1, %ymm0
1424; CHECK-NEXT:    vaddps %ymm0, %ymm3, %ymm0
1425; CHECK-NEXT:    retq
1426  %res = call <8 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
1427  %res1 = call <8 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
1428  %res2 = fadd <8 x float> %res, %res1
1429  ret <8 x float> %res2
1430}
1431
1432declare <8 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
1433
1434define <8 x float>@test_int_x86_avx512_maskz_vfmaddsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
1435; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_ps_256:
1436; CHECK:       ## BB#0:
1437; CHECK-NEXT:    movzbl %dil, %eax
1438; CHECK-NEXT:    kmovw %eax, %k1
1439; CHECK-NEXT:    vmovaps %zmm0, %zmm3
1440; CHECK-NEXT:    vfmaddsub213ps %ymm2, %ymm1, %ymm3 {%k1} {z}
1441; CHECK-NEXT:    vfmaddsub213ps %ymm2, %ymm1, %ymm0
1442; CHECK-NEXT:    vaddps %ymm0, %ymm3, %ymm0
1443; CHECK-NEXT:    retq
1444  %res = call <8 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
1445  %res1 = call <8 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
1446  %res2 = fadd <8 x float> %res, %res1
1447  ret <8 x float> %res2
1448}
1449
1450declare <2 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
1451
1452define <2 x double>@test_int_x86_avx512_mask3_vfmsubadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
1453; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_pd_128:
1454; CHECK:       ## BB#0:
1455; CHECK-NEXT:    movzbl %dil, %eax
1456; CHECK-NEXT:    kmovw %eax, %k1
1457; CHECK-NEXT:    vmovaps %zmm2, %zmm3
1458; CHECK-NEXT:    vfmsubadd231pd %xmm1, %xmm0, %xmm3 {%k1}
1459; CHECK-NEXT:    vfmsubadd213pd %xmm2, %xmm1, %xmm0
1460; CHECK-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
1461; CHECK-NEXT:    retq
1462  %res = call <2 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
1463  %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
1464  %res2=fadd <2 x double> %res, %res1
1465  ret <2 x double> %res2
1466}
1467
1468declare <4 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
1469
1470define <4 x double>@test_int_x86_avx512_mask3_vfmsubadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
1471; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_pd_256:
1472; CHECK:       ## BB#0:
1473; CHECK-NEXT:    movzbl %dil, %eax
1474; CHECK-NEXT:    kmovw %eax, %k1
1475; CHECK-NEXT:    vmovaps %zmm2, %zmm3
1476; CHECK-NEXT:    vfmsubadd231pd %ymm1, %ymm0, %ymm3 {%k1}
1477; CHECK-NEXT:    vfmsubadd213pd %ymm2, %ymm1, %ymm0
1478; CHECK-NEXT:    vaddpd %ymm0, %ymm3, %ymm0
1479; CHECK-NEXT:    retq
1480  %res = call <4 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
1481  %res1 = call <4 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
1482  %res2=fadd <4 x double> %res, %res1
1483  ret <4 x double> %res2
1484}
1485
1486declare <4 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
1487
1488define <4 x float>@test_int_x86_avx512_mask3_vfmsubadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
1489; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_ps_128:
1490; CHECK:       ## BB#0:
1491; CHECK-NEXT:    movzbl %dil, %eax
1492; CHECK-NEXT:    kmovw %eax, %k1
1493; CHECK-NEXT:    vmovaps %zmm2, %zmm3
1494; CHECK-NEXT:    vfmsubadd231ps %xmm1, %xmm0, %xmm3 {%k1}
1495; CHECK-NEXT:    vfmsubadd213ps %xmm2, %xmm1, %xmm0
1496; CHECK-NEXT:    vaddps %xmm0, %xmm3, %xmm0
1497; CHECK-NEXT:    retq
1498  %res = call <4 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
1499  %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
1500  %res2=fadd <4 x float> %res, %res1
1501  ret <4 x float> %res2
1502}
1503
1504declare <8 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
1505
1506define <8 x float>@test_int_x86_avx512_mask3_vfmsubadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
1507; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_ps_256:
1508; CHECK:       ## BB#0:
1509; CHECK-NEXT:    movzbl %dil, %eax
1510; CHECK-NEXT:    kmovw %eax, %k1
1511; CHECK-NEXT:    vmovaps %zmm2, %zmm3
1512; CHECK-NEXT:    vfmsubadd231ps %ymm1, %ymm0, %ymm3 {%k1}
1513; CHECK-NEXT:    vfmsubadd213ps %ymm2, %ymm1, %ymm0
1514; CHECK-NEXT:    vaddps %ymm0, %ymm3, %ymm0
1515; CHECK-NEXT:    retq
1516  %res = call <8 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
1517  %res1 = call <8 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
1518  %res2=fadd <8 x float> %res, %res1
1519  ret <8 x float> %res2
1520}
1521
1522
1523define <4 x float> @test_mask_vfmadd128_ps_r(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
1524  ; CHECK-LABEL: test_mask_vfmadd128_ps_r
1525  ; CHECK: vfmadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0xc2]
1526  %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
1527  ret <4 x float> %res
1528}
1529
1530define <4 x float> @test_mask_vfmadd128_ps_rz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
1531  ; CHECK-LABEL: test_mask_vfmadd128_ps_rz
1532  ; CHECK: vfmadd213ps %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x08,0xa8,0xc2]
1533  %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind
1534  ret <4 x float> %res
1535}
1536
1537define <4 x float> @test_mask_vfmadd128_ps_rmk(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2, i8 %mask) {
1538  ; CHECK-LABEL: test_mask_vfmadd128_ps_rmk
1539  ; CHECK: vfmadd213ps	(%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0x07]
1540  %a2 = load <4 x float>, <4 x float>* %ptr_a2
1541  %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
1542  ret <4 x float> %res
1543}
1544
1545define <4 x float> @test_mask_vfmadd128_ps_rmka(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2, i8 %mask) {
1546  ; CHECK-LABEL: test_mask_vfmadd128_ps_rmka
1547  ; CHECK: vfmadd213ps     (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0x07]
1548  %a2 = load <4 x float>, <4 x float>* %ptr_a2, align 8
1549  %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
1550  ret <4 x float> %res
1551}
1552
1553define <4 x float> @test_mask_vfmadd128_ps_rmkz(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2) {
1554  ; CHECK-LABEL: test_mask_vfmadd128_ps_rmkz
1555  ; CHECK: vfmadd213ps	(%rdi), %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0xa8,0x07]
1556  %a2 = load <4 x float>, <4 x float>* %ptr_a2
1557  %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind
1558  ret <4 x float> %res
1559}
1560
1561define <4 x float> @test_mask_vfmadd128_ps_rmkza(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2) {
1562  ; CHECK-LABEL: test_mask_vfmadd128_ps_rmkza
1563  ; CHECK: vfmadd213ps	(%rdi), %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0xa8,0x07]
1564  %a2 = load <4 x float>, <4 x float>* %ptr_a2, align 4
1565  %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind
1566  ret <4 x float> %res
1567}
1568
1569define <4 x float> @test_mask_vfmadd128_ps_rmb(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2, i8 %mask) {
1570  ; CHECK-LABEL: test_mask_vfmadd128_ps_rmb
1571  ; CHECK: vfmadd213ps	(%rdi){1to4}, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x19,0xa8,0x07]
1572  %q = load float, float* %ptr_a2
1573  %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
1574  %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
1575  %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
1576  %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
1577  %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 %mask) nounwind
1578  ret <4 x float> %res
1579}
1580
1581define <4 x float> @test_mask_vfmadd128_ps_rmba(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2, i8 %mask) {
1582  ; CHECK-LABEL: test_mask_vfmadd128_ps_rmba
1583  ; CHECK: vfmadd213ps	(%rdi){1to4}, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x19,0xa8,0x07]
1584  %q = load float, float* %ptr_a2, align 4
1585  %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
1586  %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
1587  %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
1588  %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
1589  %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 %mask) nounwind
1590  ret <4 x float> %res
1591}
1592
1593define <4 x float> @test_mask_vfmadd128_ps_rmbz(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2) {
1594  ; CHECK-LABEL: test_mask_vfmadd128_ps_rmbz
1595  ; CHECK: vfmadd213ps	(%rdi){1to4}, %xmm1, %xmm0  ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0x07]
1596  %q = load float, float* %ptr_a2
1597  %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
1598  %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
1599  %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
1600  %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
1601  %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 -1) nounwind
1602  ret <4 x float> %res
1603}
1604
1605define <4 x float> @test_mask_vfmadd128_ps_rmbza(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2) {
1606  ; CHECK-LABEL: test_mask_vfmadd128_ps_rmbza
1607  ; CHECK: vfmadd213ps	(%rdi){1to4}, %xmm1, %xmm0  ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0x07]
1608  %q = load float, float* %ptr_a2, align 4
1609  %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
1610  %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
1611  %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
1612  %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
1613  %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 -1) nounwind
1614  ret <4 x float> %res
1615}
1616
1617define <2 x double> @test_mask_vfmadd128_pd_r(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
1618  ; CHECK-LABEL: test_mask_vfmadd128_pd_r
1619  ; CHECK: vfmadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0xc2]
1620  %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
1621  ret <2 x double> %res
1622}
1623
1624define <2 x double> @test_mask_vfmadd128_pd_rz(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
1625  ; CHECK-LABEL: test_mask_vfmadd128_pd_rz
1626  ; CHECK: vfmadd213pd %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0xf5,0x08,0xa8,0xc2]
1627  %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 -1) nounwind
1628  ret <2 x double> %res
1629}
1630
1631define <2 x double> @test_mask_vfmadd128_pd_rmk(<2 x double> %a0, <2 x double> %a1, <2 x double>* %ptr_a2, i8 %mask) {
1632  ; CHECK-LABEL: test_mask_vfmadd128_pd_rmk
1633  ; CHECK: vfmadd213pd	(%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0x07]
1634  %a2 = load <2 x double>, <2 x double>* %ptr_a2
1635  %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
1636  ret <2 x double> %res
1637}
1638
1639define <2 x double> @test_mask_vfmadd128_pd_rmkz(<2 x double> %a0, <2 x double> %a1, <2 x double>* %ptr_a2) {
1640  ; CHECK-LABEL: test_mask_vfmadd128_pd_rmkz
1641  ; CHECK: vfmadd213pd	(%rdi), %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0xf1,0xa8,0x07]
1642  %a2 = load <2 x double>, <2 x double>* %ptr_a2
1643  %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 -1) nounwind
1644  ret <2 x double> %res
1645}
1646
1647define <4 x double> @test_mask_vfmadd256_pd_r(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) {
1648  ; CHECK-LABEL: test_mask_vfmadd256_pd_r
1649  ; CHECK: vfmadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0xc2]
1650  %res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
1651  ret <4 x double> %res
1652}
1653
1654define <4 x double> @test_mask_vfmadd256_pd_rz(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
1655  ; CHECK-LABEL: test_mask_vfmadd256_pd_rz
1656  ; CHECK: vfmadd213pd %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0xf5,0x28,0xa8,0xc2]
1657  %res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 -1) nounwind
1658  ret <4 x double> %res
1659}
1660
1661define <4 x double> @test_mask_vfmadd256_pd_rmk(<4 x double> %a0, <4 x double> %a1, <4 x double>* %ptr_a2, i8 %mask) {
1662  ; CHECK-LABEL: test_mask_vfmadd256_pd_rmk
1663  ; CHECK: vfmadd213pd	(%rdi), %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0x07]
1664  %a2 = load <4 x double>, <4 x double>* %ptr_a2
1665  %res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
1666  ret <4 x double> %res
1667}
1668
1669define <4 x double> @test_mask_vfmadd256_pd_rmkz(<4 x double> %a0, <4 x double> %a1, <4 x double>* %ptr_a2) {
1670  ; CHECK-LABEL: test_mask_vfmadd256_pd_rmkz
1671  ; CHECK: vfmadd213pd	(%rdi), %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0xf5,0xa8,0x07]
1672  %a2 = load <4 x double>, <4 x double>* %ptr_a2
1673  %res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 -1) nounwind
1674  ret <4 x double> %res
1675}
1676define <8 x i16> @test_mask_add_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
1677  ;CHECK-LABEL: test_mask_add_epi16_rr_128
1678  ;CHECK: vpaddw %xmm1, %xmm0, %xmm0     ## encoding: [0x62,0xf1,0x7d,0x08,0xfd,0xc1]
1679  %res = call <8 x i16> @llvm.x86.avx512.mask.padd.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
1680  ret <8 x i16> %res
1681}
1682
1683define <8 x i16> @test_mask_add_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
1684  ;CHECK-LABEL: test_mask_add_epi16_rrk_128
1685  ;CHECK: vpaddw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfd,0xd1]
1686  %res = call <8 x i16> @llvm.x86.avx512.mask.padd.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
1687  ret <8 x i16> %res
1688}
1689
1690define <8 x i16> @test_mask_add_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
1691  ;CHECK-LABEL: test_mask_add_epi16_rrkz_128
1692  ;CHECK: vpaddw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfd,0xc1]
1693  %res = call <8 x i16> @llvm.x86.avx512.mask.padd.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
1694  ret <8 x i16> %res
1695}
1696
1697define <8 x i16> @test_mask_add_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
1698  ;CHECK-LABEL: test_mask_add_epi16_rm_128
1699  ;CHECK: vpaddw (%rdi), %xmm0, %xmm0    ## encoding: [0x62,0xf1,0x7d,0x08,0xfd,0x07]
1700  %b = load <8 x i16>, <8 x i16>* %ptr_b
1701  %res = call <8 x i16> @llvm.x86.avx512.mask.padd.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
1702  ret <8 x i16> %res
1703}
1704
1705define <8 x i16> @test_mask_add_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
1706  ;CHECK-LABEL: test_mask_add_epi16_rmk_128
1707  ;CHECK: vpaddw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfd,0x0f]
1708  %b = load <8 x i16>, <8 x i16>* %ptr_b
1709  %res = call <8 x i16> @llvm.x86.avx512.mask.padd.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
1710  ret <8 x i16> %res
1711}
1712
1713define <8 x i16> @test_mask_add_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
1714  ;CHECK-LABEL: test_mask_add_epi16_rmkz_128
1715  ;CHECK: vpaddw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfd,0x07]
1716  %b = load <8 x i16>, <8 x i16>* %ptr_b
1717  %res = call <8 x i16> @llvm.x86.avx512.mask.padd.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
1718  ret <8 x i16> %res
1719}
1720
1721declare <8 x i16> @llvm.x86.avx512.mask.padd.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
1722
1723define <16 x i16> @test_mask_add_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
1724  ;CHECK-LABEL: test_mask_add_epi16_rr_256
1725  ;CHECK: vpaddw %ymm1, %ymm0, %ymm0     ## encoding: [0x62,0xf1,0x7d,0x28,0xfd,0xc1]
1726  %res = call <16 x i16> @llvm.x86.avx512.mask.padd.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
1727  ret <16 x i16> %res
1728}
1729
1730define <16 x i16> @test_mask_add_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
1731  ;CHECK-LABEL: test_mask_add_epi16_rrk_256
1732  ;CHECK: vpaddw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfd,0xd1]
1733  %res = call <16 x i16> @llvm.x86.avx512.mask.padd.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
1734  ret <16 x i16> %res
1735}
1736
1737define <16 x i16> @test_mask_add_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
1738  ;CHECK-LABEL: test_mask_add_epi16_rrkz_256
1739  ;CHECK: vpaddw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfd,0xc1]
1740  %res = call <16 x i16> @llvm.x86.avx512.mask.padd.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
1741  ret <16 x i16> %res
1742}
1743
1744define <16 x i16> @test_mask_add_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
1745  ;CHECK-LABEL: test_mask_add_epi16_rm_256
1746  ;CHECK: vpaddw (%rdi), %ymm0, %ymm0    ## encoding: [0x62,0xf1,0x7d,0x28,0xfd,0x07]
1747  %b = load <16 x i16>, <16 x i16>* %ptr_b
1748  %res = call <16 x i16> @llvm.x86.avx512.mask.padd.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
1749  ret <16 x i16> %res
1750}
1751
1752define <16 x i16> @test_mask_add_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
1753  ;CHECK-LABEL: test_mask_add_epi16_rmk_256
1754  ;CHECK: vpaddw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfd,0x0f]
1755  %b = load <16 x i16>, <16 x i16>* %ptr_b
1756  %res = call <16 x i16> @llvm.x86.avx512.mask.padd.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
1757  ret <16 x i16> %res
1758}
1759
1760define <16 x i16> @test_mask_add_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
1761  ;CHECK-LABEL: test_mask_add_epi16_rmkz_256
1762  ;CHECK: vpaddw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfd,0x07]
1763  %b = load <16 x i16>, <16 x i16>* %ptr_b
1764  %res = call <16 x i16> @llvm.x86.avx512.mask.padd.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
1765  ret <16 x i16> %res
1766}
1767
1768declare <16 x i16> @llvm.x86.avx512.mask.padd.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
1769
1770define <8 x i16> @test_mask_sub_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
1771  ;CHECK-LABEL: test_mask_sub_epi16_rr_128
1772  ;CHECK: vpsubw %xmm1, %xmm0, %xmm0     ## encoding: [0x62,0xf1,0x7d,0x08,0xf9,0xc1]
1773  %res = call <8 x i16> @llvm.x86.avx512.mask.psub.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
1774  ret <8 x i16> %res
1775}
1776
1777define <8 x i16> @test_mask_sub_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
1778  ;CHECK-LABEL: test_mask_sub_epi16_rrk_128
1779  ;CHECK: vpsubw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xf9,0xd1]
1780  %res = call <8 x i16> @llvm.x86.avx512.mask.psub.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
1781  ret <8 x i16> %res
1782}
1783
1784define <8 x i16> @test_mask_sub_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
1785  ;CHECK-LABEL: test_mask_sub_epi16_rrkz_128
1786  ;CHECK: vpsubw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xf9,0xc1]
1787  %res = call <8 x i16> @llvm.x86.avx512.mask.psub.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
1788  ret <8 x i16> %res
1789}
1790
1791define <8 x i16> @test_mask_sub_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
1792  ;CHECK-LABEL: test_mask_sub_epi16_rm_128
1793  ;CHECK: vpsubw (%rdi), %xmm0, %xmm0    ## encoding: [0x62,0xf1,0x7d,0x08,0xf9,0x07]
1794  %b = load <8 x i16>, <8 x i16>* %ptr_b
1795  %res = call <8 x i16> @llvm.x86.avx512.mask.psub.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
1796  ret <8 x i16> %res
1797}
1798
1799define <8 x i16> @test_mask_sub_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
1800  ;CHECK-LABEL: test_mask_sub_epi16_rmk_128
1801  ;CHECK: vpsubw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xf9,0x0f]
1802  %b = load <8 x i16>, <8 x i16>* %ptr_b
1803  %res = call <8 x i16> @llvm.x86.avx512.mask.psub.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
1804  ret <8 x i16> %res
1805}
1806
1807define <8 x i16> @test_mask_sub_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
1808  ;CHECK-LABEL: test_mask_sub_epi16_rmkz_128
1809  ;CHECK: vpsubw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xf9,0x07]
1810  %b = load <8 x i16>, <8 x i16>* %ptr_b
1811  %res = call <8 x i16> @llvm.x86.avx512.mask.psub.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
1812  ret <8 x i16> %res
1813}
1814
1815declare <8 x i16> @llvm.x86.avx512.mask.psub.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
1816
1817define <16 x i16> @test_mask_sub_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
1818  ;CHECK-LABEL: test_mask_sub_epi16_rr_256
1819  ;CHECK: vpsubw %ymm1, %ymm0, %ymm0     ## encoding: [0x62,0xf1,0x7d,0x28,0xf9,0xc1]
1820  %res = call <16 x i16> @llvm.x86.avx512.mask.psub.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
1821  ret <16 x i16> %res
1822}
1823
1824define <16 x i16> @test_mask_sub_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
1825  ;CHECK-LABEL: test_mask_sub_epi16_rrk_256
1826  ;CHECK: vpsubw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xf9,0xd1]
1827  %res = call <16 x i16> @llvm.x86.avx512.mask.psub.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
1828  ret <16 x i16> %res
1829}
1830
1831define <16 x i16> @test_mask_sub_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
1832  ;CHECK-LABEL: test_mask_sub_epi16_rrkz_256
1833  ;CHECK: vpsubw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xf9,0xc1]
1834  %res = call <16 x i16> @llvm.x86.avx512.mask.psub.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
1835  ret <16 x i16> %res
1836}
1837
1838define <16 x i16> @test_mask_sub_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
1839  ;CHECK-LABEL: test_mask_sub_epi16_rm_256
1840  ;CHECK: vpsubw (%rdi), %ymm0, %ymm0    ## encoding: [0x62,0xf1,0x7d,0x28,0xf9,0x07]
1841  %b = load <16 x i16>, <16 x i16>* %ptr_b
1842  %res = call <16 x i16> @llvm.x86.avx512.mask.psub.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
1843  ret <16 x i16> %res
1844}
1845
1846define <16 x i16> @test_mask_sub_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
1847  ;CHECK-LABEL: test_mask_sub_epi16_rmk_256
1848  ;CHECK: vpsubw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xf9,0x0f]
1849  %b = load <16 x i16>, <16 x i16>* %ptr_b
1850  %res = call <16 x i16> @llvm.x86.avx512.mask.psub.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
1851  ret <16 x i16> %res
1852}
1853
1854define <16 x i16> @test_mask_sub_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
1855  ;CHECK-LABEL: test_mask_sub_epi16_rmkz_256
1856  ;CHECK: vpsubw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xf9,0x07]
1857  %b = load <16 x i16>, <16 x i16>* %ptr_b
1858  %res = call <16 x i16> @llvm.x86.avx512.mask.psub.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
1859  ret <16 x i16> %res
1860}
1861
1862declare <16 x i16> @llvm.x86.avx512.mask.psub.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
1863
1864define <32 x i16> @test_mask_add_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
1865  ;CHECK-LABEL: test_mask_add_epi16_rr_512
1866  ;CHECK: vpaddw %zmm1, %zmm0, %zmm0     ## encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc1]
1867  %res = call <32 x i16> @llvm.x86.avx512.mask.padd.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
1868  ret <32 x i16> %res
1869}
1870
1871define <32 x i16> @test_mask_add_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
1872  ;CHECK-LABEL: test_mask_add_epi16_rrk_512
1873  ;CHECK: vpaddw %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xfd,0xd1]
1874  %res = call <32 x i16> @llvm.x86.avx512.mask.padd.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
1875  ret <32 x i16> %res
1876}
1877
1878define <32 x i16> @test_mask_add_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
1879  ;CHECK-LABEL: test_mask_add_epi16_rrkz_512
1880  ;CHECK: vpaddw %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xfd,0xc1]
1881  %res = call <32 x i16> @llvm.x86.avx512.mask.padd.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
1882  ret <32 x i16> %res
1883}
1884
1885define <32 x i16> @test_mask_add_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
1886  ;CHECK-LABEL: test_mask_add_epi16_rm_512
1887  ;CHECK: vpaddw (%rdi), %zmm0, %zmm0    ## encoding: [0x62,0xf1,0x7d,0x48,0xfd,0x07]
1888  %b = load <32 x i16>, <32 x i16>* %ptr_b
1889  %res = call <32 x i16> @llvm.x86.avx512.mask.padd.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
1890  ret <32 x i16> %res
1891}
1892
1893define <32 x i16> @test_mask_add_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
1894  ;CHECK-LABEL: test_mask_add_epi16_rmk_512
1895  ;CHECK: vpaddw (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xfd,0x0f]
1896  %b = load <32 x i16>, <32 x i16>* %ptr_b
1897  %res = call <32 x i16> @llvm.x86.avx512.mask.padd.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
1898  ret <32 x i16> %res
1899}
1900
1901define <32 x i16> @test_mask_add_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
1902  ;CHECK-LABEL: test_mask_add_epi16_rmkz_512
1903  ;CHECK: vpaddw (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xfd,0x07]
1904  %b = load <32 x i16>, <32 x i16>* %ptr_b
1905  %res = call <32 x i16> @llvm.x86.avx512.mask.padd.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
1906  ret <32 x i16> %res
1907}
1908
1909declare <32 x i16> @llvm.x86.avx512.mask.padd.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
1910
1911define <32 x i16> @test_mask_sub_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
1912  ;CHECK-LABEL: test_mask_sub_epi16_rr_512
1913  ;CHECK: vpsubw %zmm1, %zmm0, %zmm0     ## encoding: [0x62,0xf1,0x7d,0x48,0xf9,0xc1]
1914  %res = call <32 x i16> @llvm.x86.avx512.mask.psub.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
1915  ret <32 x i16> %res
1916}
1917
1918define <32 x i16> @test_mask_sub_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
1919  ;CHECK-LABEL: test_mask_sub_epi16_rrk_512
1920  ;CHECK: vpsubw %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xf9,0xd1]
1921  %res = call <32 x i16> @llvm.x86.avx512.mask.psub.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
1922  ret <32 x i16> %res
1923}
1924
1925define <32 x i16> @test_mask_sub_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
1926  ;CHECK-LABEL: test_mask_sub_epi16_rrkz_512
1927  ;CHECK: vpsubw %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xf9,0xc1]
1928  %res = call <32 x i16> @llvm.x86.avx512.mask.psub.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
1929  ret <32 x i16> %res
1930}
1931
1932define <32 x i16> @test_mask_sub_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
1933  ;CHECK-LABEL: test_mask_sub_epi16_rm_512
1934  ;CHECK: vpsubw (%rdi), %zmm0, %zmm0    ## encoding: [0x62,0xf1,0x7d,0x48,0xf9,0x07]
1935  %b = load <32 x i16>, <32 x i16>* %ptr_b
1936  %res = call <32 x i16> @llvm.x86.avx512.mask.psub.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
1937  ret <32 x i16> %res
1938}
1939
1940define <32 x i16> @test_mask_sub_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
1941  ;CHECK-LABEL: test_mask_sub_epi16_rmk_512
1942  ;CHECK: vpsubw (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xf9,0x0f]
1943  %b = load <32 x i16>, <32 x i16>* %ptr_b
1944  %res = call <32 x i16> @llvm.x86.avx512.mask.psub.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
1945  ret <32 x i16> %res
1946}
1947
1948define <32 x i16> @test_mask_sub_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
1949  ;CHECK-LABEL: test_mask_sub_epi16_rmkz_512
1950  ;CHECK: vpsubw (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xf9,0x07]
1951  %b = load <32 x i16>, <32 x i16>* %ptr_b
1952  %res = call <32 x i16> @llvm.x86.avx512.mask.psub.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
1953  ret <32 x i16> %res
1954}
1955
1956declare <32 x i16> @llvm.x86.avx512.mask.psub.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
1957
1958define <32 x i16> @test_mask_mullo_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
1959  ;CHECK-LABEL: test_mask_mullo_epi16_rr_512
1960  ;CHECK: vpmullw %zmm1, %zmm0, %zmm0     ## encoding: [0x62,0xf1,0x7d,0x48,0xd5,0xc1]
1961  %res = call <32 x i16> @llvm.x86.avx512.mask.pmull.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
1962  ret <32 x i16> %res
1963}
1964
1965define <32 x i16> @test_mask_mullo_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
1966  ;CHECK-LABEL: test_mask_mullo_epi16_rrk_512
1967  ;CHECK: vpmullw %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xd5,0xd1]
1968  %res = call <32 x i16> @llvm.x86.avx512.mask.pmull.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
1969  ret <32 x i16> %res
1970}
1971
1972define <32 x i16> @test_mask_mullo_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
1973  ;CHECK-LABEL: test_mask_mullo_epi16_rrkz_512
1974  ;CHECK: vpmullw %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xd5,0xc1]
1975  %res = call <32 x i16> @llvm.x86.avx512.mask.pmull.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
1976  ret <32 x i16> %res
1977}
1978
1979define <32 x i16> @test_mask_mullo_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
1980  ;CHECK-LABEL: test_mask_mullo_epi16_rm_512
1981  ;CHECK: vpmullw (%rdi), %zmm0, %zmm0    ## encoding: [0x62,0xf1,0x7d,0x48,0xd5,0x07]
1982  %b = load <32 x i16>, <32 x i16>* %ptr_b
1983  %res = call <32 x i16> @llvm.x86.avx512.mask.pmull.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
1984  ret <32 x i16> %res
1985}
1986
1987define <32 x i16> @test_mask_mullo_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
1988  ;CHECK-LABEL: test_mask_mullo_epi16_rmk_512
1989  ;CHECK: vpmullw (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xd5,0x0f]
1990  %b = load <32 x i16>, <32 x i16>* %ptr_b
1991  %res = call <32 x i16> @llvm.x86.avx512.mask.pmull.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
1992  ret <32 x i16> %res
1993}
1994
1995define <32 x i16> @test_mask_mullo_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
1996  ;CHECK-LABEL: test_mask_mullo_epi16_rmkz_512
1997  ;CHECK: vpmullw (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xd5,0x07]
1998  %b = load <32 x i16>, <32 x i16>* %ptr_b
1999  %res = call <32 x i16> @llvm.x86.avx512.mask.pmull.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
2000  ret <32 x i16> %res
2001}
2002
2003declare <32 x i16> @llvm.x86.avx512.mask.pmull.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
2004
2005define <8 x i16> @test_mask_mullo_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
2006  ;CHECK-LABEL: test_mask_mullo_epi16_rr_128
2007  ;CHECK: vpmullw %xmm1, %xmm0, %xmm0     ## encoding: [0x62,0xf1,0x7d,0x08,0xd5,0xc1]
2008  %res = call <8 x i16> @llvm.x86.avx512.mask.pmull.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
2009  ret <8 x i16> %res
2010}
2011
2012define <8 x i16> @test_mask_mullo_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
2013  ;CHECK-LABEL: test_mask_mullo_epi16_rrk_128
2014  ;CHECK: vpmullw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd5,0xd1]
2015  %res = call <8 x i16> @llvm.x86.avx512.mask.pmull.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
2016  ret <8 x i16> %res
2017}
2018
2019define <8 x i16> @test_mask_mullo_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
2020  ;CHECK-LABEL: test_mask_mullo_epi16_rrkz_128
2021  ;CHECK: vpmullw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd5,0xc1]
2022  %res = call <8 x i16> @llvm.x86.avx512.mask.pmull.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
2023  ret <8 x i16> %res
2024}
2025
2026define <8 x i16> @test_mask_mullo_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
2027  ;CHECK-LABEL: test_mask_mullo_epi16_rm_128
2028  ;CHECK: vpmullw (%rdi), %xmm0, %xmm0    ## encoding: [0x62,0xf1,0x7d,0x08,0xd5,0x07]
2029  %b = load <8 x i16>, <8 x i16>* %ptr_b
2030  %res = call <8 x i16> @llvm.x86.avx512.mask.pmull.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
2031  ret <8 x i16> %res
2032}
2033
2034define <8 x i16> @test_mask_mullo_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
2035  ;CHECK-LABEL: test_mask_mullo_epi16_rmk_128
2036  ;CHECK: vpmullw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd5,0x0f]
2037  %b = load <8 x i16>, <8 x i16>* %ptr_b
2038  %res = call <8 x i16> @llvm.x86.avx512.mask.pmull.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
2039  ret <8 x i16> %res
2040}
2041
2042define <8 x i16> @test_mask_mullo_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
2043  ;CHECK-LABEL: test_mask_mullo_epi16_rmkz_128
2044  ;CHECK: vpmullw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd5,0x07]
2045  %b = load <8 x i16>, <8 x i16>* %ptr_b
2046  %res = call <8 x i16> @llvm.x86.avx512.mask.pmull.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
2047  ret <8 x i16> %res
2048}
2049
2050declare <8 x i16> @llvm.x86.avx512.mask.pmull.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
2051
2052define <16 x i16> @test_mask_mullo_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
2053  ;CHECK-LABEL: test_mask_mullo_epi16_rr_256
2054  ;CHECK: vpmullw %ymm1, %ymm0, %ymm0     ## encoding: [0x62,0xf1,0x7d,0x28,0xd5,0xc1]
2055  %res = call <16 x i16> @llvm.x86.avx512.mask.pmull.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
2056  ret <16 x i16> %res
2057}
2058
2059define <16 x i16> @test_mask_mullo_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
2060  ;CHECK-LABEL: test_mask_mullo_epi16_rrk_256
2061  ;CHECK: vpmullw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd5,0xd1]
2062  %res = call <16 x i16> @llvm.x86.avx512.mask.pmull.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
2063  ret <16 x i16> %res
2064}
2065
2066define <16 x i16> @test_mask_mullo_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
2067  ;CHECK-LABEL: test_mask_mullo_epi16_rrkz_256
2068  ;CHECK: vpmullw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd5,0xc1]
2069  %res = call <16 x i16> @llvm.x86.avx512.mask.pmull.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
2070  ret <16 x i16> %res
2071}
2072
2073define <16 x i16> @test_mask_mullo_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
2074  ;CHECK-LABEL: test_mask_mullo_epi16_rm_256
2075  ;CHECK: vpmullw (%rdi), %ymm0, %ymm0    ## encoding: [0x62,0xf1,0x7d,0x28,0xd5,0x07]
2076  %b = load <16 x i16>, <16 x i16>* %ptr_b
2077  %res = call <16 x i16> @llvm.x86.avx512.mask.pmull.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
2078  ret <16 x i16> %res
2079}
2080
2081define <16 x i16> @test_mask_mullo_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
2082  ;CHECK-LABEL: test_mask_mullo_epi16_rmk_256
2083  ;CHECK: vpmullw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd5,0x0f]
2084  %b = load <16 x i16>, <16 x i16>* %ptr_b
2085  %res = call <16 x i16> @llvm.x86.avx512.mask.pmull.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
2086  ret <16 x i16> %res
2087}
2088
2089define <16 x i16> @test_mask_mullo_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
2090  ;CHECK-LABEL: test_mask_mullo_epi16_rmkz_256
2091  ;CHECK: vpmullw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd5,0x07]
2092  %b = load <16 x i16>, <16 x i16>* %ptr_b
2093  %res = call <16 x i16> @llvm.x86.avx512.mask.pmull.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
2094  ret <16 x i16> %res
2095}
2096
2097declare <16 x i16> @llvm.x86.avx512.mask.pmull.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
2098
2099
2100define <8 x i16> @test_mask_packs_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
2101  ;CHECK-LABEL: test_mask_packs_epi32_rr_128
2102  ;CHECK: vpackssdw       %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x6b,0xc1]
2103  %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 -1)
2104  ret <8 x i16> %res
2105}
2106
2107define <8 x i16> @test_mask_packs_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask) {
2108  ;CHECK-LABEL: test_mask_packs_epi32_rrk_128
2109  ;CHECK: vpackssdw       %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x6b,0xd1]
2110  %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask)
2111  ret <8 x i16> %res
2112}
2113
2114define <8 x i16> @test_mask_packs_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
2115  ;CHECK-LABEL: test_mask_packs_epi32_rrkz_128
2116  ;CHECK: vpackssdw       %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x6b,0xc1]
2117  %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 %mask)
2118  ret <8 x i16> %res
2119}
2120
2121define <8 x i16> @test_mask_packs_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
2122  ;CHECK-LABEL: test_mask_packs_epi32_rm_128
2123  ;CHECK: vpackssdw       (%rdi), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x6b,0x07]
2124  %b = load <4 x i32>, <4 x i32>* %ptr_b
2125  %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 -1)
2126  ret <8 x i16> %res
2127}
2128
2129define <8 x i16> @test_mask_packs_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
2130  ;CHECK-LABEL: test_mask_packs_epi32_rmk_128
2131  ;CHECK: vpackssdw       (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x6b,0x0f]
2132  %b = load <4 x i32>, <4 x i32>* %ptr_b
2133  %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask)
2134  ret <8 x i16> %res
2135}
2136
2137define <8 x i16> @test_mask_packs_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
2138  ;CHECK-LABEL: test_mask_packs_epi32_rmkz_128
2139  ;CHECK: vpackssdw       (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x6b,0x07]
2140  %b = load <4 x i32>, <4 x i32>* %ptr_b
2141  %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 %mask)
2142  ret <8 x i16> %res
2143}
2144
2145define <8 x i16> @test_mask_packs_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
2146  ;CHECK-LABEL: test_mask_packs_epi32_rmb_128
2147  ;CHECK: vpackssdw       (%rdi){1to4}, %xmm0, %xmm0  ## encoding: [0x62,0xf1,0x7d,0x18,0x6b,0x07]
2148  %q = load i32, i32* %ptr_b
2149  %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
2150  %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
2151  %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 -1)
2152  ret <8 x i16> %res
2153}
2154
2155define <8 x i16> @test_mask_packs_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <8 x i16> %passThru, i8 %mask) {
2156  ;CHECK-LABEL: test_mask_packs_epi32_rmbk_128
2157  ;CHECK: vpackssdw       (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0x6b,0x0f]
2158  %q = load i32, i32* %ptr_b
2159  %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
2160  %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
2161  %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask)
2162  ret <8 x i16> %res
2163}
2164
2165define <8 x i16> @test_mask_packs_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
2166  ;CHECK-LABEL: test_mask_packs_epi32_rmbkz_128
2167  ;CHECK: vpackssdw       (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0x6b,0x07]
2168  %q = load i32, i32* %ptr_b
2169  %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
2170  %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
2171  %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 %mask)
2172  ret <8 x i16> %res
2173}
2174
2175declare <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32>, <4 x i32>, <8 x i16>, i8)
2176
2177define <16 x i16> @test_mask_packs_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
2178  ;CHECK-LABEL: test_mask_packs_epi32_rr_256
2179  ;CHECK: vpackssdw       %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x6b,0xc1]
2180  %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 -1)
2181  ret <16 x i16> %res
2182}
2183
2184define <16 x i16> @test_mask_packs_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask) {
2185  ;CHECK-LABEL: test_mask_packs_epi32_rrk_256
2186  ;CHECK: vpackssdw       %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x6b,0xd1]
2187  %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask)
2188  ret <16 x i16> %res
2189}
2190
2191define <16 x i16> @test_mask_packs_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i16 %mask) {
2192  ;CHECK-LABEL: test_mask_packs_epi32_rrkz_256
2193  ;CHECK: vpackssdw       %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x6b,0xc1]
2194  %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 %mask)
2195  ret <16 x i16> %res
2196}
2197
2198define <16 x i16> @test_mask_packs_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
2199  ;CHECK-LABEL: test_mask_packs_epi32_rm_256
2200  ;CHECK: vpackssdw       (%rdi), %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x6b,0x07]
2201  %b = load <8 x i32>, <8 x i32>* %ptr_b
2202  %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 -1)
2203  ret <16 x i16> %res
2204}
2205
2206define <16 x i16> @test_mask_packs_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
2207  ;CHECK-LABEL: test_mask_packs_epi32_rmk_256
2208  ;CHECK: vpackssdw       (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x6b,0x0f]
2209  %b = load <8 x i32>, <8 x i32>* %ptr_b
2210  %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask)
2211  ret <16 x i16> %res
2212}
2213
2214define <16 x i16> @test_mask_packs_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i16 %mask) {
2215  ;CHECK-LABEL: test_mask_packs_epi32_rmkz_256
2216  ;CHECK: vpackssdw       (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x6b,0x07]
2217  %b = load <8 x i32>, <8 x i32>* %ptr_b
2218  %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 %mask)
2219  ret <16 x i16> %res
2220}
2221
2222define <16 x i16> @test_mask_packs_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
2223  ;CHECK-LABEL: test_mask_packs_epi32_rmb_256
2224  ;CHECK: vpackssdw       (%rdi){1to8}, %ymm0, %ymm0  ## encoding: [0x62,0xf1,0x7d,0x38,0x6b,0x07]
2225  %q = load i32, i32* %ptr_b
2226  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
2227  %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
2228  %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 -1)
2229  ret <16 x i16> %res
2230}
2231
2232define <16 x i16> @test_mask_packs_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <16 x i16> %passThru, i16 %mask) {
2233  ;CHECK-LABEL: test_mask_packs_epi32_rmbk_256
2234  ;CHECK: vpackssdw       (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0x6b,0x0f]
2235  %q = load i32, i32* %ptr_b
2236  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
2237  %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
2238  %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask)
2239  ret <16 x i16> %res
2240}
2241
2242define <16 x i16> @test_mask_packs_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i16 %mask) {
2243  ;CHECK-LABEL: test_mask_packs_epi32_rmbkz_256
2244  ;CHECK: vpackssdw       (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0x6b,0x07]
2245  %q = load i32, i32* %ptr_b
2246  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
2247  %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
2248  %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 %mask)
2249  ret <16 x i16> %res
2250}
2251
2252declare <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32>, <8 x i32>, <16 x i16>, i16)
2253
2254define <16 x i8> @test_mask_packs_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
2255  ;CHECK-LABEL: test_mask_packs_epi16_rr_128
2256  ;CHECK: vpacksswb       %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x63,0xc1]
2257  %res = call <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 -1)
2258  ret <16 x i8> %res
2259}
2260
2261define <16 x i8> @test_mask_packs_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask) {
2262  ;CHECK-LABEL: test_mask_packs_epi16_rrk_128
2263  ;CHECK: vpacksswb       %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x63,0xd1]
2264  %res = call <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask)
2265  ret <16 x i8> %res
2266}
2267
2268define <16 x i8> @test_mask_packs_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i16 %mask) {
2269  ;CHECK-LABEL: test_mask_packs_epi16_rrkz_128
2270  ;CHECK: vpacksswb       %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x63,0xc1]
2271  %res = call <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 %mask)
2272  ret <16 x i8> %res
2273}
2274
2275define <16 x i8> @test_mask_packs_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
2276  ;CHECK-LABEL: test_mask_packs_epi16_rm_128
2277  ;CHECK: vpacksswb       (%rdi), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x63,0x07]
2278  %b = load <8 x i16>, <8 x i16>* %ptr_b
2279  %res = call <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 -1)
2280  ret <16 x i8> %res
2281}
2282
2283define <16 x i8> @test_mask_packs_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
2284  ;CHECK-LABEL: test_mask_packs_epi16_rmk_128
2285  ;CHECK: vpacksswb       (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x63,0x0f]
2286  %b = load <8 x i16>, <8 x i16>* %ptr_b
2287  %res = call <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask)
2288  ret <16 x i8> %res
2289}
2290
2291define <16 x i8> @test_mask_packs_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i16 %mask) {
2292  ;CHECK-LABEL: test_mask_packs_epi16_rmkz_128
2293  ;CHECK: vpacksswb       (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x63,0x07]
2294  %b = load <8 x i16>, <8 x i16>* %ptr_b
2295  %res = call <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 %mask)
2296  ret <16 x i8> %res
2297}
2298
2299declare <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16>, <8 x i16>, <16 x i8>, i16)
2300
2301define <32 x i8> @test_mask_packs_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
2302  ;CHECK-LABEL: test_mask_packs_epi16_rr_256
2303  ;CHECK: vpacksswb       %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x63,0xc1]
2304  %res = call <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 -1)
2305  ret <32 x i8> %res
2306}
2307
2308define <32 x i8> @test_mask_packs_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask) {
2309  ;CHECK-LABEL: test_mask_packs_epi16_rrk_256
2310  ;CHECK: vpacksswb       %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x63,0xd1]
2311  %res = call <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask)
2312  ret <32 x i8> %res
2313}
2314
2315define <32 x i8> @test_mask_packs_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i32 %mask) {
2316  ;CHECK-LABEL: test_mask_packs_epi16_rrkz_256
2317  ;CHECK: vpacksswb       %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x63,0xc1]
2318  %res = call <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 %mask)
2319  ret <32 x i8> %res
2320}
2321
2322define <32 x i8> @test_mask_packs_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
2323  ;CHECK-LABEL: test_mask_packs_epi16_rm_256
2324  ;CHECK: vpacksswb       (%rdi), %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x63,0x07]
2325  %b = load <16 x i16>, <16 x i16>* %ptr_b
2326  %res = call <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 -1)
2327  ret <32 x i8> %res
2328}
2329
2330define <32 x i8> @test_mask_packs_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
2331  ;CHECK-LABEL: test_mask_packs_epi16_rmk_256
2332  ;CHECK: vpacksswb       (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x63,0x0f]
2333  %b = load <16 x i16>, <16 x i16>* %ptr_b
2334  %res = call <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask)
2335  ret <32 x i8> %res
2336}
2337
2338define <32 x i8> @test_mask_packs_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i32 %mask) {
2339  ;CHECK-LABEL: test_mask_packs_epi16_rmkz_256
2340  ;CHECK: vpacksswb       (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x63,0x07]
2341  %b = load <16 x i16>, <16 x i16>* %ptr_b
2342  %res = call <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 %mask)
2343  ret <32 x i8> %res
2344}
2345
2346declare <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16>, <16 x i16>, <32 x i8>, i32)
2347
2348
2349define <8 x i16> @test_mask_packus_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
2350  ;CHECK-LABEL: test_mask_packus_epi32_rr_128
2351  ;CHECK: vpackusdw       %xmm1, %xmm0, %xmm0
2352  %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 -1)
2353  ret <8 x i16> %res
2354}
2355
2356define <8 x i16> @test_mask_packus_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask) {
2357  ;CHECK-LABEL: test_mask_packus_epi32_rrk_128
2358  ;CHECK: vpackusdw       %xmm1, %xmm0, %xmm2 {%k1}
2359  %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask)
2360  ret <8 x i16> %res
2361}
2362
2363define <8 x i16> @test_mask_packus_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
2364  ;CHECK-LABEL: test_mask_packus_epi32_rrkz_128
2365  ;CHECK: vpackusdw       %xmm1, %xmm0, %xmm0 {%k1} {z}
2366  %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 %mask)
2367  ret <8 x i16> %res
2368}
2369
2370define <8 x i16> @test_mask_packus_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
2371  ;CHECK-LABEL: test_mask_packus_epi32_rm_128
2372  ;CHECK: vpackusdw       (%rdi), %xmm0, %xmm0
2373  %b = load <4 x i32>, <4 x i32>* %ptr_b
2374  %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 -1)
2375  ret <8 x i16> %res
2376}
2377
2378define <8 x i16> @test_mask_packus_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
2379  ;CHECK-LABEL: test_mask_packus_epi32_rmk_128
2380  ;CHECK: vpackusdw       (%rdi), %xmm0, %xmm1 {%k1}
2381  %b = load <4 x i32>, <4 x i32>* %ptr_b
2382  %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask)
2383  ret <8 x i16> %res
2384}
2385
2386define <8 x i16> @test_mask_packus_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
2387  ;CHECK-LABEL: test_mask_packus_epi32_rmkz_128
2388  ;CHECK: vpackusdw       (%rdi), %xmm0, %xmm0 {%k1} {z}
2389  %b = load <4 x i32>, <4 x i32>* %ptr_b
2390  %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 %mask)
2391  ret <8 x i16> %res
2392}
2393
2394define <8 x i16> @test_mask_packus_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
2395  ;CHECK-LABEL: test_mask_packus_epi32_rmb_128
2396  ;CHECK: vpackusdw       (%rdi){1to4}, %xmm0, %xmm0
2397  %q = load i32, i32* %ptr_b
2398  %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
2399  %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
2400  %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 -1)
2401  ret <8 x i16> %res
2402}
2403
2404define <8 x i16> @test_mask_packus_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <8 x i16> %passThru, i8 %mask) {
2405  ;CHECK-LABEL: test_mask_packus_epi32_rmbk_128
2406  ;CHECK: vpackusdw       (%rdi){1to4}, %xmm0, %xmm1 {%k1}
2407  %q = load i32, i32* %ptr_b
2408  %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
2409  %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
2410  %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask)
2411  ret <8 x i16> %res
2412}
2413
2414define <8 x i16> @test_mask_packus_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
2415  ;CHECK-LABEL: test_mask_packus_epi32_rmbkz_128
2416  ;CHECK: vpackusdw       (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z}
2417  %q = load i32, i32* %ptr_b
2418  %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
2419  %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
2420  %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 %mask)
2421  ret <8 x i16> %res
2422}
2423
2424declare <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32>, <4 x i32>, <8 x i16>, i8)
2425
2426define <16 x i16> @test_mask_packus_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
2427  ;CHECK-LABEL: test_mask_packus_epi32_rr_256
2428  ;CHECK: vpackusdw       %ymm1, %ymm0, %ymm0
2429  %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 -1)
2430  ret <16 x i16> %res
2431}
2432
2433define <16 x i16> @test_mask_packus_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask) {
2434  ;CHECK-LABEL: test_mask_packus_epi32_rrk_256
2435  ;CHECK: vpackusdw       %ymm1, %ymm0, %ymm2 {%k1}
2436  %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask)
2437  ret <16 x i16> %res
2438}
2439
2440define <16 x i16> @test_mask_packus_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i16 %mask) {
2441  ;CHECK-LABEL: test_mask_packus_epi32_rrkz_256
2442  ;CHECK: vpackusdw       %ymm1, %ymm0, %ymm0 {%k1} {z}
2443  %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 %mask)
2444  ret <16 x i16> %res
2445}
2446
2447define <16 x i16> @test_mask_packus_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
2448  ;CHECK-LABEL: test_mask_packus_epi32_rm_256
2449  ;CHECK: vpackusdw       (%rdi), %ymm0, %ymm0
2450  %b = load <8 x i32>, <8 x i32>* %ptr_b
2451  %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 -1)
2452  ret <16 x i16> %res
2453}
2454
2455define <16 x i16> @test_mask_packus_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
2456  ;CHECK-LABEL: test_mask_packus_epi32_rmk_256
2457  ;CHECK: vpackusdw       (%rdi), %ymm0, %ymm1 {%k1}
2458  %b = load <8 x i32>, <8 x i32>* %ptr_b
2459  %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask)
2460  ret <16 x i16> %res
2461}
2462
2463define <16 x i16> @test_mask_packus_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i16 %mask) {
2464  ;CHECK-LABEL: test_mask_packus_epi32_rmkz_256
2465  ;CHECK: vpackusdw       (%rdi), %ymm0, %ymm0 {%k1} {z}
2466  %b = load <8 x i32>, <8 x i32>* %ptr_b
2467  %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 %mask)
2468  ret <16 x i16> %res
2469}
2470
2471define <16 x i16> @test_mask_packus_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
2472  ;CHECK-LABEL: test_mask_packus_epi32_rmb_256
2473  ;CHECK: vpackusdw       (%rdi){1to8}, %ymm0, %ymm0
2474  %q = load i32, i32* %ptr_b
2475  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
2476  %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
2477  %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 -1)
2478  ret <16 x i16> %res
2479}
2480
2481define <16 x i16> @test_mask_packus_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <16 x i16> %passThru, i16 %mask) {
2482  ;CHECK-LABEL: test_mask_packus_epi32_rmbk_256
2483  ;CHECK: vpackusdw       (%rdi){1to8}, %ymm0, %ymm1 {%k1}
2484  %q = load i32, i32* %ptr_b
2485  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
2486  %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
2487  %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask)
2488  ret <16 x i16> %res
2489}
2490
2491define <16 x i16> @test_mask_packus_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i16 %mask) {
2492  ;CHECK-LABEL: test_mask_packus_epi32_rmbkz_256
2493  ;CHECK: vpackusdw       (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z}
2494  %q = load i32, i32* %ptr_b
2495  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
2496  %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
2497  %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 %mask)
2498  ret <16 x i16> %res
2499}
2500
2501declare <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32>, <8 x i32>, <16 x i16>, i16)
2502
2503define <16 x i8> @test_mask_packus_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
2504  ;CHECK-LABEL: test_mask_packus_epi16_rr_128
2505  ;CHECK: vpackuswb       %xmm1, %xmm0, %xmm0
2506  %res = call <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 -1)
2507  ret <16 x i8> %res
2508}
2509
2510define <16 x i8> @test_mask_packus_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask) {
2511  ;CHECK-LABEL: test_mask_packus_epi16_rrk_128
2512  ;CHECK: vpackuswb       %xmm1, %xmm0, %xmm2 {%k1}
2513  %res = call <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask)
2514  ret <16 x i8> %res
2515}
2516
2517define <16 x i8> @test_mask_packus_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i16 %mask) {
2518  ;CHECK-LABEL: test_mask_packus_epi16_rrkz_128
2519  ;CHECK: vpackuswb       %xmm1, %xmm0, %xmm0 {%k1} {z}
2520  %res = call <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 %mask)
2521  ret <16 x i8> %res
2522}
2523
2524define <16 x i8> @test_mask_packus_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
2525  ;CHECK-LABEL: test_mask_packus_epi16_rm_128
2526  ;CHECK: vpackuswb       (%rdi), %xmm0, %xmm0
2527  %b = load <8 x i16>, <8 x i16>* %ptr_b
2528  %res = call <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 -1)
2529  ret <16 x i8> %res
2530}
2531
2532define <16 x i8> @test_mask_packus_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
2533  ;CHECK-LABEL: test_mask_packus_epi16_rmk_128
2534  ;CHECK: vpackuswb       (%rdi), %xmm0, %xmm1 {%k1}
2535  %b = load <8 x i16>, <8 x i16>* %ptr_b
2536  %res = call <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask)
2537  ret <16 x i8> %res
2538}
2539
2540define <16 x i8> @test_mask_packus_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i16 %mask) {
2541  ;CHECK-LABEL: test_mask_packus_epi16_rmkz_128
2542  ;CHECK: vpackuswb       (%rdi), %xmm0, %xmm0 {%k1} {z}
2543  %b = load <8 x i16>, <8 x i16>* %ptr_b
2544  %res = call <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 %mask)
2545  ret <16 x i8> %res
2546}
2547
2548declare <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16>, <8 x i16>, <16 x i8>, i16)
2549
2550define <32 x i8> @test_mask_packus_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
2551  ;CHECK-LABEL: test_mask_packus_epi16_rr_256
2552  ;CHECK: vpackuswb       %ymm1, %ymm0, %ymm0
2553  %res = call <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 -1)
2554  ret <32 x i8> %res
2555}
2556
2557define <32 x i8> @test_mask_packus_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask) {
2558  ;CHECK-LABEL: test_mask_packus_epi16_rrk_256
2559  ;CHECK: vpackuswb       %ymm1, %ymm0, %ymm2 {%k1}
2560  %res = call <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask)
2561  ret <32 x i8> %res
2562}
2563
2564define <32 x i8> @test_mask_packus_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i32 %mask) {
2565  ;CHECK-LABEL: test_mask_packus_epi16_rrkz_256
2566  ;CHECK: vpackuswb       %ymm1, %ymm0, %ymm0 {%k1} {z}
2567  %res = call <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 %mask)
2568  ret <32 x i8> %res
2569}
2570
2571define <32 x i8> @test_mask_packus_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
2572  ;CHECK-LABEL: test_mask_packus_epi16_rm_256
2573  ;CHECK: vpackuswb       (%rdi), %ymm0, %ymm0
2574  %b = load <16 x i16>, <16 x i16>* %ptr_b
2575  %res = call <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 -1)
2576  ret <32 x i8> %res
2577}
2578
2579define <32 x i8> @test_mask_packus_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
2580  ;CHECK-LABEL: test_mask_packus_epi16_rmk_256
2581  ;CHECK: vpackuswb       (%rdi), %ymm0, %ymm1 {%k1}
2582  %b = load <16 x i16>, <16 x i16>* %ptr_b
2583  %res = call <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask)
2584  ret <32 x i8> %res
2585}
2586
2587define <32 x i8> @test_mask_packus_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i32 %mask) {
2588  ;CHECK-LABEL: test_mask_packus_epi16_rmkz_256
2589  ;CHECK: vpackuswb       (%rdi), %ymm0, %ymm0 {%k1} {z}
2590  %b = load <16 x i16>, <16 x i16>* %ptr_b
2591  %res = call <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 %mask)
2592  ret <32 x i8> %res
2593}
2594
2595declare <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16>, <16 x i16>, <32 x i8>, i32)
2596
2597define <8 x i16> @test_mask_adds_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
2598  ;CHECK-LABEL: test_mask_adds_epi16_rr_128
2599  ;CHECK: vpaddsw %xmm1, %xmm0, %xmm0
2600  %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
2601  ret <8 x i16> %res
2602}
2603
2604define <8 x i16> @test_mask_adds_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
2605  ;CHECK-LABEL: test_mask_adds_epi16_rrk_128
2606  ;CHECK: vpaddsw %xmm1, %xmm0, %xmm2 {%k1}
2607  %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
2608  ret <8 x i16> %res
2609}
2610
2611define <8 x i16> @test_mask_adds_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
2612  ;CHECK-LABEL: test_mask_adds_epi16_rrkz_128
2613  ;CHECK: vpaddsw %xmm1, %xmm0, %xmm0 {%k1} {z}
2614  %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
2615  ret <8 x i16> %res
2616}
2617
2618define <8 x i16> @test_mask_adds_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
2619  ;CHECK-LABEL: test_mask_adds_epi16_rm_128
2620  ;CHECK: vpaddsw (%rdi), %xmm0, %xmm0
2621  %b = load <8 x i16>, <8 x i16>* %ptr_b
2622  %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
2623  ret <8 x i16> %res
2624}
2625
2626define <8 x i16> @test_mask_adds_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
2627  ;CHECK-LABEL: test_mask_adds_epi16_rmk_128
2628  ;CHECK: vpaddsw (%rdi), %xmm0, %xmm1 {%k1}
2629  %b = load <8 x i16>, <8 x i16>* %ptr_b
2630  %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
2631  ret <8 x i16> %res
2632}
2633
2634define <8 x i16> @test_mask_adds_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
2635  ;CHECK-LABEL: test_mask_adds_epi16_rmkz_128
2636  ;CHECK: vpaddsw (%rdi), %xmm0, %xmm0 {%k1} {z}
2637  %b = load <8 x i16>, <8 x i16>* %ptr_b
2638  %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
2639  ret <8 x i16> %res
2640}
2641
2642declare <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
2643
2644define <16 x i16> @test_mask_adds_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
2645  ;CHECK-LABEL: test_mask_adds_epi16_rr_256
2646  ;CHECK: vpaddsw %ymm1, %ymm0, %ymm0
2647  %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
2648  ret <16 x i16> %res
2649}
2650
2651define <16 x i16> @test_mask_adds_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
2652  ;CHECK-LABEL: test_mask_adds_epi16_rrk_256
2653  ;CHECK: vpaddsw %ymm1, %ymm0, %ymm2 {%k1}
2654  %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
2655  ret <16 x i16> %res
2656}
2657
2658define <16 x i16> @test_mask_adds_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
2659  ;CHECK-LABEL: test_mask_adds_epi16_rrkz_256
2660  ;CHECK: vpaddsw %ymm1, %ymm0, %ymm0 {%k1} {z}
2661  %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
2662  ret <16 x i16> %res
2663}
2664
2665define <16 x i16> @test_mask_adds_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
2666  ;CHECK-LABEL: test_mask_adds_epi16_rm_256
2667  ;CHECK: vpaddsw (%rdi), %ymm0, %ymm0
2668  %b = load <16 x i16>, <16 x i16>* %ptr_b
2669  %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
2670  ret <16 x i16> %res
2671}
2672
2673define <16 x i16> @test_mask_adds_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
2674  ;CHECK-LABEL: test_mask_adds_epi16_rmk_256
2675  ;CHECK: vpaddsw (%rdi), %ymm0, %ymm1 {%k1}
2676  %b = load <16 x i16>, <16 x i16>* %ptr_b
2677  %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
2678  ret <16 x i16> %res
2679}
2680
2681define <16 x i16> @test_mask_adds_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
2682  ;CHECK-LABEL: test_mask_adds_epi16_rmkz_256
2683  ;CHECK: vpaddsw (%rdi), %ymm0, %ymm0 {%k1} {z}
2684  %b = load <16 x i16>, <16 x i16>* %ptr_b
2685  %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
2686  ret <16 x i16> %res
2687}
2688
2689declare <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
2690
2691define <8 x i16> @test_mask_subs_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
2692  ;CHECK-LABEL: test_mask_subs_epi16_rr_128
2693  ;CHECK: vpsubsw %xmm1, %xmm0, %xmm0
2694  %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
2695  ret <8 x i16> %res
2696}
2697
2698define <8 x i16> @test_mask_subs_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
2699  ;CHECK-LABEL: test_mask_subs_epi16_rrk_128
2700  ;CHECK: vpsubsw %xmm1, %xmm0, %xmm2 {%k1}
2701  %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
2702  ret <8 x i16> %res
2703}
2704
2705define <8 x i16> @test_mask_subs_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
2706  ;CHECK-LABEL: test_mask_subs_epi16_rrkz_128
2707  ;CHECK: vpsubsw %xmm1, %xmm0, %xmm0 {%k1} {z}
2708  %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
2709  ret <8 x i16> %res
2710}
2711
2712define <8 x i16> @test_mask_subs_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
2713  ;CHECK-LABEL: test_mask_subs_epi16_rm_128
2714  ;CHECK: vpsubsw (%rdi), %xmm0, %xmm0
2715  %b = load <8 x i16>, <8 x i16>* %ptr_b
2716  %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
2717  ret <8 x i16> %res
2718}
2719
2720define <8 x i16> @test_mask_subs_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
2721  ;CHECK-LABEL: test_mask_subs_epi16_rmk_128
2722  ;CHECK: vpsubsw (%rdi), %xmm0, %xmm1 {%k1}
2723  %b = load <8 x i16>, <8 x i16>* %ptr_b
2724  %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
2725  ret <8 x i16> %res
2726}
2727
2728define <8 x i16> @test_mask_subs_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
2729  ;CHECK-LABEL: test_mask_subs_epi16_rmkz_128
2730  ;CHECK: vpsubsw (%rdi), %xmm0, %xmm0 {%k1} {z}
2731  %b = load <8 x i16>, <8 x i16>* %ptr_b
2732  %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
2733  ret <8 x i16> %res
2734}
2735
2736declare <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
2737
2738define <16 x i16> @test_mask_subs_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
2739  ;CHECK-LABEL: test_mask_subs_epi16_rr_256
2740  ;CHECK: vpsubsw %ymm1, %ymm0, %ymm0
2741  %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
2742  ret <16 x i16> %res
2743}
2744
2745define <16 x i16> @test_mask_subs_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
2746  ;CHECK-LABEL: test_mask_subs_epi16_rrk_256
2747  ;CHECK: vpsubsw %ymm1, %ymm0, %ymm2 {%k1}
2748  %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
2749  ret <16 x i16> %res
2750}
2751
2752define <16 x i16> @test_mask_subs_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
2753  ;CHECK-LABEL: test_mask_subs_epi16_rrkz_256
2754  ;CHECK: vpsubsw %ymm1, %ymm0, %ymm0 {%k1} {z}
2755  %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
2756  ret <16 x i16> %res
2757}
2758
2759define <16 x i16> @test_mask_subs_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
2760  ;CHECK-LABEL: test_mask_subs_epi16_rm_256
2761  ;CHECK: vpsubsw (%rdi), %ymm0, %ymm0
2762  %b = load <16 x i16>, <16 x i16>* %ptr_b
2763  %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
2764  ret <16 x i16> %res
2765}
2766
2767define <16 x i16> @test_mask_subs_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
2768  ;CHECK-LABEL: test_mask_subs_epi16_rmk_256
2769  ;CHECK: vpsubsw (%rdi), %ymm0, %ymm1 {%k1}
2770  %b = load <16 x i16>, <16 x i16>* %ptr_b
2771  %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
2772  ret <16 x i16> %res
2773}
2774
2775define <16 x i16> @test_mask_subs_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
2776  ;CHECK-LABEL: test_mask_subs_epi16_rmkz_256
2777  ;CHECK: vpsubsw (%rdi), %ymm0, %ymm0 {%k1} {z}
2778  %b = load <16 x i16>, <16 x i16>* %ptr_b
2779  %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
2780  ret <16 x i16> %res
2781}
2782
2783declare <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
2784
2785define <8 x i16> @test_mask_adds_epu16_rr_128(<8 x i16> %a, <8 x i16> %b) {
2786  ;CHECK-LABEL: test_mask_adds_epu16_rr_128
2787  ;CHECK: vpaddusw %xmm1, %xmm0, %xmm0
2788  %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
2789  ret <8 x i16> %res
2790}
2791
2792define <8 x i16> @test_mask_adds_epu16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
2793  ;CHECK-LABEL: test_mask_adds_epu16_rrk_128
2794  ;CHECK: vpaddusw %xmm1, %xmm0, %xmm2 {%k1}
2795  %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
2796  ret <8 x i16> %res
2797}
2798
2799define <8 x i16> @test_mask_adds_epu16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
2800  ;CHECK-LABEL: test_mask_adds_epu16_rrkz_128
2801  ;CHECK: vpaddusw %xmm1, %xmm0, %xmm0 {%k1} {z}
2802  %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
2803  ret <8 x i16> %res
2804}
2805
2806define <8 x i16> @test_mask_adds_epu16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
2807  ;CHECK-LABEL: test_mask_adds_epu16_rm_128
2808  ;CHECK: vpaddusw (%rdi), %xmm0, %xmm0
2809  %b = load <8 x i16>, <8 x i16>* %ptr_b
2810  %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
2811  ret <8 x i16> %res
2812}
2813
2814define <8 x i16> @test_mask_adds_epu16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
2815  ;CHECK-LABEL: test_mask_adds_epu16_rmk_128
2816  ;CHECK: vpaddusw (%rdi), %xmm0, %xmm1 {%k1}
2817  %b = load <8 x i16>, <8 x i16>* %ptr_b
2818  %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
2819  ret <8 x i16> %res
2820}
2821
2822define <8 x i16> @test_mask_adds_epu16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
2823  ;CHECK-LABEL: test_mask_adds_epu16_rmkz_128
2824  ;CHECK: vpaddusw (%rdi), %xmm0, %xmm0 {%k1} {z}
2825  %b = load <8 x i16>, <8 x i16>* %ptr_b
2826  %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
2827  ret <8 x i16> %res
2828}
2829
2830declare <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
2831
2832define <16 x i16> @test_mask_adds_epu16_rr_256(<16 x i16> %a, <16 x i16> %b) {
2833  ;CHECK-LABEL: test_mask_adds_epu16_rr_256
2834  ;CHECK: vpaddusw %ymm1, %ymm0, %ymm0
2835  %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
2836  ret <16 x i16> %res
2837}
2838
2839define <16 x i16> @test_mask_adds_epu16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
2840  ;CHECK-LABEL: test_mask_adds_epu16_rrk_256
2841  ;CHECK: vpaddusw %ymm1, %ymm0, %ymm2 {%k1}
2842  %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
2843  ret <16 x i16> %res
2844}
2845
2846define <16 x i16> @test_mask_adds_epu16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
2847  ;CHECK-LABEL: test_mask_adds_epu16_rrkz_256
2848  ;CHECK: vpaddusw %ymm1, %ymm0, %ymm0 {%k1} {z}
2849  %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
2850  ret <16 x i16> %res
2851}
2852
2853define <16 x i16> @test_mask_adds_epu16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
2854  ;CHECK-LABEL: test_mask_adds_epu16_rm_256
2855  ;CHECK: vpaddusw (%rdi), %ymm0, %ymm0
2856  %b = load <16 x i16>, <16 x i16>* %ptr_b
2857  %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
2858  ret <16 x i16> %res
2859}
2860
2861define <16 x i16> @test_mask_adds_epu16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
2862  ;CHECK-LABEL: test_mask_adds_epu16_rmk_256
2863  ;CHECK: vpaddusw (%rdi), %ymm0, %ymm1 {%k1}
2864  %b = load <16 x i16>, <16 x i16>* %ptr_b
2865  %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
2866  ret <16 x i16> %res
2867}
2868
2869define <16 x i16> @test_mask_adds_epu16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
2870  ;CHECK-LABEL: test_mask_adds_epu16_rmkz_256
2871  ;CHECK: vpaddusw (%rdi), %ymm0, %ymm0 {%k1} {z}
2872  %b = load <16 x i16>, <16 x i16>* %ptr_b
2873  %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
2874  ret <16 x i16> %res
2875}
2876
2877declare <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
2878
2879define <8 x i16> @test_mask_subs_epu16_rr_128(<8 x i16> %a, <8 x i16> %b) {
2880  ;CHECK-LABEL: test_mask_subs_epu16_rr_128
2881  ;CHECK: vpsubusw %xmm1, %xmm0, %xmm0
2882  %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
2883  ret <8 x i16> %res
2884}
2885
2886define <8 x i16> @test_mask_subs_epu16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
2887  ;CHECK-LABEL: test_mask_subs_epu16_rrk_128
2888  ;CHECK: vpsubusw %xmm1, %xmm0, %xmm2 {%k1}
2889  %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
2890  ret <8 x i16> %res
2891}
2892
2893define <8 x i16> @test_mask_subs_epu16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
2894  ;CHECK-LABEL: test_mask_subs_epu16_rrkz_128
2895  ;CHECK: vpsubusw %xmm1, %xmm0, %xmm0 {%k1} {z}
2896  %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
2897  ret <8 x i16> %res
2898}
2899
2900define <8 x i16> @test_mask_subs_epu16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
2901  ;CHECK-LABEL: test_mask_subs_epu16_rm_128
2902  ;CHECK: vpsubusw (%rdi), %xmm0, %xmm0
2903  %b = load <8 x i16>, <8 x i16>* %ptr_b
2904  %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
2905  ret <8 x i16> %res
2906}
2907
2908define <8 x i16> @test_mask_subs_epu16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
2909  ;CHECK-LABEL: test_mask_subs_epu16_rmk_128
2910  ;CHECK: vpsubusw (%rdi), %xmm0, %xmm1 {%k1}
2911  %b = load <8 x i16>, <8 x i16>* %ptr_b
2912  %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
2913  ret <8 x i16> %res
2914}
2915
2916define <8 x i16> @test_mask_subs_epu16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
2917  ;CHECK-LABEL: test_mask_subs_epu16_rmkz_128
2918  ;CHECK: vpsubusw (%rdi), %xmm0, %xmm0 {%k1} {z}
2919  %b = load <8 x i16>, <8 x i16>* %ptr_b
2920  %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
2921  ret <8 x i16> %res
2922}
2923
2924declare <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
2925
2926define <16 x i16> @test_mask_subs_epu16_rr_256(<16 x i16> %a, <16 x i16> %b) {
2927  ;CHECK-LABEL: test_mask_subs_epu16_rr_256
2928  ;CHECK: vpsubusw %ymm1, %ymm0, %ymm0
2929  %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
2930  ret <16 x i16> %res
2931}
2932
2933define <16 x i16> @test_mask_subs_epu16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
2934  ;CHECK-LABEL: test_mask_subs_epu16_rrk_256
2935  ;CHECK: vpsubusw %ymm1, %ymm0, %ymm2 {%k1}
2936  %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
2937  ret <16 x i16> %res
2938}
2939
2940define <16 x i16> @test_mask_subs_epu16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
2941  ;CHECK-LABEL: test_mask_subs_epu16_rrkz_256
2942  ;CHECK: vpsubusw %ymm1, %ymm0, %ymm0 {%k1} {z}
2943  %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
2944  ret <16 x i16> %res
2945}
2946
2947define <16 x i16> @test_mask_subs_epu16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
2948  ;CHECK-LABEL: test_mask_subs_epu16_rm_256
2949  ;CHECK: vpsubusw (%rdi), %ymm0, %ymm0
2950  %b = load <16 x i16>, <16 x i16>* %ptr_b
2951  %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
2952  ret <16 x i16> %res
2953}
2954
2955define <16 x i16> @test_mask_subs_epu16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
2956  ;CHECK-LABEL: test_mask_subs_epu16_rmk_256
2957  ;CHECK: vpsubusw (%rdi), %ymm0, %ymm1 {%k1}
2958  %b = load <16 x i16>, <16 x i16>* %ptr_b
2959  %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
2960  ret <16 x i16> %res
2961}
2962
2963define <16 x i16> @test_mask_subs_epu16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
2964  ;CHECK-LABEL: test_mask_subs_epu16_rmkz_256
2965  ;CHECK: vpsubusw (%rdi), %ymm0, %ymm0 {%k1} {z}
2966  %b = load <16 x i16>, <16 x i16>* %ptr_b
2967  %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
2968  ret <16 x i16> %res
2969}
2970
2971declare <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
2972
2973define <16 x i8> @test_mask_adds_epi8_rr_128(<16 x i8> %a, <16 x i8> %b) {
2974  ;CHECK-LABEL: test_mask_adds_epi8_rr_128
2975  ;CHECK: vpaddsb %xmm1, %xmm0, %xmm0
2976  %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
2977  ret <16 x i8> %res
2978}
2979
2980define <16 x i8> @test_mask_adds_epi8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) {
2981  ;CHECK-LABEL: test_mask_adds_epi8_rrk_128
2982  ;CHECK: vpaddsb %xmm1, %xmm0, %xmm2 {%k1}
2983  %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
2984  ret <16 x i8> %res
2985}
2986
2987define <16 x i8> @test_mask_adds_epi8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
2988  ;CHECK-LABEL: test_mask_adds_epi8_rrkz_128
2989  ;CHECK: vpaddsb %xmm1, %xmm0, %xmm0 {%k1} {z}
2990  %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
2991  ret <16 x i8> %res
2992}
2993
2994define <16 x i8> @test_mask_adds_epi8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) {
2995  ;CHECK-LABEL: test_mask_adds_epi8_rm_128
2996  ;CHECK: vpaddsb (%rdi), %xmm0, %xmm0
2997  %b = load <16 x i8>, <16 x i8>* %ptr_b
2998  %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
2999  ret <16 x i8> %res
3000}
3001
3002define <16 x i8> @test_mask_adds_epi8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
3003  ;CHECK-LABEL: test_mask_adds_epi8_rmk_128
3004  ;CHECK: vpaddsb (%rdi), %xmm0, %xmm1 {%k1}
3005  %b = load <16 x i8>, <16 x i8>* %ptr_b
3006  %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
3007  ret <16 x i8> %res
3008}
3009
3010define <16 x i8> @test_mask_adds_epi8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) {
3011  ;CHECK-LABEL: test_mask_adds_epi8_rmkz_128
3012  ;CHECK: vpaddsb (%rdi), %xmm0, %xmm0 {%k1} {z}
3013  %b = load <16 x i8>, <16 x i8>* %ptr_b
3014  %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
3015  ret <16 x i8> %res
3016}
3017
3018declare <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
3019
3020define <32 x i8> @test_mask_adds_epi8_rr_256(<32 x i8> %a, <32 x i8> %b) {
3021  ;CHECK-LABEL: test_mask_adds_epi8_rr_256
3022  ;CHECK: vpaddsb %ymm1, %ymm0, %ymm0
3023  %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
3024  ret <32 x i8> %res
3025}
3026
3027define <32 x i8> @test_mask_adds_epi8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) {
3028  ;CHECK-LABEL: test_mask_adds_epi8_rrk_256
3029  ;CHECK: vpaddsb %ymm1, %ymm0, %ymm2 {%k1}
3030  %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
3031  ret <32 x i8> %res
3032}
3033
3034define <32 x i8> @test_mask_adds_epi8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
3035  ;CHECK-LABEL: test_mask_adds_epi8_rrkz_256
3036  ;CHECK: vpaddsb %ymm1, %ymm0, %ymm0 {%k1} {z}
3037  %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
3038  ret <32 x i8> %res
3039}
3040
3041define <32 x i8> @test_mask_adds_epi8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) {
3042  ;CHECK-LABEL: test_mask_adds_epi8_rm_256
3043  ;CHECK: vpaddsb (%rdi), %ymm0, %ymm0
3044  %b = load <32 x i8>, <32 x i8>* %ptr_b
3045  %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
3046  ret <32 x i8> %res
3047}
3048
3049define <32 x i8> @test_mask_adds_epi8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
3050  ;CHECK-LABEL: test_mask_adds_epi8_rmk_256
3051  ;CHECK: vpaddsb (%rdi), %ymm0, %ymm1 {%k1}
3052  %b = load <32 x i8>, <32 x i8>* %ptr_b
3053  %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
3054  ret <32 x i8> %res
3055}
3056
3057define <32 x i8> @test_mask_adds_epi8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) {
3058  ;CHECK-LABEL: test_mask_adds_epi8_rmkz_256
3059  ;CHECK: vpaddsb (%rdi), %ymm0, %ymm0 {%k1} {z}
3060  %b = load <32 x i8>, <32 x i8>* %ptr_b
3061  %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
3062  ret <32 x i8> %res
3063}
3064
3065declare <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
3066
3067define <16 x i8> @test_mask_subs_epi8_rr_128(<16 x i8> %a, <16 x i8> %b) {
3068  ;CHECK-LABEL: test_mask_subs_epi8_rr_128
3069  ;CHECK: vpsubsb %xmm1, %xmm0, %xmm0
3070  %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
3071  ret <16 x i8> %res
3072}
3073
3074define <16 x i8> @test_mask_subs_epi8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) {
3075  ;CHECK-LABEL: test_mask_subs_epi8_rrk_128
3076  ;CHECK: vpsubsb %xmm1, %xmm0, %xmm2 {%k1}
3077  %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
3078  ret <16 x i8> %res
3079}
3080
3081define <16 x i8> @test_mask_subs_epi8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
3082  ;CHECK-LABEL: test_mask_subs_epi8_rrkz_128
3083  ;CHECK: vpsubsb %xmm1, %xmm0, %xmm0 {%k1} {z}
3084  %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
3085  ret <16 x i8> %res
3086}
3087
3088define <16 x i8> @test_mask_subs_epi8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) {
3089  ;CHECK-LABEL: test_mask_subs_epi8_rm_128
3090  ;CHECK: vpsubsb (%rdi), %xmm0, %xmm0
3091  %b = load <16 x i8>, <16 x i8>* %ptr_b
3092  %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
3093  ret <16 x i8> %res
3094}
3095
3096define <16 x i8> @test_mask_subs_epi8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
3097  ;CHECK-LABEL: test_mask_subs_epi8_rmk_128
3098  ;CHECK: vpsubsb (%rdi), %xmm0, %xmm1 {%k1}
3099  %b = load <16 x i8>, <16 x i8>* %ptr_b
3100  %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
3101  ret <16 x i8> %res
3102}
3103
3104define <16 x i8> @test_mask_subs_epi8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) {
3105  ;CHECK-LABEL: test_mask_subs_epi8_rmkz_128
3106  ;CHECK: vpsubsb (%rdi), %xmm0, %xmm0 {%k1} {z}
3107  %b = load <16 x i8>, <16 x i8>* %ptr_b
3108  %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
3109  ret <16 x i8> %res
3110}
3111
3112declare <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
3113
3114define <32 x i8> @test_mask_subs_epi8_rr_256(<32 x i8> %a, <32 x i8> %b) {
3115  ;CHECK-LABEL: test_mask_subs_epi8_rr_256
3116  ;CHECK: vpsubsb %ymm1, %ymm0, %ymm0
3117  %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
3118  ret <32 x i8> %res
3119}
3120
3121define <32 x i8> @test_mask_subs_epi8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) {
3122  ;CHECK-LABEL: test_mask_subs_epi8_rrk_256
3123  ;CHECK: vpsubsb %ymm1, %ymm0, %ymm2 {%k1}
3124  %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
3125  ret <32 x i8> %res
3126}
3127
3128define <32 x i8> @test_mask_subs_epi8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
3129  ;CHECK-LABEL: test_mask_subs_epi8_rrkz_256
3130  ;CHECK: vpsubsb %ymm1, %ymm0, %ymm0 {%k1} {z}
3131  %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
3132  ret <32 x i8> %res
3133}
3134
3135define <32 x i8> @test_mask_subs_epi8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) {
3136  ;CHECK-LABEL: test_mask_subs_epi8_rm_256
3137  ;CHECK: vpsubsb (%rdi), %ymm0, %ymm0
3138  %b = load <32 x i8>, <32 x i8>* %ptr_b
3139  %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
3140  ret <32 x i8> %res
3141}
3142
3143define <32 x i8> @test_mask_subs_epi8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
3144  ;CHECK-LABEL: test_mask_subs_epi8_rmk_256
3145  ;CHECK: vpsubsb (%rdi), %ymm0, %ymm1 {%k1}
3146  %b = load <32 x i8>, <32 x i8>* %ptr_b
3147  %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
3148  ret <32 x i8> %res
3149}
3150
3151define <32 x i8> @test_mask_subs_epi8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) {
3152  ;CHECK-LABEL: test_mask_subs_epi8_rmkz_256
3153  ;CHECK: vpsubsb (%rdi), %ymm0, %ymm0 {%k1} {z}
3154  %b = load <32 x i8>, <32 x i8>* %ptr_b
3155  %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
3156  ret <32 x i8> %res
3157}
3158
3159declare <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
3160
3161define <16 x i8> @test_mask_adds_epu8_rr_128(<16 x i8> %a, <16 x i8> %b) {
3162  ;CHECK-LABEL: test_mask_adds_epu8_rr_128
3163  ;CHECK: vpaddusb %xmm1, %xmm0, %xmm0
3164  %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
3165  ret <16 x i8> %res
3166}
3167
3168define <16 x i8> @test_mask_adds_epu8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) {
3169  ;CHECK-LABEL: test_mask_adds_epu8_rrk_128
3170  ;CHECK: vpaddusb %xmm1, %xmm0, %xmm2 {%k1}
3171  %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
3172  ret <16 x i8> %res
3173}
3174
3175define <16 x i8> @test_mask_adds_epu8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
3176  ;CHECK-LABEL: test_mask_adds_epu8_rrkz_128
3177  ;CHECK: vpaddusb %xmm1, %xmm0, %xmm0 {%k1} {z}
3178  %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
3179  ret <16 x i8> %res
3180}
3181
3182define <16 x i8> @test_mask_adds_epu8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) {
3183  ;CHECK-LABEL: test_mask_adds_epu8_rm_128
3184  ;CHECK: vpaddusb (%rdi), %xmm0, %xmm0
3185  %b = load <16 x i8>, <16 x i8>* %ptr_b
3186  %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
3187  ret <16 x i8> %res
3188}
3189
3190define <16 x i8> @test_mask_adds_epu8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
3191  ;CHECK-LABEL: test_mask_adds_epu8_rmk_128
3192  ;CHECK: vpaddusb (%rdi), %xmm0, %xmm1 {%k1}
3193  %b = load <16 x i8>, <16 x i8>* %ptr_b
3194  %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
3195  ret <16 x i8> %res
3196}
3197
3198define <16 x i8> @test_mask_adds_epu8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) {
3199  ;CHECK-LABEL: test_mask_adds_epu8_rmkz_128
3200  ;CHECK: vpaddusb (%rdi), %xmm0, %xmm0 {%k1} {z}
3201  %b = load <16 x i8>, <16 x i8>* %ptr_b
3202  %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
3203  ret <16 x i8> %res
3204}
3205
3206declare <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
3207
3208define <32 x i8> @test_mask_adds_epu8_rr_256(<32 x i8> %a, <32 x i8> %b) {
3209  ;CHECK-LABEL: test_mask_adds_epu8_rr_256
3210  ;CHECK: vpaddusb %ymm1, %ymm0, %ymm0
3211  %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
3212  ret <32 x i8> %res
3213}
3214
3215define <32 x i8> @test_mask_adds_epu8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) {
3216  ;CHECK-LABEL: test_mask_adds_epu8_rrk_256
3217  ;CHECK: vpaddusb %ymm1, %ymm0, %ymm2 {%k1}
3218  %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
3219  ret <32 x i8> %res
3220}
3221
3222define <32 x i8> @test_mask_adds_epu8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
3223  ;CHECK-LABEL: test_mask_adds_epu8_rrkz_256
3224  ;CHECK: vpaddusb %ymm1, %ymm0, %ymm0 {%k1} {z}
3225  %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
3226  ret <32 x i8> %res
3227}
3228
3229define <32 x i8> @test_mask_adds_epu8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) {
3230  ;CHECK-LABEL: test_mask_adds_epu8_rm_256
3231  ;CHECK: vpaddusb (%rdi), %ymm0, %ymm0
3232  %b = load <32 x i8>, <32 x i8>* %ptr_b
3233  %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
3234  ret <32 x i8> %res
3235}
3236
3237define <32 x i8> @test_mask_adds_epu8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
3238  ;CHECK-LABEL: test_mask_adds_epu8_rmk_256
3239  ;CHECK: vpaddusb (%rdi), %ymm0, %ymm1 {%k1}
3240  %b = load <32 x i8>, <32 x i8>* %ptr_b
3241  %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
3242  ret <32 x i8> %res
3243}
3244
3245define <32 x i8> @test_mask_adds_epu8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) {
3246  ;CHECK-LABEL: test_mask_adds_epu8_rmkz_256
3247  ;CHECK: vpaddusb (%rdi), %ymm0, %ymm0 {%k1} {z}
3248  %b = load <32 x i8>, <32 x i8>* %ptr_b
3249  %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
3250  ret <32 x i8> %res
3251}
3252
3253declare <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
3254
3255define <16 x i8> @test_mask_subs_epu8_rr_128(<16 x i8> %a, <16 x i8> %b) {
3256  ;CHECK-LABEL: test_mask_subs_epu8_rr_128
3257  ;CHECK: vpsubusb %xmm1, %xmm0, %xmm0
3258  %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
3259  ret <16 x i8> %res
3260}
3261
3262define <16 x i8> @test_mask_subs_epu8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) {
3263  ;CHECK-LABEL: test_mask_subs_epu8_rrk_128
3264  ;CHECK: vpsubusb %xmm1, %xmm0, %xmm2 {%k1}
3265  %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
3266  ret <16 x i8> %res
3267}
3268
3269define <16 x i8> @test_mask_subs_epu8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
3270  ;CHECK-LABEL: test_mask_subs_epu8_rrkz_128
3271  ;CHECK: vpsubusb %xmm1, %xmm0, %xmm0 {%k1} {z}
3272  %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
3273  ret <16 x i8> %res
3274}
3275
3276define <16 x i8> @test_mask_subs_epu8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) {
3277  ;CHECK-LABEL: test_mask_subs_epu8_rm_128
3278  ;CHECK: vpsubusb (%rdi), %xmm0, %xmm0
3279  %b = load <16 x i8>, <16 x i8>* %ptr_b
3280  %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
3281  ret <16 x i8> %res
3282}
3283
3284define <16 x i8> @test_mask_subs_epu8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
3285  ;CHECK-LABEL: test_mask_subs_epu8_rmk_128
3286  ;CHECK: vpsubusb (%rdi), %xmm0, %xmm1 {%k1}
3287  %b = load <16 x i8>, <16 x i8>* %ptr_b
3288  %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
3289  ret <16 x i8> %res
3290}
3291
3292define <16 x i8> @test_mask_subs_epu8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) {
3293  ;CHECK-LABEL: test_mask_subs_epu8_rmkz_128
3294  ;CHECK: vpsubusb (%rdi), %xmm0, %xmm0 {%k1} {z}
3295  %b = load <16 x i8>, <16 x i8>* %ptr_b
3296  %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
3297  ret <16 x i8> %res
3298}
3299
3300declare <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
3301
3302define <32 x i8> @test_mask_subs_epu8_rr_256(<32 x i8> %a, <32 x i8> %b) {
3303  ;CHECK-LABEL: test_mask_subs_epu8_rr_256
3304  ;CHECK: vpsubusb %ymm1, %ymm0, %ymm0
3305  %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
3306  ret <32 x i8> %res
3307}
3308
3309define <32 x i8> @test_mask_subs_epu8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) {
3310  ;CHECK-LABEL: test_mask_subs_epu8_rrk_256
3311  ;CHECK: vpsubusb %ymm1, %ymm0, %ymm2 {%k1}
3312  %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
3313  ret <32 x i8> %res
3314}
3315
3316define <32 x i8> @test_mask_subs_epu8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
3317  ;CHECK-LABEL: test_mask_subs_epu8_rrkz_256
3318  ;CHECK: vpsubusb %ymm1, %ymm0, %ymm0 {%k1} {z}
3319  %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
3320  ret <32 x i8> %res
3321}
3322
3323define <32 x i8> @test_mask_subs_epu8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) {
3324  ;CHECK-LABEL: test_mask_subs_epu8_rm_256
3325  ;CHECK: vpsubusb (%rdi), %ymm0, %ymm0
3326  %b = load <32 x i8>, <32 x i8>* %ptr_b
3327  %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
3328  ret <32 x i8> %res
3329}
3330
3331define <32 x i8> @test_mask_subs_epu8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
3332  ;CHECK-LABEL: test_mask_subs_epu8_rmk_256
3333  ;CHECK: vpsubusb (%rdi), %ymm0, %ymm1 {%k1}
3334  %b = load <32 x i8>, <32 x i8>* %ptr_b
3335  %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
3336  ret <32 x i8> %res
3337}
3338
3339define <32 x i8> @test_mask_subs_epu8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) {
3340  ;CHECK-LABEL: test_mask_subs_epu8_rmkz_256
3341  ;CHECK: vpsubusb (%rdi), %ymm0, %ymm0 {%k1} {z}
3342  %b = load <32 x i8>, <32 x i8>* %ptr_b
3343  %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
3344  ret <32 x i8> %res
3345}
3346
3347declare <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
3348
3349declare <16 x i8> @llvm.x86.avx512.mask.pmaxs.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
3350
3351; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_b_128
3352; CHECK-NOT: call
3353; CHECK: vpmaxsb %xmm
3354; CHECK: {%k1}
3355define <16 x i8>@test_int_x86_avx512_mask_pmaxs_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %mask) {
3356  %res = call <16 x i8> @llvm.x86.avx512.mask.pmaxs.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2 ,i16 %mask)
3357  %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmaxs.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %mask)
3358  %res2 = add <16 x i8> %res, %res1
3359  ret <16 x i8> %res2
3360}
3361
3362declare <32 x i8> @llvm.x86.avx512.mask.pmaxs.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
3363
3364; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_b_256
3365; CHECK-NOT: call
3366; CHECK: vpmaxsb %ymm
3367; CHECK: {%k1}
3368define <32 x i8>@test_int_x86_avx512_mask_pmaxs_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
3369  %res = call <32 x i8> @llvm.x86.avx512.mask.pmaxs.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
3370  %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmaxs.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
3371  %res2 = add <32 x i8> %res, %res1
3372  ret <32 x i8> %res2
3373}
3374
3375declare <8 x i16> @llvm.x86.avx512.mask.pmaxs.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
3376
3377; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_w_128
3378; CHECK-NOT: call
3379; CHECK: vpmaxsw %xmm
3380; CHECK: {%k1}
3381define <8 x i16>@test_int_x86_avx512_mask_pmaxs_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
3382  %res = call <8 x i16> @llvm.x86.avx512.mask.pmaxs.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
3383  %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmaxs.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
3384  %res2 = add <8 x i16> %res, %res1
3385  ret <8 x i16> %res2
3386}
3387
3388declare <16 x i16> @llvm.x86.avx512.mask.pmaxs.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
3389
3390; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_w_256
3391; CHECK-NOT: call
3392; CHECK: vpmaxsw %ymm
3393; CHECK: {%k1}
3394define <16 x i16>@test_int_x86_avx512_mask_pmaxs_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask) {
3395  %res = call <16 x i16> @llvm.x86.avx512.mask.pmaxs.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask)
3396  %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmaxs.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %mask)
3397  %res2 = add <16 x i16> %res, %res1
3398  ret <16 x i16> %res2
3399}
3400
3401declare <16 x i8> @llvm.x86.avx512.mask.pmaxu.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
3402
3403; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_b_128
3404; CHECK-NOT: call
3405; CHECK: vpmaxub %xmm
3406; CHECK: {%k1}
3407define <16 x i8>@test_int_x86_avx512_mask_pmaxu_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2,i16 %mask) {
3408  %res = call <16 x i8> @llvm.x86.avx512.mask.pmaxu.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %mask)
3409  %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmaxu.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %mask)
3410  %res2 = add <16 x i8> %res, %res1
3411  ret <16 x i8> %res2
3412}
3413
3414declare <32 x i8> @llvm.x86.avx512.mask.pmaxu.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
3415
3416; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_b_256
3417; CHECK-NOT: call
3418; CHECK: vpmaxub %ymm
3419; CHECK: {%k1}
3420define <32 x i8>@test_int_x86_avx512_mask_pmaxu_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
3421  %res = call <32 x i8> @llvm.x86.avx512.mask.pmaxu.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
3422  %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmaxu.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
3423  %res2 = add <32 x i8> %res, %res1
3424  ret <32 x i8> %res2
3425}
3426
3427declare <8 x i16> @llvm.x86.avx512.mask.pmaxu.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
3428
3429; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_w_128
3430; CHECK-NOT: call
3431; CHECK: vpmaxuw %xmm
3432; CHECK: {%k1}
3433define <8 x i16>@test_int_x86_avx512_mask_pmaxu_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
3434  %res = call <8 x i16> @llvm.x86.avx512.mask.pmaxu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
3435  %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmaxu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
3436  %res2 = add <8 x i16> %res, %res1
3437  ret <8 x i16> %res2
3438}
3439
3440declare <16 x i16> @llvm.x86.avx512.mask.pmaxu.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
3441
3442; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_w_256
3443; CHECK-NOT: call
3444; CHECK: vpmaxuw %ymm
3445; CHECK: {%k1}
3446define <16 x i16>@test_int_x86_avx512_mask_pmaxu_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask) {
3447  %res = call <16 x i16> @llvm.x86.avx512.mask.pmaxu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask)
3448  %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmaxu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %mask)
3449  %res2 = add <16 x i16> %res, %res1
3450  ret <16 x i16> %res2
3451}
3452
3453declare <16 x i8> @llvm.x86.avx512.mask.pmins.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
3454
3455; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_b_128
3456; CHECK-NOT: call
3457; CHECK: vpminsb %xmm
3458; CHECK: {%k1}
3459define <16 x i8>@test_int_x86_avx512_mask_pmins_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %mask) {
3460  %res = call <16 x i8> @llvm.x86.avx512.mask.pmins.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %mask)
3461  %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmins.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %mask)
3462  %res2 = add <16 x i8> %res, %res1
3463  ret <16 x i8> %res2
3464}
3465
3466declare <32 x i8> @llvm.x86.avx512.mask.pmins.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
3467
3468; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_b_256
3469; CHECK-NOT: call
3470; CHECK: vpminsb %ymm
3471; CHECK: {%k1}
3472define <32 x i8>@test_int_x86_avx512_mask_pmins_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
3473  %res = call <32 x i8> @llvm.x86.avx512.mask.pmins.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
3474  %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmins.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
3475  %res2 = add <32 x i8> %res, %res1
3476  ret <32 x i8> %res2
3477}
3478
3479declare <8 x i16> @llvm.x86.avx512.mask.pmins.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
3480
3481; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_w_128
3482; CHECK-NOT: call
3483; CHECK: vpminsw %xmm
3484; CHECK: {%k1}
3485define <8 x i16>@test_int_x86_avx512_mask_pmins_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
3486  %res = call <8 x i16> @llvm.x86.avx512.mask.pmins.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
3487  %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmins.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
3488  %res2 = add <8 x i16> %res, %res1
3489  ret <8 x i16> %res2
3490}
3491
3492declare <16 x i16> @llvm.x86.avx512.mask.pmins.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
3493
3494; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_w_256
3495; CHECK-NOT: call
3496; CHECK: vpminsw %ymm
3497; CHECK: {%k1}
3498define <16 x i16>@test_int_x86_avx512_mask_pmins_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask) {
3499  %res = call <16 x i16> @llvm.x86.avx512.mask.pmins.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask)
3500  %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmins.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %mask)
3501  %res2 = add <16 x i16> %res, %res1
3502  ret <16 x i16> %res2
3503}
3504
3505declare <16 x i8> @llvm.x86.avx512.mask.pminu.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
3506
3507; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_b_128
3508; CHECK-NOT: call
3509; CHECK: vpminub %xmm
3510; CHECK: {%k1}
3511define <16 x i8>@test_int_x86_avx512_mask_pminu_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %mask) {
3512  %res = call <16 x i8> @llvm.x86.avx512.mask.pminu.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %mask)
3513  %res1 = call <16 x i8> @llvm.x86.avx512.mask.pminu.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %mask)
3514  %res2 = add <16 x i8> %res, %res1
3515  ret <16 x i8> %res2
3516}
3517
3518declare <32 x i8> @llvm.x86.avx512.mask.pminu.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
3519
3520; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_b_256
3521; CHECK-NOT: call
3522; CHECK: vpminub %ymm
3523; CHECK: {%k1}
3524define <32 x i8>@test_int_x86_avx512_mask_pminu_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
3525  %res = call <32 x i8> @llvm.x86.avx512.mask.pminu.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
3526  %res1 = call <32 x i8> @llvm.x86.avx512.mask.pminu.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
3527  %res2 = add <32 x i8> %res, %res1
3528  ret <32 x i8> %res2
3529}
3530
3531declare <8 x i16> @llvm.x86.avx512.mask.pminu.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
3532
3533; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_w_128
3534; CHECK-NOT: call
3535; CHECK: vpminuw %xmm
3536; CHECK: {%k1}
3537define <8 x i16>@test_int_x86_avx512_mask_pminu_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
3538  %res = call <8 x i16> @llvm.x86.avx512.mask.pminu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
3539  %res1 = call <8 x i16> @llvm.x86.avx512.mask.pminu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
3540  %res2 = add <8 x i16> %res, %res1
3541  ret <8 x i16> %res2
3542}
3543
3544declare <16 x i16> @llvm.x86.avx512.mask.pminu.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
3545
3546; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_w_256
3547; CHECK-NOT: call
3548; CHECK: vpminuw %ymm
3549; CHECK: {%k1}
3550define <16 x i16>@test_int_x86_avx512_mask_pminu_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask) {
3551  %res = call <16 x i16> @llvm.x86.avx512.mask.pminu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask)
3552  %res1 = call <16 x i16> @llvm.x86.avx512.mask.pminu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %mask)
3553  %res2 = add <16 x i16> %res, %res1
3554  ret <16 x i16> %res2
3555}
3556
3557declare <8 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
3558
3559; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_hi_128
3560; CHECK-NOT: call
3561; CHECK: kmov
3562; CHECK: vpermt2w %xmm{{.*}}{%k1}
3563; CHECK-NOT: {z}
3564define <8 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
3565  %res = call <8 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
3566  %res1 = call <8 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
3567  %res2 = add <8 x i16> %res, %res1
3568  ret <8 x i16> %res2
3569}
3570
3571declare <8 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
3572
3573; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_hi_128
3574; CHECK-NOT: call
3575; CHECK: kmov
3576; CHECK: vpermt2w %xmm{{.*}}{%k1} {z}
3577define <8 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
3578  %res = call <8 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
3579  %res1 = call <8 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
3580  %res2 = add <8 x i16> %res, %res1
3581  ret <8 x i16> %res2
3582}
3583
3584declare <16 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
3585
3586; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_hi_256
3587; CHECK-NOT: call
3588; CHECK: kmov
3589; CHECK: vpermt2w %ymm{{.*}}{%k1}
3590define <16 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
3591  %res = call <16 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
3592  %res1 = call <16 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
3593  %res2 = add <16 x i16> %res, %res1
3594  ret <16 x i16> %res2
3595}
3596
3597declare <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
3598
3599; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_hi_256
3600; CHECK-NOT: call
3601; CHECK: kmov
3602; CHECK: vpermt2w %ymm{{.*}}{%k1} {z}
3603define <16 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
3604  %res = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
3605  %res1 = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
3606  %res2 = add <16 x i16> %res, %res1
3607  ret <16 x i16> %res2
3608}
3609
3610declare <8 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
3611
3612; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_hi_128
3613; CHECK-NOT: call
3614; CHECK: kmov
3615; CHECK: vpermi2w %xmm{{.*}}{%k1}
3616define <8 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
3617  %res = call <8 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
3618  %res1 = call <8 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
3619  %res2 = add <8 x i16> %res, %res1
3620  ret <8 x i16> %res2
3621}
3622
3623declare <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
3624
3625; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_hi_256
3626; CHECK-NOT: call
3627; CHECK: kmov
3628; CHECK: vpermi2w %ymm{{.*}}{%k1}
3629define <16 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
3630  %res = call <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
3631  %res1 = call <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
3632  %res2 = add <16 x i16> %res, %res1
3633  ret <16 x i16> %res2
3634}
3635
3636declare <16 x i8> @llvm.x86.avx512.mask.pavg.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
3637
3638; CHECK-LABEL: @test_int_x86_avx512_mask_pavg_b_128
3639; CHECK-NOT: call
3640; CHECK: vpavgb %xmm
3641; CHECK: {%k1}
3642define <16 x i8>@test_int_x86_avx512_mask_pavg_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
3643  %res = call <16 x i8> @llvm.x86.avx512.mask.pavg.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3)
3644  %res1 = call <16 x i8> @llvm.x86.avx512.mask.pavg.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1)
3645  %res2 = add <16 x i8> %res, %res1
3646  ret <16 x i8> %res2
3647}
3648
3649declare <32 x i8> @llvm.x86.avx512.mask.pavg.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
3650
3651; CHECK-LABEL: @test_int_x86_avx512_mask_pavg_b_256
3652; CHECK-NOT: call
3653; CHECK: vpavgb %ymm
3654; CHECK: {%k1}
3655define <32 x i8>@test_int_x86_avx512_mask_pavg_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
3656  %res = call <32 x i8> @llvm.x86.avx512.mask.pavg.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
3657  %res1 = call <32 x i8> @llvm.x86.avx512.mask.pavg.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
3658  %res2 = add <32 x i8> %res, %res1
3659  ret <32 x i8> %res2
3660}
3661
3662declare <8 x i16> @llvm.x86.avx512.mask.pavg.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
3663
3664; CHECK-LABEL: @test_int_x86_avx512_mask_pavg_w_128
3665; CHECK-NOT: call
3666; CHECK: vpavgw %xmm
3667; CHECK: {%k1}
3668define <8 x i16>@test_int_x86_avx512_mask_pavg_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
3669  %res = call <8 x i16> @llvm.x86.avx512.mask.pavg.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
3670  %res1 = call <8 x i16> @llvm.x86.avx512.mask.pavg.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
3671  %res2 = add <8 x i16> %res, %res1
3672  ret <8 x i16> %res2
3673}
3674
3675declare <16 x i16> @llvm.x86.avx512.mask.pavg.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
3676
3677; CHECK-LABEL: @test_int_x86_avx512_mask_pavg_w_256
3678; CHECK-NOT: call
3679; CHECK: vpavgw %ymm
3680; CHECK: {%k1}
3681define <16 x i16>@test_int_x86_avx512_mask_pavg_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
3682  %res = call <16 x i16> @llvm.x86.avx512.mask.pavg.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
3683  %res1 = call <16 x i16> @llvm.x86.avx512.mask.pavg.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
3684  %res2 = add <16 x i16> %res, %res1
3685  ret <16 x i16> %res2
3686}
3687
3688declare <16 x i8> @llvm.x86.avx512.mask.pshuf.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
3689
3690; CHECK-LABEL: @test_int_x86_avx512_mask_pshuf_b_128
3691; CHECK-NOT: call
3692; CHECK: kmov
3693; CHECK: vpshufb %xmm{{.*}}{%k1}
3694define <16 x i8>@test_int_x86_avx512_mask_pshuf_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
3695  %res = call <16 x i8> @llvm.x86.avx512.mask.pshuf.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3)
3696  %res1 = call <16 x i8> @llvm.x86.avx512.mask.pshuf.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1)
3697  %res2 = add <16 x i8> %res, %res1
3698  ret <16 x i8> %res2
3699}
3700
3701declare <32 x i8> @llvm.x86.avx512.mask.pshuf.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
3702
3703; CHECK-LABEL: @test_int_x86_avx512_mask_pshuf_b_256
3704; CHECK-NOT: call
3705; CHECK: kmov
3706; CHECK: vpshufb %ymm{{.*}}{%k1}
3707define <32 x i8>@test_int_x86_avx512_mask_pshuf_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
3708  %res = call <32 x i8> @llvm.x86.avx512.mask.pshuf.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
3709  %res1 = call <32 x i8> @llvm.x86.avx512.mask.pshuf.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
3710  %res2 = add <32 x i8> %res, %res1
3711  ret <32 x i8> %res2
3712}
3713
3714declare <16 x i8> @llvm.x86.avx512.mask.pabs.b.128(<16 x i8>, <16 x i8>, i16)
3715
3716; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_b_128
3717; CHECK-NOT: call
3718; CHECK: kmov
3719; CHECK: vpabsb{{.*}}{%k1}
3720define <16 x i8>@test_int_x86_avx512_mask_pabs_b_128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2) {
3721  %res = call <16 x i8> @llvm.x86.avx512.mask.pabs.b.128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2)
3722  %res1 = call <16 x i8> @llvm.x86.avx512.mask.pabs.b.128(<16 x i8> %x0, <16 x i8> %x1, i16 -1)
3723  %res2 = add <16 x i8> %res, %res1
3724  ret <16 x i8> %res2
3725}
3726
3727declare <32 x i8> @llvm.x86.avx512.mask.pabs.b.256(<32 x i8>, <32 x i8>, i32)
3728
3729; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_b_256
3730; CHECK-NOT: call
3731; CHECK: kmov
3732; CHECK: vpabsb{{.*}}{%k1}
3733define <32 x i8>@test_int_x86_avx512_mask_pabs_b_256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2) {
3734  %res = call <32 x i8> @llvm.x86.avx512.mask.pabs.b.256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2)
3735  %res1 = call <32 x i8> @llvm.x86.avx512.mask.pabs.b.256(<32 x i8> %x0, <32 x i8> %x1, i32 -1)
3736  %res2 = add <32 x i8> %res, %res1
3737  ret <32 x i8> %res2
3738}
3739
3740declare <8 x i16> @llvm.x86.avx512.mask.pabs.w.128(<8 x i16>, <8 x i16>, i8)
3741
3742; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_w_128
3743; CHECK-NOT: call
3744; CHECK: kmov
3745; CHECK: vpabsw{{.*}}{%k1}
3746define <8 x i16>@test_int_x86_avx512_mask_pabs_w_128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2) {
3747  %res = call <8 x i16> @llvm.x86.avx512.mask.pabs.w.128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2)
3748  %res1 = call <8 x i16> @llvm.x86.avx512.mask.pabs.w.128(<8 x i16> %x0, <8 x i16> %x1, i8 -1)
3749  %res2 = add <8 x i16> %res, %res1
3750  ret <8 x i16> %res2
3751}
3752
3753declare <16 x i16> @llvm.x86.avx512.mask.pabs.w.256(<16 x i16>, <16 x i16>, i16)
3754
3755; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_w_256
3756; CHECK-NOT: call
3757; CHECK: kmov
3758; CHECK: vpabsw{{.*}}{%k1}
3759define <16 x i16>@test_int_x86_avx512_mask_pabs_w_256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2) {
3760  %res = call <16 x i16> @llvm.x86.avx512.mask.pabs.w.256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2)
3761  %res1 = call <16 x i16> @llvm.x86.avx512.mask.pabs.w.256(<16 x i16> %x0, <16 x i16> %x1, i16 -1)
3762  %res2 = add <16 x i16> %res, %res1
3763  ret <16 x i16> %res2
3764}
3765
3766; CHECK-LABEL: test_x86_mask_blend_b_256
3767; CHECK: vpblendmb
3768define <32 x i8> @test_x86_mask_blend_b_256(i32 %a0, <32 x i8> %a1, <32 x i8> %a2) {
3769  %res = call <32 x i8> @llvm.x86.avx512.mask.blend.b.256(<32 x i8> %a1, <32 x i8> %a2, i32 %a0) ; <<32 x i8>> [#uses=1]
3770  ret <32 x i8> %res
3771}
3772declare <32 x i8> @llvm.x86.avx512.mask.blend.b.256(<32 x i8>, <32 x i8>, i32) nounwind readonly
3773
3774; CHECK-LABEL: test_x86_mask_blend_w_256
3775define <16 x i16> @test_x86_mask_blend_w_256(i16 %mask, <16 x i16> %a1, <16 x i16> %a2) {
3776  ; CHECK: vpblendmw
3777  %res = call <16 x i16> @llvm.x86.avx512.mask.blend.w.256(<16 x i16> %a1, <16 x i16> %a2, i16 %mask) ; <<16 x i16>> [#uses=1]
3778  ret <16 x i16> %res
3779}
3780declare <16 x i16> @llvm.x86.avx512.mask.blend.w.256(<16 x i16>, <16 x i16>, i16) nounwind readonly
3781
3782; CHECK-LABEL: test_x86_mask_blend_b_128
3783; CHECK: vpblendmb
3784define <16 x i8> @test_x86_mask_blend_b_128(i16 %a0, <16 x i8> %a1, <16 x i8> %a2) {
3785  %res = call <16 x i8> @llvm.x86.avx512.mask.blend.b.128(<16 x i8> %a1, <16 x i8> %a2, i16 %a0) ; <<16 x i8>> [#uses=1]
3786  ret <16 x i8> %res
3787}
3788declare <16 x i8> @llvm.x86.avx512.mask.blend.b.128(<16 x i8>, <16 x i8>, i16) nounwind readonly
3789
3790; CHECK-LABEL: test_x86_mask_blend_w_128
3791define <8 x i16> @test_x86_mask_blend_w_128(i8 %mask, <8 x i16> %a1, <8 x i16> %a2) {
3792  ; CHECK: vpblendmw
3793  %res = call <8 x i16> @llvm.x86.avx512.mask.blend.w.128(<8 x i16> %a1, <8 x i16> %a2, i8 %mask) ; <<8 x i16>> [#uses=1]
3794  ret <8 x i16> %res
3795}
3796declare <8 x i16> @llvm.x86.avx512.mask.blend.w.128(<8 x i16>, <8 x i16>, i8) nounwind readonly
3797
3798declare <8 x i16> @llvm.x86.avx512.mask.pmulhu.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
3799
3800; CHECK-LABEL: @test_int_x86_avx512_mask_pmulhu_w_128
3801; CHECK-NOT: call
3802; CHECK: kmov
3803; CHECK: {%k1}
3804; CHECK: vpmulhuw {{.*}}encoding: [0x62
3805define <8 x i16>@test_int_x86_avx512_mask_pmulhu_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
3806  %res = call <8 x i16> @llvm.x86.avx512.mask.pmulhu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
3807  %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmulhu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
3808  %res2 = add <8 x i16> %res, %res1
3809  ret <8 x i16> %res2
3810}
3811
3812declare <16 x i16> @llvm.x86.avx512.mask.pmulhu.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
3813
3814; CHECK-LABEL: @test_int_x86_avx512_mask_pmulhu_w_256
3815; CHECK-NOT: call
3816; CHECK: kmov
3817; CHECK: {%k1}
3818; CHECK: vpmulhuw {{.*}}encoding: [0x62
3819define <16 x i16>@test_int_x86_avx512_mask_pmulhu_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
3820  %res = call <16 x i16> @llvm.x86.avx512.mask.pmulhu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
3821  %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmulhu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
3822  %res2 = add <16 x i16> %res, %res1
3823  ret <16 x i16> %res2
3824}
3825
3826declare <8 x i16> @llvm.x86.avx512.mask.pmulh.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
3827
3828; CHECK-LABEL: @test_int_x86_avx512_mask_pmulh_w_128
3829; CHECK-NOT: call
3830; CHECK: kmov
3831; CHECK: {%k1}
3832; CHECK: vpmulhw {{.*}}encoding: [0x62
3833define <8 x i16>@test_int_x86_avx512_mask_pmulh_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
3834  %res = call <8 x i16> @llvm.x86.avx512.mask.pmulh.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
3835  %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmulh.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
3836  %res2 = add <8 x i16> %res, %res1
3837  ret <8 x i16> %res2
3838}
3839
3840declare <16 x i16> @llvm.x86.avx512.mask.pmulh.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
3841; CHECK-LABEL: @test_int_x86_avx512_mask_pmulh_w_256
3842; CHECK-NOT: call
3843; CHECK: kmov
3844; CHECK: {%k1}
3845; CHECK: vpmulhw {{.*}}encoding: [0x62
3846define <16 x i16>@test_int_x86_avx512_mask_pmulh_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
3847  %res = call <16 x i16> @llvm.x86.avx512.mask.pmulh.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
3848  %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmulh.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
3849  %res2 = add <16 x i16> %res, %res1
3850  ret <16 x i16> %res2
3851}
3852
3853declare <8 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
3854; CHECK-LABEL: @test_int_x86_avx512_mask_pmulhr_sw_128
3855; CHECK-NOT: call
3856; CHECK: kmov
3857; CHECK: {%k1}
3858; CHECK: vpmulhrsw {{.*}}encoding: [0x62
3859define <8 x i16>@test_int_x86_avx512_mask_pmulhr_sw_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
3860  %res = call <8 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
3861  %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
3862  %res2 = add <8 x i16> %res, %res1
3863  ret <8 x i16> %res2
3864}
3865
3866declare <16 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
3867; CHECK-LABEL: @test_int_x86_avx512_mask_pmulhr_sw_256
3868; CHECK-NOT: call
3869; CHECK: kmov
3870; CHECK: {%k1}
3871; CHECK: vpmulhrsw {{.*}}encoding: [0x62
3872define <16 x i16>@test_int_x86_avx512_mask_pmulhr_sw_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
3873  %res = call <16 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
3874  %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
3875  %res2 = add <16 x i16> %res, %res1
3876  ret <16 x i16> %res2
3877}
3878
3879declare <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16>, <16 x i8>, i8)
3880
3881define <16 x i8>@test_int_x86_avx512_mask_pmov_wb_128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) {
3882; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_128:
3883; CHECK:       vpmovwb %xmm0, %xmm1 {%k1}
3884; CHECK-NEXT:  vpmovwb %xmm0, %xmm2 {%k1} {z}
3885; CHECK-NEXT:  vpmovwb %xmm0, %xmm0
3886    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 -1)
3887    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2)
3888    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16> %x0, <16 x i8> zeroinitializer, i8 %x2)
3889    %res3 = add <16 x i8> %res0, %res1
3890    %res4 = add <16 x i8> %res3, %res2
3891    ret <16 x i8> %res4
3892}
3893
3894declare void @llvm.x86.avx512.mask.pmov.wb.mem.128(i8* %ptr, <8 x i16>, i8)
3895
3896define void @test_int_x86_avx512_mask_pmov_wb_mem_128(i8* %ptr, <8 x i16> %x1, i8 %x2) {
3897; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_128:
3898; CHECK:  vpmovwb %xmm0, (%rdi)
3899; CHECK:  vpmovwb %xmm0, (%rdi) {%k1}
3900    call void @llvm.x86.avx512.mask.pmov.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 -1)
3901    call void @llvm.x86.avx512.mask.pmov.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 %x2)
3902    ret void
3903}
3904
3905declare <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16>, <16 x i8>, i8)
3906
3907define <16 x i8>@test_int_x86_avx512_mask_pmovs_wb_128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) {
3908; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_128:
3909; CHECK:       vpmovswb %xmm0, %xmm1 {%k1}
3910; CHECK-NEXT:  vpmovswb %xmm0, %xmm2 {%k1} {z}
3911; CHECK-NEXT:  vpmovswb %xmm0, %xmm0
3912    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 -1)
3913    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2)
3914    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16> %x0, <16 x i8> zeroinitializer, i8 %x2)
3915    %res3 = add <16 x i8> %res0, %res1
3916    %res4 = add <16 x i8> %res3, %res2
3917    ret <16 x i8> %res4
3918}
3919
3920declare void @llvm.x86.avx512.mask.pmovs.wb.mem.128(i8* %ptr, <8 x i16>, i8)
3921
3922define void @test_int_x86_avx512_mask_pmovs_wb_mem_128(i8* %ptr, <8 x i16> %x1, i8 %x2) {
3923; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_128:
3924; CHECK:  vpmovswb %xmm0, (%rdi)
3925; CHECK:  vpmovswb %xmm0, (%rdi) {%k1}
3926    call void @llvm.x86.avx512.mask.pmovs.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 -1)
3927    call void @llvm.x86.avx512.mask.pmovs.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 %x2)
3928    ret void
3929}
3930
3931declare <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16>, <16 x i8>, i8)
3932
3933define <16 x i8>@test_int_x86_avx512_mask_pmovus_wb_128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) {
3934; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_128:
3935; CHECK:       vpmovuswb %xmm0, %xmm1 {%k1}
3936; CHECK-NEXT:  vpmovuswb %xmm0, %xmm2 {%k1} {z}
3937; CHECK-NEXT:  vpmovuswb %xmm0, %xmm0
3938    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 -1)
3939    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2)
3940    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16> %x0, <16 x i8> zeroinitializer, i8 %x2)
3941    %res3 = add <16 x i8> %res0, %res1
3942    %res4 = add <16 x i8> %res3, %res2
3943    ret <16 x i8> %res4
3944}
3945
3946declare void @llvm.x86.avx512.mask.pmovus.wb.mem.128(i8* %ptr, <8 x i16>, i8)
3947
3948define void @test_int_x86_avx512_mask_pmovus_wb_mem_128(i8* %ptr, <8 x i16> %x1, i8 %x2) {
3949; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_128:
3950; CHECK:  vpmovuswb %xmm0, (%rdi)
3951; CHECK:  vpmovuswb %xmm0, (%rdi) {%k1}
3952    call void @llvm.x86.avx512.mask.pmovus.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 -1)
3953    call void @llvm.x86.avx512.mask.pmovus.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 %x2)
3954    ret void
3955}
3956
3957declare <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16>, <16 x i8>, i16)
3958
3959define <16 x i8>@test_int_x86_avx512_mask_pmov_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) {
3960; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_256:
3961; CHECK:       vpmovwb %ymm0, %xmm1 {%k1}
3962; CHECK-NEXT:  vpmovwb %ymm0, %xmm2 {%k1} {z}
3963; CHECK-NEXT:  vpmovwb %ymm0, %xmm0
3964    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 -1)
3965    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2)
3966    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> zeroinitializer, i16 %x2)
3967    %res3 = add <16 x i8> %res0, %res1
3968    %res4 = add <16 x i8> %res3, %res2
3969    ret <16 x i8> %res4
3970}
3971
3972declare void @llvm.x86.avx512.mask.pmov.wb.mem.256(i8* %ptr, <16 x i16>, i16)
3973
3974define void @test_int_x86_avx512_mask_pmov_wb_mem_256(i8* %ptr, <16 x i16> %x1, i16 %x2) {
3975; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_256:
3976; CHECK:  vpmovwb %ymm0, (%rdi)
3977; CHECK:  vpmovwb %ymm0, (%rdi) {%k1}
3978    call void @llvm.x86.avx512.mask.pmov.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 -1)
3979    call void @llvm.x86.avx512.mask.pmov.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 %x2)
3980    ret void
3981}
3982
3983declare <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16>, <16 x i8>, i16)
3984
3985define <16 x i8>@test_int_x86_avx512_mask_pmovs_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) {
3986; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_256:
3987; CHECK:       vpmovswb %ymm0, %xmm1 {%k1}
3988; CHECK-NEXT:  vpmovswb %ymm0, %xmm2 {%k1} {z}
3989; CHECK-NEXT:  vpmovswb %ymm0, %xmm0
3990    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 -1)
3991    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2)
3992    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %x0, <16 x i8> zeroinitializer, i16 %x2)
3993    %res3 = add <16 x i8> %res0, %res1
3994    %res4 = add <16 x i8> %res3, %res2
3995    ret <16 x i8> %res4
3996}
3997
3998declare void @llvm.x86.avx512.mask.pmovs.wb.mem.256(i8* %ptr, <16 x i16>, i16)
3999
4000define void @test_int_x86_avx512_mask_pmovs_wb_mem_256(i8* %ptr, <16 x i16> %x1, i16 %x2) {
4001; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_256:
4002; CHECK:  vpmovswb %ymm0, (%rdi)
4003; CHECK:  vpmovswb %ymm0, (%rdi) {%k1}
4004    call void @llvm.x86.avx512.mask.pmovs.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 -1)
4005    call void @llvm.x86.avx512.mask.pmovs.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 %x2)
4006    ret void
4007}
4008
4009declare <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16>, <16 x i8>, i16)
4010
4011define <16 x i8>@test_int_x86_avx512_mask_pmovus_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) {
4012; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_256:
4013; CHECK:       vpmovuswb %ymm0, %xmm1 {%k1}
4014; CHECK-NEXT:  vpmovuswb %ymm0, %xmm2 {%k1} {z}
4015; CHECK-NEXT:  vpmovuswb %ymm0, %xmm0
4016    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 -1)
4017    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2)
4018    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %x0, <16 x i8> zeroinitializer, i16 %x2)
4019    %res3 = add <16 x i8> %res0, %res1
4020    %res4 = add <16 x i8> %res3, %res2
4021    ret <16 x i8> %res4
4022}
4023
4024declare void @llvm.x86.avx512.mask.pmovus.wb.mem.256(i8* %ptr, <16 x i16>, i16)
4025
4026define void @test_int_x86_avx512_mask_pmovus_wb_mem_256(i8* %ptr, <16 x i16> %x1, i16 %x2) {
4027; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_256:
4028; CHECK:  vpmovuswb %ymm0, (%rdi)
4029; CHECK:  vpmovuswb %ymm0, (%rdi) {%k1}
4030    call void @llvm.x86.avx512.mask.pmovus.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 -1)
4031    call void @llvm.x86.avx512.mask.pmovus.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 %x2)
4032    ret void
4033}
4034
4035declare <4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16>, <8 x i16>, <4 x i32>, i8)
4036
4037define <4 x i32>@test_int_x86_avx512_mask_pmaddw_d_128(<8 x i16> %x0, <8 x i16> %x1, <4 x i32> %x2, i8 %x3) {
4038; CHECK-LABEL: test_int_x86_avx512_mask_pmaddw_d_128:
4039; CHECK:       ## BB#0:
4040; CHECK-NEXT:    movzbl %dil, %eax
4041; CHECK-NEXT:    kmovw %eax, %k1
4042; CHECK-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm2 {%k1}
4043; CHECK-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0
4044; CHECK-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
4045; CHECK-NEXT:    retq
4046  %res = call <4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16> %x0, <8 x i16> %x1, <4 x i32> %x2, i8 %x3)
4047  %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16> %x0, <8 x i16> %x1, <4 x i32> %x2, i8 -1)
4048  %res2 = add <4 x i32> %res, %res1
4049  ret <4 x i32> %res2
4050}
4051
4052declare <8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256(<16 x i16>, <16 x i16>, <8 x i32>, i8)
4053
4054define <8 x i32>@test_int_x86_avx512_mask_pmaddw_d_256(<16 x i16> %x0, <16 x i16> %x1, <8 x i32> %x2, i8 %x3) {
4055; CHECK-LABEL: test_int_x86_avx512_mask_pmaddw_d_256:
4056; CHECK:       ## BB#0:
4057; CHECK-NEXT:    movzbl %dil, %eax
4058; CHECK-NEXT:    kmovw %eax, %k1
4059; CHECK-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm2 {%k1}
4060; CHECK-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0
4061; CHECK-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
4062; CHECK-NEXT:    retq
4063  %res = call <8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256(<16 x i16> %x0, <16 x i16> %x1, <8 x i32> %x2, i8 %x3)
4064  %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256(<16 x i16> %x0, <16 x i16> %x1, <8 x i32> %x2, i8 -1)
4065  %res2 = add <8 x i32> %res, %res1
4066  ret <8 x i32> %res2
4067}
4068
4069declare <8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8>, <16 x i8>, <8 x i16>, i8)
4070
4071define <8 x i16>@test_int_x86_avx512_mask_pmaddubs_w_128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x2, i8 %x3) {
4072; CHECK-LABEL: test_int_x86_avx512_mask_pmaddubs_w_128:
4073; CHECK:       ## BB#0:
4074; CHECK-NEXT:    movzbl %dil, %eax
4075; CHECK-NEXT:    kmovw %eax, %k1
4076; CHECK-NEXT:    vpmaddubsw %xmm1, %xmm0, %xmm2 {%k1}
4077; CHECK-NEXT:    vpmaddubsw %xmm1, %xmm0, %xmm0
4078; CHECK-NEXT:    vpaddw %xmm0, %xmm2, %xmm0
4079; CHECK-NEXT:    retq
4080  %res = call <8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x2, i8 %x3)
4081  %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x2, i8 -1)
4082  %res2 = add <8 x i16> %res, %res1
4083  ret <8 x i16> %res2
4084}
4085
4086declare <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256(<32 x i8>, <32 x i8>, <16 x i16>, i16)
4087
4088define <16 x i16>@test_int_x86_avx512_mask_pmaddubs_w_256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x2, i16 %x3) {
4089; CHECK-LABEL: test_int_x86_avx512_mask_pmaddubs_w_256:
4090; CHECK:       ## BB#0:
4091; CHECK-NEXT:    kmovw %edi, %k1
4092; CHECK-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm2 {%k1}
4093; CHECK-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0
4094; CHECK-NEXT:    vpaddw %ymm0, %ymm2, %ymm0
4095; CHECK-NEXT:    retq
4096  %res = call <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x2, i16 %x3)
4097  %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x2, i16 -1)
4098  %res2 = add <16 x i16> %res, %res1
4099  ret <16 x i16> %res2
4100}
4101
4102declare <16 x i8> @llvm.x86.avx512.mask.punpckhb.w.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
4103
4104define <16 x i8>@test_int_x86_avx512_mask_punpckhb_w_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
4105; CHECK-LABEL: test_int_x86_avx512_mask_punpckhb_w_128:
4106; CHECK:         vpunpckhbw %xmm1, %xmm0, %xmm2 {%k1}
4107; CHECK-NEXT:    ## xmm2 = xmm2[8],k1[8],xmm2[9],k1[9],xmm2[10],k1[10],xmm2[11],k1[11],xmm2[12],k1[12],xmm2[13],k1[13],xmm2[14],k1[14],xmm2[15],k1[15]
4108; CHECK-NEXT:    vpunpckhbw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x68,0xc1]
4109; CHECK-NEXT:    ## xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
4110  %res = call <16 x i8> @llvm.x86.avx512.mask.punpckhb.w.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3)
4111  %res1 = call <16 x i8> @llvm.x86.avx512.mask.punpckhb.w.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1)
4112  %res2 = add <16 x i8> %res, %res1
4113  ret <16 x i8> %res2
4114}
4115
4116declare <16 x i8> @llvm.x86.avx512.mask.punpcklb.w.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
4117
4118define <16 x i8>@test_int_x86_avx512_mask_punpcklb_w_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
4119; CHECK-LABEL: test_int_x86_avx512_mask_punpcklb_w_128:
4120; CHECK:         vpunpcklbw %xmm1, %xmm0, %xmm2 {%k1}
4121; CHECK-NEXT:    ## xmm2 = xmm2[0],k1[0],xmm2[1],k1[1],xmm2[2],k1[2],xmm2[3],k1[3],xmm2[4],k1[4],xmm2[5],k1[5],xmm2[6],k1[6],xmm2[7],k1[7]
4122; CHECK-NEXT:    vpunpcklbw %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x60,0xc1]
4123; CHECK-NEXT:    ## xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
4124  %res = call <16 x i8> @llvm.x86.avx512.mask.punpcklb.w.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3)
4125  %res1 = call <16 x i8> @llvm.x86.avx512.mask.punpcklb.w.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1)
4126  %res2 = add <16 x i8> %res, %res1
4127  ret <16 x i8> %res2
4128}
4129
4130declare <32 x i8> @llvm.x86.avx512.mask.punpckhb.w.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
4131
4132define <32 x i8>@test_int_x86_avx512_mask_punpckhb_w_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
4133; CHECK-LABEL: test_int_x86_avx512_mask_punpckhb_w_256:
4134; CHECK:         vpunpckhbw %ymm1, %ymm0, %ymm2 {%k1}
4135; CHECK-NEXT:    ## ymm2 = ymm2[8],k1[8],ymm2[9],k1[9],ymm2[10],k1[10],ymm2[11],k1[11],ymm2[12],k1[12],ymm2[13],k1[13],ymm2[14],k1[14],ymm2[15],k1[15],ymm2[24],k1[24],ymm2[25],k1[25],ymm2[26],k1[26],ymm2[27],k1[27],ymm2[28],k1[28],ymm2[29],k1[29],ymm2[30],k1[30],ymm2[31],k1[31]
4136; CHECK-NEXT:    vpunpckhbw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x68,0xc1]
4137; CHECK-NEXT:    ## ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
4138  %res = call <32 x i8> @llvm.x86.avx512.mask.punpckhb.w.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
4139  %res1 = call <32 x i8> @llvm.x86.avx512.mask.punpckhb.w.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
4140  %res2 = add <32 x i8> %res, %res1
4141  ret <32 x i8> %res2
4142}
4143
4144declare <32 x i8> @llvm.x86.avx512.mask.punpcklb.w.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
4145
4146define <32 x i8>@test_int_x86_avx512_mask_punpcklb_w_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
4147; CHECK-LABEL: test_int_x86_avx512_mask_punpcklb_w_256:
4148; CHECK:         vpunpcklbw %ymm1, %ymm0, %ymm2 {%k1}
4149; CHECK-NEXT:    ## ymm2 = ymm2[0],k1[0],ymm2[1],k1[1],ymm2[2],k1[2],ymm2[3],k1[3],ymm2[4],k1[4],ymm2[5],k1[5],ymm2[6],k1[6],ymm2[7],k1[7],ymm2[16],k1[16],ymm2[17],k1[17],ymm2[18],k1[18],ymm2[19],k1[19],ymm2[20],k1[20],ymm2[21],k1[21],ymm2[22],k1[22],ymm2[23],k1[23]
4150; CHECK-NEXT:    vpunpcklbw %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x60,0xc1]
4151; CHECK-NEXT:    ## ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
4152  %res = call <32 x i8> @llvm.x86.avx512.mask.punpcklb.w.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
4153  %res1 = call <32 x i8> @llvm.x86.avx512.mask.punpcklb.w.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
4154  %res2 = add <32 x i8> %res, %res1
4155  ret <32 x i8> %res2
4156}
4157
4158declare <8 x i16> @llvm.x86.avx512.mask.punpcklw.d.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
4159
4160define <8 x i16>@test_int_x86_avx512_mask_punpcklw_d_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
4161; CHECK-LABEL: test_int_x86_avx512_mask_punpcklw_d_128:
4162; CHECK:         vpunpcklwd %xmm1, %xmm0, %xmm2 {%k1}
4163; CHECK-NEXT:    ## xmm2 = xmm2[0],k1[0],xmm2[1],k1[1],xmm2[2],k1[2],xmm2[3],k1[3]
4164; CHECK-NEXT:    vpunpcklwd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x61,0xc1]
4165; CHECK-NEXT:    ## xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
4166  %res = call <8 x i16> @llvm.x86.avx512.mask.punpcklw.d.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
4167  %res1 = call <8 x i16> @llvm.x86.avx512.mask.punpcklw.d.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
4168  %res2 = add <8 x i16> %res, %res1
4169  ret <8 x i16> %res2
4170}
4171
4172declare <8 x i16> @llvm.x86.avx512.mask.punpckhw.d.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
4173
4174define <8 x i16>@test_int_x86_avx512_mask_punpckhw_d_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
4175; CHECK-LABEL: test_int_x86_avx512_mask_punpckhw_d_128:
4176; CHECK:         vpunpckhwd %xmm1, %xmm0, %xmm2 {%k1}
4177; CHECK-NEXT:    ## xmm2 = xmm2[4],k1[4],xmm2[5],k1[5],xmm2[6],k1[6],xmm2[7],k1[7]
4178; CHECK-NEXT:    vpunpckhwd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x69,0xc1]
4179; CHECK-NEXT:    ## xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
4180  %res = call <8 x i16> @llvm.x86.avx512.mask.punpckhw.d.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
4181  %res1 = call <8 x i16> @llvm.x86.avx512.mask.punpckhw.d.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
4182  %res2 = add <8 x i16> %res, %res1
4183  ret <8 x i16> %res2
4184}
4185
4186declare <16 x i16> @llvm.x86.avx512.mask.punpcklw.d.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
4187
4188define <16 x i16>@test_int_x86_avx512_mask_punpcklw_d_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
4189; CHECK-LABEL: test_int_x86_avx512_mask_punpcklw_d_256:
4190; CHECK:         vpunpcklwd %ymm1, %ymm0, %ymm2 {%k1}
4191; CHECK-NEXT:    ## ymm2 = ymm2[0],k1[0],ymm2[1],k1[1],ymm2[2],k1[2],ymm2[3],k1[3],ymm2[8],k1[8],ymm2[9],k1[9],ymm2[10],k1[10],ymm2[11],k1[11]
4192; CHECK-NEXT:    vpunpcklwd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x61,0xc1]
4193; CHECK-NEXT:    ## ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
4194  %res = call <16 x i16> @llvm.x86.avx512.mask.punpcklw.d.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
4195  %res1 = call <16 x i16> @llvm.x86.avx512.mask.punpcklw.d.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
4196  %res2 = add <16 x i16> %res, %res1
4197  ret <16 x i16> %res2
4198}
4199
4200declare <16 x i16> @llvm.x86.avx512.mask.punpckhw.d.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
4201
4202define <16 x i16>@test_int_x86_avx512_mask_punpckhw_d_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
4203; CHECK-LABEL: test_int_x86_avx512_mask_punpckhw_d_256:
4204; CHECK:         vpunpckhwd %ymm1, %ymm0, %ymm2 {%k1}
4205; CHECK-NEXT:    ## ymm2 = ymm2[4],k1[4],ymm2[5],k1[5],ymm2[6],k1[6],ymm2[7],k1[7],ymm2[12],k1[12],ymm2[13],k1[13],ymm2[14],k1[14],ymm2[15],k1[15]
4206; CHECK-NEXT:    vpunpckhwd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x69,0xc1]
4207; CHECK-NEXT:    ## ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
4208  %res = call <16 x i16> @llvm.x86.avx512.mask.punpckhw.d.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
4209  %res1 = call <16 x i16> @llvm.x86.avx512.mask.punpckhw.d.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
4210  %res2 = add <16 x i16> %res, %res1
4211  ret <16 x i16> %res2
4212}
4213
4214declare <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8>, <16 x i8>, i32, <16 x i8>, i16)
4215
4216define <16 x i8>@test_int_x86_avx512_mask_palignr_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x3, i16 %x4) {
4217; CHECK-LABEL: test_int_x86_avx512_mask_palignr_128:
4218; CHECK:       ## BB#0:
4219; CHECK-NEXT:    kmovw %edi, %k1
4220; CHECK-NEXT:    vpalignr $2, %xmm1, %xmm0, %xmm2 {%k1}
4221; CHECK-NEXT:    vpalignr $2, %xmm1, %xmm0, %xmm3 {%k1} {z}
4222; CHECK-NEXT:    vpalignr $2, %xmm1, %xmm0, %xmm0
4223; CHECK-NEXT:    vpaddb %xmm3, %xmm2, %xmm1
4224; CHECK-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
4225; CHECK-NEXT:    retq
4226  %res = call <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <16 x i8> %x3, i16 %x4)
4227  %res1 = call <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <16 x i8> zeroinitializer, i16 %x4)
4228  %res2 = call <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <16 x i8> %x3, i16 -1)
4229  %res3 = add <16 x i8> %res, %res1
4230  %res4 = add <16 x i8> %res3, %res2
4231  ret <16 x i8> %res4
4232}
4233
4234declare <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8>, <32 x i8>, i32, <32 x i8>, i32)
4235
4236define <32 x i8>@test_int_x86_avx512_mask_palignr_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x3, i32 %x4) {
4237; CHECK-LABEL: test_int_x86_avx512_mask_palignr_256:
4238; CHECK:       ## BB#0:
4239; CHECK-NEXT:    kmovd %edi, %k1
4240; CHECK-NEXT:    vpalignr $2, %ymm1, %ymm0, %ymm2 {%k1}
4241; CHECK-NEXT:    vpalignr $2, %ymm1, %ymm0, %ymm3 {%k1} {z}
4242; CHECK-NEXT:    vpalignr $2, %ymm1, %ymm0, %ymm0
4243; CHECK-NEXT:    vpaddb %ymm3, %ymm2, %ymm1
4244; CHECK-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
4245; CHECK-NEXT:    retq
4246  %res = call <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <32 x i8> %x3, i32 %x4)
4247  %res1 = call <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <32 x i8> zeroinitializer, i32 %x4)
4248  %res2 = call <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <32 x i8> %x3, i32 -1)
4249  %res3 = add <32 x i8> %res, %res1
4250  %res4 = add <32 x i8> %res3, %res2
4251  ret <32 x i8> %res4
4252}
4253
4254declare <8 x i16> @llvm.x86.avx512.mask.dbpsadbw.128(<16 x i8>, <16 x i8>, i32, <8 x i16>, i8)
4255
4256define <8 x i16>@test_int_x86_avx512_mask_dbpsadbw_128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x3, i8 %x4) {
4257; CHECK-LABEL: test_int_x86_avx512_mask_dbpsadbw_128:
4258; CHECK:       ## BB#0:
4259; CHECK-NEXT:    movzbl %dil, %eax
4260; CHECK-NEXT:    kmovw %eax, %k1
4261; CHECK-NEXT:    vdbpsadbw $2, %xmm1, %xmm0, %xmm2 {%k1}
4262; CHECK-NEXT:    vdbpsadbw $2, %xmm1, %xmm0, %xmm3 {%k1} {z}
4263; CHECK-NEXT:    vdbpsadbw $2, %xmm1, %xmm0, %xmm0
4264; CHECK-NEXT:    vpaddw %xmm3, %xmm2, %xmm1
4265; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
4266; CHECK-NEXT:    retq
4267  %res = call <8 x i16> @llvm.x86.avx512.mask.dbpsadbw.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <8 x i16> %x3, i8 %x4)
4268  %res1 = call <8 x i16> @llvm.x86.avx512.mask.dbpsadbw.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <8 x i16> zeroinitializer, i8 %x4)
4269  %res2 = call <8 x i16> @llvm.x86.avx512.mask.dbpsadbw.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <8 x i16> %x3, i8 -1)
4270  %res3 = add <8 x i16> %res, %res1
4271  %res4 = add <8 x i16> %res2, %res3
4272  ret <8 x i16> %res4
4273}
4274
4275declare <16 x i16> @llvm.x86.avx512.mask.dbpsadbw.256(<32 x i8>, <32 x i8>, i32, <16 x i16>, i16)
4276
4277define <16 x i16>@test_int_x86_avx512_mask_dbpsadbw_256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x3, i16 %x4) {
4278; CHECK-LABEL: test_int_x86_avx512_mask_dbpsadbw_256:
4279; CHECK:       ## BB#0:
4280; CHECK-NEXT:    kmovw %edi, %k1
4281; CHECK-NEXT:    vdbpsadbw $2, %ymm1, %ymm0, %ymm2 {%k1}
4282; CHECK-NEXT:    vdbpsadbw $2, %ymm1, %ymm0, %ymm3 {%k1} {z}
4283; CHECK-NEXT:    vdbpsadbw $2, %ymm1, %ymm0, %ymm0
4284; CHECK-NEXT:    vpaddw %ymm3, %ymm2, %ymm1
4285; CHECK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
4286; CHECK-NEXT:    retq
4287  %res = call <16 x i16> @llvm.x86.avx512.mask.dbpsadbw.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <16 x i16> %x3, i16 %x4)
4288  %res1 = call <16 x i16> @llvm.x86.avx512.mask.dbpsadbw.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <16 x i16> zeroinitializer, i16 %x4)
4289  %res2 = call <16 x i16> @llvm.x86.avx512.mask.dbpsadbw.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <16 x i16> %x3, i16 -1)
4290  %res3 = add <16 x i16> %res, %res1
4291  %res4 = add <16 x i16> %res3, %res2
4292  ret <16 x i16> %res4
4293}
4294
4295declare <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8>, <32 x i8>, i32)
4296
4297define <32 x i8>@test_int_x86_avx512_pbroadcastb_256(<16 x i8> %x0, <32 x i8> %x1, i32 %mask) {
4298; CHECK-LABEL: test_int_x86_avx512_pbroadcastb_256:
4299; CHECK:       ## BB#0:
4300; CHECK-NEXT:    kmovd %edi, %k1
4301; CHECK-NEXT:    vpbroadcastb %xmm0, %ymm1 {%k1}
4302; CHECK-NEXT:    vpbroadcastb %xmm0, %ymm2 {%k1} {z}
4303; CHECK-NEXT:    vpbroadcastb %xmm0, %ymm0
4304; CHECK-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
4305; CHECK-NEXT:    vpaddb %ymm0, %ymm2, %ymm0
4306; CHECK-NEXT:    retq
4307  %res = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> %x1, i32 -1)
4308  %res1 = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> %x1, i32 %mask)
4309  %res2 = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> zeroinitializer, i32 %mask)
4310  %res3 = add <32 x i8> %res, %res1
4311  %res4 = add <32 x i8> %res2, %res3
4312  ret <32 x i8> %res4
4313}
4314
4315declare <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8>, <16 x i8>, i16)
4316
4317define <16 x i8>@test_int_x86_avx512_pbroadcastb_128(<16 x i8> %x0, <16 x i8> %x1, i16 %mask) {
4318; CHECK-LABEL: test_int_x86_avx512_pbroadcastb_128:
4319; CHECK:       ## BB#0:
4320; CHECK-NEXT:    kmovw %edi, %k1
4321; CHECK-NEXT:    vpbroadcastb %xmm0, %xmm1 {%k1}
4322; CHECK-NEXT:    vpbroadcastb %xmm0, %xmm2 {%k1} {z}
4323; CHECK-NEXT:    vpbroadcastb %xmm0, %xmm0
4324; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
4325; CHECK-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
4326; CHECK-NEXT:    retq
4327  %res = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> %x1, i16 -1)
4328  %res1 = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> %x1, i16 %mask)
4329  %res2 = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> zeroinitializer, i16 %mask)
4330  %res3 = add <16 x i8> %res, %res1
4331  %res4 = add <16 x i8> %res2, %res3
4332  ret <16 x i8> %res4
4333}
4334
4335declare <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16>, <16 x i16>, i16)
4336
4337define <16 x i16>@test_int_x86_avx512_pbroadcastw_256(<8 x i16> %x0, <16 x i16> %x1, i16 %mask) {
4338; CHECK-LABEL: test_int_x86_avx512_pbroadcastw_256:
4339; CHECK:       ## BB#0:
4340; CHECK-NEXT:    kmovw %edi, %k1
4341; CHECK-NEXT:    vpbroadcastw %xmm0, %ymm1 {%k1}
4342; CHECK-NEXT:    vpbroadcastw %xmm0, %ymm2 {%k1} {z}
4343; CHECK-NEXT:    vpbroadcastw %xmm0, %ymm0
4344; CHECK-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
4345; CHECK-NEXT:    vpaddw %ymm0, %ymm2, %ymm0
4346; CHECK-NEXT:    retq
4347  %res = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> %x1, i16 -1)
4348  %res1 = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> %x1, i16 %mask)
4349  %res2 = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> zeroinitializer, i16 %mask)
4350  %res3 = add <16 x i16> %res, %res1
4351  %res4 = add <16 x i16> %res2, %res3
4352  ret <16 x i16> %res4
4353}
4354
4355declare <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16>, <8 x i16>, i8)
4356
4357define <8 x i16>@test_int_x86_avx512_pbroadcastw_128(<8 x i16> %x0, <8 x i16> %x1, i8 %mask) {
4358; CHECK-LABEL: test_int_x86_avx512_pbroadcastw_128:
4359; CHECK:       ## BB#0:
4360; CHECK-NEXT:    movzbl %dil, %eax
4361; CHECK-NEXT:    kmovw %eax, %k1
4362; CHECK-NEXT:    vpbroadcastw %xmm0, %xmm1 {%k1}
4363; CHECK-NEXT:    vpbroadcastw %xmm0, %xmm2 {%k1} {z}
4364; CHECK-NEXT:    vpbroadcastw %xmm0, %xmm0
4365; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
4366; CHECK-NEXT:    vpaddw %xmm0, %xmm2, %xmm0
4367; CHECK-NEXT:    retq
4368  %res = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> %x1, i8 -1)
4369  %res1 = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> %x1, i8 %mask)
4370  %res2 = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> zeroinitializer, i8 %mask)
4371  %res3 = add <8 x i16> %res, %res1
4372  %res4 = add <8 x i16> %res2, %res3
4373  ret <8 x i16> %res4
4374}
4375
4376declare <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8>, <64 x i8>, i64)
4377
4378define <64 x i8>@test_int_x86_avx512_pbroadcastb_512(<16 x i8> %x0, <64 x i8> %x1, i64 %mask) {
4379; CHECK-LABEL: test_int_x86_avx512_pbroadcastb_512:
4380; CHECK:       ## BB#0:
4381; CHECK-NEXT:    kmovq %rdi, %k1 ## encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
4382; CHECK-NEXT:    vpbroadcastb %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x78,0xc8]
4383; CHECK-NEXT:    vpbroadcastb %xmm0, %zmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x78,0xd0]
4384; CHECK-NEXT:    vpbroadcastb %xmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x78,0xc0]
4385; CHECK-NEXT:    vpaddb %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc1]
4386; CHECK-NEXT:    vpaddb %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc0]
4387; CHECK-NEXT:    retq ## encoding: [0xc3]
4388  %res = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> %x1, i64 -1)
4389  %res1 = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> %x1, i64 %mask)
4390  %res2 = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> zeroinitializer, i64 %mask)
4391  %res3 = add <64 x i8> %res, %res1
4392  %res4 = add <64 x i8> %res2, %res3
4393  ret <64 x i8> %res4
4394}
4395
4396declare <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16>, <32 x i16>, i32)
4397
4398define <32 x i16>@test_int_x86_avx512_pbroadcastw_512(<8 x i16> %x0, <32 x i16> %x1, i32 %mask) {
4399; CHECK-LABEL: test_int_x86_avx512_pbroadcastw_512:
4400; CHECK:       ## BB#0:
4401; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
4402; CHECK-NEXT:    vpbroadcastw %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x79,0xc8]
4403; CHECK-NEXT:    vpbroadcastw %xmm0, %zmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x79,0xd0]
4404; CHECK-NEXT:    vpbroadcastw %xmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x79,0xc0]
4405; CHECK-NEXT:    vpaddw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc1]
4406; CHECK-NEXT:    vpaddw %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0]
4407; CHECK-NEXT:    retq ## encoding: [0xc3]
4408  %res = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> %x1, i32 -1)
4409  %res1 = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> %x1, i32 %mask)
4410  %res2 = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> zeroinitializer, i32 %mask)
4411  %res3 = add <32 x i16> %res, %res1
4412  %res4 = add <32 x i16> %res2, %res3
4413  ret <32 x i16> %res4
4414}
4415
4416