• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
3
4declare <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float>, <16 x float>, i16) nounwind readonly
5
6define <16 x float> @test_x86_vbroadcast_ss_ps_512(<4 x float> %a0, <16 x float> %a1, i16 %mask ) {
7; CHECK-LABEL: test_x86_vbroadcast_ss_ps_512:
8; CHECK:       ## BB#0:
9; CHECK-NEXT:    vbroadcastss %xmm0, %zmm2
10; CHECK-NEXT:    kmovw %edi, %k1
11; CHECK-NEXT:    vbroadcastss %xmm0, %zmm1 {%k1}
12; CHECK-NEXT:    vbroadcastss %xmm0, %zmm0 {%k1} {z}
13; CHECK-NEXT:    vaddps %zmm1, %zmm2, %zmm1
14; CHECK-NEXT:    vaddps %zmm1, %zmm0, %zmm0
15; CHECK-NEXT:    retq
16
17  %res = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> zeroinitializer, i16 -1)
18  %res1 = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> %a1, i16 %mask)
19  %res2 = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> zeroinitializer, i16 %mask)
20  %res3 = fadd <16 x float> %res, %res1
21  %res4 = fadd <16 x float> %res2, %res3
22  ret <16 x float> %res4
23}
24
25declare <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double>, <8 x double>, i8) nounwind readonly
26
27define <8 x double> @test_x86_vbroadcast_sd_pd_512(<2 x double> %a0, <8 x double> %a1, i8 %mask ) {
28; CHECK-LABEL: test_x86_vbroadcast_sd_pd_512:
29; CHECK:       ## BB#0:
30; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm2
31; CHECK-NEXT:    kmovw %edi, %k1
32; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm1 {%k1}
33; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm0 {%k1} {z}
34; CHECK-NEXT:    vaddpd %zmm1, %zmm2, %zmm1
35; CHECK-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
36; CHECK-NEXT:    retq
37
38  %res = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> zeroinitializer, i8 -1)
39  %res1 = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> %a1, i8 %mask)
40  %res2 = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> zeroinitializer, i8 %mask)
41  %res3 = fadd <8 x double> %res, %res1
42  %res4 = fadd <8 x double> %res2, %res3
43  ret <8 x double> %res4
44}
45
46declare <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32>, <16 x i32>, i16)
47
48define <16 x i32>@test_int_x86_avx512_pbroadcastd_512(<4 x i32> %x0, <16 x i32> %x1, i16 %mask) {
49; CHECK-LABEL: test_int_x86_avx512_pbroadcastd_512:
50; CHECK:       ## BB#0:
51; CHECK-NEXT:    vpbroadcastd %xmm0, %zmm2
52; CHECK-NEXT:    kmovw %edi, %k1
53; CHECK-NEXT:    vpbroadcastd %xmm0, %zmm1 {%k1}
54; CHECK-NEXT:    vpbroadcastd %xmm0, %zmm0 {%k1} {z}
55; CHECK-NEXT:    vpaddd %zmm1, %zmm2, %zmm1
56; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
57; CHECK-NEXT:    retq
58  %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 -1)
59  %res1 = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 %mask)
60  %res2 = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> zeroinitializer, i16 %mask)
61  %res3 = add <16 x i32> %res, %res1
62  %res4 = add <16 x i32> %res2, %res3
63  ret <16 x i32> %res4
64}
65
66declare <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64>, <8 x i64>, i8)
67
68define <8 x i64>@test_int_x86_avx512_pbroadcastq_512(<2 x i64> %x0, <8 x i64> %x1, i8 %mask) {
69; CHECK-LABEL: test_int_x86_avx512_pbroadcastq_512:
70; CHECK:       ## BB#0:
71; CHECK-NEXT:    vpbroadcastq %xmm0, %zmm2
72; CHECK-NEXT:    kmovw %edi, %k1
73; CHECK-NEXT:    vpbroadcastq %xmm0, %zmm1 {%k1}
74; CHECK-NEXT:    vpbroadcastq %xmm0, %zmm0 {%k1} {z}
75; CHECK-NEXT:    vpaddq %zmm1, %zmm2, %zmm1
76; CHECK-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
77; CHECK-NEXT:    retq
78  %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 -1)
79  %res1 = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 %mask)
80  %res2 = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> zeroinitializer,i8 %mask)
81  %res3 = add <8 x i64> %res, %res1
82  %res4 = add <8 x i64> %res2, %res3
83  ret <8 x i64> %res4
84}
85
86declare <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float>, <16 x float>, i16)
87
88define <16 x float>@test_int_x86_avx512_mask_movsldup_512(<16 x float> %x0, <16 x float> %x1, i16 %x2) {
89; CHECK-LABEL: test_int_x86_avx512_mask_movsldup_512:
90; CHECK:       ## BB#0:
91; CHECK-NEXT:    vmovsldup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
92; CHECK-NEXT:    kmovw %edi, %k1
93; CHECK-NEXT:    vmovsldup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
94; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
95; CHECK-NEXT:    vaddps %zmm2, %zmm1, %zmm1
96; CHECK-NEXT:    vaddps %zmm1, %zmm0, %zmm0
97; CHECK-NEXT:    retq
98  %res = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2)
99  %res1 = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> %x1, i16 -1)
100  %res2 = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> zeroinitializer, i16 %x2)
101  %res3 = fadd <16 x float> %res, %res1
102  %res4 = fadd <16 x float> %res2, %res3
103  ret <16 x float> %res4
104}
105
106declare <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float>, <16 x float>, i16)
107
108define <16 x float>@test_int_x86_avx512_mask_movshdup_512(<16 x float> %x0, <16 x float> %x1, i16 %x2) {
109; CHECK-LABEL: test_int_x86_avx512_mask_movshdup_512:
110; CHECK:       ## BB#0:
111; CHECK-NEXT:    vmovshdup {{.*#+}} zmm2 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
112; CHECK-NEXT:    kmovw %edi, %k1
113; CHECK-NEXT:    vmovshdup {{.*#+}} zmm1 {%k1} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
114; CHECK-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
115; CHECK-NEXT:    vaddps %zmm2, %zmm1, %zmm1
116; CHECK-NEXT:    vaddps %zmm1, %zmm0, %zmm0
117; CHECK-NEXT:    retq
118  %res = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2)
119  %res1 = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> %x1, i16 -1)
120  %res2 = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> zeroinitializer, i16 %x2)
121  %res3 = fadd <16 x float> %res, %res1
122  %res4 = fadd <16 x float> %res2, %res3
123  ret <16 x float> %res4
124}
125
126declare <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double>, <8 x double>, i8)
127
128define <8 x double>@test_int_x86_avx512_mask_movddup_512(<8 x double> %x0, <8 x double> %x1, i8 %x2) {
129; CHECK-LABEL: test_int_x86_avx512_mask_movddup_512:
130; CHECK:       ## BB#0:
131; CHECK-NEXT:    vmovddup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6]
132; CHECK-NEXT:    kmovw %edi, %k1
133; CHECK-NEXT:    vmovddup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6]
134; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
135; CHECK-NEXT:    vaddpd %zmm2, %zmm1, %zmm1
136; CHECK-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
137; CHECK-NEXT:    retq
138  %res = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 %x2)
139  %res1 = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 -1)
140  %res2 = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> zeroinitializer, i8 %x2)
141  %res3 = fadd <8 x double> %res, %res1
142  %res4 = fadd <8 x double> %res2, %res3
143  ret <8 x double> %res4
144}
145
146declare <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double>, i32, <8 x double>, i8)
147
148define <8 x double>@test_int_x86_avx512_mask_perm_df_512(<8 x double> %x0, i32 %x1, <8 x double> %x2, i8 %x3) {
149; CHECK-LABEL: test_int_x86_avx512_mask_perm_df_512:
150; CHECK:       ## BB#0:
151; CHECK-NEXT:    vpermpd {{.*#+}} zmm2 = zmm0[3,0,0,0,7,4,4,4]
152; CHECK-NEXT:    kmovw %esi, %k1
153; CHECK-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4]
154; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4]
155; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
156; CHECK-NEXT:    vaddpd %zmm2, %zmm0, %zmm0
157; CHECK-NEXT:    retq
158  %res = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i32 3, <8 x double> %x2, i8 %x3)
159  %res1 = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i32 3, <8 x double> zeroinitializer, i8 %x3)
160  %res2 = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i32 3, <8 x double> %x2, i8 -1)
161  %res3 = fadd <8 x double> %res, %res1
162  %res4 = fadd <8 x double> %res3, %res2
163  ret <8 x double> %res4
164}
165
166declare <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64>, i32, <8 x i64>, i8)
167
168define <8 x i64>@test_int_x86_avx512_mask_perm_di_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) {
169; CHECK-LABEL: test_int_x86_avx512_mask_perm_di_512:
170; CHECK:       ## BB#0:
171; CHECK-NEXT:    vpermq {{.*#+}} zmm2 = zmm0[3,0,0,0,7,4,4,4]
172; CHECK-NEXT:    kmovw %esi, %k1
173; CHECK-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4]
174; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4]
175; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
176; CHECK-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
177; CHECK-NEXT:    retq
178  %res = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3)
179  %res1 = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> zeroinitializer, i8 %x3)
180  %res2 = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 -1)
181  %res3 = add <8 x i64> %res, %res1
182  %res4 = add <8 x i64> %res3, %res2
183  ret <8 x i64> %res4
184}
185
186define void @test_store1(<16 x float> %data, i8* %ptr, i8* %ptr2, i16 %mask) {
187; CHECK-LABEL: test_store1:
188; CHECK:       ## BB#0:
189; CHECK-NEXT:    kmovw %edx, %k1
190; CHECK-NEXT:    vmovups %zmm0, (%rdi) {%k1}
191; CHECK-NEXT:    vmovups %zmm0, (%rsi)
192; CHECK-NEXT:    retq
193  call void @llvm.x86.avx512.mask.storeu.ps.512(i8* %ptr, <16 x float> %data, i16 %mask)
194  call void @llvm.x86.avx512.mask.storeu.ps.512(i8* %ptr2, <16 x float> %data, i16 -1)
195  ret void
196}
197
198declare void @llvm.x86.avx512.mask.storeu.ps.512(i8*, <16 x float>, i16 )
199
200define void @test_store2(<8 x double> %data, i8* %ptr, i8* %ptr2, i8 %mask) {
201; CHECK-LABEL: test_store2:
202; CHECK:       ## BB#0:
203; CHECK-NEXT:    kmovw %edx, %k1
204; CHECK-NEXT:    vmovupd %zmm0, (%rdi) {%k1}
205; CHECK-NEXT:    vmovupd %zmm0, (%rsi)
206; CHECK-NEXT:    retq
207  call void @llvm.x86.avx512.mask.storeu.pd.512(i8* %ptr, <8 x double> %data, i8 %mask)
208  call void @llvm.x86.avx512.mask.storeu.pd.512(i8* %ptr2, <8 x double> %data, i8 -1)
209  ret void
210}
211
212declare void @llvm.x86.avx512.mask.storeu.pd.512(i8*, <8 x double>, i8)
213
214define void @test_mask_store_aligned_ps(<16 x float> %data, i8* %ptr, i8* %ptr2, i16 %mask) {
215; CHECK-LABEL: test_mask_store_aligned_ps:
216; CHECK:       ## BB#0:
217; CHECK-NEXT:    kmovw %edx, %k1
218; CHECK-NEXT:    vmovaps %zmm0, (%rdi) {%k1}
219; CHECK-NEXT:    vmovaps %zmm0, (%rsi)
220; CHECK-NEXT:    retq
221  call void @llvm.x86.avx512.mask.store.ps.512(i8* %ptr, <16 x float> %data, i16 %mask)
222  call void @llvm.x86.avx512.mask.store.ps.512(i8* %ptr2, <16 x float> %data, i16 -1)
223  ret void
224}
225
226declare void @llvm.x86.avx512.mask.store.ps.512(i8*, <16 x float>, i16 )
227
228define void @test_mask_store_aligned_pd(<8 x double> %data, i8* %ptr, i8* %ptr2, i8 %mask) {
229; CHECK-LABEL: test_mask_store_aligned_pd:
230; CHECK:       ## BB#0:
231; CHECK-NEXT:    kmovw %edx, %k1
232; CHECK-NEXT:    vmovapd %zmm0, (%rdi) {%k1}
233; CHECK-NEXT:    vmovapd %zmm0, (%rsi)
234; CHECK-NEXT:    retq
235  call void @llvm.x86.avx512.mask.store.pd.512(i8* %ptr, <8 x double> %data, i8 %mask)
236  call void @llvm.x86.avx512.mask.store.pd.512(i8* %ptr2, <8 x double> %data, i8 -1)
237  ret void
238}
239
240declare void @llvm.x86.avx512.mask.store.pd.512(i8*, <8 x double>, i8)
241
242define void@test_int_x86_avx512_mask_storeu_q_512(i8* %ptr1, i8* %ptr2, <8 x i64> %x1, i8 %x2) {
243; CHECK-LABEL: test_int_x86_avx512_mask_storeu_q_512:
244; CHECK:       ## BB#0:
245; CHECK-NEXT:    kmovw %edx, %k1
246; CHECK-NEXT:    vmovdqu64 %zmm0, (%rdi) {%k1}
247; CHECK-NEXT:    vmovdqu64 %zmm0, (%rsi)
248; CHECK-NEXT:    retq
249  call void @llvm.x86.avx512.mask.storeu.q.512(i8* %ptr1, <8 x i64> %x1, i8 %x2)
250  call void @llvm.x86.avx512.mask.storeu.q.512(i8* %ptr2, <8 x i64> %x1, i8 -1)
251  ret void
252}
253
254declare void @llvm.x86.avx512.mask.storeu.q.512(i8*, <8 x i64>, i8)
255
256define void@test_int_x86_avx512_mask_storeu_d_512(i8* %ptr1, i8* %ptr2, <16 x i32> %x1, i16 %x2) {
257; CHECK-LABEL: test_int_x86_avx512_mask_storeu_d_512:
258; CHECK:       ## BB#0:
259; CHECK-NEXT:    kmovw %edx, %k1
260; CHECK-NEXT:    vmovdqu32 %zmm0, (%rdi) {%k1}
261; CHECK-NEXT:    vmovdqu32 %zmm0, (%rsi)
262; CHECK-NEXT:    retq
263  call void @llvm.x86.avx512.mask.storeu.d.512(i8* %ptr1, <16 x i32> %x1, i16 %x2)
264  call void @llvm.x86.avx512.mask.storeu.d.512(i8* %ptr2, <16 x i32> %x1, i16 -1)
265  ret void
266}
267
268declare void @llvm.x86.avx512.mask.storeu.d.512(i8*, <16 x i32>, i16)
269
270define void@test_int_x86_avx512_mask_store_q_512(i8* %ptr1, i8* %ptr2, <8 x i64> %x1, i8 %x2) {
271; CHECK-LABEL: test_int_x86_avx512_mask_store_q_512:
272; CHECK:       ## BB#0:
273; CHECK-NEXT:    kmovw %edx, %k1
274; CHECK-NEXT:    vmovdqa64 %zmm0, (%rdi) {%k1}
275; CHECK-NEXT:    vmovdqa64 %zmm0, (%rsi)
276; CHECK-NEXT:    retq
277  call void @llvm.x86.avx512.mask.store.q.512(i8* %ptr1, <8 x i64> %x1, i8 %x2)
278  call void @llvm.x86.avx512.mask.store.q.512(i8* %ptr2, <8 x i64> %x1, i8 -1)
279  ret void
280}
281
282declare void @llvm.x86.avx512.mask.store.q.512(i8*, <8 x i64>, i8)
283
284define void@test_int_x86_avx512_mask_store_d_512(i8* %ptr1, i8* %ptr2, <16 x i32> %x1, i16 %x2) {
285; CHECK-LABEL: test_int_x86_avx512_mask_store_d_512:
286; CHECK:       ## BB#0:
287; CHECK-NEXT:    kmovw %edx, %k1
288; CHECK-NEXT:    vmovdqa32 %zmm0, (%rdi) {%k1}
289; CHECK-NEXT:    vmovdqa32 %zmm0, (%rsi)
290; CHECK-NEXT:    retq
291  call void @llvm.x86.avx512.mask.store.d.512(i8* %ptr1, <16 x i32> %x1, i16 %x2)
292  call void @llvm.x86.avx512.mask.store.d.512(i8* %ptr2, <16 x i32> %x1, i16 -1)
293  ret void
294}
295
296declare void @llvm.x86.avx512.mask.store.d.512(i8*, <16 x i32>, i16)
297
298define <16 x float> @test_mask_load_aligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) {
299; CHECK-LABEL: test_mask_load_aligned_ps:
300; CHECK:       ## BB#0:
301; CHECK-NEXT:    vmovaps (%rdi), %zmm0
302; CHECK-NEXT:    kmovw %esi, %k1
303; CHECK-NEXT:    vmovaps (%rdi), %zmm0 {%k1}
304; CHECK-NEXT:    vmovaps (%rdi), %zmm1 {%k1} {z}
305; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
306; CHECK-NEXT:    retq
307  %res = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 -1)
308  %res1 = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> %res, i16 %mask)
309  %res2 = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 %mask)
310  %res4 = fadd <16 x float> %res2, %res1
311  ret <16 x float> %res4
312}
313
314declare <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8*, <16 x float>, i16)
315
316define <16 x float> @test_mask_load_unaligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) {
317; CHECK-LABEL: test_mask_load_unaligned_ps:
318; CHECK:       ## BB#0:
319; CHECK-NEXT:    vmovups (%rdi), %zmm0
320; CHECK-NEXT:    kmovw %esi, %k1
321; CHECK-NEXT:    vmovups (%rdi), %zmm0 {%k1}
322; CHECK-NEXT:    vmovups (%rdi), %zmm1 {%k1} {z}
323; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
324; CHECK-NEXT:    retq
325  %res = call <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 -1)
326  %res1 = call <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8* %ptr, <16 x float> %res, i16 %mask)
327  %res2 = call <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 %mask)
328  %res4 = fadd <16 x float> %res2, %res1
329  ret <16 x float> %res4
330}
331
332declare <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8*, <16 x float>, i16)
333
334define <8 x double> @test_mask_load_aligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) {
335; CHECK-LABEL: test_mask_load_aligned_pd:
336; CHECK:       ## BB#0:
337; CHECK-NEXT:    vmovapd (%rdi), %zmm0
338; CHECK-NEXT:    kmovw %esi, %k1
339; CHECK-NEXT:    vmovapd (%rdi), %zmm0 {%k1}
340; CHECK-NEXT:    vmovapd (%rdi), %zmm1 {%k1} {z}
341; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
342; CHECK-NEXT:    retq
343  %res = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 -1)
344  %res1 = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8* %ptr, <8 x double> %res, i8 %mask)
345  %res2 = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 %mask)
346  %res4 = fadd <8 x double> %res2, %res1
347  ret <8 x double> %res4
348}
349
350declare <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8*, <8 x double>, i8)
351
352define <8 x double> @test_mask_load_unaligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) {
353; CHECK-LABEL: test_mask_load_unaligned_pd:
354; CHECK:       ## BB#0:
355; CHECK-NEXT:    vmovupd (%rdi), %zmm0
356; CHECK-NEXT:    kmovw %esi, %k1
357; CHECK-NEXT:    vmovupd (%rdi), %zmm0 {%k1}
358; CHECK-NEXT:    vmovupd (%rdi), %zmm1 {%k1} {z}
359; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
360; CHECK-NEXT:    retq
361  %res = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 -1)
362  %res1 = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8* %ptr, <8 x double> %res, i8 %mask)
363  %res2 = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 %mask)
364  %res4 = fadd <8 x double> %res2, %res1
365  ret <8 x double> %res4
366}
367
368declare <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8*, <8 x double>, i8)
369
370declare <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(i8*, <16 x i32>, i16)
371
372define <16 x i32> @test_mask_load_unaligned_d(i8* %ptr, i8* %ptr2, <16 x i32> %data, i16 %mask) {
373; CHECK-LABEL: test_mask_load_unaligned_d:
374; CHECK:       ## BB#0:
375; CHECK-NEXT:    vmovdqu32 (%rdi), %zmm0
376; CHECK-NEXT:    kmovw %edx, %k1
377; CHECK-NEXT:    vmovdqu32 (%rsi), %zmm0 {%k1}
378; CHECK-NEXT:    vmovdqu32 (%rdi), %zmm1 {%k1} {z}
379; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
380; CHECK-NEXT:    retq
381  %res = call <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(i8* %ptr, <16 x i32> zeroinitializer, i16 -1)
382  %res1 = call <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(i8* %ptr2, <16 x i32> %res, i16 %mask)
383  %res2 = call <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(i8* %ptr, <16 x i32> zeroinitializer, i16 %mask)
384  %res4 = add <16 x i32> %res2, %res1
385  ret <16 x i32> %res4
386}
387
388declare <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8*, <8 x i64>, i8)
389
390define <8 x i64> @test_mask_load_unaligned_q(i8* %ptr, i8* %ptr2, <8 x i64> %data, i8 %mask) {
391; CHECK-LABEL: test_mask_load_unaligned_q:
392; CHECK:       ## BB#0:
393; CHECK-NEXT:    vmovdqu64 (%rdi), %zmm0
394; CHECK-NEXT:    kmovw %edx, %k1
395; CHECK-NEXT:    vmovdqu64 (%rsi), %zmm0 {%k1}
396; CHECK-NEXT:    vmovdqu64 (%rdi), %zmm1 {%k1} {z}
397; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
398; CHECK-NEXT:    retq
399  %res = call <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8* %ptr, <8 x i64> zeroinitializer, i8 -1)
400  %res1 = call <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8* %ptr2, <8 x i64> %res, i8 %mask)
401  %res2 = call <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8* %ptr, <8 x i64> zeroinitializer, i8 %mask)
402  %res4 = add <8 x i64> %res2, %res1
403  ret <8 x i64> %res4
404}
405
406declare <16 x i32> @llvm.x86.avx512.mask.load.d.512(i8*, <16 x i32>, i16)
407
408define <16 x i32> @test_mask_load_aligned_d(<16 x i32> %data, i8* %ptr, i16 %mask) {
409; CHECK-LABEL: test_mask_load_aligned_d:
410; CHECK:       ## BB#0:
411; CHECK-NEXT:    vmovdqa32 (%rdi), %zmm0
412; CHECK-NEXT:    kmovw %esi, %k1
413; CHECK-NEXT:    vmovdqa32 (%rdi), %zmm0 {%k1}
414; CHECK-NEXT:    vmovdqa32 (%rdi), %zmm1 {%k1} {z}
415; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
416; CHECK-NEXT:    retq
417  %res = call <16 x i32> @llvm.x86.avx512.mask.load.d.512(i8* %ptr, <16 x i32> zeroinitializer, i16 -1)
418  %res1 = call <16 x i32> @llvm.x86.avx512.mask.load.d.512(i8* %ptr, <16 x i32> %res, i16 %mask)
419  %res2 = call <16 x i32> @llvm.x86.avx512.mask.load.d.512(i8* %ptr, <16 x i32> zeroinitializer, i16 %mask)
420  %res4 = add <16 x i32> %res2, %res1
421  ret <16 x i32> %res4
422}
423
424declare <8 x i64> @llvm.x86.avx512.mask.load.q.512(i8*, <8 x i64>, i8)
425
426define <8 x i64> @test_mask_load_aligned_q(<8 x i64> %data, i8* %ptr, i8 %mask) {
427; CHECK-LABEL: test_mask_load_aligned_q:
428; CHECK:       ## BB#0:
429; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
430; CHECK-NEXT:    kmovw %esi, %k1
431; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0 {%k1}
432; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1 {%k1} {z}
433; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
434; CHECK-NEXT:    retq
435  %res = call <8 x i64> @llvm.x86.avx512.mask.load.q.512(i8* %ptr, <8 x i64> zeroinitializer, i8 -1)
436  %res1 = call <8 x i64> @llvm.x86.avx512.mask.load.q.512(i8* %ptr, <8 x i64> %res, i8 %mask)
437  %res2 = call <8 x i64> @llvm.x86.avx512.mask.load.q.512(i8* %ptr, <8 x i64> zeroinitializer, i8 %mask)
438  %res4 = add <8 x i64> %res2, %res1
439  ret <8 x i64> %res4
440}
441
442declare <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double>, i32, <8 x double>, i8)
443
444define <8 x double>@test_int_x86_avx512_mask_vpermil_pd_512(<8 x double> %x0, <8 x double> %x2, i8 %x3) {
445; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_pd_512:
446; CHECK:       ## BB#0:
447; CHECK-NEXT:    vpermilpd {{.*#+}} zmm2 = zmm0[0,1,3,2,5,4,6,6]
448; CHECK-NEXT:    kmovw %edi, %k1
449; CHECK-NEXT:    vpermilpd {{.*#+}} zmm1 {%k1} = zmm0[0,1,3,2,5,4,6,6]
450; CHECK-NEXT:    vpermilpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,3,2,5,4,6,6]
451; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
452; CHECK-NEXT:    vaddpd %zmm2, %zmm0, %zmm0
453; CHECK-NEXT:    retq
454  %res = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> %x2, i8 %x3)
455  %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> zeroinitializer, i8 %x3)
456  %res2 = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> %x2, i8 -1)
457  %res3 = fadd <8 x double> %res, %res1
458  %res4 = fadd <8 x double> %res3, %res2
459  ret <8 x double> %res4
460}
461
462declare <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float>, i32, <16 x float>, i16)
463
464define <16 x float>@test_int_x86_avx512_mask_vpermil_ps_512(<16 x float> %x0, <16 x float> %x2, i16 %x3) {
465; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_ps_512:
466; CHECK:       ## BB#0:
467; CHECK-NEXT:    vpermilps {{.*#+}} zmm2 = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12]
468; CHECK-NEXT:    kmovw %edi, %k1
469; CHECK-NEXT:    vpermilps {{.*#+}} zmm1 {%k1} = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12]
470; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12]
471; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
472; CHECK-NEXT:    vaddps %zmm2, %zmm0, %zmm0
473; CHECK-NEXT:    retq
474  %res = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> %x2, i16 %x3)
475  %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> zeroinitializer, i16 %x3)
476  %res2 = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> %x2, i16 -1)
477  %res3 = fadd <16 x float> %res, %res1
478  %res4 = fadd <16 x float> %res3, %res2
479  ret <16 x float> %res4
480}
481
482declare <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32>, i32, <16 x i32>, i16)
483
484define <16 x i32>@test_int_x86_avx512_mask_pshuf_d_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) {
485; CHECK-LABEL: test_int_x86_avx512_mask_pshuf_d_512:
486; CHECK:       ## BB#0:
487; CHECK-NEXT:    vpshufd {{.*#+}} zmm2 = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12]
488; CHECK-NEXT:    kmovw %esi, %k1
489; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12]
490; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12]
491; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
492; CHECK-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
493; CHECK-NEXT:    retq
494	%res = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3)
495	%res1 = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> zeroinitializer, i16 %x3)
496	%res2 = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 -1)
497	%res3 = add <16 x i32> %res, %res1
498	%res4 = add <16 x i32> %res3, %res2
499	ret <16 x i32> %res4
500}
501
502define i16 @test_pcmpeq_d(<16 x i32> %a, <16 x i32> %b) {
503; CHECK-LABEL: test_pcmpeq_d:
504; CHECK:       ## BB#0:
505; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
506; CHECK-NEXT:    kmovw %k0, %eax
507; CHECK-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
508; CHECK-NEXT:    retq
509  %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1)
510  ret i16 %res
511}
512
513define i16 @test_mask_pcmpeq_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
514; CHECK-LABEL: test_mask_pcmpeq_d:
515; CHECK:       ## BB#0:
516; CHECK-NEXT:    kmovw %edi, %k1
517; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
518; CHECK-NEXT:    kmovw %k0, %eax
519; CHECK-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
520; CHECK-NEXT:    retq
521  %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
522  ret i16 %res
523}
524
525declare i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32>, <16 x i32>, i16)
526
527define i8 @test_pcmpeq_q(<8 x i64> %a, <8 x i64> %b) {
528; CHECK-LABEL: test_pcmpeq_q:
529; CHECK:       ## BB#0:
530; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0
531; CHECK-NEXT:    kmovw %k0, %eax
532; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
533; CHECK-NEXT:    retq
534  %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1)
535  ret i8 %res
536}
537
538define i8 @test_mask_pcmpeq_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
539; CHECK-LABEL: test_mask_pcmpeq_q:
540; CHECK:       ## BB#0:
541; CHECK-NEXT:    kmovw %edi, %k1
542; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
543; CHECK-NEXT:    kmovw %k0, %eax
544; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
545; CHECK-NEXT:    retq
546  %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
547  ret i8 %res
548}
549
550declare i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64>, <8 x i64>, i8)
551
552define i16 @test_pcmpgt_d(<16 x i32> %a, <16 x i32> %b) {
553; CHECK-LABEL: test_pcmpgt_d:
554; CHECK:       ## BB#0:
555; CHECK-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
556; CHECK-NEXT:    kmovw %k0, %eax
557; CHECK-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
558; CHECK-NEXT:    retq
559  %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1)
560  ret i16 %res
561}
562
563define i16 @test_mask_pcmpgt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
564; CHECK-LABEL: test_mask_pcmpgt_d:
565; CHECK:       ## BB#0:
566; CHECK-NEXT:    kmovw %edi, %k1
567; CHECK-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
568; CHECK-NEXT:    kmovw %k0, %eax
569; CHECK-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
570; CHECK-NEXT:    retq
571  %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
572  ret i16 %res
573}
574
575declare i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32>, <16 x i32>, i16)
576
577define i8 @test_pcmpgt_q(<8 x i64> %a, <8 x i64> %b) {
578; CHECK-LABEL: test_pcmpgt_q:
579; CHECK:       ## BB#0:
580; CHECK-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0
581; CHECK-NEXT:    kmovw %k0, %eax
582; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
583; CHECK-NEXT:    retq
584  %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1)
585  ret i8 %res
586}
587
588define i8 @test_mask_pcmpgt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
589; CHECK-LABEL: test_mask_pcmpgt_q:
590; CHECK:       ## BB#0:
591; CHECK-NEXT:    kmovw %edi, %k1
592; CHECK-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
593; CHECK-NEXT:    kmovw %k0, %eax
594; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
595; CHECK-NEXT:    retq
596  %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
597  ret i8 %res
598}
599
600declare i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64>, <8 x i64>, i8)
601
602declare <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double>, <8 x double>, <8 x double>, i8)
603
604define <8 x double>@test_int_x86_avx512_mask_unpckh_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) {
605; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_pd_512:
606; CHECK:       ## BB#0:
607; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm3 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
608; CHECK-NEXT:    kmovw %edi, %k1
609; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
610; CHECK-NEXT:    vaddpd %zmm3, %zmm2, %zmm0
611; CHECK-NEXT:    retq
612  %res = call <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3)
613  %res1 = call <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1)
614  %res2 = fadd <8 x double> %res, %res1
615  ret <8 x double> %res2
616}
617
618declare <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float>, <16 x float>, <16 x float>, i16)
619
620define <16 x float>@test_int_x86_avx512_mask_unpckh_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) {
621; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_ps_512:
622; CHECK:       ## BB#0:
623; CHECK-NEXT:    vunpckhps {{.*#+}} zmm3 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
624; CHECK-NEXT:    kmovw %edi, %k1
625; CHECK-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
626; CHECK-NEXT:    vaddps %zmm3, %zmm2, %zmm0
627; CHECK-NEXT:    retq
628  %res = call <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3)
629  %res1 = call <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1)
630  %res2 = fadd <16 x float> %res, %res1
631  ret <16 x float> %res2
632}
633
634declare <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double>, <8 x double>, <8 x double>, i8)
635
636define <8 x double>@test_int_x86_avx512_mask_unpckl_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) {
637; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_pd_512:
638; CHECK:       ## BB#0:
639; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
640; CHECK-NEXT:    kmovw %edi, %k1
641; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
642; CHECK-NEXT:    vaddpd %zmm3, %zmm2, %zmm0
643; CHECK-NEXT:    retq
644  %res = call <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3)
645  %res1 = call <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1)
646  %res2 = fadd <8 x double> %res, %res1
647  ret <8 x double> %res2
648}
649
650declare <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float>, <16 x float>, <16 x float>, i16)
651
652define <16 x float>@test_int_x86_avx512_mask_unpckl_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) {
653; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_ps_512:
654; CHECK:       ## BB#0:
655; CHECK-NEXT:    vunpcklps {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
656; CHECK-NEXT:    kmovw %edi, %k1
657; CHECK-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
658; CHECK-NEXT:    vaddps %zmm3, %zmm2, %zmm0
659; CHECK-NEXT:    retq
660  %res = call <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3)
661  %res1 = call <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1)
662  %res2 = fadd <16 x float> %res, %res1
663  ret <16 x float> %res2
664}
665
666declare <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
667
668define <8 x i64>@test_int_x86_avx512_mask_punpcklqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
669; CHECK-LABEL: test_int_x86_avx512_mask_punpcklqd_q_512:
670; CHECK:       ## BB#0:
671; CHECK-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
672; CHECK-NEXT:    kmovw %edi, %k1
673; CHECK-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
674; CHECK-NEXT:    vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
675; CHECK-NEXT:    vpaddq %zmm3, %zmm2, %zmm1
676; CHECK-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
677; CHECK-NEXT:    retq
678  %res = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
679  %res1 = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
680  %res2 = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer,i8 %x3)
681  %res3 = add <8 x i64> %res, %res1
682  %res4 = add <8 x i64> %res2, %res3
683  ret <8 x i64> %res4
684}
685
686declare <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
687
688define <8 x i64>@test_int_x86_avx512_mask_punpckhqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
689; CHECK-LABEL: test_int_x86_avx512_mask_punpckhqd_q_512:
690; CHECK:       ## BB#0:
691; CHECK-NEXT:    vpunpckhqdq {{.*#+}} zmm3 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
692; CHECK-NEXT:    kmovw %edi, %k1
693; CHECK-NEXT:    vpunpckhqdq {{.*#+}} zmm2 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
694; CHECK-NEXT:    vpaddq %zmm3, %zmm2, %zmm0
695; CHECK-NEXT:    retq
696  %res = call <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
697  %res1 = call <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
698  %res2 = add <8 x i64> %res, %res1
699  ret <8 x i64> %res2
700}
701
702declare <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
703
704define <16 x i32>@test_int_x86_avx512_mask_punpckhd_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
705; CHECK-LABEL: test_int_x86_avx512_mask_punpckhd_q_512:
706; CHECK:       ## BB#0:
707; CHECK-NEXT:    vpunpckhdq {{.*#+}} zmm3 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
708; CHECK-NEXT:    kmovw %edi, %k1
709; CHECK-NEXT:    vpunpckhdq {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
710; CHECK-NEXT:    vpaddd %zmm3, %zmm2, %zmm0
711; CHECK-NEXT:    retq
712  %res = call <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
713  %res1 = call <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
714  %res2 = add <16 x i32> %res, %res1
715  ret <16 x i32> %res2
716}
717
718declare <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
719
720define <16 x i32>@test_int_x86_avx512_mask_punpckld_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
721; CHECK-LABEL: test_int_x86_avx512_mask_punpckld_q_512:
722; CHECK:       ## BB#0:
723; CHECK-NEXT:    vpunpckldq {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
724; CHECK-NEXT:    kmovw %edi, %k1
725; CHECK-NEXT:    vpunpckldq {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
726; CHECK-NEXT:    vpaddd %zmm3, %zmm2, %zmm0
727; CHECK-NEXT:    retq
728  %res = call <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
729  %res1 = call <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
730  %res2 = add <16 x i32> %res, %res1
731  ret <16 x i32> %res2
732}
733
734define <16 x i32> @test_x86_avx512_pslli_d(<16 x i32> %a0) {
735; CHECK-LABEL: test_x86_avx512_pslli_d:
736; CHECK:       ## BB#0:
737; CHECK-NEXT:    vpslld $7, %zmm0, %zmm0
738; CHECK-NEXT:    retq
739  %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
740  ret <16 x i32> %res
741}
742
743define <16 x i32> @test_x86_avx512_mask_pslli_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
744; CHECK-LABEL: test_x86_avx512_mask_pslli_d:
745; CHECK:       ## BB#0:
746; CHECK-NEXT:    kmovw %edi, %k1
747; CHECK-NEXT:    vpslld $7, %zmm0, %zmm1 {%k1}
748; CHECK-NEXT:    vmovaps %zmm1, %zmm0
749; CHECK-NEXT:    retq
750  %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
751  ret <16 x i32> %res
752}
753
754define <16 x i32> @test_x86_avx512_maskz_pslli_d(<16 x i32> %a0, i16 %mask) {
755; CHECK-LABEL: test_x86_avx512_maskz_pslli_d:
756; CHECK:       ## BB#0:
757; CHECK-NEXT:    kmovw %edi, %k1
758; CHECK-NEXT:    vpslld $7, %zmm0, %zmm0 {%k1} {z}
759; CHECK-NEXT:    retq
760  %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask)
761  ret <16 x i32> %res
762}
763
764declare <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone
765
766define <8 x i64> @test_x86_avx512_pslli_q(<8 x i64> %a0) {
767; CHECK-LABEL: test_x86_avx512_pslli_q:
768; CHECK:       ## BB#0:
769; CHECK-NEXT:    vpsllq $7, %zmm0, %zmm0
770; CHECK-NEXT:    retq
771  %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
772  ret <8 x i64> %res
773}
774
775define <8 x i64> @test_x86_avx512_mask_pslli_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
776; CHECK-LABEL: test_x86_avx512_mask_pslli_q:
777; CHECK:       ## BB#0:
778; CHECK-NEXT:    kmovw %edi, %k1
779; CHECK-NEXT:    vpsllq $7, %zmm0, %zmm1 {%k1}
780; CHECK-NEXT:    vmovaps %zmm1, %zmm0
781; CHECK-NEXT:    retq
782  %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
783  ret <8 x i64> %res
784}
785
786define <8 x i64> @test_x86_avx512_maskz_pslli_q(<8 x i64> %a0, i8 %mask) {
787; CHECK-LABEL: test_x86_avx512_maskz_pslli_q:
788; CHECK:       ## BB#0:
789; CHECK-NEXT:    kmovw %edi, %k1
790; CHECK-NEXT:    vpsllq $7, %zmm0, %zmm0 {%k1} {z}
791; CHECK-NEXT:    retq
792  %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
793  ret <8 x i64> %res
794}
795
796declare <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone
797
798define <16 x i32> @test_x86_avx512_psrli_d(<16 x i32> %a0) {
799; CHECK-LABEL: test_x86_avx512_psrli_d:
800; CHECK:       ## BB#0:
801; CHECK-NEXT:    vpsrld $7, %zmm0, %zmm0
802; CHECK-NEXT:    retq
803  %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
804  ret <16 x i32> %res
805}
806
807define <16 x i32> @test_x86_avx512_mask_psrli_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
808; CHECK-LABEL: test_x86_avx512_mask_psrli_d:
809; CHECK:       ## BB#0:
810; CHECK-NEXT:    kmovw %edi, %k1
811; CHECK-NEXT:    vpsrld $7, %zmm0, %zmm1 {%k1}
812; CHECK-NEXT:    vmovaps %zmm1, %zmm0
813; CHECK-NEXT:    retq
814  %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
815  ret <16 x i32> %res
816}
817
818define <16 x i32> @test_x86_avx512_maskz_psrli_d(<16 x i32> %a0, i16 %mask) {
819; CHECK-LABEL: test_x86_avx512_maskz_psrli_d:
820; CHECK:       ## BB#0:
821; CHECK-NEXT:    kmovw %edi, %k1
822; CHECK-NEXT:    vpsrld $7, %zmm0, %zmm0 {%k1} {z}
823; CHECK-NEXT:    retq
824  %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask)
825  ret <16 x i32> %res
826}
827
828declare <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone
829
830define <8 x i64> @test_x86_avx512_psrli_q(<8 x i64> %a0) {
831; CHECK-LABEL: test_x86_avx512_psrli_q:
832; CHECK:       ## BB#0:
833; CHECK-NEXT:    vpsrlq $7, %zmm0, %zmm0
834; CHECK-NEXT:    retq
835  %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
836  ret <8 x i64> %res
837}
838
839define <8 x i64> @test_x86_avx512_mask_psrli_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
840; CHECK-LABEL: test_x86_avx512_mask_psrli_q:
841; CHECK:       ## BB#0:
842; CHECK-NEXT:    kmovw %edi, %k1
843; CHECK-NEXT:    vpsrlq $7, %zmm0, %zmm1 {%k1}
844; CHECK-NEXT:    vmovaps %zmm1, %zmm0
845; CHECK-NEXT:    retq
846  %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
847  ret <8 x i64> %res
848}
849
850define <8 x i64> @test_x86_avx512_maskz_psrli_q(<8 x i64> %a0, i8 %mask) {
851; CHECK-LABEL: test_x86_avx512_maskz_psrli_q:
852; CHECK:       ## BB#0:
853; CHECK-NEXT:    kmovw %edi, %k1
854; CHECK-NEXT:    vpsrlq $7, %zmm0, %zmm0 {%k1} {z}
855; CHECK-NEXT:    retq
856  %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
857  ret <8 x i64> %res
858}
859
860declare <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone
861
862define <16 x i32> @test_x86_avx512_psrai_d(<16 x i32> %a0) {
863; CHECK-LABEL: test_x86_avx512_psrai_d:
864; CHECK:       ## BB#0:
865; CHECK-NEXT:    vpsrad $7, %zmm0, %zmm0
866; CHECK-NEXT:    retq
867  %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
868  ret <16 x i32> %res
869}
870
871define <16 x i32> @test_x86_avx512_mask_psrai_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
872; CHECK-LABEL: test_x86_avx512_mask_psrai_d:
873; CHECK:       ## BB#0:
874; CHECK-NEXT:    kmovw %edi, %k1
875; CHECK-NEXT:    vpsrad $7, %zmm0, %zmm1 {%k1}
876; CHECK-NEXT:    vmovaps %zmm1, %zmm0
877; CHECK-NEXT:    retq
878  %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
879  ret <16 x i32> %res
880}
881
882define <16 x i32> @test_x86_avx512_maskz_psrai_d(<16 x i32> %a0, i16 %mask) {
883; CHECK-LABEL: test_x86_avx512_maskz_psrai_d:
884; CHECK:       ## BB#0:
885; CHECK-NEXT:    kmovw %edi, %k1
886; CHECK-NEXT:    vpsrad $7, %zmm0, %zmm0 {%k1} {z}
887; CHECK-NEXT:    retq
888  %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask)
889  ret <16 x i32> %res
890}
891
892declare <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone
893
894define <8 x i64> @test_x86_avx512_psrai_q(<8 x i64> %a0) {
895; CHECK-LABEL: test_x86_avx512_psrai_q:
896; CHECK:       ## BB#0:
897; CHECK-NEXT:    vpsraq $7, %zmm0, %zmm0
898; CHECK-NEXT:    retq
899  %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
900  ret <8 x i64> %res
901}
902
903define <8 x i64> @test_x86_avx512_mask_psrai_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
904; CHECK-LABEL: test_x86_avx512_mask_psrai_q:
905; CHECK:       ## BB#0:
906; CHECK-NEXT:    kmovw %edi, %k1
907; CHECK-NEXT:    vpsraq $7, %zmm0, %zmm1 {%k1}
908; CHECK-NEXT:    vmovaps %zmm1, %zmm0
909; CHECK-NEXT:    retq
910  %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
911  ret <8 x i64> %res
912}
913
914define <8 x i64> @test_x86_avx512_maskz_psrai_q(<8 x i64> %a0, i8 %mask) {
915; CHECK-LABEL: test_x86_avx512_maskz_psrai_q:
916; CHECK:       ## BB#0:
917; CHECK-NEXT:    kmovw %edi, %k1
918; CHECK-NEXT:    vpsraq $7, %zmm0, %zmm0 {%k1} {z}
919; CHECK-NEXT:    retq
920  %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
921  ret <8 x i64> %res
922}
923
924declare <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone
925
926declare void @llvm.x86.avx512.storent.q.512(i8*, <8 x i64>)
927
928define void@test_storent_q_512(<8 x i64> %data, i8* %ptr) {
929; CHECK-LABEL: test_storent_q_512:
930; CHECK:       ## BB#0:
931; CHECK-NEXT:    vmovntdq %zmm0, (%rdi)
932; CHECK-NEXT:    retq
933  call void @llvm.x86.avx512.storent.q.512(i8* %ptr, <8 x i64> %data)
934  ret void
935}
936
937declare void @llvm.x86.avx512.storent.pd.512(i8*, <8 x double>)
938
939define void @test_storent_pd_512(<8 x double> %data, i8* %ptr) {
940; CHECK-LABEL: test_storent_pd_512:
941; CHECK:       ## BB#0:
942; CHECK-NEXT:    vmovntpd %zmm0, (%rdi)
943; CHECK-NEXT:    retq
944  call void @llvm.x86.avx512.storent.pd.512(i8* %ptr, <8 x double> %data)
945  ret void
946}
947
948declare void @llvm.x86.avx512.storent.ps.512(i8*, <16 x float>)
949
950define void @test_storent_ps_512(<16 x float> %data, i8* %ptr) {
951; CHECK-LABEL: test_storent_ps_512:
952; CHECK:       ## BB#0:
953; CHECK-NEXT:    vmovntps %zmm0, (%rdi)
954; CHECK-NEXT:    retq
955  call void @llvm.x86.avx512.storent.ps.512(i8* %ptr, <16 x float> %data)
956  ret void
957}
958
959define <16 x i32> @test_xor_epi32(<16 x i32> %a, <16 x i32> %b) {
960; CHECK-LABEL: test_xor_epi32:
961; CHECK:       ## BB#0:
962; CHECK-NEXT:    vpxord %zmm1, %zmm0, %zmm0
963; CHECK-NEXT:    retq
964  %res = call <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
965  ret < 16 x i32> %res
966}
967
968define <16 x i32> @test_mask_xor_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
969; CHECK-LABEL: test_mask_xor_epi32:
970; CHECK:       ## BB#0:
971; CHECK-NEXT:    kmovw %edi, %k1
972; CHECK-NEXT:    vpxord %zmm1, %zmm0, %zmm2 {%k1}
973; CHECK-NEXT:    vmovaps %zmm2, %zmm0
974; CHECK-NEXT:    retq
975  %res = call <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
976  ret < 16 x i32> %res
977}
978
979declare <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
980
981define <16 x i32> @test_or_epi32(<16 x i32> %a, <16 x i32> %b) {
982; CHECK-LABEL: test_or_epi32:
983; CHECK:       ## BB#0:
984; CHECK-NEXT:    vpord %zmm1, %zmm0, %zmm0
985; CHECK-NEXT:    retq
986  %res = call <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
987  ret < 16 x i32> %res
988}
989
990define <16 x i32> @test_mask_or_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
991; CHECK-LABEL: test_mask_or_epi32:
992; CHECK:       ## BB#0:
993; CHECK-NEXT:    kmovw %edi, %k1
994; CHECK-NEXT:    vpord %zmm1, %zmm0, %zmm2 {%k1}
995; CHECK-NEXT:    vmovaps %zmm2, %zmm0
996; CHECK-NEXT:    retq
997  %res = call <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
998  ret < 16 x i32> %res
999}
1000
1001declare <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
1002
1003define <16 x i32> @test_and_epi32(<16 x i32> %a, <16 x i32> %b) {
1004; CHECK-LABEL: test_and_epi32:
1005; CHECK:       ## BB#0:
1006; CHECK-NEXT:    vpandd %zmm1, %zmm0, %zmm0
1007; CHECK-NEXT:    retq
1008  %res = call <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
1009  ret < 16 x i32> %res
1010}
1011
1012define <16 x i32> @test_mask_and_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
1013; CHECK-LABEL: test_mask_and_epi32:
1014; CHECK:       ## BB#0:
1015; CHECK-NEXT:    kmovw %edi, %k1
1016; CHECK-NEXT:    vpandd %zmm1, %zmm0, %zmm2 {%k1}
1017; CHECK-NEXT:    vmovaps %zmm2, %zmm0
1018; CHECK-NEXT:    retq
1019  %res = call <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
1020  ret < 16 x i32> %res
1021}
1022
1023declare <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
1024
1025define <8 x i64> @test_xor_epi64(<8 x i64> %a, <8 x i64> %b) {
1026; CHECK-LABEL: test_xor_epi64:
1027; CHECK:       ## BB#0:
1028; CHECK-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
1029; CHECK-NEXT:    retq
1030  %res = call <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
1031  ret < 8 x i64> %res
1032}
1033
1034define <8 x i64> @test_mask_xor_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
1035; CHECK-LABEL: test_mask_xor_epi64:
1036; CHECK:       ## BB#0:
1037; CHECK-NEXT:    kmovw %edi, %k1
1038; CHECK-NEXT:    vpxorq %zmm1, %zmm0, %zmm2 {%k1}
1039; CHECK-NEXT:    vmovaps %zmm2, %zmm0
1040; CHECK-NEXT:    retq
1041  %res = call <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
1042  ret < 8 x i64> %res
1043}
1044
1045declare <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
1046
1047define <8 x i64> @test_or_epi64(<8 x i64> %a, <8 x i64> %b) {
1048; CHECK-LABEL: test_or_epi64:
1049; CHECK:       ## BB#0:
1050; CHECK-NEXT:    vporq %zmm1, %zmm0, %zmm0
1051; CHECK-NEXT:    retq
1052  %res = call <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
1053  ret < 8 x i64> %res
1054}
1055
1056define <8 x i64> @test_mask_or_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
1057; CHECK-LABEL: test_mask_or_epi64:
1058; CHECK:       ## BB#0:
1059; CHECK-NEXT:    kmovw %edi, %k1
1060; CHECK-NEXT:    vporq %zmm1, %zmm0, %zmm2 {%k1}
1061; CHECK-NEXT:    vmovaps %zmm2, %zmm0
1062; CHECK-NEXT:    retq
1063  %res = call <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
1064  ret < 8 x i64> %res
1065}
1066
1067declare <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
1068
1069define <8 x i64> @test_and_epi64(<8 x i64> %a, <8 x i64> %b) {
1070; CHECK-LABEL: test_and_epi64:
1071; CHECK:       ## BB#0:
1072; CHECK-NEXT:    vpandq %zmm1, %zmm0, %zmm0
1073; CHECK-NEXT:    retq
1074  %res = call <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
1075  ret < 8 x i64> %res
1076}
1077
1078define <8 x i64> @test_mask_and_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
1079; CHECK-LABEL: test_mask_and_epi64:
1080; CHECK:       ## BB#0:
1081; CHECK-NEXT:    kmovw %edi, %k1
1082; CHECK-NEXT:    vpandq %zmm1, %zmm0, %zmm2 {%k1}
1083; CHECK-NEXT:    vmovaps %zmm2, %zmm0
1084; CHECK-NEXT:    retq
1085  %res = call <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
1086  ret < 8 x i64> %res
1087}
1088
1089declare <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
1090