• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; RUN: llc -mtriple=x86_64-apple-darwin  -mcpu=knl < %s | FileCheck %s --check-prefix=AVX512
2; RUN: llc -mtriple=x86_64-apple-darwin  -mcpu=core-avx2 < %s | FileCheck %s --check-prefix=AVX2
3; RUN: opt -mtriple=x86_64-apple-darwin -codegenprepare -mcpu=corei7-avx -S < %s | FileCheck %s --check-prefix=AVX_SCALAR
4; RUN: llc -mtriple=x86_64-apple-darwin  -mcpu=skx < %s | FileCheck %s --check-prefix=SKX
5
6; AVX512-LABEL: test1
7; AVX512: vmovdqu32       (%rdi), %zmm0 {%k1} {z}
8
9; AVX2-LABEL: test1
10; AVX2: vpmaskmovd      {{.*}}(%rdi)
11; AVX2: vpmaskmovd      {{.*}}(%rdi)
12; AVX2-NOT: blend
13
14; AVX_SCALAR-LABEL: test1
15; AVX_SCALAR-NOT: masked
16; AVX_SCALAR: extractelement
17; AVX_SCALAR: insertelement
18; AVX_SCALAR: extractelement
19; AVX_SCALAR: insertelement
20define <16 x i32> @test1(<16 x i32> %trigger, <16 x i32>* %addr) {
21  %mask = icmp eq <16 x i32> %trigger, zeroinitializer
22  %res = call <16 x i32> @llvm.masked.load.v16i32(<16 x i32>* %addr, i32 4, <16 x i1>%mask, <16 x i32>undef)
23  ret <16 x i32> %res
24}
25
26; AVX512-LABEL: test2
27; AVX512: vmovdqu32       (%rdi), %zmm0 {%k1} {z}
28
29; AVX2-LABEL: test2
30; AVX2: vpmaskmovd      {{.*}}(%rdi)
31; AVX2: vpmaskmovd      {{.*}}(%rdi)
32; AVX2-NOT: blend
33define <16 x i32> @test2(<16 x i32> %trigger, <16 x i32>* %addr) {
34  %mask = icmp eq <16 x i32> %trigger, zeroinitializer
35  %res = call <16 x i32> @llvm.masked.load.v16i32(<16 x i32>* %addr, i32 4, <16 x i1>%mask, <16 x i32>zeroinitializer)
36  ret <16 x i32> %res
37}
38
39; AVX512-LABEL: test3
40; AVX512: vmovdqu32       %zmm1, (%rdi) {%k1}
41
42; AVX_SCALAR-LABEL: test3
43; AVX_SCALAR-NOT: masked
44; AVX_SCALAR: extractelement
45; AVX_SCALAR: store
46; AVX_SCALAR: extractelement
47; AVX_SCALAR: store
48; AVX_SCALAR: extractelement
49; AVX_SCALAR: store
50define void @test3(<16 x i32> %trigger, <16 x i32>* %addr, <16 x i32> %val) {
51  %mask = icmp eq <16 x i32> %trigger, zeroinitializer
52  call void @llvm.masked.store.v16i32(<16 x i32>%val, <16 x i32>* %addr, i32 4, <16 x i1>%mask)
53  ret void
54}
55
56; AVX512-LABEL: test4
57; AVX512: vmovups       (%rdi), %zmm{{.*{%k[1-7]}}}
58
59; AVX2-LABEL: test4
60; AVX2: vmaskmovps      {{.*}}(%rdi)
61; AVX2: vmaskmovps      {{.*}}(%rdi)
62; AVX2: blend
63define <16 x float> @test4(<16 x i32> %trigger, <16 x float>* %addr, <16 x float> %dst) {
64  %mask = icmp eq <16 x i32> %trigger, zeroinitializer
65  %res = call <16 x float> @llvm.masked.load.v16f32(<16 x float>* %addr, i32 4, <16 x i1>%mask, <16 x float> %dst)
66  ret <16 x float> %res
67}
68
69; AVX512-LABEL: test5
70; AVX512: vmovupd (%rdi), %zmm1 {%k1}
71
72; AVX2-LABEL: test5
73; AVX2: vmaskmovpd
74; AVX2: vblendvpd
75; AVX2: vmaskmovpd
76; AVX2: vblendvpd
77define <8 x double> @test5(<8 x i32> %trigger, <8 x double>* %addr, <8 x double> %dst) {
78  %mask = icmp eq <8 x i32> %trigger, zeroinitializer
79  %res = call <8 x double> @llvm.masked.load.v8f64(<8 x double>* %addr, i32 4, <8 x i1>%mask, <8 x double>%dst)
80  ret <8 x double> %res
81}
82
83; AVX2-LABEL: test6
84; AVX2: vmaskmovpd
85; AVX2: vblendvpd
86
87; SKX-LABEL: test6
88; SKX: vmovupd {{.*}}{%k1}
89define <2 x double> @test6(<2 x i64> %trigger, <2 x double>* %addr, <2 x double> %dst) {
90  %mask = icmp eq <2 x i64> %trigger, zeroinitializer
91  %res = call <2 x double> @llvm.masked.load.v2f64(<2 x double>* %addr, i32 4, <2 x i1>%mask, <2 x double>%dst)
92  ret <2 x double> %res
93}
94
95; AVX2-LABEL: test7
96; AVX2: vmaskmovps      {{.*}}(%rdi)
97; AVX2: blend
98
99; SKX-LABEL: test7
100; SKX: vmovups (%rdi){{.*}}{%k1}
101define <4 x float> @test7(<4 x i32> %trigger, <4 x float>* %addr, <4 x float> %dst) {
102  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
103  %res = call <4 x float> @llvm.masked.load.v4f32(<4 x float>* %addr, i32 4, <4 x i1>%mask, <4 x float>%dst)
104  ret <4 x float> %res
105}
106
107; AVX2-LABEL: test8
108; AVX2: vpmaskmovd      {{.*}}(%rdi)
109; AVX2: blend
110
111; SKX-LABEL: test8
112; SKX: vmovdqu32 (%rdi){{.*}}{%k1}
113define <4 x i32> @test8(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %dst) {
114  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
115  %res = call <4 x i32> @llvm.masked.load.v4i32(<4 x i32>* %addr, i32 4, <4 x i1>%mask, <4 x i32>%dst)
116  ret <4 x i32> %res
117}
118
119; AVX2-LABEL: test9
120; AVX2: vpmaskmovd %xmm
121
122; SKX-LABEL: test9
123; SKX: vmovdqu32 %xmm{{.*}}{%k1}
124define void @test9(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) {
125  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
126  call void @llvm.masked.store.v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1>%mask)
127  ret void
128}
129
130; AVX2-LABEL: test10
131; AVX2: vmaskmovpd    (%rdi), %ymm
132; AVX2: blend
133
134; SKX-LABEL: test10
135; SKX: vmovapd {{.*}}{%k1}
136define <4 x double> @test10(<4 x i32> %trigger, <4 x double>* %addr, <4 x double> %dst) {
137  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
138  %res = call <4 x double> @llvm.masked.load.v4f64(<4 x double>* %addr, i32 32, <4 x i1>%mask, <4 x double>%dst)
139  ret <4 x double> %res
140}
141
142; AVX2-LABEL: test11a
143; AVX2: vmaskmovps
144; AVX2: vblendvps
145
146; SKX-LABEL: test11a
147; SKX: vmovaps (%rdi), %ymm1 {%k1}
148; AVX512-LABEL: test11a
149; AVX512: kshiftlw $8
150; AVX512: kshiftrw $8
151; AVX512: vmovups (%rdi), %zmm1 {%k1}
152define <8 x float> @test11a(<8 x i32> %trigger, <8 x float>* %addr, <8 x float> %dst) {
153  %mask = icmp eq <8 x i32> %trigger, zeroinitializer
154  %res = call <8 x float> @llvm.masked.load.v8f32(<8 x float>* %addr, i32 32, <8 x i1>%mask, <8 x float>%dst)
155  ret <8 x float> %res
156}
157
158; SKX-LABEL: test11b
159; SKX: vmovdqu32 (%rdi), %ymm1 {%k1}
160; AVX512-LABEL: test11b
161; AVX512: kshiftlw        $8
162; AVX512: kshiftrw        $8
163; AVX512: vmovdqu32 (%rdi), %zmm1 {%k1}
164define <8 x i32> @test11b(<8 x i1> %mask, <8 x i32>* %addr, <8 x i32> %dst) {
165  %res = call <8 x i32> @llvm.masked.load.v8i32(<8 x i32>* %addr, i32 4, <8 x i1>%mask, <8 x i32>%dst)
166  ret <8 x i32> %res
167}
168
169; SKX-LABEL: test11c
170; SKX: vmovaps (%rdi), %ymm0 {%k1} {z}
171; AVX512-LABEL: test11c
172; AVX512: kshiftlw  $8
173; AVX512: kshiftrw  $8
174; AVX512: vmovups (%rdi), %zmm0 {%k1} {z}
175define <8 x float> @test11c(<8 x i1> %mask, <8 x float>* %addr) {
176  %res = call <8 x float> @llvm.masked.load.v8f32(<8 x float>* %addr, i32 32, <8 x i1> %mask, <8 x float> zeroinitializer)
177  ret <8 x float> %res
178}
179
180; SKX-LABEL: test11d
181; SKX: vmovdqu32 (%rdi), %ymm0 {%k1} {z}
182; AVX512-LABEL: test11d
183; AVX512: kshiftlw  $8
184; AVX512: kshiftrw  $8
185; AVX512: vmovdqu32 (%rdi), %zmm0 {%k1} {z}
186define <8 x i32> @test11d(<8 x i1> %mask, <8 x i32>* %addr) {
187  %res = call <8 x i32> @llvm.masked.load.v8i32(<8 x i32>* %addr, i32 4, <8 x i1> %mask, <8 x i32> zeroinitializer)
188  ret <8 x i32> %res
189}
190
191; AVX2-LABEL: test12
192; AVX2: vpmaskmovd %ymm
193
194; SKX-LABEL: test12
195; SKX: vmovdqu32 {{.*}}{%k1}
196define void @test12(<8 x i32> %trigger, <8 x i32>* %addr, <8 x i32> %val) {
197  %mask = icmp eq <8 x i32> %trigger, zeroinitializer
198  call void @llvm.masked.store.v8i32(<8 x i32>%val, <8 x i32>* %addr, i32 4, <8 x i1>%mask)
199  ret void
200}
201
202; AVX512-LABEL: test13
203; AVX512: vmovups       %zmm1, (%rdi) {%k1}
204
205define void @test13(<16 x i32> %trigger, <16 x float>* %addr, <16 x float> %val) {
206  %mask = icmp eq <16 x i32> %trigger, zeroinitializer
207  call void @llvm.masked.store.v16f32(<16 x float>%val, <16 x float>* %addr, i32 4, <16 x i1>%mask)
208  ret void
209}
210
211; AVX2-LABEL: test14
212; AVX2: vpshufd
213; AVX2: vmovq
214; AVX2: vmaskmovps
215
216; SKX-LABEL: test14
217; SKX: kshiftl
218; SKX: kshiftr
219; SKX: vmovups {{.*}}{%k1}
220
221define void @test14(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) {
222  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
223  call void @llvm.masked.store.v2f32(<2 x float>%val, <2 x float>* %addr, i32 4, <2 x i1>%mask)
224  ret void
225}
226
227; AVX2-LABEL: test15
228; AVX2: vpmaskmovd
229
230; SKX-LABEL: test15:
231; SKX:       ## BB#0:
232; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
233; SKX-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
234; SKX-NEXT:    vpcmpeqq %xmm2, %xmm0, %k1
235; SKX-NEXT:    vpmovqd %xmm1, (%rdi) {%k1}
236; SKX-NEXT:    retq
237define void @test15(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) {
238  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
239  call void @llvm.masked.store.v2i32(<2 x i32>%val, <2 x i32>* %addr, i32 4, <2 x i1>%mask)
240  ret void
241}
242
243; AVX2-LABEL: test16
244; AVX2: vmaskmovps
245; AVX2: vblendvps
246
247; SKX-LABEL: test16
248; SKX: kshiftl
249; SKX: kshiftr
250; SKX: vmovups {{.*}}{%k1}
251define <2 x float> @test16(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %dst) {
252  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
253  %res = call <2 x float> @llvm.masked.load.v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>%dst)
254  ret <2 x float> %res
255}
256
257; AVX2-LABEL: test17
258; AVX2: vpmaskmovd
259; AVX2: vblendvps
260; AVX2: vpmovsxdq
261
262; SKX-LABEL: test17
263; SKX: kshiftl
264; SKX: kshiftr
265; SKX: vmovdqu32 {{.*}}{%k1}
266define <2 x i32> @test17(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) {
267  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
268  %res = call <2 x i32> @llvm.masked.load.v2i32(<2 x i32>* %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst)
269  ret <2 x i32> %res
270}
271
272; AVX2-LABEL: test18
273; AVX2: vmaskmovps
274; AVX2-NOT: blend
275; AVX2: ret
276define <2 x float> @test18(<2 x i32> %trigger, <2 x float>* %addr) {
277; SKX-LABEL: test18:
278; SKX:       ## BB#0:
279; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
280; SKX-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
281; SKX-NEXT:    vpcmpeqq %xmm1, %xmm0, %k0
282; SKX-NEXT:    kshiftlw $2, %k0, %k0
283; SKX-NEXT:    kshiftrw $2, %k0, %k1
284; SKX-NEXT:    vmovups (%rdi), %xmm0 {%k1} {z}
285; SKX-NEXT:    retq
286  %mask = icmp eq <2 x i32> %trigger, zeroinitializer
287  %res = call <2 x float> @llvm.masked.load.v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>undef)
288  ret <2 x float> %res
289}
290
291; AVX_SCALAR-LABEL: test19
292; AVX_SCALAR: load <4 x float>, <4 x float>* %addr, align 4
293
294define <4 x float> @test19(<4 x i32> %trigger, <4 x float>* %addr) {
295  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
296  %res = call <4 x float> @llvm.masked.load.v4f32(<4 x float>* %addr, i32 4, <4 x i1><i1 true, i1 true, i1 true, i1 true>, <4 x float>undef)
297  ret <4 x float> %res
298}
299
300; AVX_SCALAR-LABEL: test20
301; AVX_SCALAR: load float, {{.*}}, align 4
302; AVX_SCALAR: insertelement <4 x float> undef, float
303; AVX_SCALAR: select <4 x i1> <i1 true, i1 false, i1 true, i1 true>
304
305define <4 x float> @test20(<4 x i32> %trigger, <4 x float>* %addr, <4 x float> %src0) {
306  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
307  %res = call <4 x float> @llvm.masked.load.v4f32(<4 x float>* %addr, i32 16, <4 x i1><i1 true, i1 false, i1 true, i1 true>, <4 x float> %src0)
308  ret <4 x float> %res
309}
310
311; AVX_SCALAR-LABEL: test21
312; AVX_SCALAR: store <4 x i32> %val
313define void @test21(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) {
314  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
315  call void @llvm.masked.store.v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1><i1 true, i1 true, i1 true, i1 true>)
316  ret void
317}
318
319; AVX_SCALAR-LABEL: test22
320; AVX_SCALAR: extractelement <4 x i32> %val, i32 0
321; AVX_SCALAR:  store i32
322define void @test22(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) {
323  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
324  call void @llvm.masked.store.v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1><i1 true, i1 false, i1 false, i1 false>)
325  ret void
326}
327
328declare <16 x i32> @llvm.masked.load.v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>)
329declare <4 x i32> @llvm.masked.load.v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
330declare <2 x i32> @llvm.masked.load.v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>)
331declare void @llvm.masked.store.v16i32(<16 x i32>, <16 x i32>*, i32, <16 x i1>)
332declare void @llvm.masked.store.v8i32(<8 x i32>, <8 x i32>*, i32, <8 x i1>)
333declare void @llvm.masked.store.v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
334declare void @llvm.masked.store.v2f32(<2 x float>, <2 x float>*, i32, <2 x i1>)
335declare void @llvm.masked.store.v2i32(<2 x i32>, <2 x i32>*, i32, <2 x i1>)
336declare void @llvm.masked.store.v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>)
337declare void @llvm.masked.store.v16f32p(<16 x float>*, <16 x float>**, i32, <16 x i1>)
338declare <16 x float> @llvm.masked.load.v16f32(<16 x float>*, i32, <16 x i1>, <16 x float>)
339declare <8 x float> @llvm.masked.load.v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>)
340declare <8 x i32> @llvm.masked.load.v8i32(<8 x i32>*, i32, <8 x i1>, <8 x i32>)
341declare <4 x float> @llvm.masked.load.v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
342declare <2 x float> @llvm.masked.load.v2f32(<2 x float>*, i32, <2 x i1>, <2 x float>)
343declare <8 x double> @llvm.masked.load.v8f64(<8 x double>*, i32, <8 x i1>, <8 x double>)
344declare <4 x double> @llvm.masked.load.v4f64(<4 x double>*, i32, <4 x i1>, <4 x double>)
345declare <2 x double> @llvm.masked.load.v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>)
346declare void @llvm.masked.store.v8f64(<8 x double>, <8 x double>*, i32, <8 x i1>)
347declare void @llvm.masked.store.v2f64(<2 x double>, <2 x double>*, i32, <2 x i1>)
348declare void @llvm.masked.store.v2i64(<2 x i64>, <2 x i64>*, i32, <2 x i1>)
349
350declare <16 x i32*> @llvm.masked.load.v16p0i32(<16 x i32*>*, i32, <16 x i1>, <16 x i32*>)
351
352; AVX512-LABEL: test23
353; AVX512: vmovdqu64       64(%rdi), %zmm1 {%k2} {z}
354; AVX512: vmovdqu64       (%rdi), %zmm0 {%k1} {z}
355
356define <16 x i32*> @test23(<16 x i32*> %trigger, <16 x i32*>* %addr) {
357  %mask = icmp eq <16 x i32*> %trigger, zeroinitializer
358  %res = call <16 x i32*> @llvm.masked.load.v16p0i32(<16 x i32*>* %addr, i32 4, <16 x i1>%mask, <16 x i32*>zeroinitializer)
359  ret <16 x i32*> %res
360}
361
362%mystruct = type { i16, i16, [1 x i8*] }
363
364declare <16 x %mystruct*> @llvm.masked.load.v16p0mystruct(<16 x %mystruct*>*, i32, <16 x i1>, <16 x %mystruct*>)
365
366; AVX512-LABEL: test24
367; AVX512: vmovdqu64       (%rdi), %zmm0 {%k1} {z}
368; AVX512: kshiftrw        $8, %k1, %k1
369; AVX512: vmovdqu64       64(%rdi), %zmm1 {%k1} {z}
370
371define <16 x %mystruct*> @test24(<16 x i1> %mask, <16 x %mystruct*>* %addr) {
372  %res = call <16 x %mystruct*> @llvm.masked.load.v16p0mystruct(<16 x %mystruct*>* %addr, i32 4, <16 x i1>%mask, <16 x %mystruct*>zeroinitializer)
373  ret <16 x %mystruct*> %res
374}
375
376define void @test_store_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64> %src0)  {
377; SKX-LABEL: test_store_16i64:
378; SKX:       ## BB#0:
379; SKX-NEXT:    vpmovb2m %xmm0, %k1
380; SKX-NEXT:    vmovdqu64 %zmm1, (%rdi) {%k1}
381; SKX-NEXT:    kshiftrw $8, %k1, %k1
382; SKX-NEXT:    vmovdqu64 %zmm2, 64(%rdi) {%k1}
383; SKX-NEXT:    retq
384  call void @llvm.masked.store.v16i64(<16 x i64> %src0, <16 x i64>* %ptrs, i32 4, <16 x i1> %mask)
385  ret void
386}
387declare void @llvm.masked.store.v16i64(<16 x i64> %src0, <16 x i64>* %ptrs, i32, <16 x i1> %mask)
388define void @test_store_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16 x double> %src0)  {
389; SKX-LABEL: test_store_16f64:
390; SKX:       ## BB#0:
391; SKX-NEXT:    vpmovb2m %xmm0, %k1
392; SKX-NEXT:    vmovupd %zmm1, (%rdi) {%k1}
393; SKX-NEXT:    kshiftrw $8, %k1, %k1
394; SKX-NEXT:    vmovupd %zmm2, 64(%rdi) {%k1}
395; SKX-NEXT:    retq
396  call void @llvm.masked.store.v16f64(<16 x double> %src0, <16 x double>* %ptrs, i32 4, <16 x i1> %mask)
397  ret void
398}
399declare void @llvm.masked.store.v16f64(<16 x double> %src0, <16 x double>* %ptrs, i32, <16 x i1> %mask)
400define <16 x i64> @test_load_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64> %src0)  {
401; SKX-LABEL: test_load_16i64:
402; SKX:       ## BB#0:
403; SKX-NEXT:    vpmovb2m %xmm0, %k1
404; SKX-NEXT:    vmovdqu64 (%rdi), %zmm1 {%k1}
405; SKX-NEXT:    kshiftrw $8, %k1, %k1
406; SKX-NEXT:    vmovdqu64 64(%rdi), %zmm2 {%k1}
407; SKX-NEXT:    vmovaps %zmm1, %zmm0
408; SKX-NEXT:    vmovaps %zmm2, %zmm1
409; SKX-NEXT:    retq
410  %res = call <16 x i64> @llvm.masked.load.v16i64(<16 x i64>* %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0)
411  ret <16 x i64> %res
412}
413declare <16 x i64> @llvm.masked.load.v16i64(<16 x i64>* %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0)
414define <16 x double> @test_load_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16 x double> %src0)  {
415; SKX-LABEL: test_load_16f64:
416; SKX:       ## BB#0:
417; SKX-NEXT:    vpmovb2m %xmm0, %k1
418; SKX-NEXT:    vmovupd (%rdi), %zmm1 {%k1}
419; SKX-NEXT:    kshiftrw $8, %k1, %k1
420; SKX-NEXT:    vmovupd 64(%rdi), %zmm2 {%k1}
421; SKX-NEXT:    vmovaps %zmm1, %zmm0
422; SKX-NEXT:    vmovaps %zmm2, %zmm1
423; SKX-NEXT:    retq
424  %res = call <16 x double> @llvm.masked.load.v16f64(<16 x double>* %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0)
425  ret <16 x double> %res
426}
427declare <16 x double> @llvm.masked.load.v16f64(<16 x double>* %ptrs, i32, <16 x i1> %mask, <16 x double> %src0)
428
429define <32 x double> @test_load_32f64(<32 x double>* %ptrs, <32 x i1> %mask, <32 x double> %src0)  {
430; SKX-LABEL: test_load_32f64:
431; SKX:       ## BB#0:
432; SKX-NEXT:    vpmovb2m %ymm0, %k1
433; SKX-NEXT:    vmovupd (%rdi), %zmm1 {%k1}
434; SKX-NEXT:    kshiftrd $16, %k1, %k2
435; SKX-NEXT:    vmovupd 128(%rdi), %zmm3 {%k2}
436; SKX-NEXT:    kshiftrw $8, %k1, %k1
437; SKX-NEXT:    vmovupd 64(%rdi), %zmm2 {%k1}
438; SKX-NEXT:    kshiftrw $8, %k2, %k1
439; SKX-NEXT:    vmovupd 192(%rdi), %zmm4 {%k1}
440; SKX-NEXT:    vmovaps %zmm1, %zmm0
441; SKX-NEXT:    vmovaps %zmm2, %zmm1
442; SKX-NEXT:    vmovaps %zmm3, %zmm2
443; SKX-NEXT:    vmovaps %zmm4, %zmm3
444; SKX-NEXT:    retq
445  %res = call <32 x double> @llvm.masked.load.v32f64(<32 x double>* %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0)
446  ret <32 x double> %res
447}
448declare <32 x double> @llvm.masked.load.v32f64(<32 x double>* %ptrs, i32, <32 x i1> %mask, <32 x double> %src0)
449