• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=x86_64-unknown-linux-gnu  -mattr=+avx512f < %s | FileCheck %s --check-prefix=KNL_64
3; RUN: llc -mtriple=i386-unknown-linux-gnu  -mattr=+avx512f < %s | FileCheck %s --check-prefix=KNL_32
4; RUN: llc -mtriple=x86_64-unknown-linux-gnu  -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=SKX
5; RUN: llc -mtriple=i386-unknown-linux-gnu  -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=SKX_32
6; RUN: opt -mtriple=x86_64-apple-darwin -codegenprepare -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=SCALAR
7
8
9target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
10target triple = "x86_64-unknown-linux-gnu"
11
12
13; SCALAR-LABEL: test1
14; SCALAR:      extractelement <16 x float*>
15; SCALAR-NEXT: load float
16; SCALAR-NEXT: insertelement <16 x float>
17; SCALAR-NEXT: extractelement <16 x float*>
18; SCALAR-NEXT: load float
19
20define <16 x float> @test1(float* %base, <16 x i32> %ind) {
21; KNL_64-LABEL: test1:
22; KNL_64:       # BB#0:
23; KNL_64-NEXT:    kxnorw %k1, %k1, %k1
24; KNL_64-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
25; KNL_64-NEXT:    vmovaps %zmm1, %zmm0
26; KNL_64-NEXT:    retq
27;
28; KNL_32-LABEL: test1:
29; KNL_32:       # BB#0:
30; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
31; KNL_32-NEXT:    kxnorw %k1, %k1, %k1
32; KNL_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
33; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
34; KNL_32-NEXT:    retl
35;
36; SKX-LABEL: test1:
37; SKX:       # BB#0:
38; SKX-NEXT:    kxnorw %k1, %k1, %k1
39; SKX-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
40; SKX-NEXT:    vmovaps %zmm1, %zmm0
41; SKX-NEXT:    retq
42
43  %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
44  %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
45
46  %sext_ind = sext <16 x i32> %ind to <16 x i64>
47  %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
48
49  %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
50  ret <16 x float>%res
51}
52
53declare <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*>, i32, <16 x i1>, <16 x i32>)
54declare <16 x float> @llvm.masked.gather.v16f32(<16 x float*>, i32, <16 x i1>, <16 x float>)
55declare <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> , i32, <8 x i1> , <8 x i32> )
56
57
58; SCALAR-LABEL: test2
59; SCALAR:      extractelement <16 x float*>
60; SCALAR-NEXT: load float
61; SCALAR-NEXT: insertelement <16 x float>
62; SCALAR-NEXT: br label %else
63; SCALAR: else:
64; SCALAR-NEXT:  %res.phi.else = phi
65; SCALAR-NEXT:  %Mask1 = extractelement <16 x i1> %imask, i32 1
66; SCALAR-NEXT:  %ToLoad1 = icmp eq i1 %Mask1, true
67; SCALAR-NEXT:  br i1 %ToLoad1, label %cond.load1, label %else2
68
69define <16 x float> @test2(float* %base, <16 x i32> %ind, i16 %mask) {
70; KNL_64-LABEL: test2:
71; KNL_64:       # BB#0:
72; KNL_64-NEXT:    kmovw %esi, %k1
73; KNL_64-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
74; KNL_64-NEXT:    vmovaps %zmm1, %zmm0
75; KNL_64-NEXT:    retq
76;
77; KNL_32-LABEL: test2:
78; KNL_32:       # BB#0:
79; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
80; KNL_32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
81; KNL_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
82; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
83; KNL_32-NEXT:    retl
84;
85; SKX-LABEL: test2:
86; SKX:       # BB#0:
87; SKX-NEXT:    kmovw %esi, %k1
88; SKX-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
89; SKX-NEXT:    vmovaps %zmm1, %zmm0
90; SKX-NEXT:    retq
91
92  %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
93  %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
94
95  %sext_ind = sext <16 x i32> %ind to <16 x i64>
96  %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
97  %imask = bitcast i16 %mask to <16 x i1>
98  %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> %imask, <16 x float>undef)
99  ret <16 x float> %res
100}
101
102define <16 x i32> @test3(i32* %base, <16 x i32> %ind, i16 %mask) {
103; KNL_64-LABEL: test3:
104; KNL_64:       # BB#0:
105; KNL_64-NEXT:    kmovw %esi, %k1
106; KNL_64-NEXT:    vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
107; KNL_64-NEXT:    vmovaps %zmm1, %zmm0
108; KNL_64-NEXT:    retq
109;
110; KNL_32-LABEL: test3:
111; KNL_32:       # BB#0:
112; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
113; KNL_32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
114; KNL_32-NEXT:    vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1}
115; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
116; KNL_32-NEXT:    retl
117;
118; SKX-LABEL: test3:
119; SKX:       # BB#0:
120; SKX-NEXT:    kmovw %esi, %k1
121; SKX-NEXT:    vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
122; SKX-NEXT:    vmovaps %zmm1, %zmm0
123; SKX-NEXT:    retq
124
125  %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
126  %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
127
128  %sext_ind = sext <16 x i32> %ind to <16 x i64>
129  %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i64> %sext_ind
130  %imask = bitcast i16 %mask to <16 x i1>
131  %res = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
132  ret <16 x i32> %res
133}
134
135
136define <16 x i32> @test4(i32* %base, <16 x i32> %ind, i16 %mask) {
137; KNL_64-LABEL: test4:
138; KNL_64:       # BB#0:
139; KNL_64-NEXT:    kmovw %esi, %k1
140; KNL_64-NEXT:    kmovw %k1, %k2
141; KNL_64-NEXT:    vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
142; KNL_64-NEXT:    vmovaps %zmm1, %zmm2
143; KNL_64-NEXT:    vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
144; KNL_64-NEXT:    vpaddd %zmm2, %zmm1, %zmm0
145; KNL_64-NEXT:    retq
146;
147; KNL_32-LABEL: test4:
148; KNL_32:       # BB#0:
149; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
150; KNL_32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
151; KNL_32-NEXT:    kmovw %k1, %k2
152; KNL_32-NEXT:    vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2}
153; KNL_32-NEXT:    vmovaps %zmm1, %zmm2
154; KNL_32-NEXT:    vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1}
155; KNL_32-NEXT:    vpaddd %zmm2, %zmm1, %zmm0
156; KNL_32-NEXT:    retl
157;
158; SKX-LABEL: test4:
159; SKX:       # BB#0:
160; SKX-NEXT:    kmovw %esi, %k1
161; SKX-NEXT:    kmovw %k1, %k2
162; SKX-NEXT:    vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
163; SKX-NEXT:    vmovaps %zmm1, %zmm2
164; SKX-NEXT:    vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
165; SKX-NEXT:    vpaddd %zmm2, %zmm1, %zmm0
166; SKX-NEXT:    retq
167
168  %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
169  %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
170
171  %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
172  %imask = bitcast i16 %mask to <16 x i1>
173  %gt1 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
174  %gt2 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1)
175  %res = add <16 x i32> %gt1, %gt2
176  ret <16 x i32> %res
177}
178
179
180; SCALAR-LABEL: test5
181; SCALAR:        %Mask0 = extractelement <16 x i1> %imask, i32 0
182; SCALAR-NEXT:   %ToStore0 = icmp eq i1 %Mask0, true
183; SCALAR-NEXT:   br i1 %ToStore0, label %cond.store, label %else
184; SCALAR: cond.store:
185; SCALAR-NEXT:  %Elt0 = extractelement <16 x i32> %val, i32 0
186; SCALAR-NEXT:  %Ptr0 = extractelement <16 x i32*> %gep.random, i32 0
187; SCALAR-NEXT:  store i32 %Elt0, i32* %Ptr0, align 4
188; SCALAR-NEXT:  br label %else
189; SCALAR: else:
190; SCALAR-NEXT: %Mask1 = extractelement <16 x i1> %imask, i32 1
191; SCALAR-NEXT:  %ToStore1 = icmp eq i1 %Mask1, true
192; SCALAR-NEXT:  br i1 %ToStore1, label %cond.store1, label %else2
193
194define void @test5(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
195; KNL_64-LABEL: test5:
196; KNL_64:       # BB#0:
197; KNL_64-NEXT:    kmovw %esi, %k1
198; KNL_64-NEXT:    kmovw %k1, %k2
199; KNL_64-NEXT:    vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2}
200; KNL_64-NEXT:    vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
201; KNL_64-NEXT:    retq
202;
203; KNL_32-LABEL: test5:
204; KNL_32:       # BB#0:
205; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
206; KNL_32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
207; KNL_32-NEXT:    kmovw %k1, %k2
208; KNL_32-NEXT:    vpscatterdd %zmm1, (%eax,%zmm0,4) {%k2}
209; KNL_32-NEXT:    vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1}
210; KNL_32-NEXT:    retl
211;
212; SKX-LABEL: test5:
213; SKX:       # BB#0:
214; SKX-NEXT:    kmovw %esi, %k1
215; SKX-NEXT:    kmovw %k1, %k2
216; SKX-NEXT:    vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2}
217; SKX-NEXT:    vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
218; SKX-NEXT:    retq
219
220  %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
221  %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
222
223  %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
224  %imask = bitcast i16 %mask to <16 x i1>
225  call void @llvm.masked.scatter.v16i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
226  call void @llvm.masked.scatter.v16i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
227  ret void
228}
229
230declare void @llvm.masked.scatter.v8i32(<8 x i32> , <8 x i32*> , i32 , <8 x i1> )
231declare void @llvm.masked.scatter.v16i32(<16 x i32> , <16 x i32*> , i32 , <16 x i1> )
232
233
234; SCALAR-LABEL: test6
235; SCALAR:        store i32 %Elt0, i32* %Ptr01, align 4
236; SCALAR-NEXT:   %Elt1 = extractelement <8 x i32> %a1, i32 1
237; SCALAR-NEXT:   %Ptr12 = extractelement <8 x i32*> %ptr, i32 1
238; SCALAR-NEXT:   store i32 %Elt1, i32* %Ptr12, align 4
239; SCALAR-NEXT:   %Elt2 = extractelement <8 x i32> %a1, i32 2
240; SCALAR-NEXT:   %Ptr23 = extractelement <8 x i32*> %ptr, i32 2
241; SCALAR-NEXT:   store i32 %Elt2, i32* %Ptr23, align 4
242
243define <8 x i32> @test6(<8 x i32>%a1, <8 x i32*> %ptr) {
244; KNL_64-LABEL: test6:
245; KNL_64:       # BB#0:
246; KNL_64-NEXT:    kxnorw %k1, %k1, %k1
247; KNL_64-NEXT:    kxnorw %k2, %k2, %k2
248; KNL_64-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k2}
249; KNL_64-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
250; KNL_64-NEXT:    vmovaps %zmm2, %zmm0
251; KNL_64-NEXT:    retq
252;
253; KNL_32-LABEL: test6:
254; KNL_32:       # BB#0:
255; KNL_32-NEXT:    kxnorw %k1, %k1, %k1
256; KNL_32-NEXT:    vpmovsxdq %ymm1, %zmm2
257; KNL_32-NEXT:    kxnorw %k2, %k2, %k2
258; KNL_32-NEXT:    vpgatherqd (,%zmm2), %ymm1 {%k2}
259; KNL_32-NEXT:    vpscatterqd %ymm0, (,%zmm2) {%k1}
260; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
261; KNL_32-NEXT:    retl
262;
263; SKX-LABEL: test6:
264; SKX:       # BB#0:
265; SKX-NEXT:    kxnorw %k1, %k1, %k1
266; SKX-NEXT:    kxnorw %k2, %k2, %k2
267; SKX-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k2}
268; SKX-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
269; SKX-NEXT:    vmovaps %zmm2, %zmm0
270; SKX-NEXT:    retq
271
272  %a = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
273
274  call void @llvm.masked.scatter.v8i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
275  ret <8 x i32>%a
276}
277
278define <8 x i32> @test7(i32* %base, <8 x i32> %ind, i8 %mask) {
279;
280; KNL_64-LABEL: test7:
281; KNL_64:       # BB#0:
282; KNL_64-NEXT:    movzbl %sil, %eax
283; KNL_64-NEXT:    kmovw %eax, %k1
284; KNL_64-NEXT:    vpmovsxdq %ymm0, %zmm0
285; KNL_64-NEXT:    kmovw %k1, %k2
286; KNL_64-NEXT:    vpgatherqd (%rdi,%zmm0,4), %ymm1 {%k2}
287; KNL_64-NEXT:    vmovaps %zmm1, %zmm2
288; KNL_64-NEXT:    vpgatherqd (%rdi,%zmm0,4), %ymm2 {%k1}
289; KNL_64-NEXT:    vpaddd %ymm2, %ymm1, %ymm0
290; KNL_64-NEXT:    retq
291;
292; KNL_32-LABEL: test7:
293; KNL_32:       # BB#0:
294; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
295; KNL_32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
296; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm0
297; KNL_32-NEXT:    kmovw %k1, %k2
298; KNL_32-NEXT:    vpgatherqd (%eax,%zmm0,4), %ymm1 {%k2}
299; KNL_32-NEXT:    vmovaps %zmm1, %zmm2
300; KNL_32-NEXT:    vpgatherqd (%eax,%zmm0,4), %ymm2 {%k1}
301; KNL_32-NEXT:    vpaddd %ymm2, %ymm1, %ymm0
302; KNL_32-NEXT:    retl
303;
304; SKX-LABEL: test7:
305; SKX:       # BB#0:
306; SKX-NEXT:    kmovb %esi, %k1
307; SKX-NEXT:    kmovw %k1, %k2
308; SKX-NEXT:    vpgatherdd (%rdi,%ymm0,4), %ymm1 {%k2}
309; SKX-NEXT:    vmovaps %zmm1, %zmm2
310; SKX-NEXT:    vpgatherdd (%rdi,%ymm0,4), %ymm2 {%k1}
311; SKX-NEXT:    vpaddd %ymm2, %ymm1, %ymm0
312; SKX-NEXT:    retq
313
314  %broadcast.splatinsert = insertelement <8 x i32*> undef, i32* %base, i32 0
315  %broadcast.splat = shufflevector <8 x i32*> %broadcast.splatinsert, <8 x i32*> undef, <8 x i32> zeroinitializer
316
317  %gep.random = getelementptr i32, <8 x i32*> %broadcast.splat, <8 x i32> %ind
318  %imask = bitcast i8 %mask to <8 x i1>
319  %gt1 = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %gep.random, i32 4, <8 x i1> %imask, <8 x i32>undef)
320  %gt2 = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %gep.random, i32 4, <8 x i1> %imask, <8 x i32>%gt1)
321  %res = add <8 x i32> %gt1, %gt2
322  ret <8 x i32> %res
323}
324
325; No uniform base in this case, index <8 x i64> contains addresses,
326; each gather call will be split into two
327define <16 x i32> @test8(<16 x i32*> %ptr.random, <16 x i32> %ind, i16 %mask) {
328; KNL_64-LABEL: test8:
329; KNL_64:       # BB#0:
330; KNL_64-NEXT:    kmovw %edi, %k1
331; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
332; KNL_64-NEXT:    kmovw %k2, %k3
333; KNL_64-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k3}
334; KNL_64-NEXT:    kmovw %k1, %k3
335; KNL_64-NEXT:    vpgatherqd (,%zmm0), %ymm3 {%k3}
336; KNL_64-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm4
337; KNL_64-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k2}
338; KNL_64-NEXT:    vpgatherqd (,%zmm0), %ymm3 {%k1}
339; KNL_64-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm0
340; KNL_64-NEXT:    vpaddd %zmm0, %zmm4, %zmm0
341; KNL_64-NEXT:    retq
342;
343; KNL_32-LABEL: test8:
344; KNL_32:       # BB#0:
345; KNL_32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
346; KNL_32-NEXT:    kmovw %k1, %k2
347; KNL_32-NEXT:    vpgatherdd (,%zmm0), %zmm1 {%k2}
348; KNL_32-NEXT:    vmovaps %zmm1, %zmm2
349; KNL_32-NEXT:    vpgatherdd (,%zmm0), %zmm2 {%k1}
350; KNL_32-NEXT:    vpaddd %zmm2, %zmm1, %zmm0
351; KNL_32-NEXT:    retl
352;
353; SKX-LABEL: test8:
354; SKX:       # BB#0:
355; SKX-NEXT:    kmovw %edi, %k1
356; SKX-NEXT:    kshiftrw $8, %k1, %k2
357; SKX-NEXT:    kmovw %k2, %k3
358; SKX-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k3}
359; SKX-NEXT:    kmovw %k1, %k3
360; SKX-NEXT:    vpgatherqd (,%zmm0), %ymm3 {%k3}
361; SKX-NEXT:    vinserti32x8 $1, %ymm2, %zmm3, %zmm4
362; SKX-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k2}
363; SKX-NEXT:    vpgatherqd (,%zmm0), %ymm3 {%k1}
364; SKX-NEXT:    vinserti32x8 $1, %ymm2, %zmm3, %zmm0
365; SKX-NEXT:    vpaddd %zmm0, %zmm4, %zmm0
366; SKX-NEXT:    retq
367;
368; SKX_32-LABEL: test8:
369; SKX_32:       # BB#0:
370; SKX_32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
371; SKX_32-NEXT:    kmovw %k1, %k2
372; SKX_32-NEXT:    vpgatherdd (,%zmm0), %zmm1 {%k2}
373; SKX_32-NEXT:    vmovaps %zmm1, %zmm2
374; SKX_32-NEXT:    vpgatherdd (,%zmm0), %zmm2 {%k1}
375; SKX_32-NEXT:    vpaddd %zmm2, %zmm1, %zmm0
376; SKX_32-NEXT:    retl
377
378  %imask = bitcast i16 %mask to <16 x i1>
379  %gt1 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
380  %gt2 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1)
381  %res = add <16 x i32> %gt1, %gt2
382  ret <16 x i32> %res
383}
384
385%struct.RT = type { i8, [10 x [20 x i32]], i8 }
386%struct.ST = type { i32, double, %struct.RT }
387
388; Masked gather for agregate types
389; Test9 and Test10 should give the same result (scalar and vector indices in GEP)
390
391
392define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) {
393; KNL_64-LABEL: test9:
394; KNL_64:       # BB#0: # %entry
395; KNL_64-NEXT:    vpbroadcastq %rdi, %zmm2
396; KNL_64-NEXT:    vpmovsxdq %ymm1, %zmm1
397; KNL_64-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm3
398; KNL_64-NEXT:    vpmuludq %zmm3, %zmm1, %zmm4
399; KNL_64-NEXT:    vpsrlq $32, %zmm1, %zmm1
400; KNL_64-NEXT:    vpmuludq %zmm3, %zmm1, %zmm1
401; KNL_64-NEXT:    vpsllq $32, %zmm1, %zmm1
402; KNL_64-NEXT:    vpaddq %zmm1, %zmm4, %zmm1
403; KNL_64-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm3
404; KNL_64-NEXT:    vpmuludq %zmm3, %zmm0, %zmm4
405; KNL_64-NEXT:    vpsrlq $32, %zmm0, %zmm0
406; KNL_64-NEXT:    vpmuludq %zmm3, %zmm0, %zmm0
407; KNL_64-NEXT:    vpsllq $32, %zmm0, %zmm0
408; KNL_64-NEXT:    vpaddq %zmm0, %zmm4, %zmm0
409; KNL_64-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
410; KNL_64-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
411; KNL_64-NEXT:    vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
412; KNL_64-NEXT:    kxnorw %k1, %k1, %k1
413; KNL_64-NEXT:    vpgatherqd (,%zmm1), %ymm0 {%k1}
414; KNL_64-NEXT:    retq
415;
416; KNL_32-LABEL: test9:
417; KNL_32:       # BB#0: # %entry
418; KNL_32-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %ymm2
419; KNL_32-NEXT:    vpbroadcastd .LCPI8_0, %ymm3
420; KNL_32-NEXT:    vpmulld %ymm3, %ymm1, %ymm1
421; KNL_32-NEXT:    vpmovqd %zmm0, %ymm0
422; KNL_32-NEXT:    vpbroadcastd .LCPI8_1, %ymm3
423; KNL_32-NEXT:    vpmulld %ymm3, %ymm0, %ymm0
424; KNL_32-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
425; KNL_32-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
426; KNL_32-NEXT:    vpbroadcastd .LCPI8_2, %ymm1
427; KNL_32-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
428; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm1
429; KNL_32-NEXT:    kxnorw %k1, %k1, %k1
430; KNL_32-NEXT:    vpgatherqd (,%zmm1), %ymm0 {%k1}
431; KNL_32-NEXT:    retl
432;
433; SKX-LABEL: test9:
434; SKX:       # BB#0: # %entry
435; SKX-NEXT:    vpbroadcastq %rdi, %zmm2
436; SKX-NEXT:    vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0
437; SKX-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
438; SKX-NEXT:    vpmovsxdq %ymm1, %zmm1
439; SKX-NEXT:    vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1
440; SKX-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
441; SKX-NEXT:    vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
442; SKX-NEXT:    kxnorw %k1, %k1, %k1
443; SKX-NEXT:    vpgatherqd (,%zmm1), %ymm0 {%k1}
444; SKX-NEXT:    retq
445entry:
446  %broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0
447  %broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer
448
449  %arrayidx = getelementptr  %struct.ST, <8 x %struct.ST*> %broadcast.splat, <8 x i64> %ind1, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>, <8 x i32><i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, <8 x i32> %ind5, <8 x i64> <i64 13, i64 13, i64 13, i64 13, i64 13, i64 13, i64 13, i64 13>
450  %res = call <8 x i32 >  @llvm.masked.gather.v8i32(<8 x i32*>%arrayidx, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
451  ret <8 x i32> %res
452}
453
454define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) {
455; KNL_64-LABEL: test10:
456; KNL_64:       # BB#0: # %entry
457; KNL_64-NEXT:    vpbroadcastq %rdi, %zmm2
458; KNL_64-NEXT:    vpmovsxdq %ymm1, %zmm1
459; KNL_64-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm3
460; KNL_64-NEXT:    vpmuludq %zmm3, %zmm1, %zmm4
461; KNL_64-NEXT:    vpsrlq $32, %zmm1, %zmm1
462; KNL_64-NEXT:    vpmuludq %zmm3, %zmm1, %zmm1
463; KNL_64-NEXT:    vpsllq $32, %zmm1, %zmm1
464; KNL_64-NEXT:    vpaddq %zmm1, %zmm4, %zmm1
465; KNL_64-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm3
466; KNL_64-NEXT:    vpmuludq %zmm3, %zmm0, %zmm4
467; KNL_64-NEXT:    vpsrlq $32, %zmm0, %zmm0
468; KNL_64-NEXT:    vpmuludq %zmm3, %zmm0, %zmm0
469; KNL_64-NEXT:    vpsllq $32, %zmm0, %zmm0
470; KNL_64-NEXT:    vpaddq %zmm0, %zmm4, %zmm0
471; KNL_64-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
472; KNL_64-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
473; KNL_64-NEXT:    vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
474; KNL_64-NEXT:    kxnorw %k1, %k1, %k1
475; KNL_64-NEXT:    vpgatherqd (,%zmm1), %ymm0 {%k1}
476; KNL_64-NEXT:    retq
477;
478; KNL_32-LABEL: test10:
479; KNL_32:       # BB#0: # %entry
480; KNL_32-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %ymm2
481; KNL_32-NEXT:    vpbroadcastd .LCPI9_0, %ymm3
482; KNL_32-NEXT:    vpmulld %ymm3, %ymm1, %ymm1
483; KNL_32-NEXT:    vpmovqd %zmm0, %ymm0
484; KNL_32-NEXT:    vpbroadcastd .LCPI9_1, %ymm3
485; KNL_32-NEXT:    vpmulld %ymm3, %ymm0, %ymm0
486; KNL_32-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
487; KNL_32-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
488; KNL_32-NEXT:    vpbroadcastd .LCPI9_2, %ymm1
489; KNL_32-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
490; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm1
491; KNL_32-NEXT:    kxnorw %k1, %k1, %k1
492; KNL_32-NEXT:    vpgatherqd (,%zmm1), %ymm0 {%k1}
493; KNL_32-NEXT:    retl
494;
495; SKX-LABEL: test10:
496; SKX:       # BB#0: # %entry
497; SKX-NEXT:    vpbroadcastq %rdi, %zmm2
498; SKX-NEXT:    vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0
499; SKX-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
500; SKX-NEXT:    vpmovsxdq %ymm1, %zmm1
501; SKX-NEXT:    vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1
502; SKX-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
503; SKX-NEXT:    vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
504; SKX-NEXT:    kxnorw %k1, %k1, %k1
505; SKX-NEXT:    vpgatherqd (,%zmm1), %ymm0 {%k1}
506; SKX-NEXT:    retq
507entry:
508  %broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0
509  %broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer
510
511  %arrayidx = getelementptr  %struct.ST, <8 x %struct.ST*> %broadcast.splat, <8 x i64> %i1, i32 2, i32 1, <8 x i32> %ind5, i64 13
512  %res = call <8 x i32 >  @llvm.masked.gather.v8i32(<8 x i32*>%arrayidx, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
513  ret <8 x i32> %res
514}
515
516; Splat index in GEP, requires broadcast
517define <16 x float> @test11(float* %base, i32 %ind) {
518; KNL_64-LABEL: test11:
519; KNL_64:       # BB#0:
520; KNL_64-NEXT:    vpbroadcastd %esi, %zmm1
521; KNL_64-NEXT:    kxnorw %k1, %k1, %k1
522; KNL_64-NEXT:    vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
523; KNL_64-NEXT:    retq
524;
525; KNL_32-LABEL: test11:
526; KNL_32:       # BB#0:
527; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
528; KNL_32-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %zmm1
529; KNL_32-NEXT:    kxnorw %k1, %k1, %k1
530; KNL_32-NEXT:    vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
531; KNL_32-NEXT:    retl
532;
533; SKX-LABEL: test11:
534; SKX:       # BB#0:
535; SKX-NEXT:    vpbroadcastd %esi, %zmm1
536; SKX-NEXT:    kxnorw %k1, %k1, %k1
537; SKX-NEXT:    vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
538; SKX-NEXT:    retq
539
540  %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
541  %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
542
543  %gep.random = getelementptr float, <16 x float*> %broadcast.splat, i32 %ind
544
545  %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
546  ret <16 x float>%res
547}
548
549; We are checking the uniform base here. It is taken directly from input to vgatherdps
550define <16 x float> @test12(float* %base, <16 x i32> %ind) {
551; KNL_64-LABEL: test12:
552; KNL_64:       # BB#0:
553; KNL_64-NEXT:    kxnorw %k1, %k1, %k1
554; KNL_64-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
555; KNL_64-NEXT:    vmovaps %zmm1, %zmm0
556; KNL_64-NEXT:    retq
557;
558; KNL_32-LABEL: test12:
559; KNL_32:       # BB#0:
560; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
561; KNL_32-NEXT:    kxnorw %k1, %k1, %k1
562; KNL_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
563; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
564; KNL_32-NEXT:    retl
565;
566; SKX-LABEL: test12:
567; SKX:       # BB#0:
568; SKX-NEXT:    kxnorw %k1, %k1, %k1
569; SKX-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
570; SKX-NEXT:    vmovaps %zmm1, %zmm0
571; SKX-NEXT:    retq
572
573  %sext_ind = sext <16 x i32> %ind to <16 x i64>
574  %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
575
576  %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
577  ret <16 x float>%res
578}
579
580; The same as the previous, but the mask is undefined
581define <16 x float> @test13(float* %base, <16 x i32> %ind) {
582; KNL_64-LABEL: test13:
583; KNL_64:       # BB#0:
584; KNL_64-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
585; KNL_64-NEXT:    vmovaps %zmm1, %zmm0
586; KNL_64-NEXT:    retq
587;
588; KNL_32-LABEL: test13:
589; KNL_32:       # BB#0:
590; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
591; KNL_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
592; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
593; KNL_32-NEXT:    retl
594;
595; SKX-LABEL: test13:
596; SKX:       # BB#0:
597; SKX-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
598; SKX-NEXT:    vmovaps %zmm1, %zmm0
599; SKX-NEXT:    retq
600
601  %sext_ind = sext <16 x i32> %ind to <16 x i64>
602  %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
603
604  %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> undef, <16 x float> undef)
605  ret <16 x float>%res
606}
607
608; The base pointer is not splat, can't find unform base
609define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) {
610; KNL_64-LABEL: test14:
611; KNL_64:       # BB#0:
612; KNL_64-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm1
613; KNL_64-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
614; KNL_64-NEXT:    vpbroadcastq %xmm0, %zmm0
615; KNL_64-NEXT:    vmovd %esi, %xmm1
616; KNL_64-NEXT:    vpbroadcastd %xmm1, %ymm1
617; KNL_64-NEXT:    vpmovsxdq %ymm1, %zmm1
618; KNL_64-NEXT:    vpsllq $2, %zmm1, %zmm1
619; KNL_64-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
620; KNL_64-NEXT:    kshiftrw $8, %k0, %k1
621; KNL_64-NEXT:    vgatherqps (,%zmm0), %ymm1 {%k1}
622; KNL_64-NEXT:    vgatherqps (,%zmm0), %ymm2 {%k1}
623; KNL_64-NEXT:    vinsertf64x4 $1, %ymm1, %zmm2, %zmm0
624; KNL_64-NEXT:    retq
625;
626; KNL_32-LABEL: test14:
627; KNL_32:       # BB#0:
628; KNL_32-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm1
629; KNL_32-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
630; KNL_32-NEXT:    vpbroadcastd %xmm0, %zmm0
631; KNL_32-NEXT:    vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1
632; KNL_32-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
633; KNL_32-NEXT:    vgatherdps (,%zmm1), %zmm0 {%k1}
634; KNL_32-NEXT:    retl
635;
636; SKX-LABEL: test14:
637; SKX:       # BB#0:
638; SKX-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm1
639; SKX-NEXT:    vinserti64x2 $0, %xmm1, %zmm0, %zmm0
640; SKX-NEXT:    vpbroadcastq %xmm0, %zmm0
641; SKX-NEXT:    vmovd %esi, %xmm1
642; SKX-NEXT:    vpbroadcastd %xmm1, %ymm1
643; SKX-NEXT:    vpmovsxdq %ymm1, %zmm1
644; SKX-NEXT:    vpsllq $2, %zmm1, %zmm1
645; SKX-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
646; SKX-NEXT:    kshiftrw $8, %k0, %k1
647; SKX-NEXT:    vgatherqps (,%zmm0), %ymm1 {%k1}
648; SKX-NEXT:    vgatherqps (,%zmm0), %ymm2 {%k1}
649; SKX-NEXT:    vinsertf32x8 $1, %ymm1, %zmm2, %zmm0
650; SKX-NEXT:    retq
651;
652; SKX_32-LABEL: test14:
653; SKX_32:       # BB#0:
654; SKX_32-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm1
655; SKX_32-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
656; SKX_32-NEXT:    vpbroadcastd %xmm0, %zmm0
657; SKX_32-NEXT:    vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1
658; SKX_32-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
659; SKX_32-NEXT:    vgatherdps (,%zmm1), %zmm0 {%k1}
660; SKX_32-NEXT:    retl
661
662  %broadcast.splatinsert = insertelement <16 x float*> %vec, float* %base, i32 1
663  %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
664
665  %gep.random = getelementptr float, <16 x float*> %broadcast.splat, i32 %ind
666
667  %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> undef, <16 x float> undef)
668  ret <16 x float>%res
669}
670
671declare <4 x float> @llvm.masked.gather.v4f32(<4 x float*>, i32, <4 x i1>, <4 x float>)
672declare <4 x double> @llvm.masked.gather.v4f64(<4 x double*>, i32, <4 x i1>, <4 x double>)
673declare <2 x double> @llvm.masked.gather.v2f64(<2 x double*>, i32, <2 x i1>, <2 x double>)
674
675; Gather smaller than existing instruction
676define <4 x float> @test15(float* %base, <4 x i32> %ind, <4 x i1> %mask) {
677;
678; KNL_64-LABEL: test15:
679; KNL_64:       # BB#0:
680; KNL_64-NEXT:    vpxor %ymm2, %ymm2, %ymm2
681; KNL_64-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
682; KNL_64-NEXT:    vpmovsxdq %ymm0, %zmm2
683; KNL_64-NEXT:    vpmovsxdq %ymm1, %zmm0
684; KNL_64-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0
685; KNL_64-NEXT:    vptestmq %zmm0, %zmm0, %k1
686; KNL_64-NEXT:    vgatherqps (%rdi,%zmm2,4), %ymm0 {%k1}
687; KNL_64-NEXT:    retq
688;
689; KNL_32-LABEL: test15:
690; KNL_32:       # BB#0:
691; KNL_32-NEXT:    vpxor %ymm2, %ymm2, %ymm2
692; KNL_32-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
693; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
694; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm2
695; KNL_32-NEXT:    vpmovsxdq %ymm1, %zmm0
696; KNL_32-NEXT:    vpandq .LCPI14_0, %zmm0, %zmm0
697; KNL_32-NEXT:    vptestmq %zmm0, %zmm0, %k1
698; KNL_32-NEXT:    vgatherqps (%eax,%zmm2,4), %ymm0 {%k1}
699; KNL_32-NEXT:    retl
700;
701; SKX-LABEL: test15:
702; SKX:       # BB#0:
703; SKX-NEXT:    vpmovd2m %xmm1, %k1
704; SKX-NEXT:    vgatherdps (%rdi,%xmm0,4), %xmm1 {%k1}
705; SKX-NEXT:    vmovaps %zmm1, %zmm0
706; SKX-NEXT:    retq
707
708  %sext_ind = sext <4 x i32> %ind to <4 x i64>
709  %gep.random = getelementptr float, float* %base, <4 x i64> %sext_ind
710  %res = call <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %gep.random, i32 4, <4 x i1> %mask, <4 x float> undef)
711  ret <4 x float>%res
712}
713
714; Gather smaller than existing instruction
715define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x double> %src0) {
716;
717; KNL_64-LABEL: test16:
718; KNL_64:       # BB#0:
719; KNL_64-NEXT:    vpslld $31, %xmm1, %xmm1
720; KNL_64-NEXT:    vpsrad $31, %xmm1, %xmm1
721; KNL_64-NEXT:    vpmovsxdq %xmm1, %ymm1
722; KNL_64-NEXT:    vpxord %zmm3, %zmm3, %zmm3
723; KNL_64-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
724; KNL_64-NEXT:    vpmovsxdq %ymm0, %zmm0
725; KNL_64-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
726; KNL_64-NEXT:    vptestmq %zmm1, %zmm1, %k1
727; KNL_64-NEXT:    vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1}
728; KNL_64-NEXT:    vmovaps %zmm2, %zmm0
729; KNL_64-NEXT:    retq
730;
731; KNL_32-LABEL: test16:
732; KNL_32:       # BB#0:
733; KNL_32-NEXT:    vpslld $31, %xmm1, %xmm1
734; KNL_32-NEXT:    vpsrad $31, %xmm1, %xmm1
735; KNL_32-NEXT:    vpmovsxdq %xmm1, %ymm1
736; KNL_32-NEXT:    vpxord %zmm3, %zmm3, %zmm3
737; KNL_32-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
738; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
739; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm0
740; KNL_32-NEXT:    vpandq .LCPI15_0, %zmm1, %zmm1
741; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k1
742; KNL_32-NEXT:    vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1}
743; KNL_32-NEXT:    vmovaps %zmm2, %zmm0
744; KNL_32-NEXT:    retl
745;
746; SKX-LABEL: test16:
747; SKX:       # BB#0:
748; SKX-NEXT:    vpmovd2m %xmm1, %k1
749; SKX-NEXT:    vgatherdpd (%rdi,%xmm0,8), %ymm2 {%k1}
750; SKX-NEXT:    vmovaps %zmm2, %zmm0
751; SKX-NEXT:    retq
752
753  %sext_ind = sext <4 x i32> %ind to <4 x i64>
754  %gep.random = getelementptr double, double* %base, <4 x i64> %sext_ind
755  %res = call <4 x double> @llvm.masked.gather.v4f64(<4 x double*> %gep.random, i32 4, <4 x i1> %mask, <4 x double> %src0)
756  ret <4 x double>%res
757}
758
759define <2 x double> @test17(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x double> %src0) {
760;
761; KNL_64-LABEL: test17:
762; KNL_64:       # BB#0:
763; KNL_64-NEXT:    vpxord %zmm3, %zmm3, %zmm3
764; KNL_64-NEXT:    vinserti32x4 $0, %xmm1, %zmm3, %zmm1
765; KNL_64-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
766; KNL_64-NEXT:    vptestmq %zmm1, %zmm1, %k1
767; KNL_64-NEXT:    vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1}
768; KNL_64-NEXT:    vmovaps %zmm2, %zmm0
769; KNL_64-NEXT:    retq
770;
771; KNL_32-LABEL: test17:
772; KNL_32:       # BB#0:
773; KNL_32-NEXT:    vpxord %zmm3, %zmm3, %zmm3
774; KNL_32-NEXT:    vinserti32x4 $0, %xmm1, %zmm3, %zmm1
775; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
776; KNL_32-NEXT:    vpandq .LCPI16_0, %zmm1, %zmm1
777; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k1
778; KNL_32-NEXT:    vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1}
779; KNL_32-NEXT:    vmovaps %zmm2, %zmm0
780; KNL_32-NEXT:    retl
781;
782; SKX-LABEL: test17:
783; SKX:       # BB#0:
784; SKX-NEXT:    vpmovq2m %xmm1, %k1
785; SKX-NEXT:    vgatherqpd (%rdi,%xmm0,8), %xmm2 {%k1}
786; SKX-NEXT:    vmovaps %zmm2, %zmm0
787; SKX-NEXT:    retq
788
789  %sext_ind = sext <2 x i32> %ind to <2 x i64>
790  %gep.random = getelementptr double, double* %base, <2 x i64> %sext_ind
791  %res = call <2 x double> @llvm.masked.gather.v2f64(<2 x double*> %gep.random, i32 4, <2 x i1> %mask, <2 x double> %src0)
792  ret <2 x double>%res
793}
794
795declare void @llvm.masked.scatter.v4i32(<4 x i32> , <4 x i32*> , i32 , <4 x i1> )
796declare void @llvm.masked.scatter.v4f64(<4 x double> , <4 x double*> , i32 , <4 x i1> )
797declare void @llvm.masked.scatter.v2i64(<2 x i64> , <2 x i64*> , i32 , <2 x i1> )
798declare void @llvm.masked.scatter.v2i32(<2 x i32> , <2 x i32*> , i32 , <2 x i1> )
799declare void @llvm.masked.scatter.v2f32(<2 x float> , <2 x float*> , i32 , <2 x i1> )
800
801define void @test18(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) {
802;
803; KNL_64-LABEL: test18:
804; KNL_64:       # BB#0:
805; KNL_64-NEXT:    vpxor %ymm3, %ymm3, %ymm3
806; KNL_64-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
807; KNL_64-NEXT:    vpmovsxdq %ymm2, %zmm2
808; KNL_64-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
809; KNL_64-NEXT:    vptestmq %zmm2, %zmm2, %k1
810; KNL_64-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
811; KNL_64-NEXT:    retq
812;
813; KNL_32-LABEL: test18:
814; KNL_32:       # BB#0:
815; KNL_32-NEXT:    vpxor %ymm3, %ymm3, %ymm3
816; KNL_32-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
817; KNL_32-NEXT:    vpmovsxdq %ymm1, %zmm1
818; KNL_32-NEXT:    vpmovsxdq %ymm2, %zmm2
819; KNL_32-NEXT:    vpandq .LCPI17_0, %zmm2, %zmm2
820; KNL_32-NEXT:    vptestmq %zmm2, %zmm2, %k1
821; KNL_32-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
822; KNL_32-NEXT:    retl
823;
824; SKX-LABEL: test18:
825; SKX:       # BB#0:
826; SKX-NEXT:    vpmovd2m %xmm2, %k1
827; SKX-NEXT:    vpscatterqd %xmm0, (,%ymm1) {%k1}
828; SKX-NEXT:    retq
829  call void @llvm.masked.scatter.v4i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask)
830  ret void
831}
832
833define void @test19(<4 x double>%a1, double* %ptr, <4 x i1>%mask, <4 x i64> %ind) {
834;
835; KNL_64-LABEL: test19:
836; KNL_64:       # BB#0:
837; KNL_64-NEXT:    vpslld $31, %xmm1, %xmm1
838; KNL_64-NEXT:    vpsrad $31, %xmm1, %xmm1
839; KNL_64-NEXT:    vpmovsxdq %xmm1, %ymm1
840; KNL_64-NEXT:    vpxord %zmm3, %zmm3, %zmm3
841; KNL_64-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
842; KNL_64-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
843; KNL_64-NEXT:    vptestmq %zmm1, %zmm1, %k1
844; KNL_64-NEXT:    vscatterqpd %zmm0, (%rdi,%zmm2,8) {%k1}
845; KNL_64-NEXT:    retq
846;
847; KNL_32-LABEL: test19:
848; KNL_32:       # BB#0:
849; KNL_32-NEXT:    vpslld $31, %xmm1, %xmm1
850; KNL_32-NEXT:    vpsrad $31, %xmm1, %xmm1
851; KNL_32-NEXT:    vpmovsxdq %xmm1, %ymm1
852; KNL_32-NEXT:    vpxord %zmm3, %zmm3, %zmm3
853; KNL_32-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
854; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
855; KNL_32-NEXT:    vpandq .LCPI18_0, %zmm1, %zmm1
856; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k1
857; KNL_32-NEXT:    vscatterqpd %zmm0, (%eax,%zmm2,8) {%k1}
858; KNL_32-NEXT:    retl
859;
860; SKX-LABEL: test19:
861; SKX:       # BB#0:
862; SKX-NEXT:    vpmovd2m %xmm1, %k1
863; SKX-NEXT:    vscatterqpd %ymm0, (%rdi,%ymm2,8) {%k1}
864; SKX-NEXT:    retq
865;
866; SKX_32-LABEL: test19:
867; SKX_32:       # BB#0:
868; SKX_32-NEXT:    vpmovd2m %xmm1, %k1
869; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
870; SKX_32-NEXT:    vscatterqpd %ymm0, (%eax,%ymm2,8) {%k1}
871; SKX_32-NEXT:    retl
872  %gep = getelementptr double, double* %ptr, <4 x i64> %ind
873  call void @llvm.masked.scatter.v4f64(<4 x double> %a1, <4 x double*> %gep, i32 8, <4 x i1> %mask)
874  ret void
875}
876
877; Data type requires widening
878define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) {
879;
880; KNL_64-LABEL: test20:
881; KNL_64:       # BB#0:
882; KNL_64-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
883; KNL_64-NEXT:    vmovq %xmm2, %xmm2
884; KNL_64-NEXT:    vpxor %ymm3, %ymm3, %ymm3
885; KNL_64-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
886; KNL_64-NEXT:    vpmovsxdq %ymm2, %zmm2
887; KNL_64-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
888; KNL_64-NEXT:    vptestmq %zmm2, %zmm2, %k1
889; KNL_64-NEXT:    vscatterqps %ymm0, (,%zmm1) {%k1}
890; KNL_64-NEXT:    retq
891;
892; KNL_32-LABEL: test20:
893; KNL_32:       # BB#0:
894; KNL_32-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
895; KNL_32-NEXT:    vmovq %xmm2, %xmm2
896; KNL_32-NEXT:    vpxor %ymm3, %ymm3, %ymm3
897; KNL_32-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
898; KNL_32-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
899; KNL_32-NEXT:    vpmovsxdq %ymm1, %zmm1
900; KNL_32-NEXT:    vpmovsxdq %ymm2, %zmm2
901; KNL_32-NEXT:    vpandq .LCPI19_0, %zmm2, %zmm2
902; KNL_32-NEXT:    vptestmq %zmm2, %zmm2, %k1
903; KNL_32-NEXT:    vscatterqps %ymm0, (,%zmm1) {%k1}
904; KNL_32-NEXT:    retl
905;
906; SKX-LABEL: test20:
907; SKX:       # BB#0:
908; SKX-NEXT:    vpmovq2m %xmm2, %k0
909; SKX-NEXT:    kshiftlw $2, %k0, %k0
910; SKX-NEXT:    kshiftrw $2, %k0, %k1
911; SKX-NEXT:    vscatterqps %xmm0, (,%ymm1) {%k1}
912; SKX-NEXT:    retq
913  call void @llvm.masked.scatter.v2f32(<2 x float> %a1, <2 x float*> %ptr, i32 4, <2 x i1> %mask)
914  ret void
915}
916
917; Data type requires promotion
918define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) {
919;
920; KNL_64-LABEL: test21:
921; KNL_64:       # BB#0:
922; KNL_64-NEXT:    vpxord %zmm3, %zmm3, %zmm3
923; KNL_64-NEXT:    vinserti32x4 $0, %xmm2, %zmm3, %zmm2
924; KNL_64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
925; KNL_64-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
926; KNL_64-NEXT:    vptestmq %zmm2, %zmm2, %k1
927; KNL_64-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
928; KNL_64-NEXT:    retq
929;
930; KNL_32-LABEL: test21:
931; KNL_32:       # BB#0:
932; KNL_32-NEXT:    vpxord %zmm3, %zmm3, %zmm3
933; KNL_32-NEXT:    vinserti32x4 $0, %xmm2, %zmm3, %zmm2
934; KNL_32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
935; KNL_32-NEXT:    vpandq .LCPI20_0, %zmm2, %zmm2
936; KNL_32-NEXT:    vptestmq %zmm2, %zmm2, %k1
937; KNL_32-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
938; KNL_32-NEXT:    retl
939;
940; SKX-LABEL: test21:
941; SKX:       # BB#0:
942; SKX-NEXT:    vpmovq2m %xmm2, %k0
943; SKX-NEXT:    kshiftlw $2, %k0, %k0
944; SKX-NEXT:    kshiftrw $2, %k0, %k1
945; SKX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
946; SKX-NEXT:    vpscatterqd %xmm0, (,%ymm1) {%k1}
947; SKX-NEXT:    retq
948  call void @llvm.masked.scatter.v2i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> %mask)
949  ret void
950}
951
952; The result type requires widening
953declare <2 x float> @llvm.masked.gather.v2f32(<2 x float*>, i32, <2 x i1>, <2 x float>)
954
955define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x float> %src0) {
956;
957;
958; KNL_64-LABEL: test22:
959; KNL_64:       # BB#0:
960; KNL_64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
961; KNL_64-NEXT:    vmovq %xmm1, %xmm1
962; KNL_64-NEXT:    vpxor %ymm3, %ymm3, %ymm3
963; KNL_64-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
964; KNL_64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
965; KNL_64-NEXT:    vpmovsxdq %ymm0, %zmm0
966; KNL_64-NEXT:    vpmovsxdq %ymm1, %zmm1
967; KNL_64-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
968; KNL_64-NEXT:    vptestmq %zmm1, %zmm1, %k1
969; KNL_64-NEXT:    vgatherqps (%rdi,%zmm0,4), %ymm2 {%k1}
970; KNL_64-NEXT:    vmovaps %zmm2, %zmm0
971; KNL_64-NEXT:    retq
972;
973; KNL_32-LABEL: test22:
974; KNL_32:       # BB#0:
975; KNL_32-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
976; KNL_32-NEXT:    vmovq %xmm1, %xmm1
977; KNL_32-NEXT:    vpxor %ymm3, %ymm3, %ymm3
978; KNL_32-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
979; KNL_32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
980; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
981; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm0
982; KNL_32-NEXT:    vpmovsxdq %ymm1, %zmm1
983; KNL_32-NEXT:    vpandq .LCPI21_0, %zmm1, %zmm1
984; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k1
985; KNL_32-NEXT:    vgatherqps (%eax,%zmm0,4), %ymm2 {%k1}
986; KNL_32-NEXT:    vmovaps %zmm2, %zmm0
987; KNL_32-NEXT:    retl
988;
989; SKX-LABEL: test22:
990; SKX:       # BB#0:
991; SKX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
992; SKX-NEXT:    vpmovq2m %xmm1, %k0
993; SKX-NEXT:    kshiftlw $2, %k0, %k0
994; SKX-NEXT:    kshiftrw $2, %k0, %k1
995; SKX-NEXT:    vgatherdps (%rdi,%xmm0,4), %xmm2 {%k1}
996; SKX-NEXT:    vmovaps %zmm2, %zmm0
997; SKX-NEXT:    retq
998  %sext_ind = sext <2 x i32> %ind to <2 x i64>
999  %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind
1000  %res = call <2 x float> @llvm.masked.gather.v2f32(<2 x float*> %gep.random, i32 4, <2 x i1> %mask, <2 x float> %src0)
1001  ret <2 x float>%res
1002}
1003
1004declare <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>)
1005declare <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*>, i32, <2 x i1>, <2 x i64>)
1006
1007define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %src0) {
1008;
1009; KNL_64-LABEL: test23:
1010; KNL_64:       # BB#0:
1011; KNL_64-NEXT:    vpxord %zmm3, %zmm3, %zmm3
1012; KNL_64-NEXT:    vinserti32x4 $0, %xmm1, %zmm3, %zmm1
1013; KNL_64-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
1014; KNL_64-NEXT:    vptestmq %zmm1, %zmm1, %k1
1015; KNL_64-NEXT:    vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1}
1016; KNL_64-NEXT:    vmovaps %zmm2, %zmm0
1017; KNL_64-NEXT:    retq
1018;
1019; KNL_32-LABEL: test23:
1020; KNL_32:       # BB#0:
1021; KNL_32-NEXT:    vpxord %zmm3, %zmm3, %zmm3
1022; KNL_32-NEXT:    vinserti32x4 $0, %xmm1, %zmm3, %zmm1
1023; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1024; KNL_32-NEXT:    vpandq .LCPI22_0, %zmm1, %zmm1
1025; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k1
1026; KNL_32-NEXT:    vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1}
1027; KNL_32-NEXT:    vmovaps %zmm2, %zmm0
1028; KNL_32-NEXT:    retl
1029;
1030; SKX-LABEL: test23:
1031; SKX:       # BB#0:
1032; SKX-NEXT:    vpmovq2m %xmm1, %k1
1033; SKX-NEXT:    vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1}
1034; SKX-NEXT:    vmovaps %zmm2, %zmm0
1035; SKX-NEXT:    retq
1036  %sext_ind = sext <2 x i32> %ind to <2 x i64>
1037  %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind
1038  %res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0)
1039  ret <2 x i32>%res
1040}
1041
1042define <2 x i32> @test24(i32* %base, <2 x i32> %ind) {
1043;
1044;
1045; KNL_64-LABEL: test24:
1046; KNL_64:       # BB#0:
1047; KNL_64-NEXT:    movb $3, %al
1048; KNL_64-NEXT:    movzbl %al, %eax
1049; KNL_64-NEXT:    kmovw %eax, %k1
1050; KNL_64-NEXT:    vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1}
1051; KNL_64-NEXT:    vmovaps %zmm1, %zmm0
1052; KNL_64-NEXT:    retq
1053;
1054; KNL_32-LABEL: test24:
1055; KNL_32:       # BB#0:
1056; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1057; KNL_32-NEXT:    vpxord %zmm1, %zmm1, %zmm1
1058; KNL_32-NEXT:    vinserti32x4 $0, .LCPI23_0, %zmm1, %zmm1
1059; KNL_32-NEXT:    vpandq .LCPI23_1, %zmm1, %zmm1
1060; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k1
1061; KNL_32-NEXT:    vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1}
1062; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
1063; KNL_32-NEXT:    retl
1064;
1065; SKX-LABEL: test24:
1066; SKX:       # BB#0:
1067; SKX-NEXT:    kxnorw %k1, %k1, %k1
1068; SKX-NEXT:    vpgatherqq (%rdi,%xmm0,8), %xmm1 {%k1}
1069; SKX-NEXT:    vmovaps %zmm1, %zmm0
1070; SKX-NEXT:    retq
1071  %sext_ind = sext <2 x i32> %ind to <2 x i64>
1072  %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind
1073  %res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
1074  ret <2 x i32>%res
1075}
1076
1077define <2 x i64> @test25(i64* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %src0) {
1078;
1079; KNL_64-LABEL: test25:
1080; KNL_64:       # BB#0:
1081; KNL_64-NEXT:    vpxord %zmm3, %zmm3, %zmm3
1082; KNL_64-NEXT:    vinserti32x4 $0, %xmm1, %zmm3, %zmm1
1083; KNL_64-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
1084; KNL_64-NEXT:    vptestmq %zmm1, %zmm1, %k1
1085; KNL_64-NEXT:    vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1}
1086; KNL_64-NEXT:    vmovaps %zmm2, %zmm0
1087; KNL_64-NEXT:    retq
1088;
1089; KNL_32-LABEL: test25:
1090; KNL_32:       # BB#0:
1091; KNL_32-NEXT:    vpxord %zmm3, %zmm3, %zmm3
1092; KNL_32-NEXT:    vinserti32x4 $0, %xmm1, %zmm3, %zmm1
1093; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1094; KNL_32-NEXT:    vpandq .LCPI24_0, %zmm1, %zmm1
1095; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k1
1096; KNL_32-NEXT:    vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1}
1097; KNL_32-NEXT:    vmovaps %zmm2, %zmm0
1098; KNL_32-NEXT:    retl
1099;
1100; SKX-LABEL: test25:
1101; SKX:       # BB#0:
1102; SKX-NEXT:    vpmovq2m %xmm1, %k1
1103; SKX-NEXT:    vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1}
1104; SKX-NEXT:    vmovaps %zmm2, %zmm0
1105; SKX-NEXT:    retq
1106  %sext_ind = sext <2 x i32> %ind to <2 x i64>
1107  %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind
1108  %res = call <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %gep.random, i32 8, <2 x i1> %mask, <2 x i64> %src0)
1109  ret <2 x i64>%res
1110}
1111
1112define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) {
1113;
1114; KNL_64-LABEL: test26:
1115; KNL_64:       # BB#0:
1116; KNL_64-NEXT:    movb $3, %al
1117; KNL_64-NEXT:    movzbl %al, %eax
1118; KNL_64-NEXT:    kmovw %eax, %k1
1119; KNL_64-NEXT:    vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1}
1120; KNL_64-NEXT:    vmovaps %zmm1, %zmm0
1121; KNL_64-NEXT:    retq
1122;
1123; KNL_32-LABEL: test26:
1124; KNL_32:       # BB#0:
1125; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1126; KNL_32-NEXT:    vpxord %zmm2, %zmm2, %zmm2
1127; KNL_32-NEXT:    vinserti32x4 $0, .LCPI25_0, %zmm2, %zmm2
1128; KNL_32-NEXT:    vpandq .LCPI25_1, %zmm2, %zmm2
1129; KNL_32-NEXT:    vptestmq %zmm2, %zmm2, %k1
1130; KNL_32-NEXT:    vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1}
1131; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
1132; KNL_32-NEXT:    retl
1133;
1134; SKX-LABEL: test26:
1135; SKX:       # BB#0:
1136; SKX-NEXT:    kxnorw %k1, %k1, %k1
1137; SKX-NEXT:    vpgatherqq (%rdi,%xmm0,8), %xmm1 {%k1}
1138; SKX-NEXT:    vmovaps %zmm1, %zmm0
1139; SKX-NEXT:    retq
1140  %sext_ind = sext <2 x i32> %ind to <2 x i64>
1141  %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind
1142  %res = call <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %gep.random, i32 8, <2 x i1> <i1 true, i1 true>, <2 x i64> %src0)
1143  ret <2 x i64>%res
1144}
1145
1146; Result type requires widening; all-ones mask
1147define <2 x float> @test27(float* %base, <2 x i32> %ind) {
1148;
1149; KNL_64-LABEL: test27:
1150; KNL_64:       # BB#0:
1151; KNL_64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1152; KNL_64-NEXT:    vpmovsxdq %ymm0, %zmm1
1153; KNL_64-NEXT:    movb $3, %al
1154; KNL_64-NEXT:    movzbl %al, %eax
1155; KNL_64-NEXT:    kmovw %eax, %k1
1156; KNL_64-NEXT:    vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
1157; KNL_64-NEXT:    retq
1158;
1159; KNL_32-LABEL: test27:
1160; KNL_32:       # BB#0:
1161; KNL_32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1162; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1163; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm1
1164; KNL_32-NEXT:    movb $3, %cl
1165; KNL_32-NEXT:    movzbl %cl, %ecx
1166; KNL_32-NEXT:    kmovw %ecx, %k1
1167; KNL_32-NEXT:    vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
1168; KNL_32-NEXT:    retl
1169;
1170; SKX-LABEL: test27:
1171; SKX:       # BB#0:
1172; SKX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
1173; SKX-NEXT:    movb $3, %al
1174; SKX-NEXT:    kmovb %eax, %k1
1175; SKX-NEXT:    vgatherdps (%rdi,%xmm1,4), %xmm0 {%k1}
1176; SKX-NEXT:    retq
1177  %sext_ind = sext <2 x i32> %ind to <2 x i64>
1178  %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind
1179  %res = call <2 x float> @llvm.masked.gather.v2f32(<2 x float*> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x float> undef)
1180  ret <2 x float>%res
1181}
1182
1183; Data type requires promotion, mask is all-ones
1184define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) {
1185;
1186;
1187; KNL_64-LABEL: test28:
1188; KNL_64:       # BB#0:
1189; KNL_64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1190; KNL_64-NEXT:    movb $3, %al
1191; KNL_64-NEXT:    movzbl %al, %eax
1192; KNL_64-NEXT:    kmovw %eax, %k1
1193; KNL_64-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
1194; KNL_64-NEXT:    retq
1195;
1196; KNL_32-LABEL: test28:
1197; KNL_32:       # BB#0:
1198; KNL_32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1199; KNL_32-NEXT:    vpxord %zmm2, %zmm2, %zmm2
1200; KNL_32-NEXT:    vinserti32x4 $0, .LCPI27_0, %zmm2, %zmm2
1201; KNL_32-NEXT:    vpandq .LCPI27_1, %zmm2, %zmm2
1202; KNL_32-NEXT:    vptestmq %zmm2, %zmm2, %k1
1203; KNL_32-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
1204; KNL_32-NEXT:    retl
1205;
1206; SKX-LABEL: test28:
1207; SKX:       # BB#0:
1208; SKX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1209; SKX-NEXT:    movb $3, %al
1210; SKX-NEXT:    kmovb %eax, %k1
1211; SKX-NEXT:    vpscatterqd %xmm0, (,%ymm1) {%k1}
1212; SKX-NEXT:    retq
1213  call void @llvm.masked.scatter.v2i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> <i1 true, i1 true>)
1214  ret void
1215}
1216
1217
1218; SCALAR-LABEL: test29
1219; SCALAR:      extractelement <16 x float*>
1220; SCALAR-NEXT: load float
1221; SCALAR-NEXT: insertelement <16 x float>
1222; SCALAR-NEXT: extractelement <16 x float*>
1223; SCALAR-NEXT: load float
1224
1225define <16 x float> @test29(float* %base, <16 x i32> %ind) {
1226; KNL_64-LABEL: test29:
1227; KNL_64:       # BB#0:
1228; KNL_64-NEXT:    movw $44, %ax
1229; KNL_64-NEXT:    kmovw %eax, %k1
1230; KNL_64-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
1231; KNL_64-NEXT:    vmovaps %zmm1, %zmm0
1232; KNL_64-NEXT:    retq
1233;
1234; KNL_32-LABEL: test29:
1235; KNL_32:       # BB#0:
1236; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1237; KNL_32-NEXT:    movw $44, %cx
1238; KNL_32-NEXT:    kmovw %ecx, %k1
1239; KNL_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
1240; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
1241; KNL_32-NEXT:    retl
1242;
1243; SKX-LABEL: test29:
1244; SKX:       # BB#0:
1245; SKX-NEXT:    movw $44, %ax
1246; SKX-NEXT:    kmovw %eax, %k1
1247; SKX-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
1248; SKX-NEXT:    vmovaps %zmm1, %zmm0
1249; SKX-NEXT:    retq
1250
1251  %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
1252  %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
1253
1254  %sext_ind = sext <16 x i32> %ind to <16 x i64>
1255  %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
1256
1257  %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float> undef)
1258  ret <16 x float>%res
1259}
1260
1261; Check non-power-of-2 case. It should be scalarized.
1262declare <3 x i32> @llvm.masked.gather.v3i32(<3 x i32*>, i32, <3 x i1>, <3 x i32>)
1263define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> %src0) {
1264; KNL_64-LABEL: test30:
1265; KNL_64:       # BB#0:
1266; KNL_64-NEXT:    andl $1, %edx
1267; KNL_64-NEXT:    kmovw %edx, %k1
1268; KNL_64-NEXT:    andl $1, %esi
1269; KNL_64-NEXT:    kmovw %esi, %k2
1270; KNL_64-NEXT:    movl %edi, %eax
1271; KNL_64-NEXT:    andl $1, %eax
1272; KNL_64-NEXT:    kmovw %eax, %k0
1273; KNL_64-NEXT:    vpmovsxdq %xmm1, %ymm1
1274; KNL_64-NEXT:    vpsllq $2, %ymm1, %ymm1
1275; KNL_64-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
1276; KNL_64-NEXT:    # implicit-def: %XMM0
1277; KNL_64-NEXT:    testb $1, %dil
1278; KNL_64-NEXT:    je .LBB29_2
1279; KNL_64-NEXT:  # BB#1: # %cond.load
1280; KNL_64-NEXT:    vmovq %xmm1, %rax
1281; KNL_64-NEXT:    vmovd (%rax), %xmm0
1282; KNL_64-NEXT:  .LBB29_2: # %else
1283; KNL_64-NEXT:    kmovw %k2, %eax
1284; KNL_64-NEXT:    movl %eax, %ecx
1285; KNL_64-NEXT:    andl $1, %ecx
1286; KNL_64-NEXT:    testb %cl, %cl
1287; KNL_64-NEXT:    je .LBB29_4
1288; KNL_64-NEXT:  # BB#3: # %cond.load1
1289; KNL_64-NEXT:    vpextrq $1, %xmm1, %rcx
1290; KNL_64-NEXT:    vpinsrd $1, (%rcx), %xmm0, %xmm0
1291; KNL_64-NEXT:  .LBB29_4: # %else2
1292; KNL_64-NEXT:    kmovw %k1, %ecx
1293; KNL_64-NEXT:    movl %ecx, %edx
1294; KNL_64-NEXT:    andl $1, %edx
1295; KNL_64-NEXT:    testb %dl, %dl
1296; KNL_64-NEXT:    je .LBB29_6
1297; KNL_64-NEXT:  # BB#5: # %cond.load4
1298; KNL_64-NEXT:    vextracti128 $1, %ymm1, %xmm1
1299; KNL_64-NEXT:    vmovq %xmm1, %rdx
1300; KNL_64-NEXT:    vpinsrd $2, (%rdx), %xmm0, %xmm0
1301; KNL_64-NEXT:  .LBB29_6: # %else5
1302; KNL_64-NEXT:    kmovw %k0, %edx
1303; KNL_64-NEXT:    vmovd %edx, %xmm1
1304; KNL_64-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
1305; KNL_64-NEXT:    vpinsrd $2, %ecx, %xmm1, %xmm1
1306; KNL_64-NEXT:    vpslld $31, %xmm1, %xmm1
1307; KNL_64-NEXT:    vblendvps %xmm1, %xmm0, %xmm2, %xmm0
1308; KNL_64-NEXT:    retq
1309;
1310; KNL_32-LABEL: test30:
1311; KNL_32:       # BB#0:
1312; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1313; KNL_32-NEXT:    andl $1, %eax
1314; KNL_32-NEXT:    kmovw %eax, %k1
1315; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1316; KNL_32-NEXT:    andl $1, %eax
1317; KNL_32-NEXT:    kmovw %eax, %k2
1318; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1319; KNL_32-NEXT:    movl %eax, %ecx
1320; KNL_32-NEXT:    andl $1, %ecx
1321; KNL_32-NEXT:    kmovw %ecx, %k0
1322; KNL_32-NEXT:    vpslld $2, %xmm1, %xmm1
1323; KNL_32-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
1324; KNL_32-NEXT:    # implicit-def: %XMM0
1325; KNL_32-NEXT:    testb $1, %al
1326; KNL_32-NEXT:    je .LBB29_2
1327; KNL_32-NEXT:  # BB#1: # %cond.load
1328; KNL_32-NEXT:    vmovd %xmm1, %eax
1329; KNL_32-NEXT:    vmovd (%eax), %xmm0
1330; KNL_32-NEXT:  .LBB29_2: # %else
1331; KNL_32-NEXT:    kmovw %k2, %eax
1332; KNL_32-NEXT:    movl %eax, %ecx
1333; KNL_32-NEXT:    andl $1, %ecx
1334; KNL_32-NEXT:    testb %cl, %cl
1335; KNL_32-NEXT:    je .LBB29_4
1336; KNL_32-NEXT:  # BB#3: # %cond.load1
1337; KNL_32-NEXT:    vpextrd $1, %xmm1, %ecx
1338; KNL_32-NEXT:    vpinsrd $1, (%ecx), %xmm0, %xmm0
1339; KNL_32-NEXT:  .LBB29_4: # %else2
1340; KNL_32-NEXT:    kmovw %k1, %ecx
1341; KNL_32-NEXT:    movl %ecx, %edx
1342; KNL_32-NEXT:    andl $1, %edx
1343; KNL_32-NEXT:    testb %dl, %dl
1344; KNL_32-NEXT:    je .LBB29_6
1345; KNL_32-NEXT:  # BB#5: # %cond.load4
1346; KNL_32-NEXT:    vpextrd $2, %xmm1, %edx
1347; KNL_32-NEXT:    vpinsrd $2, (%edx), %xmm0, %xmm0
1348; KNL_32-NEXT:  .LBB29_6: # %else5
1349; KNL_32-NEXT:    kmovw %k0, %edx
1350; KNL_32-NEXT:    vmovd %edx, %xmm1
1351; KNL_32-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
1352; KNL_32-NEXT:    vpinsrd $2, %ecx, %xmm1, %xmm1
1353; KNL_32-NEXT:    vpslld $31, %xmm1, %xmm1
1354; KNL_32-NEXT:    vblendvps %xmm1, %xmm0, %xmm2, %xmm0
1355; KNL_32-NEXT:    retl
1356;
1357; SKX-LABEL: test30:
1358; SKX:       # BB#0:
1359; SKX-NEXT:    vpmovd2m %xmm2, %k1
1360; SKX-NEXT:    kmovb %k1, -{{[0-9]+}}(%rsp)
1361; SKX-NEXT:    vpmovsxdq %xmm1, %ymm1
1362; SKX-NEXT:    vpsllq $2, %ymm1, %ymm1
1363; SKX-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
1364; SKX-NEXT:    movb -{{[0-9]+}}(%rsp), %al
1365; SKX-NEXT:    # implicit-def: %XMM0
1366; SKX-NEXT:    andb $1, %al
1367; SKX-NEXT:    je .LBB29_2
1368; SKX-NEXT:  # BB#1: # %cond.load
1369; SKX-NEXT:    vmovq %xmm1, %rax
1370; SKX-NEXT:    vmovd (%rax), %xmm0
1371; SKX-NEXT:  .LBB29_2: # %else
1372; SKX-NEXT:    kmovb %k1, -{{[0-9]+}}(%rsp)
1373; SKX-NEXT:    movb -{{[0-9]+}}(%rsp), %al
1374; SKX-NEXT:    andb $1, %al
1375; SKX-NEXT:    je .LBB29_4
1376; SKX-NEXT:  # BB#3: # %cond.load1
1377; SKX-NEXT:    vpextrq $1, %xmm1, %rax
1378; SKX-NEXT:    vpinsrd $1, (%rax), %xmm0, %xmm0
1379; SKX-NEXT:  .LBB29_4: # %else2
1380; SKX-NEXT:    kmovb %k1, -{{[0-9]+}}(%rsp)
1381; SKX-NEXT:    movb -{{[0-9]+}}(%rsp), %al
1382; SKX-NEXT:    andb $1, %al
1383; SKX-NEXT:    je .LBB29_6
1384; SKX-NEXT:  # BB#5: # %cond.load4
1385; SKX-NEXT:    vextracti128 $1, %ymm1, %xmm1
1386; SKX-NEXT:    vmovq %xmm1, %rax
1387; SKX-NEXT:    vpinsrd $2, (%rax), %xmm0, %xmm0
1388; SKX-NEXT:  .LBB29_6: # %else5
1389; SKX-NEXT:    vmovdqa32 %xmm0, %xmm3 {%k1}
1390; SKX-NEXT:    vmovaps %zmm3, %zmm0
1391; SKX-NEXT:    retq
1392
1393  %sext_ind = sext <3 x i32> %ind to <3 x i64>
1394  %gep.random = getelementptr i32, <3 x i32*> %base, <3 x i64> %sext_ind
1395  %res = call <3 x i32> @llvm.masked.gather.v3i32(<3 x i32*> %gep.random, i32 4, <3 x i1> %mask, <3 x i32> %src0)
1396  ret <3 x i32>%res
1397}
1398
1399declare <16 x float*> @llvm.masked.gather.v16p0f32(<16 x float**>, i32, <16 x i1>, <16 x float*>)
1400
1401; KNL-LABEL: test31
1402; KNL: vpgatherqq
1403; KNL: vpgatherqq
1404define <16 x float*> @test31(<16 x float**> %ptrs) {
1405; KNL_64-LABEL: test31:
1406; KNL_64:       # BB#0:
1407; KNL_64-NEXT:    kxnorw %k1, %k1, %k1
1408; KNL_64-NEXT:    kxnorw %k2, %k2, %k2
1409; KNL_64-NEXT:    vpgatherqq (,%zmm0), %zmm2 {%k2}
1410; KNL_64-NEXT:    kshiftrw $8, %k1, %k1
1411; KNL_64-NEXT:    vpgatherqq (,%zmm1), %zmm3 {%k1}
1412; KNL_64-NEXT:    vmovaps %zmm2, %zmm0
1413; KNL_64-NEXT:    vmovaps %zmm3, %zmm1
1414; KNL_64-NEXT:    retq
1415;
1416; KNL_32-LABEL: test31:
1417; KNL_32:       # BB#0:
1418; KNL_32-NEXT:    kxnorw %k1, %k1, %k1
1419; KNL_32-NEXT:    vpgatherdd (,%zmm0), %zmm1 {%k1}
1420; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
1421; KNL_32-NEXT:    retl
1422;
1423; SKX-LABEL: test31:
1424; SKX:       # BB#0:
1425; SKX-NEXT:    kxnorw %k1, %k1, %k1
1426; SKX-NEXT:    kxnorw %k2, %k2, %k2
1427; SKX-NEXT:    vpgatherqq (,%zmm0), %zmm2 {%k2}
1428; SKX-NEXT:    kshiftrw $8, %k1, %k1
1429; SKX-NEXT:    vpgatherqq (,%zmm1), %zmm3 {%k1}
1430; SKX-NEXT:    vmovaps %zmm2, %zmm0
1431; SKX-NEXT:    vmovaps %zmm3, %zmm1
1432; SKX-NEXT:    retq
1433;
1434; SKX_32-LABEL: test31:
1435; SKX_32:       # BB#0:
1436; SKX_32-NEXT:    kxnorw %k1, %k1, %k1
1437; SKX_32-NEXT:    vpgatherdd (,%zmm0), %zmm1 {%k1}
1438; SKX_32-NEXT:    vmovaps %zmm1, %zmm0
1439; SKX_32-NEXT:    retl
1440
1441  %res = call <16 x float*> @llvm.masked.gather.v16p0f32(<16 x float**> %ptrs, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float*> undef)
1442  ret <16 x float*>%res
1443}
1444
1445define <16 x i32> @test_gather_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0)  {
1446; KNL_64-LABEL: test_gather_16i32:
1447; KNL_64:       # BB#0:
1448; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
1449; KNL_64-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
1450; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
1451; KNL_64-NEXT:    vextracti64x4 $1, %zmm3, %ymm2
1452; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
1453; KNL_64-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k2}
1454; KNL_64-NEXT:    vpgatherqd (,%zmm0), %ymm3 {%k1}
1455; KNL_64-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm0
1456; KNL_64-NEXT:    retq
1457;
1458; KNL_32-LABEL: test_gather_16i32:
1459; KNL_32:       # BB#0:
1460; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
1461; KNL_32-NEXT:    vpandd .LCPI31_0{1to16}, %zmm1, %zmm1
1462; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
1463; KNL_32-NEXT:    vpgatherdd (,%zmm0), %zmm2 {%k1}
1464; KNL_32-NEXT:    vmovaps %zmm2, %zmm0
1465; KNL_32-NEXT:    retl
1466;
1467; SKX-LABEL: test_gather_16i32:
1468; SKX:       # BB#0:
1469; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
1470; SKX-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
1471; SKX-NEXT:    vptestmd %zmm2, %zmm2, %k1
1472; SKX-NEXT:    vextracti32x8 $1, %zmm3, %ymm2
1473; SKX-NEXT:    kshiftrw $8, %k1, %k2
1474; SKX-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k2}
1475; SKX-NEXT:    vpgatherqd (,%zmm0), %ymm3 {%k1}
1476; SKX-NEXT:    vinserti32x8 $1, %ymm2, %zmm3, %zmm0
1477; SKX-NEXT:    retq
1478;
1479; SKX_32-LABEL: test_gather_16i32:
1480; SKX_32:       # BB#0:
1481; SKX_32-NEXT:    vpmovsxbd %xmm1, %zmm1
1482; SKX_32-NEXT:    vpandd .LCPI31_0{1to16}, %zmm1, %zmm1
1483; SKX_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
1484; SKX_32-NEXT:    vpgatherdd (,%zmm0), %zmm2 {%k1}
1485; SKX_32-NEXT:    vmovaps %zmm2, %zmm0
1486; SKX_32-NEXT:    retl
1487  %res = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptrs, i32 4, <16 x i1> %mask, <16 x i32> %src0)
1488  ret <16 x i32> %res
1489}
1490define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0)  {
1491; KNL_64-LABEL: test_gather_16i64:
1492; KNL_64:       # BB#0:
1493; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
1494; KNL_64-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
1495; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
1496; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
1497; KNL_64-NEXT:    vpgatherqq (,%zmm0), %zmm3 {%k1}
1498; KNL_64-NEXT:    vpgatherqq (,%zmm1), %zmm4 {%k2}
1499; KNL_64-NEXT:    vmovaps %zmm3, %zmm0
1500; KNL_64-NEXT:    vmovaps %zmm4, %zmm1
1501; KNL_64-NEXT:    retq
1502;
1503; KNL_32-LABEL: test_gather_16i64:
1504; KNL_32:       # BB#0:
1505; KNL_32-NEXT:    pushl %ebp
1506; KNL_32-NEXT:  .Ltmp0:
1507; KNL_32-NEXT:    .cfi_def_cfa_offset 8
1508; KNL_32-NEXT:  .Ltmp1:
1509; KNL_32-NEXT:    .cfi_offset %ebp, -8
1510; KNL_32-NEXT:    movl %esp, %ebp
1511; KNL_32-NEXT:  .Ltmp2:
1512; KNL_32-NEXT:    .cfi_def_cfa_register %ebp
1513; KNL_32-NEXT:    andl $-64, %esp
1514; KNL_32-NEXT:    subl $64, %esp
1515; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
1516; KNL_32-NEXT:    vpandd .LCPI32_0{1to16}, %zmm1, %zmm1
1517; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
1518; KNL_32-NEXT:    vmovdqa64 8(%ebp), %zmm1
1519; KNL_32-NEXT:    kshiftrw $8, %k1, %k2
1520; KNL_32-NEXT:    vpgatherdq (,%ymm0), %zmm2 {%k1}
1521; KNL_32-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
1522; KNL_32-NEXT:    vpgatherdq (,%ymm0), %zmm1 {%k2}
1523; KNL_32-NEXT:    vmovaps %zmm2, %zmm0
1524; KNL_32-NEXT:    movl %ebp, %esp
1525; KNL_32-NEXT:    popl %ebp
1526; KNL_32-NEXT:    retl
1527;
1528; SKX-LABEL: test_gather_16i64:
1529; SKX:       # BB#0:
1530; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
1531; SKX-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
1532; SKX-NEXT:    vptestmd %zmm2, %zmm2, %k1
1533; SKX-NEXT:    kshiftrw $8, %k1, %k2
1534; SKX-NEXT:    vpgatherqq (,%zmm0), %zmm3 {%k1}
1535; SKX-NEXT:    vpgatherqq (,%zmm1), %zmm4 {%k2}
1536; SKX-NEXT:    vmovaps %zmm3, %zmm0
1537; SKX-NEXT:    vmovaps %zmm4, %zmm1
1538; SKX-NEXT:    retq
1539  %res = call <16 x i64> @llvm.masked.gather.v16i64(<16 x i64*> %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0)
1540  ret <16 x i64> %res
1541}
1542declare <16 x i64> @llvm.masked.gather.v16i64(<16 x i64*> %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0)
1543define <16 x float> @test_gather_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0)  {
1544; KNL_64-LABEL: test_gather_16f32:
1545; KNL_64:       # BB#0:
1546; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
1547; KNL_64-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
1548; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
1549; KNL_64-NEXT:    vextractf64x4 $1, %zmm3, %ymm2
1550; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
1551; KNL_64-NEXT:    vgatherqps (,%zmm1), %ymm2 {%k2}
1552; KNL_64-NEXT:    vgatherqps (,%zmm0), %ymm3 {%k1}
1553; KNL_64-NEXT:    vinsertf64x4 $1, %ymm2, %zmm3, %zmm0
1554; KNL_64-NEXT:    retq
1555;
1556; KNL_32-LABEL: test_gather_16f32:
1557; KNL_32:       # BB#0:
1558; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
1559; KNL_32-NEXT:    vpandd .LCPI33_0{1to16}, %zmm1, %zmm1
1560; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
1561; KNL_32-NEXT:    vgatherdps (,%zmm0), %zmm2 {%k1}
1562; KNL_32-NEXT:    vmovaps %zmm2, %zmm0
1563; KNL_32-NEXT:    retl
1564;
1565; SKX-LABEL: test_gather_16f32:
1566; SKX:       # BB#0:
1567; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
1568; SKX-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
1569; SKX-NEXT:    vptestmd %zmm2, %zmm2, %k1
1570; SKX-NEXT:    vextractf32x8 $1, %zmm3, %ymm2
1571; SKX-NEXT:    kshiftrw $8, %k1, %k2
1572; SKX-NEXT:    vgatherqps (,%zmm1), %ymm2 {%k2}
1573; SKX-NEXT:    vgatherqps (,%zmm0), %ymm3 {%k1}
1574; SKX-NEXT:    vinsertf32x8 $1, %ymm2, %zmm3, %zmm0
1575; SKX-NEXT:    retq
1576  %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %ptrs, i32 4, <16 x i1> %mask, <16 x float> %src0)
1577  ret <16 x float> %res
1578}
1579define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0)  {
1580; KNL_64-LABEL: test_gather_16f64:
1581; KNL_64:       # BB#0:
1582; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
1583; KNL_64-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
1584; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
1585; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
1586; KNL_64-NEXT:    vgatherqpd (,%zmm0), %zmm3 {%k1}
1587; KNL_64-NEXT:    vgatherqpd (,%zmm1), %zmm4 {%k2}
1588; KNL_64-NEXT:    vmovaps %zmm3, %zmm0
1589; KNL_64-NEXT:    vmovaps %zmm4, %zmm1
1590; KNL_64-NEXT:    retq
1591;
1592; KNL_32-LABEL: test_gather_16f64:
1593; KNL_32:       # BB#0:
1594; KNL_32-NEXT:    pushl %ebp
1595; KNL_32-NEXT:  .Ltmp3:
1596; KNL_32-NEXT:    .cfi_def_cfa_offset 8
1597; KNL_32-NEXT:  .Ltmp4:
1598; KNL_32-NEXT:    .cfi_offset %ebp, -8
1599; KNL_32-NEXT:    movl %esp, %ebp
1600; KNL_32-NEXT:  .Ltmp5:
1601; KNL_32-NEXT:    .cfi_def_cfa_register %ebp
1602; KNL_32-NEXT:    andl $-64, %esp
1603; KNL_32-NEXT:    subl $64, %esp
1604; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
1605; KNL_32-NEXT:    vpandd .LCPI34_0{1to16}, %zmm1, %zmm1
1606; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
1607; KNL_32-NEXT:    vmovapd 8(%ebp), %zmm1
1608; KNL_32-NEXT:    kshiftrw $8, %k1, %k2
1609; KNL_32-NEXT:    vgatherdpd (,%ymm0), %zmm2 {%k1}
1610; KNL_32-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
1611; KNL_32-NEXT:    vgatherdpd (,%ymm0), %zmm1 {%k2}
1612; KNL_32-NEXT:    vmovaps %zmm2, %zmm0
1613; KNL_32-NEXT:    movl %ebp, %esp
1614; KNL_32-NEXT:    popl %ebp
1615; KNL_32-NEXT:    retl
1616;
1617; SKX-LABEL: test_gather_16f64:
1618; SKX:       # BB#0:
1619; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
1620; SKX-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
1621; SKX-NEXT:    vptestmd %zmm2, %zmm2, %k1
1622; SKX-NEXT:    kshiftrw $8, %k1, %k2
1623; SKX-NEXT:    vgatherqpd (,%zmm0), %zmm3 {%k1}
1624; SKX-NEXT:    vgatherqpd (,%zmm1), %zmm4 {%k2}
1625; SKX-NEXT:    vmovaps %zmm3, %zmm0
1626; SKX-NEXT:    vmovaps %zmm4, %zmm1
1627; SKX-NEXT:    retq
1628  %res = call <16 x double> @llvm.masked.gather.v16f64(<16 x double*> %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0)
1629  ret <16 x double> %res
1630}
1631declare <16 x double> @llvm.masked.gather.v16f64(<16 x double*> %ptrs, i32, <16 x i1> %mask, <16 x double> %src0)
1632define void @test_scatter_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0)  {
1633; KNL_64-LABEL: test_scatter_16i32:
1634; KNL_64:       # BB#0:
1635; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
1636; KNL_64-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
1637; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
1638; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
1639; KNL_64-NEXT:    vpscatterqd %ymm3, (,%zmm0) {%k1}
1640; KNL_64-NEXT:    vextracti64x4 $1, %zmm3, %ymm0
1641; KNL_64-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k2}
1642; KNL_64-NEXT:    retq
1643;
1644; KNL_32-LABEL: test_scatter_16i32:
1645; KNL_32:       # BB#0:
1646; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
1647; KNL_32-NEXT:    vpandd .LCPI35_0{1to16}, %zmm1, %zmm1
1648; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
1649; KNL_32-NEXT:    vpscatterdd %zmm2, (,%zmm0) {%k1}
1650; KNL_32-NEXT:    retl
1651;
1652; SKX-LABEL: test_scatter_16i32:
1653; SKX:       # BB#0:
1654; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
1655; SKX-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
1656; SKX-NEXT:    vptestmd %zmm2, %zmm2, %k1
1657; SKX-NEXT:    kshiftrw $8, %k1, %k2
1658; SKX-NEXT:    vpscatterqd %ymm3, (,%zmm0) {%k1}
1659; SKX-NEXT:    vextracti32x8 $1, %zmm3, %ymm0
1660; SKX-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k2}
1661; SKX-NEXT:    retq
1662;
1663; SKX_32-LABEL: test_scatter_16i32:
1664; SKX_32:       # BB#0:
1665; SKX_32-NEXT:    vpmovsxbd %xmm1, %zmm1
1666; SKX_32-NEXT:    vpandd .LCPI35_0{1to16}, %zmm1, %zmm1
1667; SKX_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
1668; SKX_32-NEXT:    vpscatterdd %zmm2, (,%zmm0) {%k1}
1669; SKX_32-NEXT:    retl
1670  call void @llvm.masked.scatter.v16i32(<16 x i32> %src0, <16 x i32*> %ptrs, i32 4, <16 x i1> %mask)
1671  ret void
1672}
1673define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0)  {
1674; KNL_64-LABEL: test_scatter_16i64:
1675; KNL_64:       # BB#0:
1676; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
1677; KNL_64-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
1678; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
1679; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
1680; KNL_64-NEXT:    vpscatterqq %zmm3, (,%zmm0) {%k1}
1681; KNL_64-NEXT:    vpscatterqq %zmm4, (,%zmm1) {%k2}
1682; KNL_64-NEXT:    retq
1683;
1684; KNL_32-LABEL: test_scatter_16i64:
1685; KNL_32:       # BB#0:
1686; KNL_32-NEXT:    pushl %ebp
1687; KNL_32-NEXT:  .Ltmp6:
1688; KNL_32-NEXT:    .cfi_def_cfa_offset 8
1689; KNL_32-NEXT:  .Ltmp7:
1690; KNL_32-NEXT:    .cfi_offset %ebp, -8
1691; KNL_32-NEXT:    movl %esp, %ebp
1692; KNL_32-NEXT:  .Ltmp8:
1693; KNL_32-NEXT:    .cfi_def_cfa_register %ebp
1694; KNL_32-NEXT:    andl $-64, %esp
1695; KNL_32-NEXT:    subl $64, %esp
1696; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
1697; KNL_32-NEXT:    vpandd .LCPI36_0{1to16}, %zmm1, %zmm1
1698; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
1699; KNL_32-NEXT:    vmovdqa64 8(%ebp), %zmm1
1700; KNL_32-NEXT:    kshiftrw $8, %k1, %k2
1701; KNL_32-NEXT:    vpscatterdq %zmm2, (,%ymm0) {%k1}
1702; KNL_32-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
1703; KNL_32-NEXT:    vpscatterdq %zmm1, (,%ymm0) {%k2}
1704; KNL_32-NEXT:    movl %ebp, %esp
1705; KNL_32-NEXT:    popl %ebp
1706; KNL_32-NEXT:    retl
1707;
1708; SKX-LABEL: test_scatter_16i64:
1709; SKX:       # BB#0:
1710; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
1711; SKX-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
1712; SKX-NEXT:    vptestmd %zmm2, %zmm2, %k1
1713; SKX-NEXT:    kshiftrw $8, %k1, %k2
1714; SKX-NEXT:    vpscatterqq %zmm3, (,%zmm0) {%k1}
1715; SKX-NEXT:    vpscatterqq %zmm4, (,%zmm1) {%k2}
1716; SKX-NEXT:    retq
1717  call void @llvm.masked.scatter.v16i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32 4, <16 x i1> %mask)
1718  ret void
1719}
1720declare void @llvm.masked.scatter.v16i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32, <16 x i1> %mask)
1721define void @test_scatter_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0)  {
1722; KNL_64-LABEL: test_scatter_16f32:
1723; KNL_64:       # BB#0:
1724; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
1725; KNL_64-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
1726; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
1727; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
1728; KNL_64-NEXT:    vscatterqps %ymm3, (,%zmm0) {%k1}
1729; KNL_64-NEXT:    vextractf64x4 $1, %zmm3, %ymm0
1730; KNL_64-NEXT:    vscatterqps %ymm0, (,%zmm1) {%k2}
1731; KNL_64-NEXT:    retq
1732;
1733; KNL_32-LABEL: test_scatter_16f32:
1734; KNL_32:       # BB#0:
1735; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
1736; KNL_32-NEXT:    vpandd .LCPI37_0{1to16}, %zmm1, %zmm1
1737; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
1738; KNL_32-NEXT:    vscatterdps %zmm2, (,%zmm0) {%k1}
1739; KNL_32-NEXT:    retl
1740;
1741; SKX-LABEL: test_scatter_16f32:
1742; SKX:       # BB#0:
1743; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
1744; SKX-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
1745; SKX-NEXT:    vptestmd %zmm2, %zmm2, %k1
1746; SKX-NEXT:    kshiftrw $8, %k1, %k2
1747; SKX-NEXT:    vscatterqps %ymm3, (,%zmm0) {%k1}
1748; SKX-NEXT:    vextractf32x8 $1, %zmm3, %ymm0
1749; SKX-NEXT:    vscatterqps %ymm0, (,%zmm1) {%k2}
1750; SKX-NEXT:    retq
1751  call void @llvm.masked.scatter.v16f32(<16 x float> %src0, <16 x float*> %ptrs, i32 4, <16 x i1> %mask)
1752  ret void
1753}
1754declare void @llvm.masked.scatter.v16f32(<16 x float> %src0, <16 x float*> %ptrs, i32, <16 x i1> %mask)
1755define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0)  {
1756; KNL_64-LABEL: test_scatter_16f64:
1757; KNL_64:       # BB#0:
1758; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
1759; KNL_64-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
1760; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
1761; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
1762; KNL_64-NEXT:    vscatterqpd %zmm3, (,%zmm0) {%k1}
1763; KNL_64-NEXT:    vscatterqpd %zmm4, (,%zmm1) {%k2}
1764; KNL_64-NEXT:    retq
1765;
1766; KNL_32-LABEL: test_scatter_16f64:
1767; KNL_32:       # BB#0:
1768; KNL_32-NEXT:    pushl %ebp
1769; KNL_32-NEXT:  .Ltmp9:
1770; KNL_32-NEXT:    .cfi_def_cfa_offset 8
1771; KNL_32-NEXT:  .Ltmp10:
1772; KNL_32-NEXT:    .cfi_offset %ebp, -8
1773; KNL_32-NEXT:    movl %esp, %ebp
1774; KNL_32-NEXT:  .Ltmp11:
1775; KNL_32-NEXT:    .cfi_def_cfa_register %ebp
1776; KNL_32-NEXT:    andl $-64, %esp
1777; KNL_32-NEXT:    subl $64, %esp
1778; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
1779; KNL_32-NEXT:    vpandd .LCPI38_0{1to16}, %zmm1, %zmm1
1780; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
1781; KNL_32-NEXT:    vmovapd 8(%ebp), %zmm1
1782; KNL_32-NEXT:    kshiftrw $8, %k1, %k2
1783; KNL_32-NEXT:    vscatterdpd %zmm2, (,%ymm0) {%k1}
1784; KNL_32-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
1785; KNL_32-NEXT:    vscatterdpd %zmm1, (,%ymm0) {%k2}
1786; KNL_32-NEXT:    movl %ebp, %esp
1787; KNL_32-NEXT:    popl %ebp
1788; KNL_32-NEXT:    retl
1789;
1790; SKX-LABEL: test_scatter_16f64:
1791; SKX:       # BB#0:
1792; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
1793; SKX-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
1794; SKX-NEXT:    vptestmd %zmm2, %zmm2, %k1
1795; SKX-NEXT:    kshiftrw $8, %k1, %k2
1796; SKX-NEXT:    vscatterqpd %zmm3, (,%zmm0) {%k1}
1797; SKX-NEXT:    vscatterqpd %zmm4, (,%zmm1) {%k2}
1798; SKX-NEXT:    retq
1799  call void @llvm.masked.scatter.v16f64(<16 x double> %src0, <16 x double*> %ptrs, i32 4, <16 x i1> %mask)
1800  ret void
1801}
1802declare void @llvm.masked.scatter.v16f64(<16 x double> %src0, <16 x double*> %ptrs, i32, <16 x i1> %mask)
1803