• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; RUN: llc -mtriple=x86_64-unknown-linux-gnu  -mattr=+avx512f < %s | FileCheck %s --check-prefix=KNL_64
2; RUN: llc -mtriple=i386-unknown-linux-gnu  -mattr=+avx512f < %s | FileCheck %s --check-prefix=KNL_32
3; RUN: llc -mtriple=x86_64-unknown-linux-gnu  -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=SKX
4; RUN: llc -mtriple=i386-unknown-linux-gnu  -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=SKX_32
5; RUN: opt -mtriple=x86_64-apple-darwin -codegenprepare -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=SCALAR
6; RUN: llc -O0 -mtriple=x86_64-unknown-linux-gnu -mcpu=skx < %s -o /dev/null
7
8target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
9target triple = "x86_64-unknown-linux-gnu"
10
11
12; SCALAR-LABEL: test1
13; SCALAR:      extractelement <16 x float*>
14; SCALAR-NEXT: load float
15; SCALAR-NEXT: insertelement <16 x float>
16; SCALAR-NEXT: extractelement <16 x float*>
17; SCALAR-NEXT: load float
18
19define <16 x float> @test1(float* %base, <16 x i32> %ind) {
20; KNL_64-LABEL: test1:
21; KNL_64:       # BB#0:
22; KNL_64-NEXT:    kxnorw %k0, %k0, %k1
23; KNL_64-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
24; KNL_64-NEXT:    vmovaps %zmm1, %zmm0
25; KNL_64-NEXT:    retq
26;
27; KNL_32-LABEL: test1:
28; KNL_32:       # BB#0:
29; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
30; KNL_32-NEXT:    kxnorw %k0, %k0, %k1
31; KNL_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
32; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
33; KNL_32-NEXT:    retl
34;
35; SKX-LABEL: test1:
36; SKX:       # BB#0:
37; SKX-NEXT:    kxnorw %k0, %k0, %k1
38; SKX-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
39; SKX-NEXT:    vmovaps %zmm1, %zmm0
40; SKX-NEXT:    retq
41
42  %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
43  %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
44
45  %sext_ind = sext <16 x i32> %ind to <16 x i64>
46  %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
47
48  %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
49  ret <16 x float>%res
50}
51
52declare <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*>, i32, <16 x i1>, <16 x i32>)
53declare <16 x float> @llvm.masked.gather.v16f32(<16 x float*>, i32, <16 x i1>, <16 x float>)
54declare <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> , i32, <8 x i1> , <8 x i32> )
55
56
57; SCALAR-LABEL: test2
58; SCALAR:      extractelement <16 x float*>
59; SCALAR-NEXT: load float
60; SCALAR-NEXT: insertelement <16 x float>
61; SCALAR-NEXT: br label %else
62; SCALAR: else:
63; SCALAR-NEXT:  %res.phi.else = phi
64; SCALAR-NEXT:  %Mask1 = extractelement <16 x i1> %imask, i32 1
65; SCALAR-NEXT:  %ToLoad1 = icmp eq i1 %Mask1, true
66; SCALAR-NEXT:  br i1 %ToLoad1, label %cond.load1, label %else2
67
68define <16 x float> @test2(float* %base, <16 x i32> %ind, i16 %mask) {
69; KNL_64-LABEL: test2:
70; KNL_64:       # BB#0:
71; KNL_64-NEXT:    kmovw %esi, %k1
72; KNL_64-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
73; KNL_64-NEXT:    vmovaps %zmm1, %zmm0
74; KNL_64-NEXT:    retq
75;
76; KNL_32-LABEL: test2:
77; KNL_32:       # BB#0:
78; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
79; KNL_32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
80; KNL_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
81; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
82; KNL_32-NEXT:    retl
83;
84; SKX-LABEL: test2:
85; SKX:       # BB#0:
86; SKX-NEXT:    kmovw %esi, %k1
87; SKX-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
88; SKX-NEXT:    vmovaps %zmm1, %zmm0
89; SKX-NEXT:    retq
90
91  %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
92  %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
93
94  %sext_ind = sext <16 x i32> %ind to <16 x i64>
95  %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
96  %imask = bitcast i16 %mask to <16 x i1>
97  %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> %imask, <16 x float>undef)
98  ret <16 x float> %res
99}
100
101define <16 x i32> @test3(i32* %base, <16 x i32> %ind, i16 %mask) {
102; KNL_64-LABEL: test3:
103; KNL_64:       # BB#0:
104; KNL_64-NEXT:    kmovw %esi, %k1
105; KNL_64-NEXT:    vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
106; KNL_64-NEXT:    vmovaps %zmm1, %zmm0
107; KNL_64-NEXT:    retq
108;
109; KNL_32-LABEL: test3:
110; KNL_32:       # BB#0:
111; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
112; KNL_32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
113; KNL_32-NEXT:    vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1}
114; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
115; KNL_32-NEXT:    retl
116;
117; SKX-LABEL: test3:
118; SKX:       # BB#0:
119; SKX-NEXT:    kmovw %esi, %k1
120; SKX-NEXT:    vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
121; SKX-NEXT:    vmovaps %zmm1, %zmm0
122; SKX-NEXT:    retq
123
124  %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
125  %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
126
127  %sext_ind = sext <16 x i32> %ind to <16 x i64>
128  %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i64> %sext_ind
129  %imask = bitcast i16 %mask to <16 x i1>
130  %res = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
131  ret <16 x i32> %res
132}
133
134
135define <16 x i32> @test4(i32* %base, <16 x i32> %ind, i16 %mask) {
136; KNL_64-LABEL: test4:
137; KNL_64:       # BB#0:
138; KNL_64-NEXT:    kmovw %esi, %k1
139; KNL_64-NEXT:    kmovw %k1, %k2
140; KNL_64-NEXT:    vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
141; KNL_64-NEXT:    vmovaps %zmm1, %zmm2
142; KNL_64-NEXT:    vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
143; KNL_64-NEXT:    vpaddd %zmm2, %zmm1, %zmm0
144; KNL_64-NEXT:    retq
145;
146; KNL_32-LABEL: test4:
147; KNL_32:       # BB#0:
148; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
149; KNL_32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
150; KNL_32-NEXT:    kmovw %k1, %k2
151; KNL_32-NEXT:    vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2}
152; KNL_32-NEXT:    vmovaps %zmm1, %zmm2
153; KNL_32-NEXT:    vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1}
154; KNL_32-NEXT:    vpaddd %zmm2, %zmm1, %zmm0
155; KNL_32-NEXT:    retl
156;
157; SKX-LABEL: test4:
158; SKX:       # BB#0:
159; SKX-NEXT:    kmovw %esi, %k1
160; SKX-NEXT:    kmovw %k1, %k2
161; SKX-NEXT:    vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
162; SKX-NEXT:    vmovaps %zmm1, %zmm2
163; SKX-NEXT:    vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
164; SKX-NEXT:    vpaddd %zmm2, %zmm1, %zmm0
165; SKX-NEXT:    retq
166
167  %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
168  %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
169
170  %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
171  %imask = bitcast i16 %mask to <16 x i1>
172  %gt1 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
173  %gt2 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1)
174  %res = add <16 x i32> %gt1, %gt2
175  ret <16 x i32> %res
176}
177
178
179; SCALAR-LABEL: test5
180; SCALAR:        %Mask0 = extractelement <16 x i1> %imask, i32 0
181; SCALAR-NEXT:   %ToStore0 = icmp eq i1 %Mask0, true
182; SCALAR-NEXT:   br i1 %ToStore0, label %cond.store, label %else
183; SCALAR: cond.store:
184; SCALAR-NEXT:  %Elt0 = extractelement <16 x i32> %val, i32 0
185; SCALAR-NEXT:  %Ptr0 = extractelement <16 x i32*> %gep.random, i32 0
186; SCALAR-NEXT:  store i32 %Elt0, i32* %Ptr0, align 4
187; SCALAR-NEXT:  br label %else
188; SCALAR: else:
189; SCALAR-NEXT: %Mask1 = extractelement <16 x i1> %imask, i32 1
190; SCALAR-NEXT:  %ToStore1 = icmp eq i1 %Mask1, true
191; SCALAR-NEXT:  br i1 %ToStore1, label %cond.store1, label %else2
192
193define void @test5(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
194; KNL_64-LABEL: test5:
195; KNL_64:       # BB#0:
196; KNL_64-NEXT:    kmovw %esi, %k1
197; KNL_64-NEXT:    kmovw %k1, %k2
198; KNL_64-NEXT:    vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2}
199; KNL_64-NEXT:    vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
200; KNL_64-NEXT:    retq
201;
202; KNL_32-LABEL: test5:
203; KNL_32:       # BB#0:
204; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
205; KNL_32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
206; KNL_32-NEXT:    kmovw %k1, %k2
207; KNL_32-NEXT:    vpscatterdd %zmm1, (%eax,%zmm0,4) {%k2}
208; KNL_32-NEXT:    vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1}
209; KNL_32-NEXT:    retl
210;
211; SKX-LABEL: test5:
212; SKX:       # BB#0:
213; SKX-NEXT:    kmovw %esi, %k1
214; SKX-NEXT:    kmovw %k1, %k2
215; SKX-NEXT:    vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2}
216; SKX-NEXT:    vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
217; SKX-NEXT:    retq
218
219  %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
220  %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
221
222  %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
223  %imask = bitcast i16 %mask to <16 x i1>
224  call void @llvm.masked.scatter.v16i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
225  call void @llvm.masked.scatter.v16i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
226  ret void
227}
228
229declare void @llvm.masked.scatter.v8i32(<8 x i32> , <8 x i32*> , i32 , <8 x i1> )
230declare void @llvm.masked.scatter.v16i32(<16 x i32> , <16 x i32*> , i32 , <16 x i1> )
231
232
233; SCALAR-LABEL: test6
234; SCALAR:        store i32 %Elt0, i32* %Ptr01, align 4
235; SCALAR-NEXT:   %Elt1 = extractelement <8 x i32> %a1, i32 1
236; SCALAR-NEXT:   %Ptr12 = extractelement <8 x i32*> %ptr, i32 1
237; SCALAR-NEXT:   store i32 %Elt1, i32* %Ptr12, align 4
238; SCALAR-NEXT:   %Elt2 = extractelement <8 x i32> %a1, i32 2
239; SCALAR-NEXT:   %Ptr23 = extractelement <8 x i32*> %ptr, i32 2
240; SCALAR-NEXT:   store i32 %Elt2, i32* %Ptr23, align 4
241
242define <8 x i32> @test6(<8 x i32>%a1, <8 x i32*> %ptr) {
243; KNL_64-LABEL: test6:
244; KNL_64:       # BB#0:
245; KNL_64-NEXT:    kxnorw %k0, %k0, %k1
246; KNL_64-NEXT:    kxnorw %k0, %k0, %k2
247; KNL_64-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k2}
248; KNL_64-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
249; KNL_64-NEXT:    vmovaps %zmm2, %zmm0
250; KNL_64-NEXT:    retq
251;
252; KNL_32-LABEL: test6:
253; KNL_32:       # BB#0:
254; KNL_32-NEXT:    kxnorw %k0, %k0, %k1
255; KNL_32-NEXT:    vpmovsxdq %ymm1, %zmm2
256; KNL_32-NEXT:    kxnorw %k0, %k0, %k2
257; KNL_32-NEXT:    vpgatherqd (,%zmm2), %ymm1 {%k2}
258; KNL_32-NEXT:    vpscatterqd %ymm0, (,%zmm2) {%k1}
259; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
260; KNL_32-NEXT:    retl
261;
262; SKX-LABEL: test6:
263; SKX:       # BB#0:
264; SKX-NEXT:    kxnorw %k0, %k0, %k1
265; SKX-NEXT:    kxnorw %k0, %k0, %k2
266; SKX-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k2}
267; SKX-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
268; SKX-NEXT:    vmovaps %zmm2, %zmm0
269; SKX-NEXT:    retq
270
271  %a = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
272
273  call void @llvm.masked.scatter.v8i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
274  ret <8 x i32>%a
275}
276
277define <8 x i32> @test7(i32* %base, <8 x i32> %ind, i8 %mask) {
278;
279; KNL_64-LABEL: test7:
280; KNL_64:       # BB#0:
281; KNL_64-NEXT:    kmovw %esi, %k1
282; KNL_64-NEXT:    vpmovsxdq %ymm0, %zmm0
283; KNL_64-NEXT:    kmovw %k1, %k2
284; KNL_64-NEXT:    vpgatherqd (%rdi,%zmm0,4), %ymm1 {%k2}
285; KNL_64-NEXT:    vmovaps %zmm1, %zmm2
286; KNL_64-NEXT:    vpgatherqd (%rdi,%zmm0,4), %ymm2 {%k1}
287; KNL_64-NEXT:    vpaddd %ymm2, %ymm1, %ymm0
288; KNL_64-NEXT:    retq
289;
290; KNL_32-LABEL: test7:
291; KNL_32:       # BB#0:
292; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
293; KNL_32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
294; KNL_32-NEXT:    kmovw %ecx, %k1
295; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm0
296; KNL_32-NEXT:    kmovw %k1, %k2
297; KNL_32-NEXT:    vpgatherqd (%eax,%zmm0,4), %ymm1 {%k2}
298; KNL_32-NEXT:    vmovaps %zmm1, %zmm2
299; KNL_32-NEXT:    vpgatherqd (%eax,%zmm0,4), %ymm2 {%k1}
300; KNL_32-NEXT:    vpaddd %ymm2, %ymm1, %ymm0
301; KNL_32-NEXT:    retl
302;
303; SKX-LABEL: test7:
304; SKX:       # BB#0:
305; SKX-NEXT:    kmovb %esi, %k1
306; SKX-NEXT:    kmovw %k1, %k2
307; SKX-NEXT:    vpgatherdd (%rdi,%ymm0,4), %ymm1 {%k2}
308; SKX-NEXT:    vmovaps %zmm1, %zmm2
309; SKX-NEXT:    vpgatherdd (%rdi,%ymm0,4), %ymm2 {%k1}
310; SKX-NEXT:    vpaddd %ymm2, %ymm1, %ymm0
311; SKX-NEXT:    retq
312
313  %broadcast.splatinsert = insertelement <8 x i32*> undef, i32* %base, i32 0
314  %broadcast.splat = shufflevector <8 x i32*> %broadcast.splatinsert, <8 x i32*> undef, <8 x i32> zeroinitializer
315
316  %gep.random = getelementptr i32, <8 x i32*> %broadcast.splat, <8 x i32> %ind
317  %imask = bitcast i8 %mask to <8 x i1>
318  %gt1 = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %gep.random, i32 4, <8 x i1> %imask, <8 x i32>undef)
319  %gt2 = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %gep.random, i32 4, <8 x i1> %imask, <8 x i32>%gt1)
320  %res = add <8 x i32> %gt1, %gt2
321  ret <8 x i32> %res
322}
323
324; No uniform base in this case, index <8 x i64> contains addresses,
325; each gather call will be split into two
326define <16 x i32> @test8(<16 x i32*> %ptr.random, <16 x i32> %ind, i16 %mask) {
327; KNL_64-LABEL: test8:
328; KNL_64:       # BB#0:
329; KNL_64-NEXT:    kmovw %edi, %k1
330; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
331; KNL_64-NEXT:    kmovw %k2, %k3
332; KNL_64-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k3}
333; KNL_64-NEXT:    kmovw %k1, %k3
334; KNL_64-NEXT:    vpgatherqd (,%zmm0), %ymm3 {%k3}
335; KNL_64-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm4
336; KNL_64-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k2}
337; KNL_64-NEXT:    vpgatherqd (,%zmm0), %ymm3 {%k1}
338; KNL_64-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm0
339; KNL_64-NEXT:    vpaddd %zmm0, %zmm4, %zmm0
340; KNL_64-NEXT:    retq
341;
342; KNL_32-LABEL: test8:
343; KNL_32:       # BB#0:
344; KNL_32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
345; KNL_32-NEXT:    kmovw %k1, %k2
346; KNL_32-NEXT:    vpgatherdd (,%zmm0), %zmm1 {%k2}
347; KNL_32-NEXT:    vmovaps %zmm1, %zmm2
348; KNL_32-NEXT:    vpgatherdd (,%zmm0), %zmm2 {%k1}
349; KNL_32-NEXT:    vpaddd %zmm2, %zmm1, %zmm0
350; KNL_32-NEXT:    retl
351;
352; SKX-LABEL: test8:
353; SKX:       # BB#0:
354; SKX-NEXT:    kmovw %edi, %k1
355; SKX-NEXT:    kshiftrw $8, %k1, %k2
356; SKX-NEXT:    kmovw %k2, %k3
357; SKX-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k3}
358; SKX-NEXT:    kmovw %k1, %k3
359; SKX-NEXT:    vpgatherqd (,%zmm0), %ymm3 {%k3}
360; SKX-NEXT:    vinserti32x8 $1, %ymm2, %zmm3, %zmm4
361; SKX-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k2}
362; SKX-NEXT:    vpgatherqd (,%zmm0), %ymm3 {%k1}
363; SKX-NEXT:    vinserti32x8 $1, %ymm2, %zmm3, %zmm0
364; SKX-NEXT:    vpaddd %zmm0, %zmm4, %zmm0
365; SKX-NEXT:    retq
366;
367; SKX_32-LABEL: test8:
368; SKX_32:       # BB#0:
369; SKX_32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
370; SKX_32-NEXT:    kmovw %k1, %k2
371; SKX_32-NEXT:    vpgatherdd (,%zmm0), %zmm1 {%k2}
372; SKX_32-NEXT:    vmovaps %zmm1, %zmm2
373; SKX_32-NEXT:    vpgatherdd (,%zmm0), %zmm2 {%k1}
374; SKX_32-NEXT:    vpaddd %zmm2, %zmm1, %zmm0
375; SKX_32-NEXT:    retl
376
377  %imask = bitcast i16 %mask to <16 x i1>
378  %gt1 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
379  %gt2 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1)
380  %res = add <16 x i32> %gt1, %gt2
381  ret <16 x i32> %res
382}
383
384%struct.RT = type { i8, [10 x [20 x i32]], i8 }
385%struct.ST = type { i32, double, %struct.RT }
386
387; Masked gather for agregate types
388; Test9 and Test10 should give the same result (scalar and vector indices in GEP)
389
390
391define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) {
392; KNL_64-LABEL: test9:
393; KNL_64:       # BB#0: # %entry
394; KNL_64-NEXT:    vpbroadcastq %rdi, %zmm2
395; KNL_64-NEXT:    vpmovsxdq %ymm1, %zmm1
396; KNL_64-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm3
397; KNL_64-NEXT:    vpmuludq %zmm3, %zmm1, %zmm4
398; KNL_64-NEXT:    vpsrlq $32, %zmm1, %zmm1
399; KNL_64-NEXT:    vpmuludq %zmm3, %zmm1, %zmm1
400; KNL_64-NEXT:    vpsllq $32, %zmm1, %zmm1
401; KNL_64-NEXT:    vpaddq %zmm1, %zmm4, %zmm1
402; KNL_64-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm3
403; KNL_64-NEXT:    vpmuludq %zmm3, %zmm0, %zmm4
404; KNL_64-NEXT:    vpsrlq $32, %zmm0, %zmm0
405; KNL_64-NEXT:    vpmuludq %zmm3, %zmm0, %zmm0
406; KNL_64-NEXT:    vpsllq $32, %zmm0, %zmm0
407; KNL_64-NEXT:    vpaddq %zmm0, %zmm4, %zmm0
408; KNL_64-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
409; KNL_64-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
410; KNL_64-NEXT:    vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
411; KNL_64-NEXT:    kxnorw %k0, %k0, %k1
412; KNL_64-NEXT:    vpgatherqd (,%zmm1), %ymm0 {%k1}
413; KNL_64-NEXT:    retq
414;
415; KNL_32-LABEL: test9:
416; KNL_32:       # BB#0: # %entry
417; KNL_32-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %ymm2
418; KNL_32-NEXT:    vpbroadcastd .LCPI8_0, %ymm3
419; KNL_32-NEXT:    vpmulld %ymm3, %ymm1, %ymm1
420; KNL_32-NEXT:    vpmovqd %zmm0, %ymm0
421; KNL_32-NEXT:    vpbroadcastd .LCPI8_1, %ymm3
422; KNL_32-NEXT:    vpmulld %ymm3, %ymm0, %ymm0
423; KNL_32-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
424; KNL_32-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
425; KNL_32-NEXT:    vpbroadcastd .LCPI8_2, %ymm1
426; KNL_32-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
427; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm1
428; KNL_32-NEXT:    kxnorw %k0, %k0, %k1
429; KNL_32-NEXT:    vpgatherqd (,%zmm1), %ymm0 {%k1}
430; KNL_32-NEXT:    retl
431;
432; SKX-LABEL: test9:
433; SKX:       # BB#0: # %entry
434; SKX-NEXT:    vpbroadcastq %rdi, %zmm2
435; SKX-NEXT:    vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0
436; SKX-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
437; SKX-NEXT:    vpmovsxdq %ymm1, %zmm1
438; SKX-NEXT:    vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1
439; SKX-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
440; SKX-NEXT:    vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
441; SKX-NEXT:    kxnorw %k0, %k0, %k1
442; SKX-NEXT:    vpgatherqd (,%zmm1), %ymm0 {%k1}
443; SKX-NEXT:    retq
444entry:
445  %broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0
446  %broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer
447
448  %arrayidx = getelementptr  %struct.ST, <8 x %struct.ST*> %broadcast.splat, <8 x i64> %ind1, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>, <8 x i32><i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, <8 x i32> %ind5, <8 x i64> <i64 13, i64 13, i64 13, i64 13, i64 13, i64 13, i64 13, i64 13>
449  %res = call <8 x i32 >  @llvm.masked.gather.v8i32(<8 x i32*>%arrayidx, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
450  ret <8 x i32> %res
451}
452
453define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) {
454; KNL_64-LABEL: test10:
455; KNL_64:       # BB#0: # %entry
456; KNL_64-NEXT:    vpbroadcastq %rdi, %zmm2
457; KNL_64-NEXT:    vpmovsxdq %ymm1, %zmm1
458; KNL_64-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm3
459; KNL_64-NEXT:    vpmuludq %zmm3, %zmm1, %zmm4
460; KNL_64-NEXT:    vpsrlq $32, %zmm1, %zmm1
461; KNL_64-NEXT:    vpmuludq %zmm3, %zmm1, %zmm1
462; KNL_64-NEXT:    vpsllq $32, %zmm1, %zmm1
463; KNL_64-NEXT:    vpaddq %zmm1, %zmm4, %zmm1
464; KNL_64-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm3
465; KNL_64-NEXT:    vpmuludq %zmm3, %zmm0, %zmm4
466; KNL_64-NEXT:    vpsrlq $32, %zmm0, %zmm0
467; KNL_64-NEXT:    vpmuludq %zmm3, %zmm0, %zmm0
468; KNL_64-NEXT:    vpsllq $32, %zmm0, %zmm0
469; KNL_64-NEXT:    vpaddq %zmm0, %zmm4, %zmm0
470; KNL_64-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
471; KNL_64-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
472; KNL_64-NEXT:    vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
473; KNL_64-NEXT:    kxnorw %k0, %k0, %k1
474; KNL_64-NEXT:    vpgatherqd (,%zmm1), %ymm0 {%k1}
475; KNL_64-NEXT:    retq
476;
477; KNL_32-LABEL: test10:
478; KNL_32:       # BB#0: # %entry
479; KNL_32-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %ymm2
480; KNL_32-NEXT:    vpbroadcastd .LCPI9_0, %ymm3
481; KNL_32-NEXT:    vpmulld %ymm3, %ymm1, %ymm1
482; KNL_32-NEXT:    vpmovqd %zmm0, %ymm0
483; KNL_32-NEXT:    vpbroadcastd .LCPI9_1, %ymm3
484; KNL_32-NEXT:    vpmulld %ymm3, %ymm0, %ymm0
485; KNL_32-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
486; KNL_32-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
487; KNL_32-NEXT:    vpbroadcastd .LCPI9_2, %ymm1
488; KNL_32-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
489; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm1
490; KNL_32-NEXT:    kxnorw %k0, %k0, %k1
491; KNL_32-NEXT:    vpgatherqd (,%zmm1), %ymm0 {%k1}
492; KNL_32-NEXT:    retl
493;
494; SKX-LABEL: test10:
495; SKX:       # BB#0: # %entry
496; SKX-NEXT:    vpbroadcastq %rdi, %zmm2
497; SKX-NEXT:    vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0
498; SKX-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
499; SKX-NEXT:    vpmovsxdq %ymm1, %zmm1
500; SKX-NEXT:    vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1
501; SKX-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
502; SKX-NEXT:    vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
503; SKX-NEXT:    kxnorw %k0, %k0, %k1
504; SKX-NEXT:    vpgatherqd (,%zmm1), %ymm0 {%k1}
505; SKX-NEXT:    retq
506entry:
507  %broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0
508  %broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer
509
510  %arrayidx = getelementptr  %struct.ST, <8 x %struct.ST*> %broadcast.splat, <8 x i64> %i1, i32 2, i32 1, <8 x i32> %ind5, i64 13
511  %res = call <8 x i32 >  @llvm.masked.gather.v8i32(<8 x i32*>%arrayidx, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
512  ret <8 x i32> %res
513}
514
515; Splat index in GEP, requires broadcast
516define <16 x float> @test11(float* %base, i32 %ind) {
517; KNL_64-LABEL: test11:
518; KNL_64:       # BB#0:
519; KNL_64-NEXT:    vpbroadcastd %esi, %zmm1
520; KNL_64-NEXT:    kxnorw %k0, %k0, %k1
521; KNL_64-NEXT:    vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
522; KNL_64-NEXT:    retq
523;
524; KNL_32-LABEL: test11:
525; KNL_32:       # BB#0:
526; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
527; KNL_32-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %zmm1
528; KNL_32-NEXT:    kxnorw %k0, %k0, %k1
529; KNL_32-NEXT:    vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
530; KNL_32-NEXT:    retl
531;
532; SKX-LABEL: test11:
533; SKX:       # BB#0:
534; SKX-NEXT:    vpbroadcastd %esi, %zmm1
535; SKX-NEXT:    kxnorw %k0, %k0, %k1
536; SKX-NEXT:    vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
537; SKX-NEXT:    retq
538
539  %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
540  %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
541
542  %gep.random = getelementptr float, <16 x float*> %broadcast.splat, i32 %ind
543
544  %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
545  ret <16 x float>%res
546}
547
548; We are checking the uniform base here. It is taken directly from input to vgatherdps
549define <16 x float> @test12(float* %base, <16 x i32> %ind) {
550; KNL_64-LABEL: test12:
551; KNL_64:       # BB#0:
552; KNL_64-NEXT:    kxnorw %k0, %k0, %k1
553; KNL_64-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
554; KNL_64-NEXT:    vmovaps %zmm1, %zmm0
555; KNL_64-NEXT:    retq
556;
557; KNL_32-LABEL: test12:
558; KNL_32:       # BB#0:
559; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
560; KNL_32-NEXT:    kxnorw %k0, %k0, %k1
561; KNL_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
562; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
563; KNL_32-NEXT:    retl
564;
565; SKX-LABEL: test12:
566; SKX:       # BB#0:
567; SKX-NEXT:    kxnorw %k0, %k0, %k1
568; SKX-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
569; SKX-NEXT:    vmovaps %zmm1, %zmm0
570; SKX-NEXT:    retq
571
572  %sext_ind = sext <16 x i32> %ind to <16 x i64>
573  %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
574
575  %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
576  ret <16 x float>%res
577}
578
579; The same as the previous, but the mask is undefined
580define <16 x float> @test13(float* %base, <16 x i32> %ind) {
581; KNL_64-LABEL: test13:
582; KNL_64:       # BB#0:
583; KNL_64-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
584; KNL_64-NEXT:    vmovaps %zmm1, %zmm0
585; KNL_64-NEXT:    retq
586;
587; KNL_32-LABEL: test13:
588; KNL_32:       # BB#0:
589; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
590; KNL_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
591; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
592; KNL_32-NEXT:    retl
593;
594; SKX-LABEL: test13:
595; SKX:       # BB#0:
596; SKX-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
597; SKX-NEXT:    vmovaps %zmm1, %zmm0
598; SKX-NEXT:    retq
599
600  %sext_ind = sext <16 x i32> %ind to <16 x i64>
601  %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
602
603  %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> undef, <16 x float> undef)
604  ret <16 x float>%res
605}
606
607; The base pointer is not splat, can't find unform base
608define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) {
609; KNL_64-LABEL: test14:
610; KNL_64:       # BB#0:
611; KNL_64-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm1
612; KNL_64-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
613; KNL_64-NEXT:    vpbroadcastq %xmm0, %zmm0
614; KNL_64-NEXT:    vmovd %esi, %xmm1
615; KNL_64-NEXT:    vpbroadcastd %xmm1, %ymm1
616; KNL_64-NEXT:    vpmovsxdq %ymm1, %zmm1
617; KNL_64-NEXT:    vpsllq $2, %zmm1, %zmm1
618; KNL_64-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
619; KNL_64-NEXT:    kshiftrw $8, %k0, %k1
620; KNL_64-NEXT:    vgatherqps (,%zmm0), %ymm1 {%k1}
621; KNL_64-NEXT:    vgatherqps (,%zmm0), %ymm2 {%k1}
622; KNL_64-NEXT:    vinsertf64x4 $1, %ymm1, %zmm2, %zmm0
623; KNL_64-NEXT:    retq
624;
625; KNL_32-LABEL: test14:
626; KNL_32:       # BB#0:
627; KNL_32-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm1
628; KNL_32-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
629; KNL_32-NEXT:    vpbroadcastd %xmm0, %zmm0
630; KNL_32-NEXT:    vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1
631; KNL_32-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
632; KNL_32-NEXT:    vgatherdps (,%zmm1), %zmm0 {%k1}
633; KNL_32-NEXT:    retl
634;
635; SKX-LABEL: test14:
636; SKX:       # BB#0:
637; SKX-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm1
638; SKX-NEXT:    vinserti64x2 $0, %xmm1, %zmm0, %zmm0
639; SKX-NEXT:    vpbroadcastq %xmm0, %zmm0
640; SKX-NEXT:    vpbroadcastd %esi, %ymm1
641; SKX-NEXT:    vpmovsxdq %ymm1, %zmm1
642; SKX-NEXT:    vpsllq $2, %zmm1, %zmm1
643; SKX-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
644; SKX-NEXT:    kshiftrw $8, %k0, %k1
645; SKX-NEXT:    vgatherqps (,%zmm0), %ymm1 {%k1}
646; SKX-NEXT:    vgatherqps (,%zmm0), %ymm2 {%k1}
647; SKX-NEXT:    vinsertf32x8 $1, %ymm1, %zmm2, %zmm0
648; SKX-NEXT:    retq
649;
650; SKX_32-LABEL: test14:
651; SKX_32:       # BB#0:
652; SKX_32-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm1
653; SKX_32-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
654; SKX_32-NEXT:    vpbroadcastd %xmm0, %zmm0
655; SKX_32-NEXT:    vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1
656; SKX_32-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
657; SKX_32-NEXT:    vgatherdps (,%zmm1), %zmm0 {%k1}
658; SKX_32-NEXT:    retl
659
660  %broadcast.splatinsert = insertelement <16 x float*> %vec, float* %base, i32 1
661  %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
662
663  %gep.random = getelementptr float, <16 x float*> %broadcast.splat, i32 %ind
664
665  %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> undef, <16 x float> undef)
666  ret <16 x float>%res
667}
668
669declare <4 x float> @llvm.masked.gather.v4f32(<4 x float*>, i32, <4 x i1>, <4 x float>)
670declare <4 x double> @llvm.masked.gather.v4f64(<4 x double*>, i32, <4 x i1>, <4 x double>)
671declare <2 x double> @llvm.masked.gather.v2f64(<2 x double*>, i32, <2 x i1>, <2 x double>)
672
673; Gather smaller than existing instruction
674define <4 x float> @test15(float* %base, <4 x i32> %ind, <4 x i1> %mask) {
675;
676; KNL_64-LABEL: test15:
677; KNL_64:       # BB#0:
678; KNL_64:         vpxor %ymm2, %ymm2, %ymm2
679; KNL_64-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
680; KNL_64-NEXT:    vpmovsxdq %ymm0, %zmm2
681; KNL_64-NEXT:    vpslld $31, %ymm1, %ymm0
682; KNL_64-NEXT:    vptestmd %zmm0, %zmm0, %k1
683; KNL_64-NEXT:    vgatherqps (%rdi,%zmm2,4), %ymm0 {%k1}
684; KNL_64-NEXT:    # kill
685; KNL_64-NEXT:    retq
686;
687; KNL_32-LABEL: test15:
688; KNL_32:       # BB#0:
689; KNL_32:         vpxor %ymm2, %ymm2, %ymm2
690; KNL_32-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
691; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
692; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm2
693; KNL_32-NEXT:    vpslld $31, %ymm1, %ymm0
694; KNL_32-NEXT:    vptestmd %zmm0, %zmm0, %k1
695; KNL_32-NEXT:    vgatherqps (%eax,%zmm2,4), %ymm0 {%k1}
696; KNL_32-NEXT:    # kill
697; KNL_32-NEXT:    retl
698;
699; SKX-LABEL: test15:
700; SKX:       # BB#0:
701; SKX-NEXT:    vpslld $31, %xmm1, %xmm1
702; SKX-NEXT:    vptestmd %xmm1, %xmm1, %k1
703; SKX-NEXT:    vgatherdps (%rdi,%xmm0,4), %xmm1 {%k1}
704; SKX-NEXT:    vmovaps %zmm1, %zmm0
705; SKX-NEXT:    retq
706;
707; SKX_32-LABEL: test15:
708; SKX_32:       # BB#0:
709; SKX_32-NEXT:    vpslld $31, %xmm1, %xmm1
710; SKX_32-NEXT:    vptestmd %xmm1, %xmm1, %k1
711; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
712; SKX_32-NEXT:    vgatherdps (%eax,%xmm0,4), %xmm1 {%k1}
713; SKX_32-NEXT:    vmovaps %zmm1, %zmm0
714; SKX_32-NEXT:    retl
715
716  %sext_ind = sext <4 x i32> %ind to <4 x i64>
717  %gep.random = getelementptr float, float* %base, <4 x i64> %sext_ind
718  %res = call <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %gep.random, i32 4, <4 x i1> %mask, <4 x float> undef)
719  ret <4 x float>%res
720}
721
722; Gather smaller than existing instruction
723define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x double> %src0) {
724;
725; KNL_64-LABEL: test16:
726; KNL_64:       # BB#0:
727; KNL_64:         vpslld $31, %xmm1, %xmm1
728; KNL_64-NEXT:    vpsrad $31, %xmm1, %xmm1
729; KNL_64-NEXT:    vpmovsxdq %xmm1, %ymm1
730; KNL_64-NEXT:    vpxord %zmm3, %zmm3, %zmm3
731; KNL_64-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
732; KNL_64-NEXT:    vpmovsxdq %ymm0, %zmm0
733; KNL_64-NEXT:    vpsllq $63, %zmm1, %zmm1
734; KNL_64-NEXT:    vptestmq %zmm1, %zmm1, %k1
735; KNL_64-NEXT:    vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1}
736; KNL_64-NEXT:    vmovaps %zmm2, %zmm0
737; KNL_64-NEXT:    retq
738;
739; KNL_32-LABEL: test16:
740; KNL_32:       # BB#0:
741; KNL_32:         vpslld $31, %xmm1, %xmm1
742; KNL_32-NEXT:    vpsrad $31, %xmm1, %xmm1
743; KNL_32-NEXT:    vpmovsxdq %xmm1, %ymm1
744; KNL_32-NEXT:    vpxord %zmm3, %zmm3, %zmm3
745; KNL_32-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
746; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
747; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm0
748; KNL_32-NEXT:    vpsllvq .LCPI15_0, %zmm1, %zmm1
749; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k1
750; KNL_32-NEXT:    vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1}
751; KNL_32-NEXT:    vmovaps %zmm2, %zmm0
752; KNL_32-NEXT:    retl
753;
754; SKX-LABEL: test16:
755; SKX:       # BB#0:
756; SKX-NEXT:    vpslld $31, %xmm1, %xmm1
757; SKX-NEXT:    vptestmd %xmm1, %xmm1, %k1
758; SKX-NEXT:    vgatherdpd (%rdi,%xmm0,8), %ymm2 {%k1}
759; SKX-NEXT:    vmovaps %zmm2, %zmm0
760; SKX-NEXT:    retq
761;
762; SKX_32-LABEL: test16:
763; SKX_32:       # BB#0:
764; SKX_32-NEXT:    vpslld $31, %xmm1, %xmm1
765; SKX_32-NEXT:    vptestmd %xmm1, %xmm1, %k1
766; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
767; SKX_32-NEXT:    vgatherdpd (%eax,%xmm0,8), %ymm2 {%k1}
768; SKX_32-NEXT:    vmovaps %zmm2, %zmm0
769; SKX_32-NEXT:    retl
770
771  %sext_ind = sext <4 x i32> %ind to <4 x i64>
772  %gep.random = getelementptr double, double* %base, <4 x i64> %sext_ind
773  %res = call <4 x double> @llvm.masked.gather.v4f64(<4 x double*> %gep.random, i32 4, <4 x i1> %mask, <4 x double> %src0)
774  ret <4 x double>%res
775}
776
777define <2 x double> @test17(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x double> %src0) {
778;
779; KNL_64-LABEL: test17:
780; KNL_64:       # BB#0:
781; KNL_64:         vpxord %zmm3, %zmm3, %zmm3
782; KNL_64-NEXT:    vinserti32x4 $0, %xmm1, %zmm3, %zmm1
783; KNL_64-NEXT:    vpsllq $63, %zmm1, %zmm1
784; KNL_64-NEXT:    vptestmq %zmm1, %zmm1, %k1
785; KNL_64-NEXT:    vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1}
786; KNL_64-NEXT:    vmovaps %zmm2, %zmm0
787; KNL_64-NEXT:    retq
788;
789; KNL_32-LABEL: test17:
790; KNL_32:       # BB#0:
791; KNL_32:         vpxord %zmm3, %zmm3, %zmm3
792; KNL_32-NEXT:    vinserti32x4 $0, %xmm1, %zmm3, %zmm1
793; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
794; KNL_32-NEXT:    vpsllvq .LCPI16_0, %zmm1, %zmm1
795; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k1
796; KNL_32-NEXT:    vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1}
797; KNL_32-NEXT:    vmovaps %zmm2, %zmm0
798; KNL_32-NEXT:    retl
799;
800; SKX-LABEL: test17:
801; SKX:       # BB#0:
802; SKX-NEXT:    vpsllq $63, %xmm1, %xmm1
803; SKX-NEXT:    vptestmq %xmm1, %xmm1, %k1
804; SKX-NEXT:    vgatherqpd (%rdi,%xmm0,8), %xmm2 {%k1}
805; SKX-NEXT:    vmovaps %zmm2, %zmm0
806; SKX-NEXT:    retq
807;
808; SKX_32-LABEL: test17:
809; SKX_32:       # BB#0:
810; SKX_32-NEXT:    vpsllq $63, %xmm1, %xmm1
811; SKX_32-NEXT:    vptestmq %xmm1, %xmm1, %k1
812; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
813; SKX_32-NEXT:    vgatherqpd (%eax,%xmm0,8), %xmm2 {%k1}
814; SKX_32-NEXT:    vmovaps %zmm2, %zmm0
815; SKX_32-NEXT:    retl
816
817  %sext_ind = sext <2 x i32> %ind to <2 x i64>
818  %gep.random = getelementptr double, double* %base, <2 x i64> %sext_ind
819  %res = call <2 x double> @llvm.masked.gather.v2f64(<2 x double*> %gep.random, i32 4, <2 x i1> %mask, <2 x double> %src0)
820  ret <2 x double>%res
821}
822
823declare void @llvm.masked.scatter.v4i32(<4 x i32> , <4 x i32*> , i32 , <4 x i1> )
824declare void @llvm.masked.scatter.v4f64(<4 x double> , <4 x double*> , i32 , <4 x i1> )
825declare void @llvm.masked.scatter.v2i64(<2 x i64> , <2 x i64*> , i32 , <2 x i1> )
826declare void @llvm.masked.scatter.v2i32(<2 x i32> , <2 x i32*> , i32 , <2 x i1> )
827declare void @llvm.masked.scatter.v2f32(<2 x float> , <2 x float*> , i32 , <2 x i1> )
828
829define void @test18(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) {
830;
831; KNL_64-LABEL: test18:
832; KNL_64:       # BB#0:
833; KNL_64:         vpxor %ymm3, %ymm3, %ymm3
834; KNL_64-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
835; KNL_64-NEXT:    vpslld $31, %ymm2, %ymm2
836; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
837; KNL_64-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
838; KNL_64-NEXT:    retq
839;
840; KNL_32-LABEL: test18:
841; KNL_32:       # BB#0:
842; KNL_32:         vpxor %ymm3, %ymm3, %ymm3
843; KNL_32-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
844; KNL_32-NEXT:    vpmovsxdq %ymm1, %zmm1
845; KNL_32-NEXT:    vpslld $31, %ymm2, %ymm2
846; KNL_32-NEXT:    vptestmd %zmm2, %zmm2, %k1
847; KNL_32-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
848; KNL_32-NEXT:    retl
849;
850; SKX-LABEL: test18:
851; SKX:       # BB#0:
852; SKX-NEXT:    vpslld $31, %xmm2, %xmm2
853; SKX-NEXT:    vptestmd %xmm2, %xmm2, %k1
854; SKX-NEXT:    vpscatterqd %xmm0, (,%ymm1) {%k1}
855; SKX-NEXT:    retq
856;
857; SKX_32-LABEL: test18:
858; SKX_32:       # BB#0:
859; SKX_32-NEXT:    vpslld $31, %xmm2, %xmm2
860; SKX_32-NEXT:    vptestmd %xmm2, %xmm2, %k1
861; SKX_32-NEXT:    vpscatterdd %xmm0, (,%xmm1) {%k1}
862; SKX_32-NEXT:    retl
863  call void @llvm.masked.scatter.v4i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask)
864  ret void
865}
866
867define void @test19(<4 x double>%a1, double* %ptr, <4 x i1>%mask, <4 x i64> %ind) {
868;
869; KNL_64-LABEL: test19:
870; KNL_64:       # BB#0:
871; KNL_64:         vpslld $31, %xmm1, %xmm1
872; KNL_64-NEXT:    vpsrad $31, %xmm1, %xmm1
873; KNL_64-NEXT:    vpmovsxdq %xmm1, %ymm1
874; KNL_64-NEXT:    vpxord %zmm3, %zmm3, %zmm3
875; KNL_64-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
876; KNL_64-NEXT:    vpsllq $63, %zmm1, %zmm1
877; KNL_64-NEXT:    vptestmq %zmm1, %zmm1, %k1
878; KNL_64-NEXT:    vscatterqpd %zmm0, (%rdi,%zmm2,8) {%k1}
879; KNL_64-NEXT:    retq
880;
881; KNL_32-LABEL: test19:
882; KNL_32:       # BB#0:
883; KNL_32:         vpslld $31, %xmm1, %xmm1
884; KNL_32-NEXT:    vpsrad $31, %xmm1, %xmm1
885; KNL_32-NEXT:    vpmovsxdq %xmm1, %ymm1
886; KNL_32-NEXT:    vpxord %zmm3, %zmm3, %zmm3
887; KNL_32-NEXT:    vinserti64x4 $0, %ymm1, %zmm3, %zmm1
888; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
889; KNL_32-NEXT:    vpsllvq .LCPI18_0, %zmm1, %zmm1
890; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k1
891; KNL_32-NEXT:    vscatterqpd %zmm0, (%eax,%zmm2,8) {%k1}
892; KNL_32-NEXT:    retl
893;
894; SKX-LABEL: test19:
895; SKX:       # BB#0:
896; SKX-NEXT:    vpslld $31, %xmm1, %xmm1
897; SKX-NEXT:    vptestmd %xmm1, %xmm1, %k1
898; SKX-NEXT:    vscatterqpd %ymm0, (%rdi,%ymm2,8) {%k1}
899; SKX-NEXT:    retq
900;
901; SKX_32-LABEL: test19:
902; SKX_32:       # BB#0:
903; SKX_32-NEXT:    vpslld $31, %xmm1, %xmm1
904; SKX_32-NEXT:    vptestmd %xmm1, %xmm1, %k1
905; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
906; SKX_32-NEXT:    vscatterqpd %ymm0, (%eax,%ymm2,8) {%k1}
907; SKX_32-NEXT:    retl
908  %gep = getelementptr double, double* %ptr, <4 x i64> %ind
909  call void @llvm.masked.scatter.v4f64(<4 x double> %a1, <4 x double*> %gep, i32 8, <4 x i1> %mask)
910  ret void
911}
912
913; Data type requires widening
914define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) {
915;
916; KNL_64-LABEL: test20:
917; KNL_64:       # BB#0:
918; KNL_64:         vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
919; KNL_64-NEXT:    vmovq {{.*#+}} xmm2 = xmm2[0],zero
920; KNL_64-NEXT:    vpxor %ymm3, %ymm3, %ymm3
921; KNL_64-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
922; KNL_64-NEXT:    vpslld $31, %ymm2, %ymm2
923; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
924; KNL_64-NEXT:    vscatterqps %ymm0, (,%zmm1) {%k1}
925; KNL_64-NEXT:    retq
926;
927; KNL_32-LABEL: test20:
928; KNL_32:       # BB#0:
929; KNL_32:         vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
930; KNL_32-NEXT:    vmovq {{.*#+}} xmm2 = xmm2[0],zero
931; KNL_32-NEXT:    vpxor %ymm3, %ymm3, %ymm3
932; KNL_32-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
933; KNL_32-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
934; KNL_32-NEXT:    vpmovsxdq %ymm1, %zmm1
935; KNL_32-NEXT:    vpslld $31, %ymm2, %ymm2
936; KNL_32-NEXT:    vptestmd %zmm2, %zmm2, %k1
937; KNL_32-NEXT:    vscatterqps %ymm0, (,%zmm1) {%k1}
938; KNL_32-NEXT:    retl
939;
940; SKX-LABEL: test20:
941; SKX:       # BB#0:
942; SKX:         vpsllq $63, %xmm2, %xmm2
943; SKX-NEXT:    vptestmq %xmm2, %xmm2, %k0
944; SKX-NEXT:    kshiftlb $6, %k0, %k0
945; SKX-NEXT:    kshiftrb $6, %k0, %k1
946; SKX-NEXT:    vscatterqps %xmm0, (,%ymm1) {%k1}
947; SKX-NEXT:    retq
948;
949; SKX_32-LABEL: test20:
950; SKX_32:       # BB#0:
951; SKX_32-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
952; SKX_32-NEXT:    vpsllq $63, %xmm2, %xmm2
953; SKX_32-NEXT:    vptestmq %xmm2, %xmm2, %k0
954; SKX_32-NEXT:    kshiftlb $6, %k0, %k0
955; SKX_32-NEXT:    kshiftrb $6, %k0, %k1
956; SKX_32-NEXT:    vscatterdps %xmm0, (,%xmm1) {%k1}
957; SKX_32-NEXT:    retl
958  call void @llvm.masked.scatter.v2f32(<2 x float> %a1, <2 x float*> %ptr, i32 4, <2 x i1> %mask)
959  ret void
960}
961
962; Data type requires promotion
963define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) {
964;
965; KNL_64-LABEL: test21:
966; KNL_64:       # BB#0:
967; KNL_64:         vpxord %zmm3, %zmm3, %zmm3
968; KNL_64-NEXT:    vinserti32x4 $0, %xmm2, %zmm3, %zmm2
969; KNL_64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
970; KNL_64-NEXT:    vpsllq $63, %zmm2, %zmm2
971; KNL_64-NEXT:    vptestmq %zmm2, %zmm2, %k1
972; KNL_64-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
973; KNL_64-NEXT:    retq
974;
975; KNL_32-LABEL: test21:
976; KNL_32:       # BB#0:
977; KNL_32:         vpxord %zmm3, %zmm3, %zmm3
978; KNL_32-NEXT:    vinserti32x4 $0, %xmm2, %zmm3, %zmm2
979; KNL_32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
980; KNL_32-NEXT:    vpsllvq .LCPI20_0, %zmm2, %zmm2
981; KNL_32-NEXT:    vptestmq %zmm2, %zmm2, %k1
982; KNL_32-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
983; KNL_32-NEXT:    retl
984;
985; SKX-LABEL: test21:
986; SKX:       # BB#0:
987; SKX:         vpsllq $63, %xmm2, %xmm2
988; SKX-NEXT:    vptestmq %xmm2, %xmm2, %k0
989; SKX-NEXT:    kshiftlb $6, %k0, %k0
990; SKX-NEXT:    kshiftrb $6, %k0, %k1
991; SKX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
992; SKX-NEXT:    vpscatterqd %xmm0, (,%ymm1) {%k1}
993; SKX-NEXT:    retq
994;
995; SKX_32-LABEL: test21:
996; SKX_32:       # BB#0:
997; SKX_32:         vpsllq $63, %xmm2, %xmm2
998; SKX_32-NEXT:    vptestmq %xmm2, %xmm2, %k0
999; SKX_32-NEXT:    kshiftlb $6, %k0, %k0
1000; SKX_32-NEXT:    kshiftrb $6, %k0, %k1
1001; SKX_32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1002; SKX_32-NEXT:    vpscatterqd %xmm0, (,%ymm1) {%k1}
1003; SKX_32-NEXT:    retl
1004  call void @llvm.masked.scatter.v2i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> %mask)
1005  ret void
1006}
1007
1008; The result type requires widening
1009declare <2 x float> @llvm.masked.gather.v2f32(<2 x float*>, i32, <2 x i1>, <2 x float>)
1010
1011define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x float> %src0) {
1012;
1013;
1014; KNL_64-LABEL: test22:
1015; KNL_64:       # BB#0:
1016; KNL_64:         vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1017; KNL_64-NEXT:    vmovq {{.*#+}} xmm1 = xmm1[0],zero
1018; KNL_64-NEXT:    vpxor %ymm3, %ymm3, %ymm3
1019; KNL_64-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
1020; KNL_64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1021; KNL_64-NEXT:    vpmovsxdq %ymm0, %zmm0
1022; KNL_64-NEXT:    vpslld $31, %ymm1, %ymm1
1023; KNL_64-NEXT:    vptestmd %zmm1, %zmm1, %k1
1024; KNL_64-NEXT:    vgatherqps (%rdi,%zmm0,4), %ymm2 {%k1}
1025; KNL_64-NEXT:    vmovaps %zmm2, %zmm0
1026; KNL_64-NEXT:    retq
1027;
1028; KNL_32-LABEL: test22:
1029; KNL_32:       # BB#0:
1030; KNL_32:         vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1031; KNL_32-NEXT:    vmovq {{.*#+}} xmm1 = xmm1[0],zero
1032; KNL_32-NEXT:    vpxor %ymm3, %ymm3, %ymm3
1033; KNL_32-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
1034; KNL_32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1035; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1036; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm0
1037; KNL_32-NEXT:    vpslld $31, %ymm1, %ymm1
1038; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
1039; KNL_32-NEXT:    vgatherqps (%eax,%zmm0,4), %ymm2 {%k1}
1040; KNL_32-NEXT:    vmovaps %zmm2, %zmm0
1041; KNL_32-NEXT:    retl
1042;
1043; SKX-LABEL: test22:
1044; SKX:       # BB#0:
1045; SKX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1046; SKX-NEXT:    vpsllq $63, %xmm1, %xmm1
1047; SKX-NEXT:    vptestmq %xmm1, %xmm1, %k0
1048; SKX-NEXT:    kshiftlb $6, %k0, %k0
1049; SKX-NEXT:    kshiftrb $6, %k0, %k1
1050; SKX-NEXT:    vgatherdps (%rdi,%xmm0,4), %xmm2 {%k1}
1051; SKX-NEXT:    vmovaps %zmm2, %zmm0
1052; SKX-NEXT:    retq
1053;
1054; SKX_32-LABEL: test22:
1055; SKX_32:       # BB#0:
1056; SKX_32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1057; SKX_32-NEXT:    vpsllq $63, %xmm1, %xmm1
1058; SKX_32-NEXT:    vptestmq %xmm1, %xmm1, %k0
1059; SKX_32-NEXT:    kshiftlb $6, %k0, %k0
1060; SKX_32-NEXT:    kshiftrb $6, %k0, %k1
1061; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1062; SKX_32-NEXT:    vgatherdps (%eax,%xmm0,4), %xmm2 {%k1}
1063; SKX_32-NEXT:    vmovaps %zmm2, %zmm0
1064; SKX_32-NEXT:    retl
1065  %sext_ind = sext <2 x i32> %ind to <2 x i64>
1066  %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind
1067  %res = call <2 x float> @llvm.masked.gather.v2f32(<2 x float*> %gep.random, i32 4, <2 x i1> %mask, <2 x float> %src0)
1068  ret <2 x float>%res
1069}
1070
1071declare <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>)
1072declare <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*>, i32, <2 x i1>, <2 x i64>)
1073
1074define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %src0) {
1075;
1076; KNL_64-LABEL: test23:
1077; KNL_64:       # BB#0:
1078; KNL_64:         vpxord %zmm3, %zmm3, %zmm3
1079; KNL_64-NEXT:    vinserti32x4 $0, %xmm1, %zmm3, %zmm1
1080; KNL_64-NEXT:    vpsllq $63, %zmm1, %zmm1
1081; KNL_64-NEXT:    vptestmq %zmm1, %zmm1, %k1
1082; KNL_64-NEXT:    vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1}
1083; KNL_64-NEXT:    vmovaps %zmm2, %zmm0
1084; KNL_64-NEXT:    retq
1085;
1086; KNL_32-LABEL: test23:
1087; KNL_32:       # BB#0:
1088; KNL_32:         vpxord %zmm3, %zmm3, %zmm3
1089; KNL_32-NEXT:    vinserti32x4 $0, %xmm1, %zmm3, %zmm1
1090; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1091; KNL_32-NEXT:    vpsllvq .LCPI22_0, %zmm1, %zmm1
1092; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k1
1093; KNL_32-NEXT:    vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1}
1094; KNL_32-NEXT:    vmovaps %zmm2, %zmm0
1095; KNL_32-NEXT:    retl
1096;
1097; SKX-LABEL: test23:
1098; SKX:       # BB#0:
1099; SKX-NEXT:    vpsllq $63, %xmm1, %xmm1
1100; SKX-NEXT:    vptestmq %xmm1, %xmm1, %k1
1101; SKX-NEXT:    vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1}
1102; SKX-NEXT:    vmovaps %zmm2, %zmm0
1103; SKX-NEXT:    retq
1104;
1105; SKX_32-LABEL: test23:
1106; SKX_32:       # BB#0:
1107; SKX_32-NEXT:    vpsllq $63, %xmm1, %xmm1
1108; SKX_32-NEXT:    vptestmq %xmm1, %xmm1, %k1
1109; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1110; SKX_32-NEXT:    vpgatherqq (%eax,%xmm0,8), %xmm2 {%k1}
1111; SKX_32-NEXT:    vmovaps %zmm2, %zmm0
1112; SKX_32-NEXT:    retl
1113  %sext_ind = sext <2 x i32> %ind to <2 x i64>
1114  %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind
1115  %res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0)
1116  ret <2 x i32>%res
1117}
1118
1119define <2 x i32> @test24(i32* %base, <2 x i32> %ind) {
1120; KNL_64-LABEL: test24:
1121; KNL_64:       # BB#0:
1122; KNL_64:         movb $3, %al
1123; KNL_64-NEXT:    kmovw %eax, %k1
1124; KNL_64-NEXT:    vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1}
1125; KNL_64-NEXT:    vmovaps %zmm1, %zmm0
1126; KNL_64-NEXT:    retq
1127;
1128; KNL_32-LABEL: test24:
1129; KNL_32:       # BB#0:
1130; KNL_32:         movl {{[0-9]+}}(%esp), %eax
1131; KNL_32-NEXT:    vpxord %zmm1, %zmm1, %zmm1
1132; KNL_32-NEXT:    vinserti32x4 $0, .LCPI23_0, %zmm1, %zmm1
1133; KNL_32-NEXT:    vpsllvq .LCPI23_1, %zmm1, %zmm1
1134; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k1
1135; KNL_32-NEXT:    vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1}
1136; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
1137; KNL_32-NEXT:    retl
1138;
1139; SKX-LABEL: test24:
1140; SKX:       # BB#0:
1141; SKX-NEXT:    kxnorw %k0, %k0, %k1
1142; SKX-NEXT:    vpgatherqq (%rdi,%xmm0,8), %xmm1 {%k1}
1143; SKX-NEXT:    vmovaps %zmm1, %zmm0
1144; SKX-NEXT:    retq
1145;
1146; SKX_32-LABEL: test24:
1147; SKX_32:       # BB#0:
1148; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1149; SKX_32-NEXT:    kxnorw %k0, %k0, %k1
1150; SKX_32-NEXT:    vpgatherqq (%eax,%xmm0,8), %xmm1 {%k1}
1151; SKX_32-NEXT:    vmovaps %zmm1, %zmm0
1152; SKX_32-NEXT:    retl
1153  %sext_ind = sext <2 x i32> %ind to <2 x i64>
1154  %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind
1155  %res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
1156  ret <2 x i32>%res
1157}
1158
1159define <2 x i64> @test25(i64* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %src0) {
1160;
1161; KNL_64-LABEL: test25:
1162; KNL_64:       # BB#0:
1163; KNL_64:         vpxord %zmm3, %zmm3, %zmm3
1164; KNL_64-NEXT:    vinserti32x4 $0, %xmm1, %zmm3, %zmm1
1165; KNL_64-NEXT:    vpsllq $63, %zmm1, %zmm1
1166; KNL_64-NEXT:    vptestmq %zmm1, %zmm1, %k1
1167; KNL_64-NEXT:    vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1}
1168; KNL_64-NEXT:    vmovaps %zmm2, %zmm0
1169; KNL_64-NEXT:    retq
1170;
1171; KNL_32-LABEL: test25:
1172; KNL_32:       # BB#0:
1173; KNL_32:         vpxord %zmm3, %zmm3, %zmm3
1174; KNL_32-NEXT:    vinserti32x4 $0, %xmm1, %zmm3, %zmm1
1175; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1176; KNL_32-NEXT:    vpsllvq .LCPI24_0, %zmm1, %zmm1
1177; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k1
1178; KNL_32-NEXT:    vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1}
1179; KNL_32-NEXT:    vmovaps %zmm2, %zmm0
1180; KNL_32-NEXT:    retl
1181;
1182; SKX-LABEL: test25:
1183; SKX:       # BB#0:
1184; SKX-NEXT:    vpsllq $63, %xmm1, %xmm1
1185; SKX-NEXT:    vptestmq %xmm1, %xmm1, %k1
1186; SKX-NEXT:    vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1}
1187; SKX-NEXT:    vmovaps %zmm2, %zmm0
1188; SKX-NEXT:    retq
1189;
1190; SKX_32-LABEL: test25:
1191; SKX_32:       # BB#0:
1192; SKX_32-NEXT:    vpsllq $63, %xmm1, %xmm1
1193; SKX_32-NEXT:    vptestmq %xmm1, %xmm1, %k1
1194; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1195; SKX_32-NEXT:    vpgatherqq (%eax,%xmm0,8), %xmm2 {%k1}
1196; SKX_32-NEXT:    vmovaps %zmm2, %zmm0
1197; SKX_32-NEXT:    retl
1198  %sext_ind = sext <2 x i32> %ind to <2 x i64>
1199  %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind
1200  %res = call <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %gep.random, i32 8, <2 x i1> %mask, <2 x i64> %src0)
1201  ret <2 x i64>%res
1202}
1203
1204define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) {
1205;
1206; KNL_64-LABEL: test26:
1207; KNL_64:       # BB#0:
1208; KNL_64:         movb $3, %al
1209; KNL_64-NEXT:    kmovw %eax, %k1
1210; KNL_64-NEXT:    vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1}
1211; KNL_64-NEXT:    vmovaps %zmm1, %zmm0
1212; KNL_64-NEXT:    retq
1213;
1214; KNL_32-LABEL: test26:
1215; KNL_32:       # BB#0:
1216; KNL_32:         movl {{[0-9]+}}(%esp), %eax
1217; KNL_32-NEXT:    vpxord %zmm2, %zmm2, %zmm2
1218; KNL_32-NEXT:    vinserti32x4 $0, .LCPI25_0, %zmm2, %zmm2
1219; KNL_32-NEXT:    vpsllvq .LCPI25_1, %zmm2, %zmm2
1220; KNL_32-NEXT:    vptestmq %zmm2, %zmm2, %k1
1221; KNL_32-NEXT:    vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1}
1222; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
1223; KNL_32-NEXT:    retl
1224;
1225; SKX-LABEL: test26:
1226; SKX:       # BB#0:
1227; SKX-NEXT:    kxnorw %k0, %k0, %k1
1228; SKX-NEXT:    vpgatherqq (%rdi,%xmm0,8), %xmm1 {%k1}
1229; SKX-NEXT:    vmovaps %zmm1, %zmm0
1230; SKX-NEXT:    retq
1231;
1232; SKX_32-LABEL: test26:
1233; SKX_32:       # BB#0:
1234; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1235; SKX_32-NEXT:    kxnorw %k0, %k0, %k1
1236; SKX_32-NEXT:    vpgatherqq (%eax,%xmm0,8), %xmm1 {%k1}
1237; SKX_32-NEXT:    vmovaps %zmm1, %zmm0
1238; SKX_32-NEXT:    retl
1239  %sext_ind = sext <2 x i32> %ind to <2 x i64>
1240  %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind
1241  %res = call <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %gep.random, i32 8, <2 x i1> <i1 true, i1 true>, <2 x i64> %src0)
1242  ret <2 x i64>%res
1243}
1244
1245; Result type requires widening; all-ones mask
1246define <2 x float> @test27(float* %base, <2 x i32> %ind) {
1247;
1248; KNL_64-LABEL: test27:
1249; KNL_64:       # BB#0:
1250; KNL_64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1251; KNL_64-NEXT:    vpmovsxdq %ymm0, %zmm1
1252; KNL_64-NEXT:    movb $3, %al
1253; KNL_64-NEXT:    kmovw %eax, %k1
1254; KNL_64-NEXT:    vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
1255; KNL_64-NEXT:    # kill
1256; KNL_64-NEXT:    retq
1257;
1258; KNL_32-LABEL: test27:
1259; KNL_32:       # BB#0:
1260; KNL_32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1261; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1262; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm1
1263; KNL_32-NEXT:    movb $3, %cl
1264; KNL_32-NEXT:    kmovw %ecx, %k1
1265; KNL_32-NEXT:    vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
1266; KNL_32-NEXT:    # kill
1267; KNL_32-NEXT:    retl
1268;
1269; SKX-LABEL: test27:
1270; SKX:       # BB#0:
1271; SKX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
1272; SKX-NEXT:    movb $3, %al
1273; SKX-NEXT:    kmovb %eax, %k1
1274; SKX-NEXT:    vgatherdps (%rdi,%xmm1,4), %xmm0 {%k1}
1275; SKX-NEXT:    retq
1276  %sext_ind = sext <2 x i32> %ind to <2 x i64>
1277  %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind
1278  %res = call <2 x float> @llvm.masked.gather.v2f32(<2 x float*> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x float> undef)
1279  ret <2 x float>%res
1280}
1281
1282; Data type requires promotion, mask is all-ones
1283define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) {
1284;
1285;
1286; KNL_64-LABEL: test28:
1287; KNL_64:       # BB#0:
1288; KNL_64:         vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1289; KNL_64-NEXT:    movb $3, %al
1290; KNL_64-NEXT:    kmovw %eax, %k1
1291; KNL_64-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
1292; KNL_64-NEXT:    retq
1293;
1294; KNL_32-LABEL: test28:
1295; KNL_32:       # BB#0:
1296; KNL_32:         vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1297; KNL_32-NEXT:    vpxord %zmm2, %zmm2, %zmm2
1298; KNL_32-NEXT:    vinserti32x4 $0, .LCPI27_0, %zmm2, %zmm2
1299; KNL_32-NEXT:    vpsllvq .LCPI27_1, %zmm2, %zmm2
1300; KNL_32-NEXT:    vptestmq %zmm2, %zmm2, %k1
1301; KNL_32-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
1302; KNL_32-NEXT:    retl
1303;
1304; SKX-LABEL: test28:
1305; SKX:       # BB#0:
1306; SKX:         vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1307; SKX-NEXT:    movb $3, %al
1308; SKX-NEXT:    kmovb %eax, %k1
1309; SKX-NEXT:    vpscatterqd %xmm0, (,%ymm1) {%k1}
1310; SKX-NEXT:    retq
1311;
1312; SKX_32-LABEL: test28:
1313; SKX_32:       # BB#0:
1314; SKX_32:         vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1315; SKX_32-NEXT:    movb $3, %al
1316; SKX_32-NEXT:    kmovb %eax, %k1
1317; SKX_32-NEXT:    vpscatterqd %xmm0, (,%ymm1) {%k1}
1318; SKX_32-NEXT:    retl
1319  call void @llvm.masked.scatter.v2i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> <i1 true, i1 true>)
1320  ret void
1321}
1322
1323
1324; SCALAR-LABEL: test29
1325; SCALAR:      extractelement <16 x float*>
1326; SCALAR-NEXT: load float
1327; SCALAR-NEXT: insertelement <16 x float>
1328; SCALAR-NEXT: extractelement <16 x float*>
1329; SCALAR-NEXT: load float
1330
1331define <16 x float> @test29(float* %base, <16 x i32> %ind) {
1332; KNL_64-LABEL: test29:
1333; KNL_64:       # BB#0:
1334; KNL_64-NEXT:    movw $44, %ax
1335; KNL_64-NEXT:    kmovw %eax, %k1
1336; KNL_64-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
1337; KNL_64-NEXT:    vmovaps %zmm1, %zmm0
1338; KNL_64-NEXT:    retq
1339;
1340; KNL_32-LABEL: test29:
1341; KNL_32:       # BB#0:
1342; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1343; KNL_32-NEXT:    movw $44, %cx
1344; KNL_32-NEXT:    kmovw %ecx, %k1
1345; KNL_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
1346; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
1347; KNL_32-NEXT:    retl
1348;
1349; SKX-LABEL: test29:
1350; SKX:       # BB#0:
1351; SKX-NEXT:    movw $44, %ax
1352; SKX-NEXT:    kmovw %eax, %k1
1353; SKX-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
1354; SKX-NEXT:    vmovaps %zmm1, %zmm0
1355; SKX-NEXT:    retq
1356
1357  %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
1358  %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
1359
1360  %sext_ind = sext <16 x i32> %ind to <16 x i64>
1361  %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
1362
1363  %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float> undef)
1364  ret <16 x float>%res
1365}
1366
1367; Check non-power-of-2 case. It should be scalarized.
1368declare <3 x i32> @llvm.masked.gather.v3i32(<3 x i32*>, i32, <3 x i1>, <3 x i32>)
1369define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> %src0) {
1370; KNL_64-LABEL: test30:
1371; KNL_64:       # BB#0:
1372; KNL_64-NEXT:    andl $1, %edx
1373; KNL_64-NEXT:    andl $1, %esi
1374; KNL_64-NEXT:    movl %edi, %eax
1375; KNL_64-NEXT:    andl $1, %eax
1376; KNL_64-NEXT:    vpmovsxdq %xmm1, %ymm1
1377; KNL_64-NEXT:    vpsllq $2, %ymm1, %ymm1
1378; KNL_64-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
1379; KNL_64-NEXT:    # implicit-def: %XMM0
1380; KNL_64-NEXT:    testb $1, %dil
1381; KNL_64-NEXT:    je .LBB29_2
1382; KNL_64-NEXT:  # BB#1: # %cond.load
1383; KNL_64-NEXT:    vmovq %xmm1, %rcx
1384; KNL_64-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1385; KNL_64-NEXT:  .LBB29_2: # %else
1386; KNL_64-NEXT:    testb %sil, %sil
1387; KNL_64-NEXT:    je .LBB29_4
1388; KNL_64-NEXT:  # BB#3: # %cond.load1
1389; KNL_64-NEXT:    vpextrq $1, %xmm1, %rcx
1390; KNL_64-NEXT:    vpinsrd $1, (%rcx), %xmm0, %xmm0
1391; KNL_64-NEXT:  .LBB29_4: # %else2
1392; KNL_64-NEXT:    testb %dl, %dl
1393; KNL_64-NEXT:    je .LBB29_6
1394; KNL_64-NEXT:  # BB#5: # %cond.load4
1395; KNL_64-NEXT:    vextracti128 $1, %ymm1, %xmm1
1396; KNL_64-NEXT:    vmovq %xmm1, %rcx
1397; KNL_64-NEXT:    vpinsrd $2, (%rcx), %xmm0, %xmm0
1398; KNL_64-NEXT:  .LBB29_6: # %else5
1399; KNL_64-NEXT:    vmovd %eax, %xmm1
1400; KNL_64-NEXT:    vpinsrd $1, %esi, %xmm1, %xmm1
1401; KNL_64-NEXT:    vpinsrd $2, %edx, %xmm1, %xmm1
1402; KNL_64-NEXT:    vpslld $31, %xmm1, %xmm1
1403; KNL_64-NEXT:    vblendvps %xmm1, %xmm0, %xmm2, %xmm0
1404; KNL_64-NEXT:    retq
1405;
1406; KNL_32-LABEL: test30:
1407; KNL_32:       # BB#0:
1408; KNL_32-NEXT:    pushl %ebx
1409; KNL_32-NEXT:  .Ltmp0:
1410; KNL_32-NEXT:    .cfi_def_cfa_offset 8
1411; KNL_32-NEXT:    pushl %esi
1412; KNL_32-NEXT:  .Ltmp1:
1413; KNL_32-NEXT:    .cfi_def_cfa_offset 12
1414; KNL_32-NEXT:  .Ltmp2:
1415; KNL_32-NEXT:    .cfi_offset %esi, -12
1416; KNL_32-NEXT:  .Ltmp3:
1417; KNL_32-NEXT:    .cfi_offset %ebx, -8
1418; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1419; KNL_32-NEXT:    andl $1, %eax
1420; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1421; KNL_32-NEXT:    andl $1, %ecx
1422; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
1423; KNL_32-NEXT:    movl %ebx, %edx
1424; KNL_32-NEXT:    andl $1, %edx
1425; KNL_32-NEXT:    vpslld $2, %xmm1, %xmm1
1426; KNL_32-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
1427; KNL_32-NEXT:    # implicit-def: %XMM0
1428; KNL_32-NEXT:    testb $1, %bl
1429; KNL_32-NEXT:    je .LBB29_2
1430; KNL_32-NEXT:  # BB#1: # %cond.load
1431; KNL_32-NEXT:    vmovd %xmm1, %esi
1432; KNL_32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1433; KNL_32-NEXT:  .LBB29_2: # %else
1434; KNL_32-NEXT:    testb %cl, %cl
1435; KNL_32-NEXT:    je .LBB29_4
1436; KNL_32-NEXT:  # BB#3: # %cond.load1
1437; KNL_32-NEXT:    vpextrd $1, %xmm1, %esi
1438; KNL_32-NEXT:    vpinsrd $1, (%esi), %xmm0, %xmm0
1439; KNL_32-NEXT:  .LBB29_4: # %else2
1440; KNL_32-NEXT:    testb %al, %al
1441; KNL_32-NEXT:    je .LBB29_6
1442; KNL_32-NEXT:  # BB#5: # %cond.load4
1443; KNL_32-NEXT:    vpextrd $2, %xmm1, %esi
1444; KNL_32-NEXT:    vpinsrd $2, (%esi), %xmm0, %xmm0
1445; KNL_32-NEXT:  .LBB29_6: # %else5
1446; KNL_32-NEXT:    vmovd %edx, %xmm1
1447; KNL_32-NEXT:    vpinsrd $1, %ecx, %xmm1, %xmm1
1448; KNL_32-NEXT:    vpinsrd $2, %eax, %xmm1, %xmm1
1449; KNL_32-NEXT:    vpslld $31, %xmm1, %xmm1
1450; KNL_32-NEXT:    vblendvps %xmm1, %xmm0, %xmm2, %xmm0
1451; KNL_32-NEXT:    popl %esi
1452; KNL_32-NEXT:    popl %ebx
1453; KNL_32-NEXT:    retl
1454;
1455; SKX-LABEL: test30:
1456; SKX:       # BB#0:
1457; SKX-NEXT:    vpslld $31, %xmm2, %xmm2
1458; SKX-NEXT:    vptestmd %xmm2, %xmm2, %k1
1459; SKX-NEXT:    kmovb %k1, -{{[0-9]+}}(%rsp)
1460; SKX-NEXT:    vpmovsxdq %xmm1, %ymm1
1461; SKX-NEXT:    vpsllq $2, %ymm1, %ymm1
1462; SKX-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
1463; SKX-NEXT:    movb -{{[0-9]+}}(%rsp), %al
1464; SKX-NEXT:    # implicit-def: %XMM0
1465; SKX-NEXT:    testb %al, %al
1466; SKX-NEXT:    je .LBB29_2
1467; SKX-NEXT:  # BB#1: # %cond.load
1468; SKX-NEXT:    vmovq %xmm1, %rax
1469; SKX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1470; SKX-NEXT:  .LBB29_2: # %else
1471; SKX-NEXT:    kmovb %k1, -{{[0-9]+}}(%rsp)
1472; SKX-NEXT:    movb -{{[0-9]+}}(%rsp), %al
1473; SKX-NEXT:    testb %al, %al
1474; SKX-NEXT:    je .LBB29_4
1475; SKX-NEXT:  # BB#3: # %cond.load1
1476; SKX-NEXT:    vpextrq $1, %xmm1, %rax
1477; SKX-NEXT:    vpinsrd $1, (%rax), %xmm0, %xmm0
1478; SKX-NEXT:  .LBB29_4: # %else2
1479; SKX-NEXT:    kmovb %k1, -{{[0-9]+}}(%rsp)
1480; SKX-NEXT:    movb -{{[0-9]+}}(%rsp), %al
1481; SKX-NEXT:    testb %al, %al
1482; SKX-NEXT:    je .LBB29_6
1483; SKX-NEXT:  # BB#5: # %cond.load4
1484; SKX-NEXT:    vextracti64x2 $1, %ymm1, %xmm1
1485; SKX-NEXT:    vmovq %xmm1, %rax
1486; SKX-NEXT:    vpinsrd $2, (%rax), %xmm0, %xmm0
1487; SKX-NEXT:  .LBB29_6: # %else5
1488; SKX-NEXT:    vpblendmd %xmm0, %xmm3, %xmm0 {%k1}
1489; SKX-NEXT:    retq
1490;
1491; SKX_32-LABEL: test30:
1492; SKX_32:       # BB#0:
1493; SKX_32-NEXT:    subl $12, %esp
1494; SKX_32-NEXT:  .Ltmp0:
1495; SKX_32-NEXT:    .cfi_def_cfa_offset 16
1496; SKX_32-NEXT:    vpslld $31, %xmm2, %xmm2
1497; SKX_32-NEXT:    vptestmd %xmm2, %xmm2, %k1
1498; SKX_32-NEXT:    kmovb %k1, {{[0-9]+}}(%esp)
1499; SKX_32-NEXT:    vpslld $2, %xmm1, %xmm1
1500; SKX_32-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
1501; SKX_32-NEXT:    movb {{[0-9]+}}(%esp), %al
1502; SKX_32-NEXT:    # implicit-def: %XMM0
1503; SKX_32-NEXT:    testb %al, %al
1504; SKX_32-NEXT:    je .LBB29_2
1505; SKX_32-NEXT:  # BB#1: # %cond.load
1506; SKX_32-NEXT:    vmovd %xmm1, %eax
1507; SKX_32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1508; SKX_32-NEXT:  .LBB29_2: # %else
1509; SKX_32-NEXT:    kmovb %k1, {{[0-9]+}}(%esp)
1510; SKX_32-NEXT:    movb {{[0-9]+}}(%esp), %al
1511; SKX_32-NEXT:    testb %al, %al
1512; SKX_32-NEXT:    je .LBB29_4
1513; SKX_32-NEXT:  # BB#3: # %cond.load1
1514; SKX_32-NEXT:    vpextrd $1, %xmm1, %eax
1515; SKX_32-NEXT:    vpinsrd $1, (%eax), %xmm0, %xmm0
1516; SKX_32-NEXT:  .LBB29_4: # %else2
1517; SKX_32-NEXT:    vmovdqa32 {{[0-9]+}}(%esp), %xmm2
1518; SKX_32-NEXT:    kmovb %k1, (%esp)
1519; SKX_32-NEXT:    movb (%esp), %al
1520; SKX_32-NEXT:    testb %al, %al
1521; SKX_32-NEXT:    je .LBB29_6
1522; SKX_32-NEXT:  # BB#5: # %cond.load4
1523; SKX_32-NEXT:    vpextrd $2, %xmm1, %eax
1524; SKX_32-NEXT:    vpinsrd $2, (%eax), %xmm0, %xmm0
1525; SKX_32-NEXT:  .LBB29_6: # %else5
1526; SKX_32-NEXT:    vpblendmd %xmm0, %xmm2, %xmm0 {%k1}
1527; SKX_32-NEXT:    addl $12, %esp
1528; SKX_32-NEXT:    retl
1529
1530  %sext_ind = sext <3 x i32> %ind to <3 x i64>
1531  %gep.random = getelementptr i32, <3 x i32*> %base, <3 x i64> %sext_ind
1532  %res = call <3 x i32> @llvm.masked.gather.v3i32(<3 x i32*> %gep.random, i32 4, <3 x i1> %mask, <3 x i32> %src0)
1533  ret <3 x i32>%res
1534}
1535
1536declare <16 x float*> @llvm.masked.gather.v16p0f32(<16 x float**>, i32, <16 x i1>, <16 x float*>)
1537
1538; KNL-LABEL: test31
1539; KNL: vpgatherqq
1540; KNL: vpgatherqq
1541define <16 x float*> @test31(<16 x float**> %ptrs) {
1542; KNL_64-LABEL: test31:
1543; KNL_64:       # BB#0:
1544; KNL_64-NEXT:    kxnorw %k0, %k0, %k1
1545; KNL_64-NEXT:    kxnorw %k0, %k0, %k2
1546; KNL_64-NEXT:    vpgatherqq (,%zmm0), %zmm2 {%k2}
1547; KNL_64-NEXT:    kshiftrw $8, %k1, %k1
1548; KNL_64-NEXT:    vpgatherqq (,%zmm1), %zmm3 {%k1}
1549; KNL_64-NEXT:    vmovaps %zmm2, %zmm0
1550; KNL_64-NEXT:    vmovaps %zmm3, %zmm1
1551; KNL_64-NEXT:    retq
1552;
1553; KNL_32-LABEL: test31:
1554; KNL_32:       # BB#0:
1555; KNL_32-NEXT:    kxnorw %k0, %k0, %k1
1556; KNL_32-NEXT:    vpgatherdd (,%zmm0), %zmm1 {%k1}
1557; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
1558; KNL_32-NEXT:    retl
1559;
1560; SKX-LABEL: test31:
1561; SKX:       # BB#0:
1562; SKX-NEXT:    kxnorw %k0, %k0, %k1
1563; SKX-NEXT:    kxnorw %k0, %k0, %k2
1564; SKX-NEXT:    vpgatherqq (,%zmm0), %zmm2 {%k2}
1565; SKX-NEXT:    kshiftrw $8, %k1, %k1
1566; SKX-NEXT:    vpgatherqq (,%zmm1), %zmm3 {%k1}
1567; SKX-NEXT:    vmovaps %zmm2, %zmm0
1568; SKX-NEXT:    vmovaps %zmm3, %zmm1
1569; SKX-NEXT:    retq
1570;
1571; SKX_32-LABEL: test31:
1572; SKX_32:       # BB#0:
1573; SKX_32-NEXT:    kxnorw %k0, %k0, %k1
1574; SKX_32-NEXT:    vpgatherdd (,%zmm0), %zmm1 {%k1}
1575; SKX_32-NEXT:    vmovaps %zmm1, %zmm0
1576; SKX_32-NEXT:    retl
1577
1578  %res = call <16 x float*> @llvm.masked.gather.v16p0f32(<16 x float**> %ptrs, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float*> undef)
1579  ret <16 x float*>%res
1580}
1581
1582define <16 x i32> @test_gather_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0)  {
1583; KNL_64-LABEL: test_gather_16i32:
1584; KNL_64:       # BB#0:
1585; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
1586; KNL_64-NEXT:    vpslld $31, %zmm2, %zmm2
1587; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
1588; KNL_64-NEXT:    vextracti64x4 $1, %zmm3, %ymm2
1589; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
1590; KNL_64-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k2}
1591; KNL_64-NEXT:    vpgatherqd (,%zmm0), %ymm3 {%k1}
1592; KNL_64-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm0
1593; KNL_64-NEXT:    retq
1594;
1595; KNL_32-LABEL: test_gather_16i32:
1596; KNL_32:       # BB#0:
1597; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
1598; KNL_32-NEXT:    vpslld $31, %zmm1, %zmm1
1599; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
1600; KNL_32-NEXT:    vpgatherdd (,%zmm0), %zmm2 {%k1}
1601; KNL_32-NEXT:    vmovaps %zmm2, %zmm0
1602; KNL_32-NEXT:    retl
1603;
1604; SKX-LABEL: test_gather_16i32:
1605; SKX:       # BB#0:
1606; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
1607; SKX-NEXT:    vpslld $31, %zmm2, %zmm2
1608; SKX-NEXT:    vptestmd %zmm2, %zmm2, %k1
1609; SKX-NEXT:    vextracti32x8 $1, %zmm3, %ymm2
1610; SKX-NEXT:    kshiftrw $8, %k1, %k2
1611; SKX-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k2}
1612; SKX-NEXT:    vpgatherqd (,%zmm0), %ymm3 {%k1}
1613; SKX-NEXT:    vinserti32x8 $1, %ymm2, %zmm3, %zmm0
1614; SKX-NEXT:    retq
1615;
1616; SKX_32-LABEL: test_gather_16i32:
1617; SKX_32:       # BB#0:
1618; SKX_32-NEXT:    vpmovsxbd %xmm1, %zmm1
1619; SKX_32-NEXT:    vpslld $31, %zmm1, %zmm1
1620; SKX_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
1621; SKX_32-NEXT:    vpgatherdd (,%zmm0), %zmm2 {%k1}
1622; SKX_32-NEXT:    vmovaps %zmm2, %zmm0
1623; SKX_32-NEXT:    retl
1624  %res = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptrs, i32 4, <16 x i1> %mask, <16 x i32> %src0)
1625  ret <16 x i32> %res
1626}
1627define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0)  {
1628; KNL_64-LABEL: test_gather_16i64:
1629; KNL_64:       # BB#0:
1630; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
1631; KNL_64-NEXT:    vpslld $31, %zmm2, %zmm2
1632; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
1633; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
1634; KNL_64-NEXT:    vpgatherqq (,%zmm0), %zmm3 {%k1}
1635; KNL_64-NEXT:    vpgatherqq (,%zmm1), %zmm4 {%k2}
1636; KNL_64-NEXT:    vmovaps %zmm3, %zmm0
1637; KNL_64-NEXT:    vmovaps %zmm4, %zmm1
1638; KNL_64-NEXT:    retq
1639;
1640; KNL_32-LABEL: test_gather_16i64:
1641; KNL_32:       # BB#0:
1642; KNL_32-NEXT:    pushl %ebp
1643; KNL_32-NEXT:  .Ltmp4:
1644; KNL_32-NEXT:    .cfi_def_cfa_offset 8
1645; KNL_32-NEXT:  .Ltmp5:
1646; KNL_32-NEXT:    .cfi_offset %ebp, -8
1647; KNL_32-NEXT:    movl %esp, %ebp
1648; KNL_32-NEXT:  .Ltmp6:
1649; KNL_32-NEXT:    .cfi_def_cfa_register %ebp
1650; KNL_32-NEXT:    andl $-64, %esp
1651; KNL_32-NEXT:    subl $64, %esp
1652; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
1653; KNL_32-NEXT:    vpslld $31, %zmm1, %zmm1
1654; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
1655; KNL_32-NEXT:    vmovdqa64 8(%ebp), %zmm1
1656; KNL_32-NEXT:    kshiftrw $8, %k1, %k2
1657; KNL_32-NEXT:    vpgatherdq (,%ymm0), %zmm2 {%k1}
1658; KNL_32-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
1659; KNL_32-NEXT:    vpgatherdq (,%ymm0), %zmm1 {%k2}
1660; KNL_32-NEXT:    vmovaps %zmm2, %zmm0
1661; KNL_32-NEXT:    movl %ebp, %esp
1662; KNL_32-NEXT:    popl %ebp
1663; KNL_32-NEXT:    retl
1664;
1665; SKX-LABEL: test_gather_16i64:
1666; SKX:       # BB#0:
1667; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
1668; SKX-NEXT:    vpslld $31, %zmm2, %zmm2
1669; SKX-NEXT:    vptestmd %zmm2, %zmm2, %k1
1670; SKX-NEXT:    kshiftrw $8, %k1, %k2
1671; SKX-NEXT:    vpgatherqq (,%zmm0), %zmm3 {%k1}
1672; SKX-NEXT:    vpgatherqq (,%zmm1), %zmm4 {%k2}
1673; SKX-NEXT:    vmovaps %zmm3, %zmm0
1674; SKX-NEXT:    vmovaps %zmm4, %zmm1
1675; SKX-NEXT:    retq
1676;
1677; SKX_32-LABEL: test_gather_16i64:
1678; SKX_32:       # BB#0:
1679; SKX_32-NEXT:    pushl %ebp
1680; SKX_32-NEXT:  .Ltmp1:
1681; SKX_32-NEXT:    .cfi_def_cfa_offset 8
1682; SKX_32-NEXT:  .Ltmp2:
1683; SKX_32-NEXT:    .cfi_offset %ebp, -8
1684; SKX_32-NEXT:    movl %esp, %ebp
1685; SKX_32-NEXT:  .Ltmp3:
1686; SKX_32-NEXT:    .cfi_def_cfa_register %ebp
1687; SKX_32-NEXT:    andl $-64, %esp
1688; SKX_32-NEXT:    subl $64, %esp
1689; SKX_32-NEXT:    vpmovsxbd %xmm1, %zmm1
1690; SKX_32-NEXT:    vpslld $31, %zmm1, %zmm1
1691; SKX_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
1692; SKX_32-NEXT:    vmovdqa64 8(%ebp), %zmm1
1693; SKX_32-NEXT:    kshiftrw $8, %k1, %k2
1694; SKX_32-NEXT:    vpgatherdq (,%ymm0), %zmm2 {%k1}
1695; SKX_32-NEXT:    vextracti32x8 $1, %zmm0, %ymm0
1696; SKX_32-NEXT:    vpgatherdq (,%ymm0), %zmm1 {%k2}
1697; SKX_32-NEXT:    vmovaps %zmm2, %zmm0
1698; SKX_32-NEXT:    movl %ebp, %esp
1699; SKX_32-NEXT:    popl %ebp
1700; SKX_32-NEXT:    retl
1701  %res = call <16 x i64> @llvm.masked.gather.v16i64(<16 x i64*> %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0)
1702  ret <16 x i64> %res
1703}
1704declare <16 x i64> @llvm.masked.gather.v16i64(<16 x i64*> %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0)
1705define <16 x float> @test_gather_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0)  {
1706; KNL_64-LABEL: test_gather_16f32:
1707; KNL_64:       # BB#0:
1708; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
1709; KNL_64-NEXT:    vpslld $31, %zmm2, %zmm2
1710; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
1711; KNL_64-NEXT:    vextractf64x4 $1, %zmm3, %ymm2
1712; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
1713; KNL_64-NEXT:    vgatherqps (,%zmm1), %ymm2 {%k2}
1714; KNL_64-NEXT:    vgatherqps (,%zmm0), %ymm3 {%k1}
1715; KNL_64-NEXT:    vinsertf64x4 $1, %ymm2, %zmm3, %zmm0
1716; KNL_64-NEXT:    retq
1717;
1718; KNL_32-LABEL: test_gather_16f32:
1719; KNL_32:       # BB#0:
1720; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
1721; KNL_32-NEXT:    vpslld $31, %zmm1, %zmm1
1722; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
1723; KNL_32-NEXT:    vgatherdps (,%zmm0), %zmm2 {%k1}
1724; KNL_32-NEXT:    vmovaps %zmm2, %zmm0
1725; KNL_32-NEXT:    retl
1726;
1727; SKX-LABEL: test_gather_16f32:
1728; SKX:       # BB#0:
1729; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
1730; SKX-NEXT:    vpslld $31, %zmm2, %zmm2
1731; SKX-NEXT:    vptestmd %zmm2, %zmm2, %k1
1732; SKX-NEXT:    vextractf32x8 $1, %zmm3, %ymm2
1733; SKX-NEXT:    kshiftrw $8, %k1, %k2
1734; SKX-NEXT:    vgatherqps (,%zmm1), %ymm2 {%k2}
1735; SKX-NEXT:    vgatherqps (,%zmm0), %ymm3 {%k1}
1736; SKX-NEXT:    vinsertf32x8 $1, %ymm2, %zmm3, %zmm0
1737; SKX-NEXT:    retq
1738;
1739; SKX_32-LABEL: test_gather_16f32:
1740; SKX_32:       # BB#0:
1741; SKX_32-NEXT:    vpmovsxbd %xmm1, %zmm1
1742; SKX_32-NEXT:    vpslld $31, %zmm1, %zmm1
1743; SKX_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
1744; SKX_32-NEXT:    vgatherdps (,%zmm0), %zmm2 {%k1}
1745; SKX_32-NEXT:    vmovaps %zmm2, %zmm0
1746; SKX_32-NEXT:    retl
1747  %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %ptrs, i32 4, <16 x i1> %mask, <16 x float> %src0)
1748  ret <16 x float> %res
1749}
1750define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0)  {
1751; KNL_64-LABEL: test_gather_16f64:
1752; KNL_64:       # BB#0:
1753; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
1754; KNL_64-NEXT:    vpslld $31, %zmm2, %zmm2
1755; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
1756; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
1757; KNL_64-NEXT:    vgatherqpd (,%zmm0), %zmm3 {%k1}
1758; KNL_64-NEXT:    vgatherqpd (,%zmm1), %zmm4 {%k2}
1759; KNL_64-NEXT:    vmovaps %zmm3, %zmm0
1760; KNL_64-NEXT:    vmovaps %zmm4, %zmm1
1761; KNL_64-NEXT:    retq
1762;
1763; KNL_32-LABEL: test_gather_16f64:
1764; KNL_32:       # BB#0:
1765; KNL_32-NEXT:    pushl %ebp
1766; KNL_32-NEXT:  .Ltmp7:
1767; KNL_32-NEXT:    .cfi_def_cfa_offset 8
1768; KNL_32-NEXT:  .Ltmp8:
1769; KNL_32-NEXT:    .cfi_offset %ebp, -8
1770; KNL_32-NEXT:    movl %esp, %ebp
1771; KNL_32-NEXT:  .Ltmp9:
1772; KNL_32-NEXT:    .cfi_def_cfa_register %ebp
1773; KNL_32-NEXT:    andl $-64, %esp
1774; KNL_32-NEXT:    subl $64, %esp
1775; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
1776; KNL_32-NEXT:    vpslld $31, %zmm1, %zmm1
1777; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
1778; KNL_32-NEXT:    vmovapd 8(%ebp), %zmm1
1779; KNL_32-NEXT:    kshiftrw $8, %k1, %k2
1780; KNL_32-NEXT:    vgatherdpd (,%ymm0), %zmm2 {%k1}
1781; KNL_32-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
1782; KNL_32-NEXT:    vgatherdpd (,%ymm0), %zmm1 {%k2}
1783; KNL_32-NEXT:    vmovaps %zmm2, %zmm0
1784; KNL_32-NEXT:    movl %ebp, %esp
1785; KNL_32-NEXT:    popl %ebp
1786; KNL_32-NEXT:    retl
1787;
1788; SKX-LABEL: test_gather_16f64:
1789; SKX:       # BB#0:
1790; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
1791; SKX-NEXT:    vpslld $31, %zmm2, %zmm2
1792; SKX-NEXT:    vptestmd %zmm2, %zmm2, %k1
1793; SKX-NEXT:    kshiftrw $8, %k1, %k2
1794; SKX-NEXT:    vgatherqpd (,%zmm0), %zmm3 {%k1}
1795; SKX-NEXT:    vgatherqpd (,%zmm1), %zmm4 {%k2}
1796; SKX-NEXT:    vmovaps %zmm3, %zmm0
1797; SKX-NEXT:    vmovaps %zmm4, %zmm1
1798; SKX-NEXT:    retq
1799;
1800; SKX_32-LABEL: test_gather_16f64:
1801; SKX_32:       # BB#0:
1802; SKX_32-NEXT:    pushl %ebp
1803; SKX_32-NEXT:  .Ltmp4:
1804; SKX_32-NEXT:    .cfi_def_cfa_offset 8
1805; SKX_32-NEXT:  .Ltmp5:
1806; SKX_32-NEXT:    .cfi_offset %ebp, -8
1807; SKX_32-NEXT:    movl %esp, %ebp
1808; SKX_32-NEXT:  .Ltmp6:
1809; SKX_32-NEXT:    .cfi_def_cfa_register %ebp
1810; SKX_32-NEXT:    andl $-64, %esp
1811; SKX_32-NEXT:    subl $64, %esp
1812; SKX_32-NEXT:    vpmovsxbd %xmm1, %zmm1
1813; SKX_32-NEXT:    vpslld $31, %zmm1, %zmm1
1814; SKX_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
1815; SKX_32-NEXT:    vmovapd 8(%ebp), %zmm1
1816; SKX_32-NEXT:    kshiftrw $8, %k1, %k2
1817; SKX_32-NEXT:    vgatherdpd (,%ymm0), %zmm2 {%k1}
1818; SKX_32-NEXT:    vextracti32x8 $1, %zmm0, %ymm0
1819; SKX_32-NEXT:    vgatherdpd (,%ymm0), %zmm1 {%k2}
1820; SKX_32-NEXT:    vmovaps %zmm2, %zmm0
1821; SKX_32-NEXT:    movl %ebp, %esp
1822; SKX_32-NEXT:    popl %ebp
1823; SKX_32-NEXT:    retl
1824  %res = call <16 x double> @llvm.masked.gather.v16f64(<16 x double*> %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0)
1825  ret <16 x double> %res
1826}
1827declare <16 x double> @llvm.masked.gather.v16f64(<16 x double*> %ptrs, i32, <16 x i1> %mask, <16 x double> %src0)
1828define void @test_scatter_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0)  {
1829; KNL_64-LABEL: test_scatter_16i32:
1830; KNL_64:       # BB#0:
1831; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
1832; KNL_64-NEXT:    vpslld $31, %zmm2, %zmm2
1833; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
1834; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
1835; KNL_64-NEXT:    vpscatterqd %ymm3, (,%zmm0) {%k1}
1836; KNL_64-NEXT:    vextracti64x4 $1, %zmm3, %ymm0
1837; KNL_64-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k2}
1838; KNL_64-NEXT:    retq
1839;
1840; KNL_32-LABEL: test_scatter_16i32:
1841; KNL_32:       # BB#0:
1842; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
1843; KNL_32-NEXT:    vpslld $31, %zmm1, %zmm1
1844; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
1845; KNL_32-NEXT:    vpscatterdd %zmm2, (,%zmm0) {%k1}
1846; KNL_32-NEXT:    retl
1847;
1848; SKX-LABEL: test_scatter_16i32:
1849; SKX:       # BB#0:
1850; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
1851; SKX-NEXT:    vpslld $31, %zmm2, %zmm2
1852; SKX-NEXT:    vptestmd %zmm2, %zmm2, %k1
1853; SKX-NEXT:    kshiftrw $8, %k1, %k2
1854; SKX-NEXT:    vpscatterqd %ymm3, (,%zmm0) {%k1}
1855; SKX-NEXT:    vextracti32x8 $1, %zmm3, %ymm0
1856; SKX-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k2}
1857; SKX-NEXT:    retq
1858;
1859; SKX_32-LABEL: test_scatter_16i32:
1860; SKX_32:       # BB#0:
1861; SKX_32-NEXT:    vpmovsxbd %xmm1, %zmm1
1862; SKX_32-NEXT:    vpslld $31, %zmm1, %zmm1
1863; SKX_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
1864; SKX_32-NEXT:    vpscatterdd %zmm2, (,%zmm0) {%k1}
1865; SKX_32-NEXT:    retl
1866  call void @llvm.masked.scatter.v16i32(<16 x i32> %src0, <16 x i32*> %ptrs, i32 4, <16 x i1> %mask)
1867  ret void
1868}
1869define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0)  {
1870; KNL_64-LABEL: test_scatter_16i64:
1871; KNL_64:       # BB#0:
1872; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
1873; KNL_64-NEXT:    vpslld $31, %zmm2, %zmm2
1874; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
1875; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
1876; KNL_64-NEXT:    vpscatterqq %zmm3, (,%zmm0) {%k1}
1877; KNL_64-NEXT:    vpscatterqq %zmm4, (,%zmm1) {%k2}
1878; KNL_64-NEXT:    retq
1879;
1880; KNL_32-LABEL: test_scatter_16i64:
1881; KNL_32:       # BB#0:
1882; KNL_32-NEXT:    pushl %ebp
1883; KNL_32-NEXT:  .Ltmp10:
1884; KNL_32-NEXT:    .cfi_def_cfa_offset 8
1885; KNL_32-NEXT:  .Ltmp11:
1886; KNL_32-NEXT:    .cfi_offset %ebp, -8
1887; KNL_32-NEXT:    movl %esp, %ebp
1888; KNL_32-NEXT:  .Ltmp12:
1889; KNL_32-NEXT:    .cfi_def_cfa_register %ebp
1890; KNL_32-NEXT:    andl $-64, %esp
1891; KNL_32-NEXT:    subl $64, %esp
1892; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
1893; KNL_32-NEXT:    vpslld $31, %zmm1, %zmm1
1894; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
1895; KNL_32-NEXT:    vmovdqa64 8(%ebp), %zmm1
1896; KNL_32-NEXT:    kshiftrw $8, %k1, %k2
1897; KNL_32-NEXT:    vpscatterdq %zmm2, (,%ymm0) {%k1}
1898; KNL_32-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
1899; KNL_32-NEXT:    vpscatterdq %zmm1, (,%ymm0) {%k2}
1900; KNL_32-NEXT:    movl %ebp, %esp
1901; KNL_32-NEXT:    popl %ebp
1902; KNL_32-NEXT:    retl
1903;
1904; SKX-LABEL: test_scatter_16i64:
1905; SKX:       # BB#0:
1906; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
1907; SKX-NEXT:    vpslld $31, %zmm2, %zmm2
1908; SKX-NEXT:    vptestmd %zmm2, %zmm2, %k1
1909; SKX-NEXT:    kshiftrw $8, %k1, %k2
1910; SKX-NEXT:    vpscatterqq %zmm3, (,%zmm0) {%k1}
1911; SKX-NEXT:    vpscatterqq %zmm4, (,%zmm1) {%k2}
1912; SKX-NEXT:    retq
1913;
1914; SKX_32-LABEL: test_scatter_16i64:
1915; SKX_32:       # BB#0:
1916; SKX_32-NEXT:    pushl %ebp
1917; SKX_32-NEXT:  .Ltmp7:
1918; SKX_32-NEXT:    .cfi_def_cfa_offset 8
1919; SKX_32-NEXT:  .Ltmp8:
1920; SKX_32-NEXT:    .cfi_offset %ebp, -8
1921; SKX_32-NEXT:    movl %esp, %ebp
1922; SKX_32-NEXT:  .Ltmp9:
1923; SKX_32-NEXT:    .cfi_def_cfa_register %ebp
1924; SKX_32-NEXT:    andl $-64, %esp
1925; SKX_32-NEXT:    subl $64, %esp
1926; SKX_32-NEXT:    vpmovsxbd %xmm1, %zmm1
1927; SKX_32-NEXT:    vpslld $31, %zmm1, %zmm1
1928; SKX_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
1929; SKX_32-NEXT:    vmovdqa64 8(%ebp), %zmm1
1930; SKX_32-NEXT:    kshiftrw $8, %k1, %k2
1931; SKX_32-NEXT:    vpscatterdq %zmm2, (,%ymm0) {%k1}
1932; SKX_32-NEXT:    vextracti32x8 $1, %zmm0, %ymm0
1933; SKX_32-NEXT:    vpscatterdq %zmm1, (,%ymm0) {%k2}
1934; SKX_32-NEXT:    movl %ebp, %esp
1935; SKX_32-NEXT:    popl %ebp
1936; SKX_32-NEXT:    retl
1937  call void @llvm.masked.scatter.v16i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32 4, <16 x i1> %mask)
1938  ret void
1939}
1940declare void @llvm.masked.scatter.v16i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32, <16 x i1> %mask)
1941define void @test_scatter_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0)  {
1942; KNL_64-LABEL: test_scatter_16f32:
1943; KNL_64:       # BB#0:
1944; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
1945; KNL_64-NEXT:    vpslld $31, %zmm2, %zmm2
1946; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
1947; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
1948; KNL_64-NEXT:    vscatterqps %ymm3, (,%zmm0) {%k1}
1949; KNL_64-NEXT:    vextractf64x4 $1, %zmm3, %ymm0
1950; KNL_64-NEXT:    vscatterqps %ymm0, (,%zmm1) {%k2}
1951; KNL_64-NEXT:    retq
1952;
1953; KNL_32-LABEL: test_scatter_16f32:
1954; KNL_32:       # BB#0:
1955; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
1956; KNL_32-NEXT:    vpslld $31, %zmm1, %zmm1
1957; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
1958; KNL_32-NEXT:    vscatterdps %zmm2, (,%zmm0) {%k1}
1959; KNL_32-NEXT:    retl
1960;
1961; SKX-LABEL: test_scatter_16f32:
1962; SKX:       # BB#0:
1963; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
1964; SKX-NEXT:    vpslld $31, %zmm2, %zmm2
1965; SKX-NEXT:    vptestmd %zmm2, %zmm2, %k1
1966; SKX-NEXT:    kshiftrw $8, %k1, %k2
1967; SKX-NEXT:    vscatterqps %ymm3, (,%zmm0) {%k1}
1968; SKX-NEXT:    vextractf32x8 $1, %zmm3, %ymm0
1969; SKX-NEXT:    vscatterqps %ymm0, (,%zmm1) {%k2}
1970; SKX-NEXT:    retq
1971;
1972; SKX_32-LABEL: test_scatter_16f32:
1973; SKX_32:       # BB#0:
1974; SKX_32-NEXT:    vpmovsxbd %xmm1, %zmm1
1975; SKX_32-NEXT:    vpslld $31, %zmm1, %zmm1
1976; SKX_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
1977; SKX_32-NEXT:    vscatterdps %zmm2, (,%zmm0) {%k1}
1978; SKX_32-NEXT:    retl
1979  call void @llvm.masked.scatter.v16f32(<16 x float> %src0, <16 x float*> %ptrs, i32 4, <16 x i1> %mask)
1980  ret void
1981}
1982declare void @llvm.masked.scatter.v16f32(<16 x float> %src0, <16 x float*> %ptrs, i32, <16 x i1> %mask)
1983define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0)  {
1984; KNL_64-LABEL: test_scatter_16f64:
1985; KNL_64:       # BB#0:
1986; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
1987; KNL_64-NEXT:    vpslld $31, %zmm2, %zmm2
1988; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
1989; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
1990; KNL_64-NEXT:    vscatterqpd %zmm3, (,%zmm0) {%k1}
1991; KNL_64-NEXT:    vscatterqpd %zmm4, (,%zmm1) {%k2}
1992; KNL_64-NEXT:    retq
1993;
1994; KNL_32-LABEL: test_scatter_16f64:
1995; KNL_32:       # BB#0:
1996; KNL_32-NEXT:    pushl %ebp
1997; KNL_32-NEXT:  .Ltmp13:
1998; KNL_32-NEXT:    .cfi_def_cfa_offset 8
1999; KNL_32-NEXT:  .Ltmp14:
2000; KNL_32-NEXT:    .cfi_offset %ebp, -8
2001; KNL_32-NEXT:    movl %esp, %ebp
2002; KNL_32-NEXT:  .Ltmp15:
2003; KNL_32-NEXT:    .cfi_def_cfa_register %ebp
2004; KNL_32-NEXT:    andl $-64, %esp
2005; KNL_32-NEXT:    subl $64, %esp
2006; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
2007; KNL_32-NEXT:    vpslld $31, %zmm1, %zmm1
2008; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
2009; KNL_32-NEXT:    vmovapd 8(%ebp), %zmm1
2010; KNL_32-NEXT:    kshiftrw $8, %k1, %k2
2011; KNL_32-NEXT:    vscatterdpd %zmm2, (,%ymm0) {%k1}
2012; KNL_32-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
2013; KNL_32-NEXT:    vscatterdpd %zmm1, (,%ymm0) {%k2}
2014; KNL_32-NEXT:    movl %ebp, %esp
2015; KNL_32-NEXT:    popl %ebp
2016; KNL_32-NEXT:    retl
2017;
2018; SKX-LABEL: test_scatter_16f64:
2019; SKX:       # BB#0:
2020; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
2021; SKX-NEXT:    vpslld $31, %zmm2, %zmm2
2022; SKX-NEXT:    vptestmd %zmm2, %zmm2, %k1
2023; SKX-NEXT:    kshiftrw $8, %k1, %k2
2024; SKX-NEXT:    vscatterqpd %zmm3, (,%zmm0) {%k1}
2025; SKX-NEXT:    vscatterqpd %zmm4, (,%zmm1) {%k2}
2026; SKX-NEXT:    retq
2027;
2028; SKX_32-LABEL: test_scatter_16f64:
2029; SKX_32:       # BB#0:
2030; SKX_32-NEXT:    pushl %ebp
2031; SKX_32-NEXT:  .Ltmp10:
2032; SKX_32-NEXT:    .cfi_def_cfa_offset 8
2033; SKX_32-NEXT:  .Ltmp11:
2034; SKX_32-NEXT:    .cfi_offset %ebp, -8
2035; SKX_32-NEXT:    movl %esp, %ebp
2036; SKX_32-NEXT:  .Ltmp12:
2037; SKX_32-NEXT:    .cfi_def_cfa_register %ebp
2038; SKX_32-NEXT:    andl $-64, %esp
2039; SKX_32-NEXT:    subl $64, %esp
2040; SKX_32-NEXT:    vpmovsxbd %xmm1, %zmm1
2041; SKX_32-NEXT:    vpslld $31, %zmm1, %zmm1
2042; SKX_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
2043; SKX_32-NEXT:    vmovapd 8(%ebp), %zmm1
2044; SKX_32-NEXT:    kshiftrw $8, %k1, %k2
2045; SKX_32-NEXT:    vscatterdpd %zmm2, (,%ymm0) {%k1}
2046; SKX_32-NEXT:    vextracti32x8 $1, %zmm0, %ymm0
2047; SKX_32-NEXT:    vscatterdpd %zmm1, (,%ymm0) {%k2}
2048; SKX_32-NEXT:    movl %ebp, %esp
2049; SKX_32-NEXT:    popl %ebp
2050; SKX_32-NEXT:    retl
2051  call void @llvm.masked.scatter.v16f64(<16 x double> %src0, <16 x double*> %ptrs, i32 4, <16 x i1> %mask)
2052  ret void
2053}
2054declare void @llvm.masked.scatter.v16f64(<16 x double> %src0, <16 x double*> %ptrs, i32, <16 x i1> %mask)
2055