• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=x86_64-unknown-linux-gnu  -mattr=+avx512f < %s | FileCheck %s --check-prefix=ALL --check-prefix=KNL_64
3; RUN: llc -mtriple=i386-unknown-linux-gnu  -mattr=+avx512f < %s | FileCheck %s --check-prefix=ALL --check-prefix=KNL_32
4; RUN: llc -mtriple=x86_64-unknown-linux-gnu  -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX --check-prefix=SKX_SMALL
5; RUN: llc -mtriple=x86_64-unknown-linux-gnu  -mattr=+avx512vl -mattr=+avx512dq -code-model=large < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX --check-prefix=SKX_LARGE
6; RUN: llc -mtriple=i386-unknown-linux-gnu  -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX_32
7; RUN: opt -mtriple=x86_64-apple-darwin -scalarize-masked-mem-intrin -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=SCALAR
8; RUN: opt -mtriple=x86_64-apple-darwin -passes=scalarize-masked-mem-intrin -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=SCALAR
9; RUN: llc -O0 -mtriple=x86_64-unknown-linux-gnu -mcpu=skx < %s -o /dev/null
10
11@glob_array = internal unnamed_addr constant [16 x i32] [i32 1, i32 1, i32 2, i32 3, i32 5, i32 8, i32 13, i32 21, i32 34, i32 55, i32 89, i32 144, i32 233, i32 377, i32 610, i32 987], align 16
12
13; SCALAR-LABEL: test1
14; SCALAR:      extractelement <16 x float*>
15; SCALAR-NEXT: load float
16; SCALAR-NEXT: insertelement <16 x float>
17; SCALAR-NEXT: extractelement <16 x float*>
18; SCALAR-NEXT: load float
19
20define <16 x float> @test1(float* %base, <16 x i32> %ind) {
21; KNL_64-LABEL: test1:
22; KNL_64:       # %bb.0:
23; KNL_64-NEXT:    kxnorw %k0, %k0, %k1
24; KNL_64-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
25; KNL_64-NEXT:    vmovaps %zmm1, %zmm0
26; KNL_64-NEXT:    retq
27;
28; KNL_32-LABEL: test1:
29; KNL_32:       # %bb.0:
30; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
31; KNL_32-NEXT:    kxnorw %k0, %k0, %k1
32; KNL_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
33; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
34; KNL_32-NEXT:    retl
35;
36; SKX-LABEL: test1:
37; SKX:       # %bb.0:
38; SKX-NEXT:    kxnorw %k0, %k0, %k1
39; SKX-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
40; SKX-NEXT:    vmovaps %zmm1, %zmm0
41; SKX-NEXT:    retq
42;
43; SKX_32-LABEL: test1:
44; SKX_32:       # %bb.0:
45; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
46; SKX_32-NEXT:    kxnorw %k0, %k0, %k1
47; SKX_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
48; SKX_32-NEXT:    vmovaps %zmm1, %zmm0
49; SKX_32-NEXT:    retl
50
51  %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
52  %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
53
54  %sext_ind = sext <16 x i32> %ind to <16 x i64>
55  %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
56
57  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
58  ret <16 x float>%res
59}
60
61declare <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*>, i32, <16 x i1>, <16 x i32>)
62declare <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*>, i32, <16 x i1>, <16 x float>)
63declare <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> , i32, <8 x i1> , <8 x i32> )
64
65
66; SCALAR-LABEL: test2
67; SCALAR:      extractelement <16 x float*>
68; SCALAR-NEXT: load float
69; SCALAR-NEXT: insertelement <16 x float>
70; SCALAR-NEXT: br label %else
71; SCALAR: else:
72; SCALAR-NEXT:  %res.phi.else = phi
73; SCALAR-NEXT:  and i16 %{{.*}}, 2
74; SCALAR-NEXT:  icmp ne i16 %{{.*}}, 0
75; SCALAR-NEXT:  br i1 %{{.*}}, label %cond.load1, label %else2
76
77define <16 x float> @test2(float* %base, <16 x i32> %ind, i16 %mask) {
78; KNL_64-LABEL: test2:
79; KNL_64:       # %bb.0:
80; KNL_64-NEXT:    kmovw %esi, %k1
81; KNL_64-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
82; KNL_64-NEXT:    vmovaps %zmm1, %zmm0
83; KNL_64-NEXT:    retq
84;
85; KNL_32-LABEL: test2:
86; KNL_32:       # %bb.0:
87; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
88; KNL_32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
89; KNL_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
90; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
91; KNL_32-NEXT:    retl
92;
93; SKX-LABEL: test2:
94; SKX:       # %bb.0:
95; SKX-NEXT:    kmovw %esi, %k1
96; SKX-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
97; SKX-NEXT:    vmovaps %zmm1, %zmm0
98; SKX-NEXT:    retq
99;
100; SKX_32-LABEL: test2:
101; SKX_32:       # %bb.0:
102; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
103; SKX_32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
104; SKX_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
105; SKX_32-NEXT:    vmovaps %zmm1, %zmm0
106; SKX_32-NEXT:    retl
107
108  %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
109  %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
110
111  %sext_ind = sext <16 x i32> %ind to <16 x i64>
112  %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
113  %imask = bitcast i16 %mask to <16 x i1>
114  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> %imask, <16 x float>undef)
115  ret <16 x float> %res
116}
117
118define <16 x i32> @test3(i32* %base, <16 x i32> %ind, i16 %mask) {
119; KNL_64-LABEL: test3:
120; KNL_64:       # %bb.0:
121; KNL_64-NEXT:    kmovw %esi, %k1
122; KNL_64-NEXT:    vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
123; KNL_64-NEXT:    vmovdqa64 %zmm1, %zmm0
124; KNL_64-NEXT:    retq
125;
126; KNL_32-LABEL: test3:
127; KNL_32:       # %bb.0:
128; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
129; KNL_32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
130; KNL_32-NEXT:    vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1}
131; KNL_32-NEXT:    vmovdqa64 %zmm1, %zmm0
132; KNL_32-NEXT:    retl
133;
134; SKX-LABEL: test3:
135; SKX:       # %bb.0:
136; SKX-NEXT:    kmovw %esi, %k1
137; SKX-NEXT:    vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
138; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
139; SKX-NEXT:    retq
140;
141; SKX_32-LABEL: test3:
142; SKX_32:       # %bb.0:
143; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
144; SKX_32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
145; SKX_32-NEXT:    vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1}
146; SKX_32-NEXT:    vmovdqa64 %zmm1, %zmm0
147; SKX_32-NEXT:    retl
148
149  %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
150  %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
151
152  %sext_ind = sext <16 x i32> %ind to <16 x i64>
153  %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i64> %sext_ind
154  %imask = bitcast i16 %mask to <16 x i1>
155  %res = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
156  ret <16 x i32> %res
157}
158
159
160define <16 x i32> @test4(i32* %base, <16 x i32> %ind, i16 %mask) {
161; KNL_64-LABEL: test4:
162; KNL_64:       # %bb.0:
163; KNL_64-NEXT:    kmovw %esi, %k1
164; KNL_64-NEXT:    kmovw %k1, %k2
165; KNL_64-NEXT:    vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
166; KNL_64-NEXT:    vmovdqa64 %zmm1, %zmm2
167; KNL_64-NEXT:    vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
168; KNL_64-NEXT:    vpaddd %zmm2, %zmm1, %zmm0
169; KNL_64-NEXT:    retq
170;
171; KNL_32-LABEL: test4:
172; KNL_32:       # %bb.0:
173; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
174; KNL_32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
175; KNL_32-NEXT:    kmovw %k1, %k2
176; KNL_32-NEXT:    vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2}
177; KNL_32-NEXT:    vmovdqa64 %zmm1, %zmm2
178; KNL_32-NEXT:    vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1}
179; KNL_32-NEXT:    vpaddd %zmm2, %zmm1, %zmm0
180; KNL_32-NEXT:    retl
181;
182; SKX-LABEL: test4:
183; SKX:       # %bb.0:
184; SKX-NEXT:    kmovw %esi, %k1
185; SKX-NEXT:    kmovw %k1, %k2
186; SKX-NEXT:    vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
187; SKX-NEXT:    vmovdqa64 %zmm1, %zmm2
188; SKX-NEXT:    vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
189; SKX-NEXT:    vpaddd %zmm2, %zmm1, %zmm0
190; SKX-NEXT:    retq
191;
192; SKX_32-LABEL: test4:
193; SKX_32:       # %bb.0:
194; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
195; SKX_32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
196; SKX_32-NEXT:    kmovw %k1, %k2
197; SKX_32-NEXT:    vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2}
198; SKX_32-NEXT:    vmovdqa64 %zmm1, %zmm2
199; SKX_32-NEXT:    vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1}
200; SKX_32-NEXT:    vpaddd %zmm2, %zmm1, %zmm0
201; SKX_32-NEXT:    retl
202
203  %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
204  %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
205
206  %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
207  %imask = bitcast i16 %mask to <16 x i1>
208  %gt1 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
209  %gt2 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1)
210  %res = add <16 x i32> %gt1, %gt2
211  ret <16 x i32> %res
212}
213
214
215; SCALAR-LABEL: test5
216; SCALAR:        and i16 %scalar_mask, 1
217; SCALAR-NEXT:   icmp ne i16 %{{.*}}, 0
218; SCALAR-NEXT:   br i1 %{{.*}}, label %cond.store, label %else
219; SCALAR: cond.store:
220; SCALAR-NEXT:  %Elt0 = extractelement <16 x i32> %val, i64 0
221; SCALAR-NEXT:  %Ptr0 = extractelement <16 x i32*> %gep.random, i64 0
222; SCALAR-NEXT:  store i32 %Elt0, i32* %Ptr0, align 4
223; SCALAR-NEXT:  br label %else
224; SCALAR: else:
225; SCALAR-NEXT:   and i16 %scalar_mask, 2
226; SCALAR-NEXT:   icmp ne i16 %{{.*}}, 0
227; SCALAR-NEXT:  br i1 %{{.*}}, label %cond.store1, label %else2
228
229define void @test5(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
230; KNL_64-LABEL: test5:
231; KNL_64:       # %bb.0:
232; KNL_64-NEXT:    kmovw %esi, %k1
233; KNL_64-NEXT:    kmovw %k1, %k2
234; KNL_64-NEXT:    vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2}
235; KNL_64-NEXT:    vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
236; KNL_64-NEXT:    vzeroupper
237; KNL_64-NEXT:    retq
238;
239; KNL_32-LABEL: test5:
240; KNL_32:       # %bb.0:
241; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
242; KNL_32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
243; KNL_32-NEXT:    kmovw %k1, %k2
244; KNL_32-NEXT:    vpscatterdd %zmm1, (%eax,%zmm0,4) {%k2}
245; KNL_32-NEXT:    vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1}
246; KNL_32-NEXT:    vzeroupper
247; KNL_32-NEXT:    retl
248;
249; SKX-LABEL: test5:
250; SKX:       # %bb.0:
251; SKX-NEXT:    kmovw %esi, %k1
252; SKX-NEXT:    kmovw %k1, %k2
253; SKX-NEXT:    vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2}
254; SKX-NEXT:    vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
255; SKX-NEXT:    vzeroupper
256; SKX-NEXT:    retq
257;
258; SKX_32-LABEL: test5:
259; SKX_32:       # %bb.0:
260; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
261; SKX_32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
262; SKX_32-NEXT:    kmovw %k1, %k2
263; SKX_32-NEXT:    vpscatterdd %zmm1, (%eax,%zmm0,4) {%k2}
264; SKX_32-NEXT:    vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1}
265; SKX_32-NEXT:    vzeroupper
266; SKX_32-NEXT:    retl
267
268  %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
269  %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
270
271  %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
272  %imask = bitcast i16 %mask to <16 x i1>
273  call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
274  call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
275  ret void
276}
277
278declare void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> , <8 x i32*> , i32 , <8 x i1> )
279declare void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> , <16 x i32*> , i32 , <16 x i1> )
280
281
282; SCALAR-LABEL: test6
283; SCALAR:        store i32 %Elt0, i32* %Ptr01, align 4
284; SCALAR-NEXT:   %Elt1 = extractelement <8 x i32> %a1, i64 1
285; SCALAR-NEXT:   %Ptr12 = extractelement <8 x i32*> %ptr, i64 1
286; SCALAR-NEXT:   store i32 %Elt1, i32* %Ptr12, align 4
287; SCALAR-NEXT:   %Elt2 = extractelement <8 x i32> %a1, i64 2
288; SCALAR-NEXT:   %Ptr23 = extractelement <8 x i32*> %ptr, i64 2
289; SCALAR-NEXT:   store i32 %Elt2, i32* %Ptr23, align 4
290
291define <8 x i32> @test6(<8 x i32>%a1, <8 x i32*> %ptr) {
292; KNL_64-LABEL: test6:
293; KNL_64:       # %bb.0:
294; KNL_64-NEXT:    kxnorw %k0, %k0, %k1
295; KNL_64-NEXT:    kxnorw %k0, %k0, %k2
296; KNL_64-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k2}
297; KNL_64-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
298; KNL_64-NEXT:    vmovdqa %ymm2, %ymm0
299; KNL_64-NEXT:    retq
300;
301; KNL_32-LABEL: test6:
302; KNL_32:       # %bb.0:
303; KNL_32-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
304; KNL_32-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
305; KNL_32-NEXT:    movw $255, %ax
306; KNL_32-NEXT:    kmovw %eax, %k1
307; KNL_32-NEXT:    kmovw %k1, %k2
308; KNL_32-NEXT:    vpgatherdd (,%zmm1), %zmm2 {%k2}
309; KNL_32-NEXT:    vpscatterdd %zmm0, (,%zmm1) {%k1}
310; KNL_32-NEXT:    vmovdqa %ymm2, %ymm0
311; KNL_32-NEXT:    retl
312;
313; SKX-LABEL: test6:
314; SKX:       # %bb.0:
315; SKX-NEXT:    kxnorw %k0, %k0, %k1
316; SKX-NEXT:    kxnorw %k0, %k0, %k2
317; SKX-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k2}
318; SKX-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
319; SKX-NEXT:    vmovdqa %ymm2, %ymm0
320; SKX-NEXT:    retq
321;
322; SKX_32-LABEL: test6:
323; SKX_32:       # %bb.0:
324; SKX_32-NEXT:    kxnorw %k0, %k0, %k1
325; SKX_32-NEXT:    kxnorw %k0, %k0, %k2
326; SKX_32-NEXT:    vpgatherdd (,%ymm1), %ymm2 {%k2}
327; SKX_32-NEXT:    vpscatterdd %ymm0, (,%ymm1) {%k1}
328; SKX_32-NEXT:    vmovdqa %ymm2, %ymm0
329; SKX_32-NEXT:    retl
330
331  %a = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
332
333  call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
334  ret <8 x i32>%a
335}
336
337define <8 x i32> @test7(i32* %base, <8 x i32> %ind, i8 %mask) {
338;
339; KNL_64-LABEL: test7:
340; KNL_64:       # %bb.0:
341; KNL_64-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
342; KNL_64-NEXT:    kmovw %esi, %k0
343; KNL_64-NEXT:    kshiftlw $8, %k0, %k0
344; KNL_64-NEXT:    kshiftrw $8, %k0, %k1
345; KNL_64-NEXT:    kmovw %k1, %k2
346; KNL_64-NEXT:    vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
347; KNL_64-NEXT:    vmovdqa64 %zmm1, %zmm2
348; KNL_64-NEXT:    vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
349; KNL_64-NEXT:    vpaddd %ymm2, %ymm1, %ymm0
350; KNL_64-NEXT:    retq
351;
352; KNL_32-LABEL: test7:
353; KNL_32:       # %bb.0:
354; KNL_32-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
355; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
356; KNL_32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
357; KNL_32-NEXT:    kmovw %ecx, %k0
358; KNL_32-NEXT:    kshiftlw $8, %k0, %k0
359; KNL_32-NEXT:    kshiftrw $8, %k0, %k1
360; KNL_32-NEXT:    kmovw %k1, %k2
361; KNL_32-NEXT:    vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2}
362; KNL_32-NEXT:    vmovdqa64 %zmm1, %zmm2
363; KNL_32-NEXT:    vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1}
364; KNL_32-NEXT:    vpaddd %ymm2, %ymm1, %ymm0
365; KNL_32-NEXT:    retl
366;
367; SKX-LABEL: test7:
368; SKX:       # %bb.0:
369; SKX-NEXT:    kmovw %esi, %k1
370; SKX-NEXT:    kmovw %k1, %k2
371; SKX-NEXT:    vpgatherdd (%rdi,%ymm0,4), %ymm1 {%k2}
372; SKX-NEXT:    vmovdqa %ymm1, %ymm2
373; SKX-NEXT:    vpgatherdd (%rdi,%ymm0,4), %ymm2 {%k1}
374; SKX-NEXT:    vpaddd %ymm2, %ymm1, %ymm0
375; SKX-NEXT:    retq
376;
377; SKX_32-LABEL: test7:
378; SKX_32:       # %bb.0:
379; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
380; SKX_32-NEXT:    kmovb {{[0-9]+}}(%esp), %k1
381; SKX_32-NEXT:    kmovw %k1, %k2
382; SKX_32-NEXT:    vpgatherdd (%eax,%ymm0,4), %ymm1 {%k2}
383; SKX_32-NEXT:    vmovdqa %ymm1, %ymm2
384; SKX_32-NEXT:    vpgatherdd (%eax,%ymm0,4), %ymm2 {%k1}
385; SKX_32-NEXT:    vpaddd %ymm2, %ymm1, %ymm0
386; SKX_32-NEXT:    retl
387
388  %broadcast.splatinsert = insertelement <8 x i32*> undef, i32* %base, i32 0
389  %broadcast.splat = shufflevector <8 x i32*> %broadcast.splatinsert, <8 x i32*> undef, <8 x i32> zeroinitializer
390
391  %gep.random = getelementptr i32, <8 x i32*> %broadcast.splat, <8 x i32> %ind
392  %imask = bitcast i8 %mask to <8 x i1>
393  %gt1 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %gep.random, i32 4, <8 x i1> %imask, <8 x i32>undef)
394  %gt2 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %gep.random, i32 4, <8 x i1> %imask, <8 x i32>%gt1)
395  %res = add <8 x i32> %gt1, %gt2
396  ret <8 x i32> %res
397}
398
399; No uniform base in this case, index <8 x i64> contains addresses,
400; each gather call will be split into two
401define <16 x i32> @test8(<16 x i32*> %ptr.random, <16 x i32> %ind, i16 %mask) {
402; KNL_64-LABEL: test8:
403; KNL_64:       # %bb.0:
404; KNL_64-NEXT:    kmovw %edi, %k1
405; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
406; KNL_64-NEXT:    kmovw %k2, %k3
407; KNL_64-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k3}
408; KNL_64-NEXT:    kmovw %k1, %k3
409; KNL_64-NEXT:    vpgatherqd (,%zmm0), %ymm3 {%k3}
410; KNL_64-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm4
411; KNL_64-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k2}
412; KNL_64-NEXT:    vpgatherqd (,%zmm0), %ymm3 {%k1}
413; KNL_64-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm0
414; KNL_64-NEXT:    vpaddd %zmm0, %zmm4, %zmm0
415; KNL_64-NEXT:    retq
416;
417; KNL_32-LABEL: test8:
418; KNL_32:       # %bb.0:
419; KNL_32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
420; KNL_32-NEXT:    kmovw %k1, %k2
421; KNL_32-NEXT:    vpgatherdd (,%zmm0), %zmm1 {%k2}
422; KNL_32-NEXT:    vmovdqa64 %zmm1, %zmm2
423; KNL_32-NEXT:    vpgatherdd (,%zmm0), %zmm2 {%k1}
424; KNL_32-NEXT:    vpaddd %zmm2, %zmm1, %zmm0
425; KNL_32-NEXT:    retl
426;
427; SKX-LABEL: test8:
428; SKX:       # %bb.0:
429; SKX-NEXT:    kmovw %edi, %k1
430; SKX-NEXT:    kshiftrw $8, %k1, %k2
431; SKX-NEXT:    kmovw %k2, %k3
432; SKX-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k3}
433; SKX-NEXT:    kmovw %k1, %k3
434; SKX-NEXT:    vpgatherqd (,%zmm0), %ymm3 {%k3}
435; SKX-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm4
436; SKX-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k2}
437; SKX-NEXT:    vpgatherqd (,%zmm0), %ymm3 {%k1}
438; SKX-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm0
439; SKX-NEXT:    vpaddd %zmm0, %zmm4, %zmm0
440; SKX-NEXT:    retq
441;
442; SKX_32-LABEL: test8:
443; SKX_32:       # %bb.0:
444; SKX_32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
445; SKX_32-NEXT:    kmovw %k1, %k2
446; SKX_32-NEXT:    vpgatherdd (,%zmm0), %zmm1 {%k2}
447; SKX_32-NEXT:    vmovdqa64 %zmm1, %zmm2
448; SKX_32-NEXT:    vpgatherdd (,%zmm0), %zmm2 {%k1}
449; SKX_32-NEXT:    vpaddd %zmm2, %zmm1, %zmm0
450; SKX_32-NEXT:    retl
451
452  %imask = bitcast i16 %mask to <16 x i1>
453  %gt1 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
454  %gt2 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1)
455  %res = add <16 x i32> %gt1, %gt2
456  ret <16 x i32> %res
457}
458
459%struct.RT = type { i8, [10 x [20 x i32]], i8 }
460%struct.ST = type { i32, double, %struct.RT }
461
462; Masked gather for aggregate types
463; Test9 and Test10 should give the same result (scalar and vector indices in GEP)
464
465
466define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) {
467; KNL_64-LABEL: test9:
468; KNL_64:       # %bb.0: # %entry
469; KNL_64-NEXT:    vpbroadcastq %rdi, %zmm2
470; KNL_64-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [824,824,824,824,824,824,824,824]
471; KNL_64-NEXT:    vpmuludq %zmm3, %zmm0, %zmm4
472; KNL_64-NEXT:    vpsrlq $32, %zmm0, %zmm0
473; KNL_64-NEXT:    vpmuludq %zmm3, %zmm0, %zmm0
474; KNL_64-NEXT:    vpsllq $32, %zmm0, %zmm0
475; KNL_64-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
476; KNL_64-NEXT:    vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
477; KNL_64-NEXT:    vpmuldq {{.*}}(%rip){1to8}, %zmm1, %zmm1
478; KNL_64-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
479; KNL_64-NEXT:    vpaddq %zmm0, %zmm4, %zmm0
480; KNL_64-NEXT:    vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
481; KNL_64-NEXT:    kxnorw %k0, %k0, %k1
482; KNL_64-NEXT:    vpgatherqd (,%zmm1), %ymm0 {%k1}
483; KNL_64-NEXT:    retq
484;
485; KNL_32-LABEL: test9:
486; KNL_32:       # %bb.0: # %entry
487; KNL_32-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %ymm2
488; KNL_32-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [80,80,80,80,80,80,80,80]
489; KNL_32-NEXT:    vpmulld %ymm3, %ymm1, %ymm1
490; KNL_32-NEXT:    vpmovqd %zmm0, %ymm0
491; KNL_32-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [820,820,820,820,820,820,820,820]
492; KNL_32-NEXT:    vpmulld %ymm3, %ymm0, %ymm0
493; KNL_32-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
494; KNL_32-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [68,68,68,68,68,68,68,68]
495; KNL_32-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
496; KNL_32-NEXT:    vpaddd %ymm0, %ymm2, %ymm1
497; KNL_32-NEXT:    movw $255, %ax
498; KNL_32-NEXT:    kmovw %eax, %k1
499; KNL_32-NEXT:    vpgatherdd (,%zmm1), %zmm0 {%k1}
500; KNL_32-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
501; KNL_32-NEXT:    retl
502;
503; SKX_SMALL-LABEL: test9:
504; SKX_SMALL:       # %bb.0: # %entry
505; SKX_SMALL-NEXT:    vpbroadcastq %rdi, %zmm2
506; SKX_SMALL-NEXT:    vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0
507; SKX_SMALL-NEXT:    vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
508; SKX_SMALL-NEXT:    vpmuldq {{.*}}(%rip){1to8}, %zmm1, %zmm1
509; SKX_SMALL-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
510; SKX_SMALL-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
511; SKX_SMALL-NEXT:    vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
512; SKX_SMALL-NEXT:    kxnorw %k0, %k0, %k1
513; SKX_SMALL-NEXT:    vpgatherqd (,%zmm1), %ymm0 {%k1}
514; SKX_SMALL-NEXT:    retq
515;
516; SKX_LARGE-LABEL: test9:
517; SKX_LARGE:       # %bb.0: # %entry
518; SKX_LARGE-NEXT:    vpbroadcastq %rdi, %zmm2
519; SKX_LARGE-NEXT:    vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
520; SKX_LARGE-NEXT:    movabsq ${{\.LCPI.*}}, %rax
521; SKX_LARGE-NEXT:    vpmuldq (%rax){1to8}, %zmm1, %zmm1
522; SKX_LARGE-NEXT:    movabsq ${{\.LCPI.*}}, %rax
523; SKX_LARGE-NEXT:    vpmullq (%rax){1to8}, %zmm0, %zmm0
524; SKX_LARGE-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
525; SKX_LARGE-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
526; SKX_LARGE-NEXT:    movabsq ${{\.LCPI.*}}, %rax
527; SKX_LARGE-NEXT:    vpaddq (%rax){1to8}, %zmm0, %zmm1
528; SKX_LARGE-NEXT:    kxnorw %k0, %k0, %k1
529; SKX_LARGE-NEXT:    vpgatherqd (,%zmm1), %ymm0 {%k1}
530; SKX_LARGE-NEXT:    retq
531;
532; SKX_32-LABEL: test9:
533; SKX_32:       # %bb.0: # %entry
534; SKX_32-NEXT:    vpmulld {{\.LCPI.*}}{1to8}, %ymm1, %ymm1
535; SKX_32-NEXT:    vpmovqd %zmm0, %ymm0
536; SKX_32-NEXT:    vpmulld {{\.LCPI.*}}{1to8}, %ymm0, %ymm0
537; SKX_32-NEXT:    vpaddd {{[0-9]+}}(%esp){1to8}, %ymm0, %ymm0
538; SKX_32-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
539; SKX_32-NEXT:    vpaddd {{\.LCPI.*}}{1to8}, %ymm0, %ymm1
540; SKX_32-NEXT:    kxnorw %k0, %k0, %k1
541; SKX_32-NEXT:    vpgatherdd (,%ymm1), %ymm0 {%k1}
542; SKX_32-NEXT:    retl
543entry:
544  %broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0
545  %broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer
546
547  %arrayidx = getelementptr  %struct.ST, <8 x %struct.ST*> %broadcast.splat, <8 x i64> %ind1, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>, <8 x i32><i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, <8 x i32> %ind5, <8 x i64> <i64 13, i64 13, i64 13, i64 13, i64 13, i64 13, i64 13, i64 13>
548  %res = call <8 x i32 >  @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*>%arrayidx, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
549  ret <8 x i32> %res
550}
551
552define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) {
553; KNL_64-LABEL: test10:
554; KNL_64:       # %bb.0: # %entry
555; KNL_64-NEXT:    vpbroadcastq %rdi, %zmm2
556; KNL_64-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [824,824,824,824,824,824,824,824]
557; KNL_64-NEXT:    vpmuludq %zmm3, %zmm0, %zmm4
558; KNL_64-NEXT:    vpsrlq $32, %zmm0, %zmm0
559; KNL_64-NEXT:    vpmuludq %zmm3, %zmm0, %zmm0
560; KNL_64-NEXT:    vpsllq $32, %zmm0, %zmm0
561; KNL_64-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
562; KNL_64-NEXT:    vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
563; KNL_64-NEXT:    vpmuldq {{.*}}(%rip){1to8}, %zmm1, %zmm1
564; KNL_64-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
565; KNL_64-NEXT:    vpaddq %zmm0, %zmm4, %zmm0
566; KNL_64-NEXT:    vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
567; KNL_64-NEXT:    kxnorw %k0, %k0, %k1
568; KNL_64-NEXT:    vpgatherqd (,%zmm1), %ymm0 {%k1}
569; KNL_64-NEXT:    retq
570;
571; KNL_32-LABEL: test10:
572; KNL_32:       # %bb.0: # %entry
573; KNL_32-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %ymm2
574; KNL_32-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [80,80,80,80,80,80,80,80]
575; KNL_32-NEXT:    vpmulld %ymm3, %ymm1, %ymm1
576; KNL_32-NEXT:    vpmovqd %zmm0, %ymm0
577; KNL_32-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [820,820,820,820,820,820,820,820]
578; KNL_32-NEXT:    vpmulld %ymm3, %ymm0, %ymm0
579; KNL_32-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
580; KNL_32-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [68,68,68,68,68,68,68,68]
581; KNL_32-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
582; KNL_32-NEXT:    vpaddd %ymm0, %ymm2, %ymm1
583; KNL_32-NEXT:    movw $255, %ax
584; KNL_32-NEXT:    kmovw %eax, %k1
585; KNL_32-NEXT:    vpgatherdd (,%zmm1), %zmm0 {%k1}
586; KNL_32-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
587; KNL_32-NEXT:    retl
588;
589; SKX_SMALL-LABEL: test10:
590; SKX_SMALL:       # %bb.0: # %entry
591; SKX_SMALL-NEXT:    vpbroadcastq %rdi, %zmm2
592; SKX_SMALL-NEXT:    vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0
593; SKX_SMALL-NEXT:    vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
594; SKX_SMALL-NEXT:    vpmuldq {{.*}}(%rip){1to8}, %zmm1, %zmm1
595; SKX_SMALL-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
596; SKX_SMALL-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
597; SKX_SMALL-NEXT:    vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
598; SKX_SMALL-NEXT:    kxnorw %k0, %k0, %k1
599; SKX_SMALL-NEXT:    vpgatherqd (,%zmm1), %ymm0 {%k1}
600; SKX_SMALL-NEXT:    retq
601;
602; SKX_LARGE-LABEL: test10:
603; SKX_LARGE:       # %bb.0: # %entry
604; SKX_LARGE-NEXT:    vpbroadcastq %rdi, %zmm2
605; SKX_LARGE-NEXT:    vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
606; SKX_LARGE-NEXT:    movabsq ${{\.LCPI.*}}, %rax
607; SKX_LARGE-NEXT:    vpmuldq (%rax){1to8}, %zmm1, %zmm1
608; SKX_LARGE-NEXT:    movabsq ${{\.LCPI.*}}, %rax
609; SKX_LARGE-NEXT:    vpmullq (%rax){1to8}, %zmm0, %zmm0
610; SKX_LARGE-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
611; SKX_LARGE-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
612; SKX_LARGE-NEXT:    movabsq ${{\.LCPI.*}}, %rax
613; SKX_LARGE-NEXT:    vpaddq (%rax){1to8}, %zmm0, %zmm1
614; SKX_LARGE-NEXT:    kxnorw %k0, %k0, %k1
615; SKX_LARGE-NEXT:    vpgatherqd (,%zmm1), %ymm0 {%k1}
616; SKX_LARGE-NEXT:    retq
617;
618; SKX_32-LABEL: test10:
619; SKX_32:       # %bb.0: # %entry
620; SKX_32-NEXT:    vpmulld {{\.LCPI.*}}{1to8}, %ymm1, %ymm1
621; SKX_32-NEXT:    vpmovqd %zmm0, %ymm0
622; SKX_32-NEXT:    vpmulld {{\.LCPI.*}}{1to8}, %ymm0, %ymm0
623; SKX_32-NEXT:    vpaddd {{[0-9]+}}(%esp){1to8}, %ymm0, %ymm0
624; SKX_32-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
625; SKX_32-NEXT:    vpaddd {{\.LCPI.*}}{1to8}, %ymm0, %ymm1
626; SKX_32-NEXT:    kxnorw %k0, %k0, %k1
627; SKX_32-NEXT:    vpgatherdd (,%ymm1), %ymm0 {%k1}
628; SKX_32-NEXT:    retl
629entry:
630  %broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0
631  %broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer
632
633  %arrayidx = getelementptr  %struct.ST, <8 x %struct.ST*> %broadcast.splat, <8 x i64> %i1, i32 2, i32 1, <8 x i32> %ind5, i64 13
634  %res = call <8 x i32 >  @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*>%arrayidx, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
635  ret <8 x i32> %res
636}
637
638; Splat index in GEP, requires broadcast
639define <16 x float> @test11(float* %base, i32 %ind) {
640; KNL_64-LABEL: test11:
641; KNL_64:       # %bb.0:
642; KNL_64-NEXT:    movslq %esi, %rax
643; KNL_64-NEXT:    leaq (%rdi,%rax,4), %rax
644; KNL_64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
645; KNL_64-NEXT:    kxnorw %k0, %k0, %k1
646; KNL_64-NEXT:    vgatherdps (%rax,%zmm1,4), %zmm0 {%k1}
647; KNL_64-NEXT:    retq
648;
649; KNL_32-LABEL: test11:
650; KNL_32:       # %bb.0:
651; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
652; KNL_32-NEXT:    shll $2, %eax
653; KNL_32-NEXT:    addl {{[0-9]+}}(%esp), %eax
654; KNL_32-NEXT:    vxorps %xmm1, %xmm1, %xmm1
655; KNL_32-NEXT:    kxnorw %k0, %k0, %k1
656; KNL_32-NEXT:    vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
657; KNL_32-NEXT:    retl
658;
659; SKX-LABEL: test11:
660; SKX:       # %bb.0:
661; SKX-NEXT:    movslq %esi, %rax
662; SKX-NEXT:    leaq (%rdi,%rax,4), %rax
663; SKX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
664; SKX-NEXT:    kxnorw %k0, %k0, %k1
665; SKX-NEXT:    vgatherdps (%rax,%zmm1,4), %zmm0 {%k1}
666; SKX-NEXT:    retq
667;
668; SKX_32-LABEL: test11:
669; SKX_32:       # %bb.0:
670; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
671; SKX_32-NEXT:    shll $2, %eax
672; SKX_32-NEXT:    addl {{[0-9]+}}(%esp), %eax
673; SKX_32-NEXT:    vxorps %xmm1, %xmm1, %xmm1
674; SKX_32-NEXT:    kxnorw %k0, %k0, %k1
675; SKX_32-NEXT:    vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
676; SKX_32-NEXT:    retl
677
678  %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
679  %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
680
681  %gep.random = getelementptr float, <16 x float*> %broadcast.splat, i32 %ind
682
683  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
684  ret <16 x float>%res
685}
686
687; We are checking the uniform base here. It is taken directly from input to vgatherdps
688define <16 x float> @test12(float* %base, <16 x i32> %ind) {
689; KNL_64-LABEL: test12:
690; KNL_64:       # %bb.0:
691; KNL_64-NEXT:    kxnorw %k0, %k0, %k1
692; KNL_64-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
693; KNL_64-NEXT:    vmovaps %zmm1, %zmm0
694; KNL_64-NEXT:    retq
695;
696; KNL_32-LABEL: test12:
697; KNL_32:       # %bb.0:
698; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
699; KNL_32-NEXT:    kxnorw %k0, %k0, %k1
700; KNL_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
701; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
702; KNL_32-NEXT:    retl
703;
704; SKX-LABEL: test12:
705; SKX:       # %bb.0:
706; SKX-NEXT:    kxnorw %k0, %k0, %k1
707; SKX-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
708; SKX-NEXT:    vmovaps %zmm1, %zmm0
709; SKX-NEXT:    retq
710;
711; SKX_32-LABEL: test12:
712; SKX_32:       # %bb.0:
713; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
714; SKX_32-NEXT:    kxnorw %k0, %k0, %k1
715; SKX_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
716; SKX_32-NEXT:    vmovaps %zmm1, %zmm0
717; SKX_32-NEXT:    retl
718
719  %sext_ind = sext <16 x i32> %ind to <16 x i64>
720  %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
721
722  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
723  ret <16 x float>%res
724}
725
726; The same as the previous, but the mask is undefined
727define <16 x float> @test13(float* %base, <16 x i32> %ind) {
728; KNL_64-LABEL: test13:
729; KNL_64:       # %bb.0:
730; KNL_64-NEXT:    kxnorw %k0, %k0, %k1
731; KNL_64-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
732; KNL_64-NEXT:    vmovaps %zmm1, %zmm0
733; KNL_64-NEXT:    retq
734;
735; KNL_32-LABEL: test13:
736; KNL_32:       # %bb.0:
737; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
738; KNL_32-NEXT:    kxnorw %k0, %k0, %k1
739; KNL_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
740; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
741; KNL_32-NEXT:    retl
742;
743; SKX-LABEL: test13:
744; SKX:       # %bb.0:
745; SKX-NEXT:    kxnorw %k0, %k0, %k1
746; SKX-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
747; SKX-NEXT:    vmovaps %zmm1, %zmm0
748; SKX-NEXT:    retq
749;
750; SKX_32-LABEL: test13:
751; SKX_32:       # %bb.0:
752; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
753; SKX_32-NEXT:    kxnorw %k0, %k0, %k1
754; SKX_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
755; SKX_32-NEXT:    vmovaps %zmm1, %zmm0
756; SKX_32-NEXT:    retl
757
758  %sext_ind = sext <16 x i32> %ind to <16 x i64>
759  %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
760
761  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
762  ret <16 x float>%res
763}
764
765; The base pointer is not splat, can't find unform base
766define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) {
767; KNL_64-LABEL: test14:
768; KNL_64:       # %bb.0:
769; KNL_64-NEXT:    vmovq %xmm0, %rax
770; KNL_64-NEXT:    vmovd %esi, %xmm0
771; KNL_64-NEXT:    vpbroadcastd %xmm0, %ymm0
772; KNL_64-NEXT:    vpmovsxdq %ymm0, %zmm0
773; KNL_64-NEXT:    vpsllq $2, %zmm0, %zmm0
774; KNL_64-NEXT:    kxnorw %k0, %k0, %k1
775; KNL_64-NEXT:    vgatherqps (%rax,%zmm0), %ymm1 {%k1}
776; KNL_64-NEXT:    vinsertf64x4 $1, %ymm1, %zmm1, %zmm0
777; KNL_64-NEXT:    retq
778;
779; KNL_32-LABEL: test14:
780; KNL_32:       # %bb.0:
781; KNL_32-NEXT:    vmovd %xmm0, %eax
782; KNL_32-NEXT:    vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1
783; KNL_32-NEXT:    kxnorw %k0, %k0, %k1
784; KNL_32-NEXT:    vgatherdps (%eax,%zmm1), %zmm0 {%k1}
785; KNL_32-NEXT:    retl
786;
787; SKX-LABEL: test14:
788; SKX:       # %bb.0:
789; SKX-NEXT:    vmovq %xmm0, %rax
790; SKX-NEXT:    vpbroadcastd %esi, %ymm0
791; SKX-NEXT:    vpmovsxdq %ymm0, %zmm0
792; SKX-NEXT:    vpsllq $2, %zmm0, %zmm0
793; SKX-NEXT:    kxnorw %k0, %k0, %k1
794; SKX-NEXT:    vgatherqps (%rax,%zmm0), %ymm1 {%k1}
795; SKX-NEXT:    vinsertf64x4 $1, %ymm1, %zmm1, %zmm0
796; SKX-NEXT:    retq
797;
798; SKX_32-LABEL: test14:
799; SKX_32:       # %bb.0:
800; SKX_32-NEXT:    vmovd %xmm0, %eax
801; SKX_32-NEXT:    vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1
802; SKX_32-NEXT:    kxnorw %k0, %k0, %k1
803; SKX_32-NEXT:    vgatherdps (%eax,%zmm1), %zmm0 {%k1}
804; SKX_32-NEXT:    retl
805
806  %broadcast.splatinsert = insertelement <16 x float*> %vec, float* %base, i32 1
807  %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
808
809  %gep.random = getelementptr float, <16 x float*> %broadcast.splat, i32 %ind
810
811  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
812  ret <16 x float>%res
813}
814
815declare <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*>, i32, <4 x i1>, <4 x float>)
816declare <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*>, i32, <4 x i1>, <4 x double>)
817declare <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*>, i32, <2 x i1>, <2 x double>)
818
819; Gather smaller than existing instruction
820define <4 x float> @test15(float* %base, <4 x i32> %ind, <4 x i1> %mask) {
821; KNL_64-LABEL: test15:
822; KNL_64:       # %bb.0:
823; KNL_64-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
824; KNL_64-NEXT:    vpslld $31, %xmm1, %xmm1
825; KNL_64-NEXT:    vptestmd %zmm1, %zmm1, %k0
826; KNL_64-NEXT:    kshiftlw $12, %k0, %k0
827; KNL_64-NEXT:    kshiftrw $12, %k0, %k1
828; KNL_64-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
829; KNL_64-NEXT:    vmovaps %xmm1, %xmm0
830; KNL_64-NEXT:    vzeroupper
831; KNL_64-NEXT:    retq
832;
833; KNL_32-LABEL: test15:
834; KNL_32:       # %bb.0:
835; KNL_32-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
836; KNL_32-NEXT:    vpslld $31, %xmm1, %xmm1
837; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k0
838; KNL_32-NEXT:    kshiftlw $12, %k0, %k0
839; KNL_32-NEXT:    kshiftrw $12, %k0, %k1
840; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
841; KNL_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
842; KNL_32-NEXT:    vmovaps %xmm1, %xmm0
843; KNL_32-NEXT:    vzeroupper
844; KNL_32-NEXT:    retl
845;
846; SKX-LABEL: test15:
847; SKX:       # %bb.0:
848; SKX-NEXT:    vpslld $31, %xmm1, %xmm1
849; SKX-NEXT:    vpmovd2m %xmm1, %k1
850; SKX-NEXT:    vgatherdps (%rdi,%xmm0,4), %xmm1 {%k1}
851; SKX-NEXT:    vmovaps %xmm1, %xmm0
852; SKX-NEXT:    retq
853;
854; SKX_32-LABEL: test15:
855; SKX_32:       # %bb.0:
856; SKX_32-NEXT:    vpslld $31, %xmm1, %xmm1
857; SKX_32-NEXT:    vpmovd2m %xmm1, %k1
858; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
859; SKX_32-NEXT:    vgatherdps (%eax,%xmm0,4), %xmm1 {%k1}
860; SKX_32-NEXT:    vmovaps %xmm1, %xmm0
861; SKX_32-NEXT:    retl
862
863  %sext_ind = sext <4 x i32> %ind to <4 x i64>
864  %gep.random = getelementptr float, float* %base, <4 x i64> %sext_ind
865  %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.random, i32 4, <4 x i1> %mask, <4 x float> undef)
866  ret <4 x float>%res
867}
868
869; Gather smaller than existing instruction
870define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x double> %src0) {
871; KNL_64-LABEL: test16:
872; KNL_64:       # %bb.0:
873; KNL_64-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
874; KNL_64-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
875; KNL_64-NEXT:    vpslld $31, %xmm1, %xmm1
876; KNL_64-NEXT:    vptestmd %zmm1, %zmm1, %k0
877; KNL_64-NEXT:    kshiftlw $12, %k0, %k0
878; KNL_64-NEXT:    kshiftrw $12, %k0, %k1
879; KNL_64-NEXT:    vgatherdpd (%rdi,%ymm0,8), %zmm2 {%k1}
880; KNL_64-NEXT:    vmovapd %ymm2, %ymm0
881; KNL_64-NEXT:    retq
882;
883; KNL_32-LABEL: test16:
884; KNL_32:       # %bb.0:
885; KNL_32-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
886; KNL_32-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
887; KNL_32-NEXT:    vpslld $31, %xmm1, %xmm1
888; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k0
889; KNL_32-NEXT:    kshiftlw $12, %k0, %k0
890; KNL_32-NEXT:    kshiftrw $12, %k0, %k1
891; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
892; KNL_32-NEXT:    vgatherdpd (%eax,%ymm0,8), %zmm2 {%k1}
893; KNL_32-NEXT:    vmovapd %ymm2, %ymm0
894; KNL_32-NEXT:    retl
895;
896; SKX-LABEL: test16:
897; SKX:       # %bb.0:
898; SKX-NEXT:    vpslld $31, %xmm1, %xmm1
899; SKX-NEXT:    vpmovd2m %xmm1, %k1
900; SKX-NEXT:    vgatherdpd (%rdi,%xmm0,8), %ymm2 {%k1}
901; SKX-NEXT:    vmovapd %ymm2, %ymm0
902; SKX-NEXT:    retq
903;
904; SKX_32-LABEL: test16:
905; SKX_32:       # %bb.0:
906; SKX_32-NEXT:    vpslld $31, %xmm1, %xmm1
907; SKX_32-NEXT:    vpmovd2m %xmm1, %k1
908; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
909; SKX_32-NEXT:    vgatherdpd (%eax,%xmm0,8), %ymm2 {%k1}
910; SKX_32-NEXT:    vmovapd %ymm2, %ymm0
911; SKX_32-NEXT:    retl
912
913  %sext_ind = sext <4 x i32> %ind to <4 x i64>
914  %gep.random = getelementptr double, double* %base, <4 x i64> %sext_ind
915  %res = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> %gep.random, i32 4, <4 x i1> %mask, <4 x double> %src0)
916  ret <4 x double>%res
917}
918
919define <2 x double> @test17(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x double> %src0) {
920; KNL_64-LABEL: test17:
921; KNL_64:       # %bb.0:
922; KNL_64-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
923; KNL_64-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
924; KNL_64-NEXT:    vpsllq $63, %xmm1, %xmm1
925; KNL_64-NEXT:    vptestmq %zmm1, %zmm1, %k0
926; KNL_64-NEXT:    kshiftlw $14, %k0, %k0
927; KNL_64-NEXT:    kshiftrw $14, %k0, %k1
928; KNL_64-NEXT:    vgatherdpd (%rdi,%ymm0,8), %zmm2 {%k1}
929; KNL_64-NEXT:    vmovapd %xmm2, %xmm0
930; KNL_64-NEXT:    vzeroupper
931; KNL_64-NEXT:    retq
932;
933; KNL_32-LABEL: test17:
934; KNL_32:       # %bb.0:
935; KNL_32-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
936; KNL_32-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
937; KNL_32-NEXT:    vpsllq $63, %xmm1, %xmm1
938; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k0
939; KNL_32-NEXT:    kshiftlw $14, %k0, %k0
940; KNL_32-NEXT:    kshiftrw $14, %k0, %k1
941; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
942; KNL_32-NEXT:    vgatherdpd (%eax,%ymm0,8), %zmm2 {%k1}
943; KNL_32-NEXT:    vmovapd %xmm2, %xmm0
944; KNL_32-NEXT:    vzeroupper
945; KNL_32-NEXT:    retl
946;
947; SKX-LABEL: test17:
948; SKX:       # %bb.0:
949; SKX-NEXT:    vpsllq $63, %xmm1, %xmm1
950; SKX-NEXT:    vpmovq2m %xmm1, %k1
951; SKX-NEXT:    vgatherdpd (%rdi,%xmm0,8), %xmm2 {%k1}
952; SKX-NEXT:    vmovapd %xmm2, %xmm0
953; SKX-NEXT:    retq
954;
955; SKX_32-LABEL: test17:
956; SKX_32:       # %bb.0:
957; SKX_32-NEXT:    vpsllq $63, %xmm1, %xmm1
958; SKX_32-NEXT:    vpmovq2m %xmm1, %k1
959; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
960; SKX_32-NEXT:    vgatherdpd (%eax,%xmm0,8), %xmm2 {%k1}
961; SKX_32-NEXT:    vmovapd %xmm2, %xmm0
962; SKX_32-NEXT:    retl
963
964  %sext_ind = sext <2 x i32> %ind to <2 x i64>
965  %gep.random = getelementptr double, double* %base, <2 x i64> %sext_ind
966  %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %gep.random, i32 4, <2 x i1> %mask, <2 x double> %src0)
967  ret <2 x double>%res
968}
969
970declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> , <4 x i32*> , i32 , <4 x i1> )
971declare void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> , <4 x double*> , i32 , <4 x i1> )
972declare void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64> , <2 x i64*> , i32 , <2 x i1> )
973declare void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> , <2 x i32*> , i32 , <2 x i1> )
974declare void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> , <2 x float*> , i32 , <2 x i1> )
975
976define void @test18(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) {
977; KNL_64-LABEL: test18:
978; KNL_64:       # %bb.0:
979; KNL_64-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
980; KNL_64-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
981; KNL_64-NEXT:    vpslld $31, %xmm2, %xmm2
982; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k0
983; KNL_64-NEXT:    kshiftlw $12, %k0, %k0
984; KNL_64-NEXT:    kshiftrw $12, %k0, %k1
985; KNL_64-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
986; KNL_64-NEXT:    vzeroupper
987; KNL_64-NEXT:    retq
988;
989; KNL_32-LABEL: test18:
990; KNL_32:       # %bb.0:
991; KNL_32-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
992; KNL_32-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
993; KNL_32-NEXT:    vpslld $31, %xmm2, %xmm2
994; KNL_32-NEXT:    vptestmd %zmm2, %zmm2, %k0
995; KNL_32-NEXT:    kshiftlw $12, %k0, %k0
996; KNL_32-NEXT:    kshiftrw $12, %k0, %k1
997; KNL_32-NEXT:    vpscatterdd %zmm0, (,%zmm1) {%k1}
998; KNL_32-NEXT:    vzeroupper
999; KNL_32-NEXT:    retl
1000;
1001; SKX-LABEL: test18:
1002; SKX:       # %bb.0:
1003; SKX-NEXT:    vpslld $31, %xmm2, %xmm2
1004; SKX-NEXT:    vpmovd2m %xmm2, %k1
1005; SKX-NEXT:    vpscatterqd %xmm0, (,%ymm1) {%k1}
1006; SKX-NEXT:    vzeroupper
1007; SKX-NEXT:    retq
1008;
1009; SKX_32-LABEL: test18:
1010; SKX_32:       # %bb.0:
1011; SKX_32-NEXT:    vpslld $31, %xmm2, %xmm2
1012; SKX_32-NEXT:    vpmovd2m %xmm2, %k1
1013; SKX_32-NEXT:    vpscatterdd %xmm0, (,%xmm1) {%k1}
1014; SKX_32-NEXT:    retl
1015  call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask)
1016  ret void
1017}
1018
1019define void @test19(<4 x double>%a1, double* %ptr, <4 x i1>%mask, <4 x i64> %ind) {
1020; KNL_64-LABEL: test19:
1021; KNL_64:       # %bb.0:
1022; KNL_64-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
1023; KNL_64-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1024; KNL_64-NEXT:    vpslld $31, %xmm1, %xmm1
1025; KNL_64-NEXT:    vptestmd %zmm1, %zmm1, %k0
1026; KNL_64-NEXT:    kshiftlw $12, %k0, %k0
1027; KNL_64-NEXT:    kshiftrw $12, %k0, %k1
1028; KNL_64-NEXT:    vscatterqpd %zmm0, (%rdi,%zmm2,8) {%k1}
1029; KNL_64-NEXT:    vzeroupper
1030; KNL_64-NEXT:    retq
1031;
1032; KNL_32-LABEL: test19:
1033; KNL_32:       # %bb.0:
1034; KNL_32-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
1035; KNL_32-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1036; KNL_32-NEXT:    vpslld $31, %xmm1, %xmm1
1037; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k0
1038; KNL_32-NEXT:    kshiftlw $12, %k0, %k0
1039; KNL_32-NEXT:    kshiftrw $12, %k0, %k1
1040; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1041; KNL_32-NEXT:    vscatterqpd %zmm0, (%eax,%zmm2,8) {%k1}
1042; KNL_32-NEXT:    vzeroupper
1043; KNL_32-NEXT:    retl
1044;
1045; SKX-LABEL: test19:
1046; SKX:       # %bb.0:
1047; SKX-NEXT:    vpslld $31, %xmm1, %xmm1
1048; SKX-NEXT:    vpmovd2m %xmm1, %k1
1049; SKX-NEXT:    vscatterqpd %ymm0, (%rdi,%ymm2,8) {%k1}
1050; SKX-NEXT:    vzeroupper
1051; SKX-NEXT:    retq
1052;
1053; SKX_32-LABEL: test19:
1054; SKX_32:       # %bb.0:
1055; SKX_32-NEXT:    vpslld $31, %xmm1, %xmm1
1056; SKX_32-NEXT:    vpmovd2m %xmm1, %k1
1057; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1058; SKX_32-NEXT:    vscatterqpd %ymm0, (%eax,%ymm2,8) {%k1}
1059; SKX_32-NEXT:    vzeroupper
1060; SKX_32-NEXT:    retl
1061  %gep = getelementptr double, double* %ptr, <4 x i64> %ind
1062  call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> %a1, <4 x double*> %gep, i32 8, <4 x i1> %mask)
1063  ret void
1064}
1065
1066; Data type requires widening
1067define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) {
1068; KNL_64-LABEL: test20:
1069; KNL_64:       # %bb.0:
1070; KNL_64-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
1071; KNL_64-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1072; KNL_64-NEXT:    vpsllq $63, %xmm2, %xmm2
1073; KNL_64-NEXT:    vptestmq %zmm2, %zmm2, %k0
1074; KNL_64-NEXT:    kshiftlw $14, %k0, %k0
1075; KNL_64-NEXT:    kshiftrw $14, %k0, %k1
1076; KNL_64-NEXT:    vscatterqps %ymm0, (,%zmm1) {%k1}
1077; KNL_64-NEXT:    vzeroupper
1078; KNL_64-NEXT:    retq
1079;
1080; KNL_32-LABEL: test20:
1081; KNL_32:       # %bb.0:
1082; KNL_32-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
1083; KNL_32-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1084; KNL_32-NEXT:    vpsllq $63, %xmm2, %xmm2
1085; KNL_32-NEXT:    vptestmq %zmm2, %zmm2, %k0
1086; KNL_32-NEXT:    kshiftlw $14, %k0, %k0
1087; KNL_32-NEXT:    kshiftrw $14, %k0, %k1
1088; KNL_32-NEXT:    vscatterdps %zmm0, (,%zmm1) {%k1}
1089; KNL_32-NEXT:    vzeroupper
1090; KNL_32-NEXT:    retl
1091;
1092; SKX-LABEL: test20:
1093; SKX:       # %bb.0:
1094; SKX-NEXT:    vpsllq $63, %xmm2, %xmm2
1095; SKX-NEXT:    vpmovq2m %xmm2, %k1
1096; SKX-NEXT:    vscatterqps %xmm0, (,%xmm1) {%k1}
1097; SKX-NEXT:    retq
1098;
1099; SKX_32-LABEL: test20:
1100; SKX_32:       # %bb.0:
1101; SKX_32-NEXT:    vpsllq $63, %xmm2, %xmm2
1102; SKX_32-NEXT:    vpmovq2m %xmm2, %k1
1103; SKX_32-NEXT:    vscatterdps %xmm0, (,%xmm1) {%k1}
1104; SKX_32-NEXT:    retl
1105  call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> %a1, <2 x float*> %ptr, i32 4, <2 x i1> %mask)
1106  ret void
1107}
1108
1109; Data type requires promotion
1110define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) {
1111; KNL_64-LABEL: test21:
1112; KNL_64:       # %bb.0:
1113; KNL_64-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
1114; KNL_64-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1115; KNL_64-NEXT:    vpsllq $63, %xmm2, %xmm2
1116; KNL_64-NEXT:    vptestmq %zmm2, %zmm2, %k0
1117; KNL_64-NEXT:    kshiftlw $14, %k0, %k0
1118; KNL_64-NEXT:    kshiftrw $14, %k0, %k1
1119; KNL_64-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
1120; KNL_64-NEXT:    vzeroupper
1121; KNL_64-NEXT:    retq
1122;
1123; KNL_32-LABEL: test21:
1124; KNL_32:       # %bb.0:
1125; KNL_32-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
1126; KNL_32-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1127; KNL_32-NEXT:    vpsllq $63, %xmm2, %xmm2
1128; KNL_32-NEXT:    vptestmq %zmm2, %zmm2, %k0
1129; KNL_32-NEXT:    kshiftlw $14, %k0, %k0
1130; KNL_32-NEXT:    kshiftrw $14, %k0, %k1
1131; KNL_32-NEXT:    vpscatterdd %zmm0, (,%zmm1) {%k1}
1132; KNL_32-NEXT:    vzeroupper
1133; KNL_32-NEXT:    retl
1134;
1135; SKX-LABEL: test21:
1136; SKX:       # %bb.0:
1137; SKX-NEXT:    vpsllq $63, %xmm2, %xmm2
1138; SKX-NEXT:    vpmovq2m %xmm2, %k1
1139; SKX-NEXT:    vpscatterqd %xmm0, (,%xmm1) {%k1}
1140; SKX-NEXT:    retq
1141;
1142; SKX_32-LABEL: test21:
1143; SKX_32:       # %bb.0:
1144; SKX_32-NEXT:    vpsllq $63, %xmm2, %xmm2
1145; SKX_32-NEXT:    vpmovq2m %xmm2, %k1
1146; SKX_32-NEXT:    vpscatterdd %xmm0, (,%xmm1) {%k1}
1147; SKX_32-NEXT:    retl
1148  call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> %mask)
1149  ret void
1150}
1151
1152; The result type requires widening
1153declare <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*>, i32, <2 x i1>, <2 x float>)
1154
1155define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x float> %src0) {
1156; KNL_64-LABEL: test22:
1157; KNL_64:       # %bb.0:
1158; KNL_64-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
1159; KNL_64-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1160; KNL_64-NEXT:    vpsllq $63, %xmm1, %xmm1
1161; KNL_64-NEXT:    vptestmq %zmm1, %zmm1, %k0
1162; KNL_64-NEXT:    kshiftlw $14, %k0, %k0
1163; KNL_64-NEXT:    kshiftrw $14, %k0, %k1
1164; KNL_64-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm2 {%k1}
1165; KNL_64-NEXT:    vmovaps %xmm2, %xmm0
1166; KNL_64-NEXT:    vzeroupper
1167; KNL_64-NEXT:    retq
1168;
1169; KNL_32-LABEL: test22:
1170; KNL_32:       # %bb.0:
1171; KNL_32-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
1172; KNL_32-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1173; KNL_32-NEXT:    vpsllq $63, %xmm1, %xmm1
1174; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k0
1175; KNL_32-NEXT:    kshiftlw $14, %k0, %k0
1176; KNL_32-NEXT:    kshiftrw $14, %k0, %k1
1177; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1178; KNL_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm2 {%k1}
1179; KNL_32-NEXT:    vmovaps %xmm2, %xmm0
1180; KNL_32-NEXT:    vzeroupper
1181; KNL_32-NEXT:    retl
1182;
1183; SKX-LABEL: test22:
1184; SKX:       # %bb.0:
1185; SKX-NEXT:    vpsllq $63, %xmm1, %xmm1
1186; SKX-NEXT:    vpmovq2m %xmm1, %k1
1187; SKX-NEXT:    vgatherdps (%rdi,%xmm0,4), %xmm2 {%k1}
1188; SKX-NEXT:    vmovaps %xmm2, %xmm0
1189; SKX-NEXT:    retq
1190;
1191; SKX_32-LABEL: test22:
1192; SKX_32:       # %bb.0:
1193; SKX_32-NEXT:    vpsllq $63, %xmm1, %xmm1
1194; SKX_32-NEXT:    vpmovq2m %xmm1, %k1
1195; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1196; SKX_32-NEXT:    vgatherdps (%eax,%xmm0,4), %xmm2 {%k1}
1197; SKX_32-NEXT:    vmovaps %xmm2, %xmm0
1198; SKX_32-NEXT:    retl
1199  %sext_ind = sext <2 x i32> %ind to <2 x i64>
1200  %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind
1201  %res = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> %gep.random, i32 4, <2 x i1> %mask, <2 x float> %src0)
1202  ret <2 x float>%res
1203}
1204
1205define <2 x float> @test22a(float* %base, <2 x i64> %ind, <2 x i1> %mask, <2 x float> %src0) {
1206; KNL_64-LABEL: test22a:
1207; KNL_64:       # %bb.0:
1208; KNL_64-NEXT:    # kill: def $xmm2 killed $xmm2 def $ymm2
1209; KNL_64-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1210; KNL_64-NEXT:    vpsllq $63, %xmm1, %xmm1
1211; KNL_64-NEXT:    vptestmq %zmm1, %zmm1, %k0
1212; KNL_64-NEXT:    kshiftlw $14, %k0, %k0
1213; KNL_64-NEXT:    kshiftrw $14, %k0, %k1
1214; KNL_64-NEXT:    vgatherqps (%rdi,%zmm0,4), %ymm2 {%k1}
1215; KNL_64-NEXT:    vmovaps %xmm2, %xmm0
1216; KNL_64-NEXT:    vzeroupper
1217; KNL_64-NEXT:    retq
1218;
1219; KNL_32-LABEL: test22a:
1220; KNL_32:       # %bb.0:
1221; KNL_32-NEXT:    # kill: def $xmm2 killed $xmm2 def $ymm2
1222; KNL_32-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1223; KNL_32-NEXT:    vpsllq $63, %xmm1, %xmm1
1224; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k0
1225; KNL_32-NEXT:    kshiftlw $14, %k0, %k0
1226; KNL_32-NEXT:    kshiftrw $14, %k0, %k1
1227; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1228; KNL_32-NEXT:    vgatherqps (%eax,%zmm0,4), %ymm2 {%k1}
1229; KNL_32-NEXT:    vmovaps %xmm2, %xmm0
1230; KNL_32-NEXT:    vzeroupper
1231; KNL_32-NEXT:    retl
1232;
1233; SKX-LABEL: test22a:
1234; SKX:       # %bb.0:
1235; SKX-NEXT:    vpsllq $63, %xmm1, %xmm1
1236; SKX-NEXT:    vpmovq2m %xmm1, %k1
1237; SKX-NEXT:    vgatherqps (%rdi,%xmm0,4), %xmm2 {%k1}
1238; SKX-NEXT:    vmovaps %xmm2, %xmm0
1239; SKX-NEXT:    retq
1240;
1241; SKX_32-LABEL: test22a:
1242; SKX_32:       # %bb.0:
1243; SKX_32-NEXT:    vpsllq $63, %xmm1, %xmm1
1244; SKX_32-NEXT:    vpmovq2m %xmm1, %k1
1245; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1246; SKX_32-NEXT:    vgatherqps (%eax,%xmm0,4), %xmm2 {%k1}
1247; SKX_32-NEXT:    vmovaps %xmm2, %xmm0
1248; SKX_32-NEXT:    retl
1249  %gep.random = getelementptr float, float* %base, <2 x i64> %ind
1250  %res = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> %gep.random, i32 4, <2 x i1> %mask, <2 x float> %src0)
1251  ret <2 x float>%res
1252}
1253
1254declare <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>)
1255declare <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*>, i32, <2 x i1>, <2 x i64>)
1256
1257define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %src0) {
1258; KNL_64-LABEL: test23:
1259; KNL_64:       # %bb.0:
1260; KNL_64-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
1261; KNL_64-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1262; KNL_64-NEXT:    vpsllq $63, %xmm1, %xmm1
1263; KNL_64-NEXT:    vptestmq %zmm1, %zmm1, %k0
1264; KNL_64-NEXT:    kshiftlw $14, %k0, %k0
1265; KNL_64-NEXT:    kshiftrw $14, %k0, %k1
1266; KNL_64-NEXT:    vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
1267; KNL_64-NEXT:    vmovdqa %xmm2, %xmm0
1268; KNL_64-NEXT:    vzeroupper
1269; KNL_64-NEXT:    retq
1270;
1271; KNL_32-LABEL: test23:
1272; KNL_32:       # %bb.0:
1273; KNL_32-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
1274; KNL_32-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1275; KNL_32-NEXT:    vpsllq $63, %xmm1, %xmm1
1276; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k0
1277; KNL_32-NEXT:    kshiftlw $14, %k0, %k0
1278; KNL_32-NEXT:    kshiftrw $14, %k0, %k1
1279; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1280; KNL_32-NEXT:    vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1}
1281; KNL_32-NEXT:    vmovdqa %xmm2, %xmm0
1282; KNL_32-NEXT:    vzeroupper
1283; KNL_32-NEXT:    retl
1284;
1285; SKX-LABEL: test23:
1286; SKX:       # %bb.0:
1287; SKX-NEXT:    vpsllq $63, %xmm1, %xmm1
1288; SKX-NEXT:    vpmovq2m %xmm1, %k1
1289; SKX-NEXT:    vpgatherdd (%rdi,%xmm0,4), %xmm2 {%k1}
1290; SKX-NEXT:    vmovdqa %xmm2, %xmm0
1291; SKX-NEXT:    retq
1292;
1293; SKX_32-LABEL: test23:
1294; SKX_32:       # %bb.0:
1295; SKX_32-NEXT:    vpsllq $63, %xmm1, %xmm1
1296; SKX_32-NEXT:    vpmovq2m %xmm1, %k1
1297; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1298; SKX_32-NEXT:    vpgatherdd (%eax,%xmm0,4), %xmm2 {%k1}
1299; SKX_32-NEXT:    vmovdqa %xmm2, %xmm0
1300; SKX_32-NEXT:    retl
1301  %sext_ind = sext <2 x i32> %ind to <2 x i64>
1302  %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind
1303  %res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0)
1304  ret <2 x i32>%res
1305}
1306
1307define <2 x i32> @test23b(i32* %base, <2 x i64> %ind, <2 x i1> %mask, <2 x i32> %src0) {
1308; KNL_64-LABEL: test23b:
1309; KNL_64:       # %bb.0:
1310; KNL_64-NEXT:    # kill: def $xmm2 killed $xmm2 def $ymm2
1311; KNL_64-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1312; KNL_64-NEXT:    vpsllq $63, %xmm1, %xmm1
1313; KNL_64-NEXT:    vptestmq %zmm1, %zmm1, %k0
1314; KNL_64-NEXT:    kshiftlw $14, %k0, %k0
1315; KNL_64-NEXT:    kshiftrw $14, %k0, %k1
1316; KNL_64-NEXT:    vpgatherqd (%rdi,%zmm0,4), %ymm2 {%k1}
1317; KNL_64-NEXT:    vmovdqa %xmm2, %xmm0
1318; KNL_64-NEXT:    vzeroupper
1319; KNL_64-NEXT:    retq
1320;
1321; KNL_32-LABEL: test23b:
1322; KNL_32:       # %bb.0:
1323; KNL_32-NEXT:    # kill: def $xmm2 killed $xmm2 def $ymm2
1324; KNL_32-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1325; KNL_32-NEXT:    vpsllq $63, %xmm1, %xmm1
1326; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k0
1327; KNL_32-NEXT:    kshiftlw $14, %k0, %k0
1328; KNL_32-NEXT:    kshiftrw $14, %k0, %k1
1329; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1330; KNL_32-NEXT:    vpgatherqd (%eax,%zmm0,4), %ymm2 {%k1}
1331; KNL_32-NEXT:    vmovdqa %xmm2, %xmm0
1332; KNL_32-NEXT:    vzeroupper
1333; KNL_32-NEXT:    retl
1334;
1335; SKX-LABEL: test23b:
1336; SKX:       # %bb.0:
1337; SKX-NEXT:    vpsllq $63, %xmm1, %xmm1
1338; SKX-NEXT:    vpmovq2m %xmm1, %k1
1339; SKX-NEXT:    vpgatherqd (%rdi,%xmm0,4), %xmm2 {%k1}
1340; SKX-NEXT:    vmovdqa %xmm2, %xmm0
1341; SKX-NEXT:    retq
1342;
1343; SKX_32-LABEL: test23b:
1344; SKX_32:       # %bb.0:
1345; SKX_32-NEXT:    vpsllq $63, %xmm1, %xmm1
1346; SKX_32-NEXT:    vpmovq2m %xmm1, %k1
1347; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1348; SKX_32-NEXT:    vpgatherqd (%eax,%xmm0,4), %xmm2 {%k1}
1349; SKX_32-NEXT:    vmovdqa %xmm2, %xmm0
1350; SKX_32-NEXT:    retl
1351  %gep.random = getelementptr i32, i32* %base, <2 x i64> %ind
1352  %res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0)
1353  ret <2 x i32>%res
1354}
1355
1356define <2 x i32> @test24(i32* %base, <2 x i32> %ind) {
1357; KNL_64-LABEL: test24:
1358; KNL_64:       # %bb.0:
1359; KNL_64-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1360; KNL_64-NEXT:    movw $3, %ax
1361; KNL_64-NEXT:    kmovw %eax, %k1
1362; KNL_64-NEXT:    vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
1363; KNL_64-NEXT:    vmovdqa %xmm1, %xmm0
1364; KNL_64-NEXT:    vzeroupper
1365; KNL_64-NEXT:    retq
1366;
1367; KNL_32-LABEL: test24:
1368; KNL_32:       # %bb.0:
1369; KNL_32-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1370; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1371; KNL_32-NEXT:    movw $3, %cx
1372; KNL_32-NEXT:    kmovw %ecx, %k1
1373; KNL_32-NEXT:    vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1}
1374; KNL_32-NEXT:    vmovdqa %xmm1, %xmm0
1375; KNL_32-NEXT:    vzeroupper
1376; KNL_32-NEXT:    retl
1377;
1378; SKX-LABEL: test24:
1379; SKX:       # %bb.0:
1380; SKX-NEXT:    movb $3, %al
1381; SKX-NEXT:    kmovw %eax, %k1
1382; SKX-NEXT:    vpgatherdd (%rdi,%xmm0,4), %xmm1 {%k1}
1383; SKX-NEXT:    vmovdqa %xmm1, %xmm0
1384; SKX-NEXT:    retq
1385;
1386; SKX_32-LABEL: test24:
1387; SKX_32:       # %bb.0:
1388; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1389; SKX_32-NEXT:    movb $3, %cl
1390; SKX_32-NEXT:    kmovw %ecx, %k1
1391; SKX_32-NEXT:    vpgatherdd (%eax,%xmm0,4), %xmm1 {%k1}
1392; SKX_32-NEXT:    vmovdqa %xmm1, %xmm0
1393; SKX_32-NEXT:    retl
1394  %sext_ind = sext <2 x i32> %ind to <2 x i64>
1395  %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind
1396  %res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
1397  ret <2 x i32>%res
1398}
1399
1400define <2 x i64> @test25(i64* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %src0) {
1401; KNL_64-LABEL: test25:
1402; KNL_64:       # %bb.0:
1403; KNL_64-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
1404; KNL_64-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1405; KNL_64-NEXT:    vpsllq $63, %xmm1, %xmm1
1406; KNL_64-NEXT:    vptestmq %zmm1, %zmm1, %k0
1407; KNL_64-NEXT:    kshiftlw $14, %k0, %k0
1408; KNL_64-NEXT:    kshiftrw $14, %k0, %k1
1409; KNL_64-NEXT:    vpgatherdq (%rdi,%ymm0,8), %zmm2 {%k1}
1410; KNL_64-NEXT:    vmovdqa %xmm2, %xmm0
1411; KNL_64-NEXT:    vzeroupper
1412; KNL_64-NEXT:    retq
1413;
1414; KNL_32-LABEL: test25:
1415; KNL_32:       # %bb.0:
1416; KNL_32-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
1417; KNL_32-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1418; KNL_32-NEXT:    vpsllq $63, %xmm1, %xmm1
1419; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k0
1420; KNL_32-NEXT:    kshiftlw $14, %k0, %k0
1421; KNL_32-NEXT:    kshiftrw $14, %k0, %k1
1422; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1423; KNL_32-NEXT:    vpgatherdq (%eax,%ymm0,8), %zmm2 {%k1}
1424; KNL_32-NEXT:    vmovdqa %xmm2, %xmm0
1425; KNL_32-NEXT:    vzeroupper
1426; KNL_32-NEXT:    retl
1427;
1428; SKX-LABEL: test25:
1429; SKX:       # %bb.0:
1430; SKX-NEXT:    vpsllq $63, %xmm1, %xmm1
1431; SKX-NEXT:    vpmovq2m %xmm1, %k1
1432; SKX-NEXT:    vpgatherdq (%rdi,%xmm0,8), %xmm2 {%k1}
1433; SKX-NEXT:    vmovdqa %xmm2, %xmm0
1434; SKX-NEXT:    retq
1435;
1436; SKX_32-LABEL: test25:
1437; SKX_32:       # %bb.0:
1438; SKX_32-NEXT:    vpsllq $63, %xmm1, %xmm1
1439; SKX_32-NEXT:    vpmovq2m %xmm1, %k1
1440; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1441; SKX_32-NEXT:    vpgatherdq (%eax,%xmm0,8), %xmm2 {%k1}
1442; SKX_32-NEXT:    vmovdqa %xmm2, %xmm0
1443; SKX_32-NEXT:    retl
1444  %sext_ind = sext <2 x i32> %ind to <2 x i64>
1445  %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind
1446  %res = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> %gep.random, i32 8, <2 x i1> %mask, <2 x i64> %src0)
1447  ret <2 x i64>%res
1448}
1449
1450define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) {
1451; KNL_64-LABEL: test26:
1452; KNL_64:       # %bb.0:
1453; KNL_64-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
1454; KNL_64-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1455; KNL_64-NEXT:    movb $3, %al
1456; KNL_64-NEXT:    kmovw %eax, %k1
1457; KNL_64-NEXT:    vpgatherdq (%rdi,%ymm0,8), %zmm1 {%k1}
1458; KNL_64-NEXT:    vmovdqa %xmm1, %xmm0
1459; KNL_64-NEXT:    vzeroupper
1460; KNL_64-NEXT:    retq
1461;
1462; KNL_32-LABEL: test26:
1463; KNL_32:       # %bb.0:
1464; KNL_32-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
1465; KNL_32-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1466; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1467; KNL_32-NEXT:    movb $3, %cl
1468; KNL_32-NEXT:    kmovw %ecx, %k1
1469; KNL_32-NEXT:    vpgatherdq (%eax,%ymm0,8), %zmm1 {%k1}
1470; KNL_32-NEXT:    vmovdqa %xmm1, %xmm0
1471; KNL_32-NEXT:    vzeroupper
1472; KNL_32-NEXT:    retl
1473;
1474; SKX-LABEL: test26:
1475; SKX:       # %bb.0:
1476; SKX-NEXT:    kxnorw %k0, %k0, %k1
1477; SKX-NEXT:    vpgatherdq (%rdi,%xmm0,8), %xmm1 {%k1}
1478; SKX-NEXT:    vmovdqa %xmm1, %xmm0
1479; SKX-NEXT:    retq
1480;
1481; SKX_32-LABEL: test26:
1482; SKX_32:       # %bb.0:
1483; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1484; SKX_32-NEXT:    kxnorw %k0, %k0, %k1
1485; SKX_32-NEXT:    vpgatherdq (%eax,%xmm0,8), %xmm1 {%k1}
1486; SKX_32-NEXT:    vmovdqa %xmm1, %xmm0
1487; SKX_32-NEXT:    retl
1488  %sext_ind = sext <2 x i32> %ind to <2 x i64>
1489  %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind
1490  %res = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> %gep.random, i32 8, <2 x i1> <i1 true, i1 true>, <2 x i64> %src0)
1491  ret <2 x i64>%res
1492}
1493
1494; Result type requires widening; all-ones mask
1495define <2 x float> @test27(float* %base, <2 x i32> %ind) {
1496; KNL_64-LABEL: test27:
1497; KNL_64:       # %bb.0:
1498; KNL_64-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1499; KNL_64-NEXT:    movw $3, %ax
1500; KNL_64-NEXT:    kmovw %eax, %k1
1501; KNL_64-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
1502; KNL_64-NEXT:    vmovaps %xmm1, %xmm0
1503; KNL_64-NEXT:    vzeroupper
1504; KNL_64-NEXT:    retq
1505;
1506; KNL_32-LABEL: test27:
1507; KNL_32:       # %bb.0:
1508; KNL_32-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1509; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1510; KNL_32-NEXT:    movw $3, %cx
1511; KNL_32-NEXT:    kmovw %ecx, %k1
1512; KNL_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
1513; KNL_32-NEXT:    vmovaps %xmm1, %xmm0
1514; KNL_32-NEXT:    vzeroupper
1515; KNL_32-NEXT:    retl
1516;
1517; SKX-LABEL: test27:
1518; SKX:       # %bb.0:
1519; SKX-NEXT:    movb $3, %al
1520; SKX-NEXT:    kmovw %eax, %k1
1521; SKX-NEXT:    vgatherdps (%rdi,%xmm0,4), %xmm1 {%k1}
1522; SKX-NEXT:    vmovaps %xmm1, %xmm0
1523; SKX-NEXT:    retq
1524;
1525; SKX_32-LABEL: test27:
1526; SKX_32:       # %bb.0:
1527; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1528; SKX_32-NEXT:    movb $3, %cl
1529; SKX_32-NEXT:    kmovw %ecx, %k1
1530; SKX_32-NEXT:    vgatherdps (%eax,%xmm0,4), %xmm1 {%k1}
1531; SKX_32-NEXT:    vmovaps %xmm1, %xmm0
1532; SKX_32-NEXT:    retl
1533  %sext_ind = sext <2 x i32> %ind to <2 x i64>
1534  %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind
1535  %res = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x float> undef)
1536  ret <2 x float>%res
1537}
1538
1539; Data type requires promotion, mask is all-ones
1540define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) {
1541; KNL_64-LABEL: test28:
1542; KNL_64:       # %bb.0:
1543; KNL_64-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
1544; KNL_64-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1545; KNL_64-NEXT:    movb $3, %al
1546; KNL_64-NEXT:    kmovw %eax, %k1
1547; KNL_64-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
1548; KNL_64-NEXT:    vzeroupper
1549; KNL_64-NEXT:    retq
1550;
1551; KNL_32-LABEL: test28:
1552; KNL_32:       # %bb.0:
1553; KNL_32-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
1554; KNL_32-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1555; KNL_32-NEXT:    movw $3, %ax
1556; KNL_32-NEXT:    kmovw %eax, %k1
1557; KNL_32-NEXT:    vpscatterdd %zmm0, (,%zmm1) {%k1}
1558; KNL_32-NEXT:    vzeroupper
1559; KNL_32-NEXT:    retl
1560;
1561; SKX-LABEL: test28:
1562; SKX:       # %bb.0:
1563; SKX-NEXT:    kxnorw %k0, %k0, %k1
1564; SKX-NEXT:    vpscatterqd %xmm0, (,%xmm1) {%k1}
1565; SKX-NEXT:    retq
1566;
1567; SKX_32-LABEL: test28:
1568; SKX_32:       # %bb.0:
1569; SKX_32-NEXT:    movb $3, %al
1570; SKX_32-NEXT:    kmovw %eax, %k1
1571; SKX_32-NEXT:    vpscatterdd %xmm0, (,%xmm1) {%k1}
1572; SKX_32-NEXT:    retl
1573  call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> <i1 true, i1 true>)
1574  ret void
1575}
1576
1577; SCALAR-LABEL: test29
1578; SCALAR:      extractelement <16 x float*>
1579; SCALAR-NEXT: load float
1580; SCALAR-NEXT: insertelement <16 x float>
1581; SCALAR-NEXT: extractelement <16 x float*>
1582; SCALAR-NEXT: load float
1583
1584define <16 x float> @test29(float* %base, <16 x i32> %ind) {
1585; KNL_64-LABEL: test29:
1586; KNL_64:       # %bb.0:
1587; KNL_64-NEXT:    movw $44, %ax
1588; KNL_64-NEXT:    kmovw %eax, %k1
1589; KNL_64-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
1590; KNL_64-NEXT:    vmovaps %zmm1, %zmm0
1591; KNL_64-NEXT:    retq
1592;
1593; KNL_32-LABEL: test29:
1594; KNL_32:       # %bb.0:
1595; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1596; KNL_32-NEXT:    movw $44, %cx
1597; KNL_32-NEXT:    kmovw %ecx, %k1
1598; KNL_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
1599; KNL_32-NEXT:    vmovaps %zmm1, %zmm0
1600; KNL_32-NEXT:    retl
1601;
1602; SKX-LABEL: test29:
1603; SKX:       # %bb.0:
1604; SKX-NEXT:    movw $44, %ax
1605; SKX-NEXT:    kmovw %eax, %k1
1606; SKX-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
1607; SKX-NEXT:    vmovaps %zmm1, %zmm0
1608; SKX-NEXT:    retq
1609;
1610; SKX_32-LABEL: test29:
1611; SKX_32:       # %bb.0:
1612; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
1613; SKX_32-NEXT:    movw $44, %cx
1614; SKX_32-NEXT:    kmovw %ecx, %k1
1615; SKX_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
1616; SKX_32-NEXT:    vmovaps %zmm1, %zmm0
1617; SKX_32-NEXT:    retl
1618
1619  %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
1620  %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
1621
1622  %sext_ind = sext <16 x i32> %ind to <16 x i64>
1623  %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
1624
1625  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float> undef)
1626  ret <16 x float>%res
1627}
1628
1629declare <3 x i32> @llvm.masked.gather.v3i32.v3p0i32(<3 x i32*>, i32, <3 x i1>, <3 x i32>)
1630define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> %src0) {
1631; KNL_64-LABEL: test30:
1632; KNL_64:       # %bb.0:
1633; KNL_64-NEXT:    # kill: def $xmm2 killed $xmm2 def $ymm2
1634; KNL_64-NEXT:    movw $-3, %ax
1635; KNL_64-NEXT:    kmovw %eax, %k0
1636; KNL_64-NEXT:    andl $1, %edi
1637; KNL_64-NEXT:    kmovw %edi, %k1
1638; KNL_64-NEXT:    kandw %k0, %k1, %k0
1639; KNL_64-NEXT:    kmovw %esi, %k1
1640; KNL_64-NEXT:    kshiftlw $15, %k1, %k1
1641; KNL_64-NEXT:    kshiftrw $14, %k1, %k1
1642; KNL_64-NEXT:    korw %k1, %k0, %k0
1643; KNL_64-NEXT:    movw $-5, %ax
1644; KNL_64-NEXT:    kmovw %eax, %k1
1645; KNL_64-NEXT:    kandw %k1, %k0, %k0
1646; KNL_64-NEXT:    kmovw %edx, %k1
1647; KNL_64-NEXT:    kshiftlw $15, %k1, %k1
1648; KNL_64-NEXT:    kshiftrw $13, %k1, %k1
1649; KNL_64-NEXT:    korw %k1, %k0, %k0
1650; KNL_64-NEXT:    kshiftlw $12, %k0, %k0
1651; KNL_64-NEXT:    kshiftrw $12, %k0, %k1
1652; KNL_64-NEXT:    vpmovsxdq %xmm1, %ymm1
1653; KNL_64-NEXT:    vpsllq $2, %ymm1, %ymm1
1654; KNL_64-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
1655; KNL_64-NEXT:    vpgatherqd (,%zmm0), %ymm2 {%k1}
1656; KNL_64-NEXT:    vmovdqa %xmm2, %xmm0
1657; KNL_64-NEXT:    vzeroupper
1658; KNL_64-NEXT:    retq
1659;
1660; KNL_32-LABEL: test30:
1661; KNL_32:       # %bb.0:
1662; KNL_32-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
1663; KNL_32-NEXT:    movw $-3, %ax
1664; KNL_32-NEXT:    kmovw %eax, %k0
1665; KNL_32-NEXT:    movb {{[0-9]+}}(%esp), %al
1666; KNL_32-NEXT:    andl $1, %eax
1667; KNL_32-NEXT:    kmovw %eax, %k1
1668; KNL_32-NEXT:    kandw %k0, %k1, %k0
1669; KNL_32-NEXT:    movb {{[0-9]+}}(%esp), %al
1670; KNL_32-NEXT:    kmovw %eax, %k1
1671; KNL_32-NEXT:    kshiftlw $15, %k1, %k1
1672; KNL_32-NEXT:    kshiftrw $14, %k1, %k1
1673; KNL_32-NEXT:    korw %k1, %k0, %k0
1674; KNL_32-NEXT:    movw $-5, %ax
1675; KNL_32-NEXT:    kmovw %eax, %k1
1676; KNL_32-NEXT:    kandw %k1, %k0, %k0
1677; KNL_32-NEXT:    movb {{[0-9]+}}(%esp), %al
1678; KNL_32-NEXT:    kmovw %eax, %k1
1679; KNL_32-NEXT:    kshiftlw $15, %k1, %k1
1680; KNL_32-NEXT:    kshiftrw $13, %k1, %k1
1681; KNL_32-NEXT:    korw %k1, %k0, %k0
1682; KNL_32-NEXT:    kshiftlw $12, %k0, %k0
1683; KNL_32-NEXT:    kshiftrw $12, %k0, %k1
1684; KNL_32-NEXT:    vpslld $2, %xmm1, %xmm1
1685; KNL_32-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1686; KNL_32-NEXT:    vpgatherdd (,%zmm0), %zmm2 {%k1}
1687; KNL_32-NEXT:    vmovdqa %xmm2, %xmm0
1688; KNL_32-NEXT:    vzeroupper
1689; KNL_32-NEXT:    retl
1690;
1691; SKX-LABEL: test30:
1692; SKX:       # %bb.0:
1693; SKX-NEXT:    movb $-3, %al
1694; SKX-NEXT:    kmovw %eax, %k0
1695; SKX-NEXT:    kmovw %edi, %k1
1696; SKX-NEXT:    kshiftlb $7, %k1, %k1
1697; SKX-NEXT:    kshiftrb $7, %k1, %k1
1698; SKX-NEXT:    kandw %k0, %k1, %k0
1699; SKX-NEXT:    kmovw %esi, %k1
1700; SKX-NEXT:    kshiftlb $7, %k1, %k1
1701; SKX-NEXT:    kshiftrb $6, %k1, %k1
1702; SKX-NEXT:    korw %k1, %k0, %k0
1703; SKX-NEXT:    movb $-5, %al
1704; SKX-NEXT:    kmovw %eax, %k1
1705; SKX-NEXT:    kandw %k1, %k0, %k0
1706; SKX-NEXT:    kmovw %edx, %k1
1707; SKX-NEXT:    kshiftlb $7, %k1, %k1
1708; SKX-NEXT:    kshiftrb $5, %k1, %k1
1709; SKX-NEXT:    korw %k1, %k0, %k1
1710; SKX-NEXT:    vpmovsxdq %xmm1, %ymm1
1711; SKX-NEXT:    vpsllq $2, %ymm1, %ymm1
1712; SKX-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
1713; SKX-NEXT:    vpgatherqd (,%ymm0), %xmm2 {%k1}
1714; SKX-NEXT:    vmovdqa %xmm2, %xmm0
1715; SKX-NEXT:    vzeroupper
1716; SKX-NEXT:    retq
1717;
1718; SKX_32-LABEL: test30:
1719; SKX_32:       # %bb.0:
1720; SKX_32-NEXT:    movb $-3, %al
1721; SKX_32-NEXT:    kmovw %eax, %k0
1722; SKX_32-NEXT:    movb {{[0-9]+}}(%esp), %al
1723; SKX_32-NEXT:    kmovw %eax, %k1
1724; SKX_32-NEXT:    kshiftlb $7, %k1, %k1
1725; SKX_32-NEXT:    kshiftrb $7, %k1, %k1
1726; SKX_32-NEXT:    kandw %k0, %k1, %k0
1727; SKX_32-NEXT:    movb {{[0-9]+}}(%esp), %al
1728; SKX_32-NEXT:    kmovw %eax, %k1
1729; SKX_32-NEXT:    kshiftlb $7, %k1, %k1
1730; SKX_32-NEXT:    kshiftrb $6, %k1, %k1
1731; SKX_32-NEXT:    korw %k1, %k0, %k0
1732; SKX_32-NEXT:    movb $-5, %al
1733; SKX_32-NEXT:    kmovw %eax, %k1
1734; SKX_32-NEXT:    kandw %k1, %k0, %k0
1735; SKX_32-NEXT:    movb {{[0-9]+}}(%esp), %al
1736; SKX_32-NEXT:    kmovw %eax, %k1
1737; SKX_32-NEXT:    kshiftlb $7, %k1, %k1
1738; SKX_32-NEXT:    kshiftrb $5, %k1, %k1
1739; SKX_32-NEXT:    korw %k1, %k0, %k1
1740; SKX_32-NEXT:    vpslld $2, %xmm1, %xmm1
1741; SKX_32-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1742; SKX_32-NEXT:    vpgatherdd (,%xmm0), %xmm2 {%k1}
1743; SKX_32-NEXT:    vmovdqa %xmm2, %xmm0
1744; SKX_32-NEXT:    retl
1745
1746  %sext_ind = sext <3 x i32> %ind to <3 x i64>
1747  %gep.random = getelementptr i32, <3 x i32*> %base, <3 x i64> %sext_ind
1748  %res = call <3 x i32> @llvm.masked.gather.v3i32.v3p0i32(<3 x i32*> %gep.random, i32 4, <3 x i1> %mask, <3 x i32> %src0)
1749  ret <3 x i32>%res
1750}
1751
1752; Non-power of 2 scatter
1753declare void @llvm.masked.scatter.v3i32.v3p0i32(<3 x i32>, <3 x i32*>, i32, <3 x i1>)
1754define void @test30b(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> %src0) {
1755; KNL_64-LABEL: test30b:
1756; KNL_64:       # %bb.0:
1757; KNL_64-NEXT:    # kill: def $xmm2 killed $xmm2 def $ymm2
1758; KNL_64-NEXT:    movw $-3, %ax
1759; KNL_64-NEXT:    kmovw %eax, %k0
1760; KNL_64-NEXT:    andl $1, %edi
1761; KNL_64-NEXT:    kmovw %edi, %k1
1762; KNL_64-NEXT:    kandw %k0, %k1, %k0
1763; KNL_64-NEXT:    kmovw %esi, %k1
1764; KNL_64-NEXT:    kshiftlw $15, %k1, %k1
1765; KNL_64-NEXT:    kshiftrw $14, %k1, %k1
1766; KNL_64-NEXT:    korw %k1, %k0, %k0
1767; KNL_64-NEXT:    movw $-5, %ax
1768; KNL_64-NEXT:    kmovw %eax, %k1
1769; KNL_64-NEXT:    kandw %k1, %k0, %k0
1770; KNL_64-NEXT:    kmovw %edx, %k1
1771; KNL_64-NEXT:    kshiftlw $15, %k1, %k1
1772; KNL_64-NEXT:    kshiftrw $13, %k1, %k1
1773; KNL_64-NEXT:    korw %k1, %k0, %k0
1774; KNL_64-NEXT:    kshiftlw $12, %k0, %k0
1775; KNL_64-NEXT:    kshiftrw $12, %k0, %k1
1776; KNL_64-NEXT:    vpmovsxdq %xmm1, %ymm1
1777; KNL_64-NEXT:    vpsllq $2, %ymm1, %ymm1
1778; KNL_64-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
1779; KNL_64-NEXT:    vpscatterqd %ymm2, (,%zmm0) {%k1}
1780; KNL_64-NEXT:    vzeroupper
1781; KNL_64-NEXT:    retq
1782;
1783; KNL_32-LABEL: test30b:
1784; KNL_32:       # %bb.0:
1785; KNL_32-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
1786; KNL_32-NEXT:    movw $-3, %ax
1787; KNL_32-NEXT:    kmovw %eax, %k0
1788; KNL_32-NEXT:    movb {{[0-9]+}}(%esp), %al
1789; KNL_32-NEXT:    andl $1, %eax
1790; KNL_32-NEXT:    kmovw %eax, %k1
1791; KNL_32-NEXT:    kandw %k0, %k1, %k0
1792; KNL_32-NEXT:    movb {{[0-9]+}}(%esp), %al
1793; KNL_32-NEXT:    kmovw %eax, %k1
1794; KNL_32-NEXT:    kshiftlw $15, %k1, %k1
1795; KNL_32-NEXT:    kshiftrw $14, %k1, %k1
1796; KNL_32-NEXT:    korw %k1, %k0, %k0
1797; KNL_32-NEXT:    movw $-5, %ax
1798; KNL_32-NEXT:    kmovw %eax, %k1
1799; KNL_32-NEXT:    kandw %k1, %k0, %k0
1800; KNL_32-NEXT:    movb {{[0-9]+}}(%esp), %al
1801; KNL_32-NEXT:    kmovw %eax, %k1
1802; KNL_32-NEXT:    kshiftlw $15, %k1, %k1
1803; KNL_32-NEXT:    kshiftrw $13, %k1, %k1
1804; KNL_32-NEXT:    korw %k1, %k0, %k0
1805; KNL_32-NEXT:    kshiftlw $12, %k0, %k0
1806; KNL_32-NEXT:    kshiftrw $12, %k0, %k1
1807; KNL_32-NEXT:    vpslld $2, %xmm1, %xmm1
1808; KNL_32-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1809; KNL_32-NEXT:    vpscatterdd %zmm2, (,%zmm0) {%k1}
1810; KNL_32-NEXT:    vzeroupper
1811; KNL_32-NEXT:    retl
1812;
1813; SKX-LABEL: test30b:
1814; SKX:       # %bb.0:
1815; SKX-NEXT:    movb $-3, %al
1816; SKX-NEXT:    kmovw %eax, %k0
1817; SKX-NEXT:    kmovw %edi, %k1
1818; SKX-NEXT:    kshiftlb $7, %k1, %k1
1819; SKX-NEXT:    kshiftrb $7, %k1, %k1
1820; SKX-NEXT:    kandw %k0, %k1, %k0
1821; SKX-NEXT:    kmovw %esi, %k1
1822; SKX-NEXT:    kshiftlb $7, %k1, %k1
1823; SKX-NEXT:    kshiftrb $6, %k1, %k1
1824; SKX-NEXT:    korw %k1, %k0, %k0
1825; SKX-NEXT:    movb $-5, %al
1826; SKX-NEXT:    kmovw %eax, %k1
1827; SKX-NEXT:    kandw %k1, %k0, %k0
1828; SKX-NEXT:    kmovw %edx, %k1
1829; SKX-NEXT:    kshiftlb $7, %k1, %k1
1830; SKX-NEXT:    kshiftrb $5, %k1, %k1
1831; SKX-NEXT:    korw %k1, %k0, %k1
1832; SKX-NEXT:    vpmovsxdq %xmm1, %ymm1
1833; SKX-NEXT:    vpsllq $2, %ymm1, %ymm1
1834; SKX-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
1835; SKX-NEXT:    vpscatterqd %xmm2, (,%ymm0) {%k1}
1836; SKX-NEXT:    vzeroupper
1837; SKX-NEXT:    retq
1838;
1839; SKX_32-LABEL: test30b:
1840; SKX_32:       # %bb.0:
1841; SKX_32-NEXT:    movb $-3, %al
1842; SKX_32-NEXT:    kmovw %eax, %k0
1843; SKX_32-NEXT:    movb {{[0-9]+}}(%esp), %al
1844; SKX_32-NEXT:    kmovw %eax, %k1
1845; SKX_32-NEXT:    kshiftlb $7, %k1, %k1
1846; SKX_32-NEXT:    kshiftrb $7, %k1, %k1
1847; SKX_32-NEXT:    kandw %k0, %k1, %k0
1848; SKX_32-NEXT:    movb {{[0-9]+}}(%esp), %al
1849; SKX_32-NEXT:    kmovw %eax, %k1
1850; SKX_32-NEXT:    kshiftlb $7, %k1, %k1
1851; SKX_32-NEXT:    kshiftrb $6, %k1, %k1
1852; SKX_32-NEXT:    korw %k1, %k0, %k0
1853; SKX_32-NEXT:    movb $-5, %al
1854; SKX_32-NEXT:    kmovw %eax, %k1
1855; SKX_32-NEXT:    kandw %k1, %k0, %k0
1856; SKX_32-NEXT:    movb {{[0-9]+}}(%esp), %al
1857; SKX_32-NEXT:    kmovw %eax, %k1
1858; SKX_32-NEXT:    kshiftlb $7, %k1, %k1
1859; SKX_32-NEXT:    kshiftrb $5, %k1, %k1
1860; SKX_32-NEXT:    korw %k1, %k0, %k1
1861; SKX_32-NEXT:    vpslld $2, %xmm1, %xmm1
1862; SKX_32-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1863; SKX_32-NEXT:    vpscatterdd %xmm2, (,%xmm0) {%k1}
1864; SKX_32-NEXT:    retl
1865  %sext_ind = sext <3 x i32> %ind to <3 x i64>
1866  %gep.random = getelementptr i32, <3 x i32*> %base, <3 x i64> %sext_ind
1867  call void @llvm.masked.scatter.v3i32.v3p0i32(<3 x i32> %src0, <3 x i32*> %gep.random, i32 4, <3 x i1> %mask)
1868  ret void
1869}
1870
1871declare <16 x float*> @llvm.masked.gather.v16p0f32.v16p0p0f32(<16 x float**>, i32, <16 x i1>, <16 x float*>)
1872define <16 x float*> @test31(<16 x float**> %ptrs) {
1873; KNL_64-LABEL: test31:
1874; KNL_64:       # %bb.0:
1875; KNL_64-NEXT:    kxnorw %k0, %k0, %k1
1876; KNL_64-NEXT:    kxnorw %k0, %k0, %k2
1877; KNL_64-NEXT:    vpgatherqq (,%zmm0), %zmm2 {%k2}
1878; KNL_64-NEXT:    vpgatherqq (,%zmm1), %zmm3 {%k1}
1879; KNL_64-NEXT:    vmovdqa64 %zmm2, %zmm0
1880; KNL_64-NEXT:    vmovdqa64 %zmm3, %zmm1
1881; KNL_64-NEXT:    retq
1882;
1883; KNL_32-LABEL: test31:
1884; KNL_32:       # %bb.0:
1885; KNL_32-NEXT:    kxnorw %k0, %k0, %k1
1886; KNL_32-NEXT:    vpgatherdd (,%zmm0), %zmm1 {%k1}
1887; KNL_32-NEXT:    vmovdqa64 %zmm1, %zmm0
1888; KNL_32-NEXT:    retl
1889;
1890; SKX-LABEL: test31:
1891; SKX:       # %bb.0:
1892; SKX-NEXT:    kxnorw %k0, %k0, %k1
1893; SKX-NEXT:    kxnorw %k0, %k0, %k2
1894; SKX-NEXT:    vpgatherqq (,%zmm0), %zmm2 {%k2}
1895; SKX-NEXT:    vpgatherqq (,%zmm1), %zmm3 {%k1}
1896; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0
1897; SKX-NEXT:    vmovdqa64 %zmm3, %zmm1
1898; SKX-NEXT:    retq
1899;
1900; SKX_32-LABEL: test31:
1901; SKX_32:       # %bb.0:
1902; SKX_32-NEXT:    kxnorw %k0, %k0, %k1
1903; SKX_32-NEXT:    vpgatherdd (,%zmm0), %zmm1 {%k1}
1904; SKX_32-NEXT:    vmovdqa64 %zmm1, %zmm0
1905; SKX_32-NEXT:    retl
1906
1907  %res = call <16 x float*> @llvm.masked.gather.v16p0f32.v16p0p0f32(<16 x float**> %ptrs, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float*> undef)
1908  ret <16 x float*>%res
1909}
1910
1911define <16 x i32> @test_gather_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0)  {
1912; KNL_64-LABEL: test_gather_16i32:
1913; KNL_64:       # %bb.0:
1914; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
1915; KNL_64-NEXT:    vpslld $31, %zmm2, %zmm2
1916; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
1917; KNL_64-NEXT:    vextracti64x4 $1, %zmm3, %ymm2
1918; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
1919; KNL_64-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k2}
1920; KNL_64-NEXT:    vpgatherqd (,%zmm0), %ymm3 {%k1}
1921; KNL_64-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm0
1922; KNL_64-NEXT:    retq
1923;
1924; KNL_32-LABEL: test_gather_16i32:
1925; KNL_32:       # %bb.0:
1926; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
1927; KNL_32-NEXT:    vpslld $31, %zmm1, %zmm1
1928; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
1929; KNL_32-NEXT:    vpgatherdd (,%zmm0), %zmm2 {%k1}
1930; KNL_32-NEXT:    vmovdqa64 %zmm2, %zmm0
1931; KNL_32-NEXT:    retl
1932;
1933; SKX-LABEL: test_gather_16i32:
1934; SKX:       # %bb.0:
1935; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
1936; SKX-NEXT:    vpslld $31, %zmm2, %zmm2
1937; SKX-NEXT:    vpmovd2m %zmm2, %k1
1938; SKX-NEXT:    vextracti64x4 $1, %zmm3, %ymm2
1939; SKX-NEXT:    kshiftrw $8, %k1, %k2
1940; SKX-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k2}
1941; SKX-NEXT:    vpgatherqd (,%zmm0), %ymm3 {%k1}
1942; SKX-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm0
1943; SKX-NEXT:    retq
1944;
1945; SKX_32-LABEL: test_gather_16i32:
1946; SKX_32:       # %bb.0:
1947; SKX_32-NEXT:    vpmovsxbd %xmm1, %zmm1
1948; SKX_32-NEXT:    vpslld $31, %zmm1, %zmm1
1949; SKX_32-NEXT:    vpmovd2m %zmm1, %k1
1950; SKX_32-NEXT:    vpgatherdd (,%zmm0), %zmm2 {%k1}
1951; SKX_32-NEXT:    vmovdqa64 %zmm2, %zmm0
1952; SKX_32-NEXT:    retl
1953  %res = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %ptrs, i32 4, <16 x i1> %mask, <16 x i32> %src0)
1954  ret <16 x i32> %res
1955}
1956define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0)  {
1957; KNL_64-LABEL: test_gather_16i64:
1958; KNL_64:       # %bb.0:
1959; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
1960; KNL_64-NEXT:    vpslld $31, %zmm2, %zmm2
1961; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
1962; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
1963; KNL_64-NEXT:    vpgatherqq (,%zmm0), %zmm3 {%k1}
1964; KNL_64-NEXT:    vpgatherqq (,%zmm1), %zmm4 {%k2}
1965; KNL_64-NEXT:    vmovdqa64 %zmm3, %zmm0
1966; KNL_64-NEXT:    vmovdqa64 %zmm4, %zmm1
1967; KNL_64-NEXT:    retq
1968;
1969; KNL_32-LABEL: test_gather_16i64:
1970; KNL_32:       # %bb.0:
1971; KNL_32-NEXT:    pushl %ebp
1972; KNL_32-NEXT:    .cfi_def_cfa_offset 8
1973; KNL_32-NEXT:    .cfi_offset %ebp, -8
1974; KNL_32-NEXT:    movl %esp, %ebp
1975; KNL_32-NEXT:    .cfi_def_cfa_register %ebp
1976; KNL_32-NEXT:    andl $-64, %esp
1977; KNL_32-NEXT:    subl $64, %esp
1978; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
1979; KNL_32-NEXT:    vpslld $31, %zmm1, %zmm1
1980; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
1981; KNL_32-NEXT:    vmovdqa64 8(%ebp), %zmm1
1982; KNL_32-NEXT:    kshiftrw $8, %k1, %k2
1983; KNL_32-NEXT:    vpgatherdq (,%ymm0), %zmm2 {%k1}
1984; KNL_32-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
1985; KNL_32-NEXT:    vpgatherdq (,%ymm0), %zmm1 {%k2}
1986; KNL_32-NEXT:    vmovdqa64 %zmm2, %zmm0
1987; KNL_32-NEXT:    movl %ebp, %esp
1988; KNL_32-NEXT:    popl %ebp
1989; KNL_32-NEXT:    .cfi_def_cfa %esp, 4
1990; KNL_32-NEXT:    retl
1991;
1992; SKX-LABEL: test_gather_16i64:
1993; SKX:       # %bb.0:
1994; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
1995; SKX-NEXT:    vpslld $31, %zmm2, %zmm2
1996; SKX-NEXT:    vpmovd2m %zmm2, %k1
1997; SKX-NEXT:    kshiftrw $8, %k1, %k2
1998; SKX-NEXT:    vpgatherqq (,%zmm0), %zmm3 {%k1}
1999; SKX-NEXT:    vpgatherqq (,%zmm1), %zmm4 {%k2}
2000; SKX-NEXT:    vmovdqa64 %zmm3, %zmm0
2001; SKX-NEXT:    vmovdqa64 %zmm4, %zmm1
2002; SKX-NEXT:    retq
2003;
2004; SKX_32-LABEL: test_gather_16i64:
2005; SKX_32:       # %bb.0:
2006; SKX_32-NEXT:    pushl %ebp
2007; SKX_32-NEXT:    .cfi_def_cfa_offset 8
2008; SKX_32-NEXT:    .cfi_offset %ebp, -8
2009; SKX_32-NEXT:    movl %esp, %ebp
2010; SKX_32-NEXT:    .cfi_def_cfa_register %ebp
2011; SKX_32-NEXT:    andl $-64, %esp
2012; SKX_32-NEXT:    subl $64, %esp
2013; SKX_32-NEXT:    vpmovsxbd %xmm1, %zmm1
2014; SKX_32-NEXT:    vpslld $31, %zmm1, %zmm1
2015; SKX_32-NEXT:    vpmovd2m %zmm1, %k1
2016; SKX_32-NEXT:    vmovdqa64 8(%ebp), %zmm1
2017; SKX_32-NEXT:    kshiftrw $8, %k1, %k2
2018; SKX_32-NEXT:    vpgatherdq (,%ymm0), %zmm2 {%k1}
2019; SKX_32-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
2020; SKX_32-NEXT:    vpgatherdq (,%ymm0), %zmm1 {%k2}
2021; SKX_32-NEXT:    vmovdqa64 %zmm2, %zmm0
2022; SKX_32-NEXT:    movl %ebp, %esp
2023; SKX_32-NEXT:    popl %ebp
2024; SKX_32-NEXT:    .cfi_def_cfa %esp, 4
2025; SKX_32-NEXT:    retl
2026  %res = call <16 x i64> @llvm.masked.gather.v16i64.v16p0i64(<16 x i64*> %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0)
2027  ret <16 x i64> %res
2028}
2029declare <16 x i64> @llvm.masked.gather.v16i64.v16p0i64(<16 x i64*> %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0)
2030define <16 x float> @test_gather_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0)  {
2031; KNL_64-LABEL: test_gather_16f32:
2032; KNL_64:       # %bb.0:
2033; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
2034; KNL_64-NEXT:    vpslld $31, %zmm2, %zmm2
2035; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
2036; KNL_64-NEXT:    vextractf64x4 $1, %zmm3, %ymm2
2037; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
2038; KNL_64-NEXT:    vgatherqps (,%zmm1), %ymm2 {%k2}
2039; KNL_64-NEXT:    vgatherqps (,%zmm0), %ymm3 {%k1}
2040; KNL_64-NEXT:    vinsertf64x4 $1, %ymm2, %zmm3, %zmm0
2041; KNL_64-NEXT:    retq
2042;
2043; KNL_32-LABEL: test_gather_16f32:
2044; KNL_32:       # %bb.0:
2045; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
2046; KNL_32-NEXT:    vpslld $31, %zmm1, %zmm1
2047; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
2048; KNL_32-NEXT:    vgatherdps (,%zmm0), %zmm2 {%k1}
2049; KNL_32-NEXT:    vmovaps %zmm2, %zmm0
2050; KNL_32-NEXT:    retl
2051;
2052; SKX-LABEL: test_gather_16f32:
2053; SKX:       # %bb.0:
2054; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
2055; SKX-NEXT:    vpslld $31, %zmm2, %zmm2
2056; SKX-NEXT:    vpmovd2m %zmm2, %k1
2057; SKX-NEXT:    vextractf64x4 $1, %zmm3, %ymm2
2058; SKX-NEXT:    kshiftrw $8, %k1, %k2
2059; SKX-NEXT:    vgatherqps (,%zmm1), %ymm2 {%k2}
2060; SKX-NEXT:    vgatherqps (,%zmm0), %ymm3 {%k1}
2061; SKX-NEXT:    vinsertf64x4 $1, %ymm2, %zmm3, %zmm0
2062; SKX-NEXT:    retq
2063;
2064; SKX_32-LABEL: test_gather_16f32:
2065; SKX_32:       # %bb.0:
2066; SKX_32-NEXT:    vpmovsxbd %xmm1, %zmm1
2067; SKX_32-NEXT:    vpslld $31, %zmm1, %zmm1
2068; SKX_32-NEXT:    vpmovd2m %zmm1, %k1
2069; SKX_32-NEXT:    vgatherdps (,%zmm0), %zmm2 {%k1}
2070; SKX_32-NEXT:    vmovaps %zmm2, %zmm0
2071; SKX_32-NEXT:    retl
2072  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %ptrs, i32 4, <16 x i1> %mask, <16 x float> %src0)
2073  ret <16 x float> %res
2074}
2075define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0)  {
2076; KNL_64-LABEL: test_gather_16f64:
2077; KNL_64:       # %bb.0:
2078; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
2079; KNL_64-NEXT:    vpslld $31, %zmm2, %zmm2
2080; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
2081; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
2082; KNL_64-NEXT:    vgatherqpd (,%zmm0), %zmm3 {%k1}
2083; KNL_64-NEXT:    vgatherqpd (,%zmm1), %zmm4 {%k2}
2084; KNL_64-NEXT:    vmovapd %zmm3, %zmm0
2085; KNL_64-NEXT:    vmovapd %zmm4, %zmm1
2086; KNL_64-NEXT:    retq
2087;
2088; KNL_32-LABEL: test_gather_16f64:
2089; KNL_32:       # %bb.0:
2090; KNL_32-NEXT:    pushl %ebp
2091; KNL_32-NEXT:    .cfi_def_cfa_offset 8
2092; KNL_32-NEXT:    .cfi_offset %ebp, -8
2093; KNL_32-NEXT:    movl %esp, %ebp
2094; KNL_32-NEXT:    .cfi_def_cfa_register %ebp
2095; KNL_32-NEXT:    andl $-64, %esp
2096; KNL_32-NEXT:    subl $64, %esp
2097; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
2098; KNL_32-NEXT:    vpslld $31, %zmm1, %zmm1
2099; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
2100; KNL_32-NEXT:    vmovapd 8(%ebp), %zmm1
2101; KNL_32-NEXT:    kshiftrw $8, %k1, %k2
2102; KNL_32-NEXT:    vgatherdpd (,%ymm0), %zmm2 {%k1}
2103; KNL_32-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
2104; KNL_32-NEXT:    vgatherdpd (,%ymm0), %zmm1 {%k2}
2105; KNL_32-NEXT:    vmovapd %zmm2, %zmm0
2106; KNL_32-NEXT:    movl %ebp, %esp
2107; KNL_32-NEXT:    popl %ebp
2108; KNL_32-NEXT:    .cfi_def_cfa %esp, 4
2109; KNL_32-NEXT:    retl
2110;
2111; SKX-LABEL: test_gather_16f64:
2112; SKX:       # %bb.0:
2113; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
2114; SKX-NEXT:    vpslld $31, %zmm2, %zmm2
2115; SKX-NEXT:    vpmovd2m %zmm2, %k1
2116; SKX-NEXT:    kshiftrw $8, %k1, %k2
2117; SKX-NEXT:    vgatherqpd (,%zmm0), %zmm3 {%k1}
2118; SKX-NEXT:    vgatherqpd (,%zmm1), %zmm4 {%k2}
2119; SKX-NEXT:    vmovapd %zmm3, %zmm0
2120; SKX-NEXT:    vmovapd %zmm4, %zmm1
2121; SKX-NEXT:    retq
2122;
2123; SKX_32-LABEL: test_gather_16f64:
2124; SKX_32:       # %bb.0:
2125; SKX_32-NEXT:    pushl %ebp
2126; SKX_32-NEXT:    .cfi_def_cfa_offset 8
2127; SKX_32-NEXT:    .cfi_offset %ebp, -8
2128; SKX_32-NEXT:    movl %esp, %ebp
2129; SKX_32-NEXT:    .cfi_def_cfa_register %ebp
2130; SKX_32-NEXT:    andl $-64, %esp
2131; SKX_32-NEXT:    subl $64, %esp
2132; SKX_32-NEXT:    vpmovsxbd %xmm1, %zmm1
2133; SKX_32-NEXT:    vpslld $31, %zmm1, %zmm1
2134; SKX_32-NEXT:    vpmovd2m %zmm1, %k1
2135; SKX_32-NEXT:    vmovapd 8(%ebp), %zmm1
2136; SKX_32-NEXT:    kshiftrw $8, %k1, %k2
2137; SKX_32-NEXT:    vgatherdpd (,%ymm0), %zmm2 {%k1}
2138; SKX_32-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
2139; SKX_32-NEXT:    vgatherdpd (,%ymm0), %zmm1 {%k2}
2140; SKX_32-NEXT:    vmovapd %zmm2, %zmm0
2141; SKX_32-NEXT:    movl %ebp, %esp
2142; SKX_32-NEXT:    popl %ebp
2143; SKX_32-NEXT:    .cfi_def_cfa %esp, 4
2144; SKX_32-NEXT:    retl
2145  %res = call <16 x double> @llvm.masked.gather.v16f64.v16p0f64(<16 x double*> %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0)
2146  ret <16 x double> %res
2147}
2148declare <16 x double> @llvm.masked.gather.v16f64.v16p0f64(<16 x double*> %ptrs, i32, <16 x i1> %mask, <16 x double> %src0)
2149define void @test_scatter_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0)  {
2150; KNL_64-LABEL: test_scatter_16i32:
2151; KNL_64:       # %bb.0:
2152; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
2153; KNL_64-NEXT:    vpslld $31, %zmm2, %zmm2
2154; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
2155; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
2156; KNL_64-NEXT:    vpscatterqd %ymm3, (,%zmm0) {%k1}
2157; KNL_64-NEXT:    vextracti64x4 $1, %zmm3, %ymm0
2158; KNL_64-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k2}
2159; KNL_64-NEXT:    vzeroupper
2160; KNL_64-NEXT:    retq
2161;
2162; KNL_32-LABEL: test_scatter_16i32:
2163; KNL_32:       # %bb.0:
2164; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
2165; KNL_32-NEXT:    vpslld $31, %zmm1, %zmm1
2166; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
2167; KNL_32-NEXT:    vpscatterdd %zmm2, (,%zmm0) {%k1}
2168; KNL_32-NEXT:    vzeroupper
2169; KNL_32-NEXT:    retl
2170;
2171; SKX-LABEL: test_scatter_16i32:
2172; SKX:       # %bb.0:
2173; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
2174; SKX-NEXT:    vpslld $31, %zmm2, %zmm2
2175; SKX-NEXT:    vpmovd2m %zmm2, %k1
2176; SKX-NEXT:    kshiftrw $8, %k1, %k2
2177; SKX-NEXT:    vpscatterqd %ymm3, (,%zmm0) {%k1}
2178; SKX-NEXT:    vextracti64x4 $1, %zmm3, %ymm0
2179; SKX-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k2}
2180; SKX-NEXT:    vzeroupper
2181; SKX-NEXT:    retq
2182;
2183; SKX_32-LABEL: test_scatter_16i32:
2184; SKX_32:       # %bb.0:
2185; SKX_32-NEXT:    vpmovsxbd %xmm1, %zmm1
2186; SKX_32-NEXT:    vpslld $31, %zmm1, %zmm1
2187; SKX_32-NEXT:    vpmovd2m %zmm1, %k1
2188; SKX_32-NEXT:    vpscatterdd %zmm2, (,%zmm0) {%k1}
2189; SKX_32-NEXT:    vzeroupper
2190; SKX_32-NEXT:    retl
2191  call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %src0, <16 x i32*> %ptrs, i32 4, <16 x i1> %mask)
2192  ret void
2193}
2194define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0)  {
2195; KNL_64-LABEL: test_scatter_16i64:
2196; KNL_64:       # %bb.0:
2197; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
2198; KNL_64-NEXT:    vpslld $31, %zmm2, %zmm2
2199; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
2200; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
2201; KNL_64-NEXT:    vpscatterqq %zmm3, (,%zmm0) {%k1}
2202; KNL_64-NEXT:    vpscatterqq %zmm4, (,%zmm1) {%k2}
2203; KNL_64-NEXT:    vzeroupper
2204; KNL_64-NEXT:    retq
2205;
2206; KNL_32-LABEL: test_scatter_16i64:
2207; KNL_32:       # %bb.0:
2208; KNL_32-NEXT:    pushl %ebp
2209; KNL_32-NEXT:    .cfi_def_cfa_offset 8
2210; KNL_32-NEXT:    .cfi_offset %ebp, -8
2211; KNL_32-NEXT:    movl %esp, %ebp
2212; KNL_32-NEXT:    .cfi_def_cfa_register %ebp
2213; KNL_32-NEXT:    andl $-64, %esp
2214; KNL_32-NEXT:    subl $64, %esp
2215; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
2216; KNL_32-NEXT:    vpslld $31, %zmm1, %zmm1
2217; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
2218; KNL_32-NEXT:    vmovdqa64 8(%ebp), %zmm1
2219; KNL_32-NEXT:    kshiftrw $8, %k1, %k2
2220; KNL_32-NEXT:    vpscatterdq %zmm2, (,%ymm0) {%k1}
2221; KNL_32-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
2222; KNL_32-NEXT:    vpscatterdq %zmm1, (,%ymm0) {%k2}
2223; KNL_32-NEXT:    movl %ebp, %esp
2224; KNL_32-NEXT:    popl %ebp
2225; KNL_32-NEXT:    .cfi_def_cfa %esp, 4
2226; KNL_32-NEXT:    vzeroupper
2227; KNL_32-NEXT:    retl
2228;
2229; SKX-LABEL: test_scatter_16i64:
2230; SKX:       # %bb.0:
2231; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
2232; SKX-NEXT:    vpslld $31, %zmm2, %zmm2
2233; SKX-NEXT:    vpmovd2m %zmm2, %k1
2234; SKX-NEXT:    kshiftrw $8, %k1, %k2
2235; SKX-NEXT:    vpscatterqq %zmm3, (,%zmm0) {%k1}
2236; SKX-NEXT:    vpscatterqq %zmm4, (,%zmm1) {%k2}
2237; SKX-NEXT:    vzeroupper
2238; SKX-NEXT:    retq
2239;
2240; SKX_32-LABEL: test_scatter_16i64:
2241; SKX_32:       # %bb.0:
2242; SKX_32-NEXT:    pushl %ebp
2243; SKX_32-NEXT:    .cfi_def_cfa_offset 8
2244; SKX_32-NEXT:    .cfi_offset %ebp, -8
2245; SKX_32-NEXT:    movl %esp, %ebp
2246; SKX_32-NEXT:    .cfi_def_cfa_register %ebp
2247; SKX_32-NEXT:    andl $-64, %esp
2248; SKX_32-NEXT:    subl $64, %esp
2249; SKX_32-NEXT:    vpmovsxbd %xmm1, %zmm1
2250; SKX_32-NEXT:    vpslld $31, %zmm1, %zmm1
2251; SKX_32-NEXT:    vpmovd2m %zmm1, %k1
2252; SKX_32-NEXT:    vmovdqa64 8(%ebp), %zmm1
2253; SKX_32-NEXT:    kshiftrw $8, %k1, %k2
2254; SKX_32-NEXT:    vpscatterdq %zmm2, (,%ymm0) {%k1}
2255; SKX_32-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
2256; SKX_32-NEXT:    vpscatterdq %zmm1, (,%ymm0) {%k2}
2257; SKX_32-NEXT:    movl %ebp, %esp
2258; SKX_32-NEXT:    popl %ebp
2259; SKX_32-NEXT:    .cfi_def_cfa %esp, 4
2260; SKX_32-NEXT:    vzeroupper
2261; SKX_32-NEXT:    retl
2262  call void @llvm.masked.scatter.v16i64.v16p0i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32 4, <16 x i1> %mask)
2263  ret void
2264}
2265declare void @llvm.masked.scatter.v16i64.v16p0i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32, <16 x i1> %mask)
2266define void @test_scatter_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0)  {
2267; KNL_64-LABEL: test_scatter_16f32:
2268; KNL_64:       # %bb.0:
2269; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
2270; KNL_64-NEXT:    vpslld $31, %zmm2, %zmm2
2271; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
2272; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
2273; KNL_64-NEXT:    vscatterqps %ymm3, (,%zmm0) {%k1}
2274; KNL_64-NEXT:    vextractf64x4 $1, %zmm3, %ymm0
2275; KNL_64-NEXT:    vscatterqps %ymm0, (,%zmm1) {%k2}
2276; KNL_64-NEXT:    vzeroupper
2277; KNL_64-NEXT:    retq
2278;
2279; KNL_32-LABEL: test_scatter_16f32:
2280; KNL_32:       # %bb.0:
2281; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
2282; KNL_32-NEXT:    vpslld $31, %zmm1, %zmm1
2283; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
2284; KNL_32-NEXT:    vscatterdps %zmm2, (,%zmm0) {%k1}
2285; KNL_32-NEXT:    vzeroupper
2286; KNL_32-NEXT:    retl
2287;
2288; SKX-LABEL: test_scatter_16f32:
2289; SKX:       # %bb.0:
2290; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
2291; SKX-NEXT:    vpslld $31, %zmm2, %zmm2
2292; SKX-NEXT:    vpmovd2m %zmm2, %k1
2293; SKX-NEXT:    kshiftrw $8, %k1, %k2
2294; SKX-NEXT:    vscatterqps %ymm3, (,%zmm0) {%k1}
2295; SKX-NEXT:    vextractf64x4 $1, %zmm3, %ymm0
2296; SKX-NEXT:    vscatterqps %ymm0, (,%zmm1) {%k2}
2297; SKX-NEXT:    vzeroupper
2298; SKX-NEXT:    retq
2299;
2300; SKX_32-LABEL: test_scatter_16f32:
2301; SKX_32:       # %bb.0:
2302; SKX_32-NEXT:    vpmovsxbd %xmm1, %zmm1
2303; SKX_32-NEXT:    vpslld $31, %zmm1, %zmm1
2304; SKX_32-NEXT:    vpmovd2m %zmm1, %k1
2305; SKX_32-NEXT:    vscatterdps %zmm2, (,%zmm0) {%k1}
2306; SKX_32-NEXT:    vzeroupper
2307; SKX_32-NEXT:    retl
2308  call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> %src0, <16 x float*> %ptrs, i32 4, <16 x i1> %mask)
2309  ret void
2310}
2311declare void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> %src0, <16 x float*> %ptrs, i32, <16 x i1> %mask)
2312define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0)  {
2313; KNL_64-LABEL: test_scatter_16f64:
2314; KNL_64:       # %bb.0:
2315; KNL_64-NEXT:    vpmovsxbd %xmm2, %zmm2
2316; KNL_64-NEXT:    vpslld $31, %zmm2, %zmm2
2317; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
2318; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
2319; KNL_64-NEXT:    vscatterqpd %zmm3, (,%zmm0) {%k1}
2320; KNL_64-NEXT:    vscatterqpd %zmm4, (,%zmm1) {%k2}
2321; KNL_64-NEXT:    vzeroupper
2322; KNL_64-NEXT:    retq
2323;
2324; KNL_32-LABEL: test_scatter_16f64:
2325; KNL_32:       # %bb.0:
2326; KNL_32-NEXT:    pushl %ebp
2327; KNL_32-NEXT:    .cfi_def_cfa_offset 8
2328; KNL_32-NEXT:    .cfi_offset %ebp, -8
2329; KNL_32-NEXT:    movl %esp, %ebp
2330; KNL_32-NEXT:    .cfi_def_cfa_register %ebp
2331; KNL_32-NEXT:    andl $-64, %esp
2332; KNL_32-NEXT:    subl $64, %esp
2333; KNL_32-NEXT:    vpmovsxbd %xmm1, %zmm1
2334; KNL_32-NEXT:    vpslld $31, %zmm1, %zmm1
2335; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
2336; KNL_32-NEXT:    vmovapd 8(%ebp), %zmm1
2337; KNL_32-NEXT:    kshiftrw $8, %k1, %k2
2338; KNL_32-NEXT:    vscatterdpd %zmm2, (,%ymm0) {%k1}
2339; KNL_32-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
2340; KNL_32-NEXT:    vscatterdpd %zmm1, (,%ymm0) {%k2}
2341; KNL_32-NEXT:    movl %ebp, %esp
2342; KNL_32-NEXT:    popl %ebp
2343; KNL_32-NEXT:    .cfi_def_cfa %esp, 4
2344; KNL_32-NEXT:    vzeroupper
2345; KNL_32-NEXT:    retl
2346;
2347; SKX-LABEL: test_scatter_16f64:
2348; SKX:       # %bb.0:
2349; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
2350; SKX-NEXT:    vpslld $31, %zmm2, %zmm2
2351; SKX-NEXT:    vpmovd2m %zmm2, %k1
2352; SKX-NEXT:    kshiftrw $8, %k1, %k2
2353; SKX-NEXT:    vscatterqpd %zmm3, (,%zmm0) {%k1}
2354; SKX-NEXT:    vscatterqpd %zmm4, (,%zmm1) {%k2}
2355; SKX-NEXT:    vzeroupper
2356; SKX-NEXT:    retq
2357;
2358; SKX_32-LABEL: test_scatter_16f64:
2359; SKX_32:       # %bb.0:
2360; SKX_32-NEXT:    pushl %ebp
2361; SKX_32-NEXT:    .cfi_def_cfa_offset 8
2362; SKX_32-NEXT:    .cfi_offset %ebp, -8
2363; SKX_32-NEXT:    movl %esp, %ebp
2364; SKX_32-NEXT:    .cfi_def_cfa_register %ebp
2365; SKX_32-NEXT:    andl $-64, %esp
2366; SKX_32-NEXT:    subl $64, %esp
2367; SKX_32-NEXT:    vpmovsxbd %xmm1, %zmm1
2368; SKX_32-NEXT:    vpslld $31, %zmm1, %zmm1
2369; SKX_32-NEXT:    vpmovd2m %zmm1, %k1
2370; SKX_32-NEXT:    vmovapd 8(%ebp), %zmm1
2371; SKX_32-NEXT:    kshiftrw $8, %k1, %k2
2372; SKX_32-NEXT:    vscatterdpd %zmm2, (,%ymm0) {%k1}
2373; SKX_32-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
2374; SKX_32-NEXT:    vscatterdpd %zmm1, (,%ymm0) {%k2}
2375; SKX_32-NEXT:    movl %ebp, %esp
2376; SKX_32-NEXT:    popl %ebp
2377; SKX_32-NEXT:    .cfi_def_cfa %esp, 4
2378; SKX_32-NEXT:    vzeroupper
2379; SKX_32-NEXT:    retl
2380  call void @llvm.masked.scatter.v16f64.v16p0f64(<16 x double> %src0, <16 x double*> %ptrs, i32 4, <16 x i1> %mask)
2381  ret void
2382}
2383declare void @llvm.masked.scatter.v16f64.v16p0f64(<16 x double> %src0, <16 x double*> %ptrs, i32, <16 x i1> %mask)
2384
2385define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i64> %d) {
2386; KNL_64-LABEL: test_pr28312:
2387; KNL_64:       # %bb.0:
2388; KNL_64-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
2389; KNL_64-NEXT:    vpslld $31, %xmm1, %xmm1
2390; KNL_64-NEXT:    vptestmd %zmm1, %zmm1, %k0
2391; KNL_64-NEXT:    kshiftlw $12, %k0, %k0
2392; KNL_64-NEXT:    kshiftrw $12, %k0, %k1
2393; KNL_64-NEXT:    vpgatherqq (,%zmm0), %zmm1 {%k1}
2394; KNL_64-NEXT:    vpaddq %ymm1, %ymm1, %ymm0
2395; KNL_64-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
2396; KNL_64-NEXT:    retq
2397;
2398; KNL_32-LABEL: test_pr28312:
2399; KNL_32:       # %bb.0:
2400; KNL_32-NEXT:    pushl %ebp
2401; KNL_32-NEXT:    .cfi_def_cfa_offset 8
2402; KNL_32-NEXT:    .cfi_offset %ebp, -8
2403; KNL_32-NEXT:    movl %esp, %ebp
2404; KNL_32-NEXT:    .cfi_def_cfa_register %ebp
2405; KNL_32-NEXT:    andl $-32, %esp
2406; KNL_32-NEXT:    subl $32, %esp
2407; KNL_32-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
2408; KNL_32-NEXT:    vpslld $31, %xmm1, %xmm1
2409; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k0
2410; KNL_32-NEXT:    kshiftlw $12, %k0, %k0
2411; KNL_32-NEXT:    kshiftrw $12, %k0, %k1
2412; KNL_32-NEXT:    vpgatherdq (,%ymm0), %zmm1 {%k1}
2413; KNL_32-NEXT:    vpaddq %ymm1, %ymm1, %ymm0
2414; KNL_32-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
2415; KNL_32-NEXT:    movl %ebp, %esp
2416; KNL_32-NEXT:    popl %ebp
2417; KNL_32-NEXT:    .cfi_def_cfa %esp, 4
2418; KNL_32-NEXT:    retl
2419;
2420; SKX-LABEL: test_pr28312:
2421; SKX:       # %bb.0:
2422; SKX-NEXT:    vpslld $31, %xmm1, %xmm1
2423; SKX-NEXT:    vpmovd2m %xmm1, %k1
2424; SKX-NEXT:    vpgatherqq (,%ymm0), %ymm1 {%k1}
2425; SKX-NEXT:    vpaddq %ymm1, %ymm1, %ymm0
2426; SKX-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
2427; SKX-NEXT:    retq
2428;
2429; SKX_32-LABEL: test_pr28312:
2430; SKX_32:       # %bb.0:
2431; SKX_32-NEXT:    pushl %ebp
2432; SKX_32-NEXT:    .cfi_def_cfa_offset 8
2433; SKX_32-NEXT:    .cfi_offset %ebp, -8
2434; SKX_32-NEXT:    movl %esp, %ebp
2435; SKX_32-NEXT:    .cfi_def_cfa_register %ebp
2436; SKX_32-NEXT:    andl $-32, %esp
2437; SKX_32-NEXT:    subl $32, %esp
2438; SKX_32-NEXT:    vpslld $31, %xmm1, %xmm1
2439; SKX_32-NEXT:    vpmovd2m %xmm1, %k1
2440; SKX_32-NEXT:    vpgatherdq (,%xmm0), %ymm1 {%k1}
2441; SKX_32-NEXT:    vpaddq %ymm1, %ymm1, %ymm0
2442; SKX_32-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
2443; SKX_32-NEXT:    movl %ebp, %esp
2444; SKX_32-NEXT:    popl %ebp
2445; SKX_32-NEXT:    .cfi_def_cfa %esp, 4
2446; SKX_32-NEXT:    retl
2447  %g1 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
2448  %g2 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
2449  %g3 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
2450  %a = add <4 x i64> %g1, %g2
2451  %b = add <4 x i64> %a, %g3
2452  ret <4 x i64> %b
2453}
2454declare <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*>, i32, <4 x i1>, <4 x i64>)
2455
2456define <8 x i32> @test_global_array(<8 x i64> %indxs) {
2457; KNL_64-LABEL: test_global_array:
2458; KNL_64:       # %bb.0:
2459; KNL_64-NEXT:    kxnorw %k0, %k0, %k1
2460; KNL_64-NEXT:    vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
2461; KNL_64-NEXT:    vmovdqa %ymm1, %ymm0
2462; KNL_64-NEXT:    retq
2463;
2464; KNL_32-LABEL: test_global_array:
2465; KNL_32:       # %bb.0:
2466; KNL_32-NEXT:    kxnorw %k0, %k0, %k1
2467; KNL_32-NEXT:    vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
2468; KNL_32-NEXT:    vmovdqa %ymm1, %ymm0
2469; KNL_32-NEXT:    retl
2470;
2471; SKX_SMALL-LABEL: test_global_array:
2472; SKX_SMALL:       # %bb.0:
2473; SKX_SMALL-NEXT:    kxnorw %k0, %k0, %k1
2474; SKX_SMALL-NEXT:    vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
2475; SKX_SMALL-NEXT:    vmovdqa %ymm1, %ymm0
2476; SKX_SMALL-NEXT:    retq
2477;
2478; SKX_LARGE-LABEL: test_global_array:
2479; SKX_LARGE:       # %bb.0:
2480; SKX_LARGE-NEXT:    movabsq $glob_array, %rax
2481; SKX_LARGE-NEXT:    kxnorw %k0, %k0, %k1
2482; SKX_LARGE-NEXT:    vpgatherqd (%rax,%zmm0,4), %ymm1 {%k1}
2483; SKX_LARGE-NEXT:    vmovdqa %ymm1, %ymm0
2484; SKX_LARGE-NEXT:    retq
2485;
2486; SKX_32-LABEL: test_global_array:
2487; SKX_32:       # %bb.0:
2488; SKX_32-NEXT:    kxnorw %k0, %k0, %k1
2489; SKX_32-NEXT:    vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
2490; SKX_32-NEXT:    vmovdqa %ymm1, %ymm0
2491; SKX_32-NEXT:    retl
2492  %p = getelementptr inbounds [16 x i32], [16 x i32]* @glob_array, i64 0, <8 x i64> %indxs
2493  %g = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %p, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
2494  ret <8 x i32> %g
2495}
2496
2497define <8 x i32> @test_global_array_zeroinitializer_index(<8 x i64> %indxs) {
2498; KNL_64-LABEL: test_global_array_zeroinitializer_index:
2499; KNL_64:       # %bb.0:
2500; KNL_64-NEXT:    kxnorw %k0, %k0, %k1
2501; KNL_64-NEXT:    vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
2502; KNL_64-NEXT:    vmovdqa %ymm1, %ymm0
2503; KNL_64-NEXT:    retq
2504;
2505; KNL_32-LABEL: test_global_array_zeroinitializer_index:
2506; KNL_32:       # %bb.0:
2507; KNL_32-NEXT:    kxnorw %k0, %k0, %k1
2508; KNL_32-NEXT:    vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
2509; KNL_32-NEXT:    vmovdqa %ymm1, %ymm0
2510; KNL_32-NEXT:    retl
2511;
2512; SKX_SMALL-LABEL: test_global_array_zeroinitializer_index:
2513; SKX_SMALL:       # %bb.0:
2514; SKX_SMALL-NEXT:    kxnorw %k0, %k0, %k1
2515; SKX_SMALL-NEXT:    vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
2516; SKX_SMALL-NEXT:    vmovdqa %ymm1, %ymm0
2517; SKX_SMALL-NEXT:    retq
2518;
2519; SKX_LARGE-LABEL: test_global_array_zeroinitializer_index:
2520; SKX_LARGE:       # %bb.0:
2521; SKX_LARGE-NEXT:    movabsq $glob_array, %rax
2522; SKX_LARGE-NEXT:    kxnorw %k0, %k0, %k1
2523; SKX_LARGE-NEXT:    vpgatherqd (%rax,%zmm0,4), %ymm1 {%k1}
2524; SKX_LARGE-NEXT:    vmovdqa %ymm1, %ymm0
2525; SKX_LARGE-NEXT:    retq
2526;
2527; SKX_32-LABEL: test_global_array_zeroinitializer_index:
2528; SKX_32:       # %bb.0:
2529; SKX_32-NEXT:    kxnorw %k0, %k0, %k1
2530; SKX_32-NEXT:    vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
2531; SKX_32-NEXT:    vmovdqa %ymm1, %ymm0
2532; SKX_32-NEXT:    retl
2533  %p = getelementptr inbounds [16 x i32], [16 x i32]* @glob_array, <8 x i64> zeroinitializer, <8 x i64> %indxs
2534  %g = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %p, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
2535  ret <8 x i32> %g
2536}
2537
2538define void @v1_scatter(<1 x i32>%a1, <1 x i32*> %ptr, <1 x i1> %mask) {
2539; KNL_64-LABEL: v1_scatter:
2540; KNL_64:       # %bb.0:
2541; KNL_64-NEXT:    testb $1, %dl
2542; KNL_64-NEXT:    je .LBB45_2
2543; KNL_64-NEXT:  # %bb.1: # %cond.store
2544; KNL_64-NEXT:    movl %edi, (%rsi)
2545; KNL_64-NEXT:  .LBB45_2: # %else
2546; KNL_64-NEXT:    retq
2547;
2548; KNL_32-LABEL: v1_scatter:
2549; KNL_32:       # %bb.0:
2550; KNL_32-NEXT:    testb $1, {{[0-9]+}}(%esp)
2551; KNL_32-NEXT:    je .LBB45_2
2552; KNL_32-NEXT:  # %bb.1: # %cond.store
2553; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
2554; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
2555; KNL_32-NEXT:    movl %ecx, (%eax)
2556; KNL_32-NEXT:  .LBB45_2: # %else
2557; KNL_32-NEXT:    retl
2558;
2559; SKX-LABEL: v1_scatter:
2560; SKX:       # %bb.0:
2561; SKX-NEXT:    testb $1, %dl
2562; SKX-NEXT:    je .LBB45_2
2563; SKX-NEXT:  # %bb.1: # %cond.store
2564; SKX-NEXT:    movl %edi, (%rsi)
2565; SKX-NEXT:  .LBB45_2: # %else
2566; SKX-NEXT:    retq
2567;
2568; SKX_32-LABEL: v1_scatter:
2569; SKX_32:       # %bb.0:
2570; SKX_32-NEXT:    testb $1, {{[0-9]+}}(%esp)
2571; SKX_32-NEXT:    je .LBB45_2
2572; SKX_32-NEXT:  # %bb.1: # %cond.store
2573; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
2574; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
2575; SKX_32-NEXT:    movl %ecx, (%eax)
2576; SKX_32-NEXT:  .LBB45_2: # %else
2577; SKX_32-NEXT:    retl
2578  call void @llvm.masked.scatter.v1i32.v1p0i32(<1 x i32> %a1, <1 x i32*> %ptr, i32 4, <1 x i1> %mask)
2579  ret void
2580}
2581declare void @llvm.masked.scatter.v1i32.v1p0i32(<1 x i32>, <1 x i32*>, i32, <1 x i1>)
2582
2583define <1 x i32> @v1_gather(<1 x i32*> %ptr, <1 x i1> %mask, <1 x i32> %src0) {
2584; KNL_64-LABEL: v1_gather:
2585; KNL_64:       # %bb.0:
2586; KNL_64-NEXT:    movl (%rdi), %eax
2587; KNL_64-NEXT:    retq
2588;
2589; KNL_32-LABEL: v1_gather:
2590; KNL_32:       # %bb.0:
2591; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
2592; KNL_32-NEXT:    movl (%eax), %eax
2593; KNL_32-NEXT:    retl
2594;
2595; SKX-LABEL: v1_gather:
2596; SKX:       # %bb.0:
2597; SKX-NEXT:    movl (%rdi), %eax
2598; SKX-NEXT:    retq
2599;
2600; SKX_32-LABEL: v1_gather:
2601; SKX_32:       # %bb.0:
2602; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
2603; SKX_32-NEXT:    movl (%eax), %eax
2604; SKX_32-NEXT:    retl
2605  %res = call <1 x i32> @llvm.masked.gather.v1i32.v1p0i32(<1 x i32*> %ptr, i32 4, <1 x i1> <i1 true>, <1 x i32> %src0)
2606  ret <1 x i32>%res
2607}
2608declare <1 x i32> @llvm.masked.gather.v1i32.v1p0i32(<1 x i32*>, i32, <1 x i1>, <1 x i32>)
2609
2610; Make sure we don't crash when the index element type is larger than i64 and we need to widen the result
2611; This experienced a bad interaction when we widened and then tried to split.
2612define <2 x float> @large_index(float* %base, <2 x i128> %ind, <2 x i1> %mask, <2 x float> %src0) {
2613; KNL_64-LABEL: large_index:
2614; KNL_64:       # %bb.0:
2615; KNL_64-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
2616; KNL_64-NEXT:    vpsllq $63, %xmm0, %xmm0
2617; KNL_64-NEXT:    vptestmq %zmm0, %zmm0, %k0
2618; KNL_64-NEXT:    kshiftlw $14, %k0, %k0
2619; KNL_64-NEXT:    kshiftrw $14, %k0, %k1
2620; KNL_64-NEXT:    vmovq %rcx, %xmm0
2621; KNL_64-NEXT:    vmovq %rsi, %xmm2
2622; KNL_64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
2623; KNL_64-NEXT:    vgatherqps (%rdi,%zmm0,4), %ymm1 {%k1}
2624; KNL_64-NEXT:    vmovaps %xmm1, %xmm0
2625; KNL_64-NEXT:    vzeroupper
2626; KNL_64-NEXT:    retq
2627;
2628; KNL_32-LABEL: large_index:
2629; KNL_32:       # %bb.0:
2630; KNL_32-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
2631; KNL_32-NEXT:    vpsllq $63, %xmm0, %xmm0
2632; KNL_32-NEXT:    vptestmq %zmm0, %zmm0, %k0
2633; KNL_32-NEXT:    kshiftlw $14, %k0, %k0
2634; KNL_32-NEXT:    kshiftrw $14, %k0, %k1
2635; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
2636; KNL_32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2637; KNL_32-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
2638; KNL_32-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
2639; KNL_32-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
2640; KNL_32-NEXT:    vgatherqps (%eax,%zmm0,4), %ymm1 {%k1}
2641; KNL_32-NEXT:    vmovaps %xmm1, %xmm0
2642; KNL_32-NEXT:    vzeroupper
2643; KNL_32-NEXT:    retl
2644;
2645; SKX-LABEL: large_index:
2646; SKX:       # %bb.0:
2647; SKX-NEXT:    vpsllq $63, %xmm0, %xmm0
2648; SKX-NEXT:    vpmovq2m %xmm0, %k1
2649; SKX-NEXT:    vmovq %rcx, %xmm0
2650; SKX-NEXT:    vmovq %rsi, %xmm2
2651; SKX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
2652; SKX-NEXT:    vgatherqps (%rdi,%xmm0,4), %xmm1 {%k1}
2653; SKX-NEXT:    vmovaps %xmm1, %xmm0
2654; SKX-NEXT:    retq
2655;
2656; SKX_32-LABEL: large_index:
2657; SKX_32:       # %bb.0:
2658; SKX_32-NEXT:    vpsllq $63, %xmm0, %xmm0
2659; SKX_32-NEXT:    vpmovq2m %xmm0, %k1
2660; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
2661; SKX_32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2662; SKX_32-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
2663; SKX_32-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
2664; SKX_32-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
2665; SKX_32-NEXT:    vgatherqps (%eax,%xmm0,4), %xmm1 {%k1}
2666; SKX_32-NEXT:    vmovaps %xmm1, %xmm0
2667; SKX_32-NEXT:    retl
2668  %gep.random = getelementptr float, float* %base, <2 x i128> %ind
2669  %res = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> %gep.random, i32 4, <2 x i1> %mask, <2 x float> %src0)
2670  ret <2 x float>%res
2671}
2672
2673; Make sure we allow index to be sign extended from a smaller than i32 element size.
2674define <16 x float> @sext_i8_index(float* %base, <16 x i8> %ind) {
2675; KNL_64-LABEL: sext_i8_index:
2676; KNL_64:       # %bb.0:
2677; KNL_64-NEXT:    vpmovsxbd %xmm0, %zmm1
2678; KNL_64-NEXT:    kxnorw %k0, %k0, %k1
2679; KNL_64-NEXT:    vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
2680; KNL_64-NEXT:    retq
2681;
2682; KNL_32-LABEL: sext_i8_index:
2683; KNL_32:       # %bb.0:
2684; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
2685; KNL_32-NEXT:    vpmovsxbd %xmm0, %zmm1
2686; KNL_32-NEXT:    kxnorw %k0, %k0, %k1
2687; KNL_32-NEXT:    vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
2688; KNL_32-NEXT:    retl
2689;
2690; SKX-LABEL: sext_i8_index:
2691; SKX:       # %bb.0:
2692; SKX-NEXT:    vpmovsxbd %xmm0, %zmm1
2693; SKX-NEXT:    kxnorw %k0, %k0, %k1
2694; SKX-NEXT:    vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
2695; SKX-NEXT:    retq
2696;
2697; SKX_32-LABEL: sext_i8_index:
2698; SKX_32:       # %bb.0:
2699; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
2700; SKX_32-NEXT:    vpmovsxbd %xmm0, %zmm1
2701; SKX_32-NEXT:    kxnorw %k0, %k0, %k1
2702; SKX_32-NEXT:    vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
2703; SKX_32-NEXT:    retl
2704
2705  %sext_ind = sext <16 x i8> %ind to <16 x i64>
2706  %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
2707
2708  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
2709  ret <16 x float>%res
2710}
2711
2712; Make sure we allow index to be sign extended from a smaller than i32 element size.
2713define <8 x float> @sext_v8i8_index(float* %base, <8 x i8> %ind) {
2714; KNL_64-LABEL: sext_v8i8_index:
2715; KNL_64:       # %bb.0:
2716; KNL_64-NEXT:    vpmovsxbd %xmm0, %ymm1
2717; KNL_64-NEXT:    movw $255, %ax
2718; KNL_64-NEXT:    kmovw %eax, %k1
2719; KNL_64-NEXT:    vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
2720; KNL_64-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
2721; KNL_64-NEXT:    retq
2722;
2723; KNL_32-LABEL: sext_v8i8_index:
2724; KNL_32:       # %bb.0:
2725; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
2726; KNL_32-NEXT:    vpmovsxbd %xmm0, %ymm1
2727; KNL_32-NEXT:    movw $255, %cx
2728; KNL_32-NEXT:    kmovw %ecx, %k1
2729; KNL_32-NEXT:    vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
2730; KNL_32-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
2731; KNL_32-NEXT:    retl
2732;
2733; SKX-LABEL: sext_v8i8_index:
2734; SKX:       # %bb.0:
2735; SKX-NEXT:    vpmovsxbd %xmm0, %ymm1
2736; SKX-NEXT:    kxnorw %k0, %k0, %k1
2737; SKX-NEXT:    vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1}
2738; SKX-NEXT:    retq
2739;
2740; SKX_32-LABEL: sext_v8i8_index:
2741; SKX_32:       # %bb.0:
2742; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
2743; SKX_32-NEXT:    vpmovsxbd %xmm0, %ymm1
2744; SKX_32-NEXT:    kxnorw %k0, %k0, %k1
2745; SKX_32-NEXT:    vgatherdps (%eax,%ymm1,4), %ymm0 {%k1}
2746; SKX_32-NEXT:    retl
2747
2748  %sext_ind = sext <8 x i8> %ind to <8 x i64>
2749  %gep.random = getelementptr float, float *%base, <8 x i64> %sext_ind
2750
2751  %res = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> %gep.random, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef)
2752  ret <8 x float>%res
2753}
2754declare <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*>, i32, <8 x i1>, <8 x float>)
2755
2756; Make sure we also allow index to be zero extended from a smaller than i32 element size.
2757define <16 x float> @zext_i8_index(float* %base, <16 x i8> %ind) {
2758; KNL_64-LABEL: zext_i8_index:
2759; KNL_64:       # %bb.0:
2760; KNL_64-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
2761; KNL_64-NEXT:    kxnorw %k0, %k0, %k1
2762; KNL_64-NEXT:    vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
2763; KNL_64-NEXT:    retq
2764;
2765; KNL_32-LABEL: zext_i8_index:
2766; KNL_32:       # %bb.0:
2767; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
2768; KNL_32-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
2769; KNL_32-NEXT:    kxnorw %k0, %k0, %k1
2770; KNL_32-NEXT:    vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
2771; KNL_32-NEXT:    retl
2772;
2773; SKX-LABEL: zext_i8_index:
2774; SKX:       # %bb.0:
2775; SKX-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
2776; SKX-NEXT:    kxnorw %k0, %k0, %k1
2777; SKX-NEXT:    vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
2778; SKX-NEXT:    retq
2779;
2780; SKX_32-LABEL: zext_i8_index:
2781; SKX_32:       # %bb.0:
2782; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
2783; SKX_32-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
2784; SKX_32-NEXT:    kxnorw %k0, %k0, %k1
2785; SKX_32-NEXT:    vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
2786; SKX_32-NEXT:    retl
2787
2788  %zext_ind = zext <16 x i8> %ind to <16 x i64>
2789  %gep.random = getelementptr float, float *%base, <16 x i64> %zext_ind
2790
2791  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
2792  ret <16 x float>%res
2793}
2794
2795; Make sure we also allow index to be zero extended from a smaller than i32 element size.
2796define <8 x float> @zext_v8i8_index(float* %base, <8 x i8> %ind) {
2797; KNL_64-LABEL: zext_v8i8_index:
2798; KNL_64:       # %bb.0:
2799; KNL_64-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
2800; KNL_64-NEXT:    movw $255, %ax
2801; KNL_64-NEXT:    kmovw %eax, %k1
2802; KNL_64-NEXT:    vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
2803; KNL_64-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
2804; KNL_64-NEXT:    retq
2805;
2806; KNL_32-LABEL: zext_v8i8_index:
2807; KNL_32:       # %bb.0:
2808; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
2809; KNL_32-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
2810; KNL_32-NEXT:    movw $255, %cx
2811; KNL_32-NEXT:    kmovw %ecx, %k1
2812; KNL_32-NEXT:    vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
2813; KNL_32-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
2814; KNL_32-NEXT:    retl
2815;
2816; SKX-LABEL: zext_v8i8_index:
2817; SKX:       # %bb.0:
2818; SKX-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
2819; SKX-NEXT:    kxnorw %k0, %k0, %k1
2820; SKX-NEXT:    vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1}
2821; SKX-NEXT:    retq
2822;
2823; SKX_32-LABEL: zext_v8i8_index:
2824; SKX_32:       # %bb.0:
2825; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
2826; SKX_32-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
2827; SKX_32-NEXT:    kxnorw %k0, %k0, %k1
2828; SKX_32-NEXT:    vgatherdps (%eax,%ymm1,4), %ymm0 {%k1}
2829; SKX_32-NEXT:    retl
2830
2831  %zext_ind = zext <8 x i8> %ind to <8 x i64>
2832  %gep.random = getelementptr float, float *%base, <8 x i64> %zext_ind
2833
2834  %res = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> %gep.random, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef)
2835  ret <8 x float>%res
2836}
2837
2838; Index requires promotion
2839define void @test_scatter_2i32_index(<2 x double> %a1, double* %base, <2 x i32> %ind, <2 x i1> %mask) {
2840; KNL_64-LABEL: test_scatter_2i32_index:
2841; KNL_64:       # %bb.0:
2842; KNL_64-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
2843; KNL_64-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
2844; KNL_64-NEXT:    vpsllq $63, %xmm2, %xmm2
2845; KNL_64-NEXT:    vptestmq %zmm2, %zmm2, %k0
2846; KNL_64-NEXT:    kshiftlw $14, %k0, %k0
2847; KNL_64-NEXT:    kshiftrw $14, %k0, %k1
2848; KNL_64-NEXT:    vscatterdpd %zmm0, (%rdi,%ymm1,8) {%k1}
2849; KNL_64-NEXT:    vzeroupper
2850; KNL_64-NEXT:    retq
2851;
2852; KNL_32-LABEL: test_scatter_2i32_index:
2853; KNL_32:       # %bb.0:
2854; KNL_32-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
2855; KNL_32-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
2856; KNL_32-NEXT:    vpsllq $63, %xmm2, %xmm2
2857; KNL_32-NEXT:    vptestmq %zmm2, %zmm2, %k0
2858; KNL_32-NEXT:    kshiftlw $14, %k0, %k0
2859; KNL_32-NEXT:    kshiftrw $14, %k0, %k1
2860; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
2861; KNL_32-NEXT:    vscatterdpd %zmm0, (%eax,%ymm1,8) {%k1}
2862; KNL_32-NEXT:    vzeroupper
2863; KNL_32-NEXT:    retl
2864;
2865; SKX-LABEL: test_scatter_2i32_index:
2866; SKX:       # %bb.0:
2867; SKX-NEXT:    vpsllq $63, %xmm2, %xmm2
2868; SKX-NEXT:    vpmovq2m %xmm2, %k1
2869; SKX-NEXT:    vscatterdpd %xmm0, (%rdi,%xmm1,8) {%k1}
2870; SKX-NEXT:    retq
2871;
2872; SKX_32-LABEL: test_scatter_2i32_index:
2873; SKX_32:       # %bb.0:
2874; SKX_32-NEXT:    vpsllq $63, %xmm2, %xmm2
2875; SKX_32-NEXT:    vpmovq2m %xmm2, %k1
2876; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
2877; SKX_32-NEXT:    vscatterdpd %xmm0, (%eax,%xmm1,8) {%k1}
2878; SKX_32-NEXT:    retl
2879  %gep = getelementptr double, double *%base, <2 x i32> %ind
2880  call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> %a1, <2 x double*> %gep, i32 4, <2 x i1> %mask)
2881  ret void
2882}
2883declare void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double>, <2 x double*>, i32, <2 x i1>)
2884
2885define <16 x float> @zext_index(float* %base, <16 x i32> %ind) {
2886; KNL_64-LABEL: zext_index:
2887; KNL_64:       # %bb.0:
2888; KNL_64-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1
2889; KNL_64-NEXT:    kxnorw %k0, %k0, %k1
2890; KNL_64-NEXT:    vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
2891; KNL_64-NEXT:    retq
2892;
2893; KNL_32-LABEL: zext_index:
2894; KNL_32:       # %bb.0:
2895; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
2896; KNL_32-NEXT:    vpandd {{\.LCPI.*}}{1to16}, %zmm0, %zmm1
2897; KNL_32-NEXT:    kxnorw %k0, %k0, %k1
2898; KNL_32-NEXT:    vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
2899; KNL_32-NEXT:    retl
2900;
2901; SKX_SMALL-LABEL: zext_index:
2902; SKX_SMALL:       # %bb.0:
2903; SKX_SMALL-NEXT:    vandps {{.*}}(%rip){1to16}, %zmm0, %zmm1
2904; SKX_SMALL-NEXT:    kxnorw %k0, %k0, %k1
2905; SKX_SMALL-NEXT:    vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
2906; SKX_SMALL-NEXT:    retq
2907;
2908; SKX_LARGE-LABEL: zext_index:
2909; SKX_LARGE:       # %bb.0:
2910; SKX_LARGE-NEXT:    movabsq ${{\.LCPI.*}}, %rax
2911; SKX_LARGE-NEXT:    vandps (%rax){1to16}, %zmm0, %zmm1
2912; SKX_LARGE-NEXT:    kxnorw %k0, %k0, %k1
2913; SKX_LARGE-NEXT:    vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
2914; SKX_LARGE-NEXT:    retq
2915;
2916; SKX_32-LABEL: zext_index:
2917; SKX_32:       # %bb.0:
2918; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
2919; SKX_32-NEXT:    vandps {{\.LCPI.*}}{1to16}, %zmm0, %zmm1
2920; SKX_32-NEXT:    kxnorw %k0, %k0, %k1
2921; SKX_32-NEXT:    vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
2922; SKX_32-NEXT:    retl
2923  %ind_masked = and <16 x i32> %ind, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
2924  %sext_ind = zext <16 x i32> %ind_masked to <16 x i64>
2925  %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
2926
2927  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
2928  ret <16 x float>%res
2929}
2930
2931define <16 x double> @test_gather_setcc_split(double* %base, <16 x i32> %ind, <16 x i32> %cmp, <16 x double> %passthru) {
2932; KNL_64-LABEL: test_gather_setcc_split:
2933; KNL_64:       # %bb.0:
2934; KNL_64-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
2935; KNL_64-NEXT:    vptestnmd %zmm4, %zmm4, %k1
2936; KNL_64-NEXT:    vptestnmd %zmm1, %zmm1, %k2
2937; KNL_64-NEXT:    vgatherdpd (%rdi,%ymm0,8), %zmm2 {%k2}
2938; KNL_64-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
2939; KNL_64-NEXT:    vgatherdpd (%rdi,%ymm0,8), %zmm3 {%k1}
2940; KNL_64-NEXT:    vmovapd %zmm2, %zmm0
2941; KNL_64-NEXT:    vmovapd %zmm3, %zmm1
2942; KNL_64-NEXT:    retq
2943;
2944; KNL_32-LABEL: test_gather_setcc_split:
2945; KNL_32:       # %bb.0:
2946; KNL_32-NEXT:    pushl %ebp
2947; KNL_32-NEXT:    .cfi_def_cfa_offset 8
2948; KNL_32-NEXT:    .cfi_offset %ebp, -8
2949; KNL_32-NEXT:    movl %esp, %ebp
2950; KNL_32-NEXT:    .cfi_def_cfa_register %ebp
2951; KNL_32-NEXT:    andl $-64, %esp
2952; KNL_32-NEXT:    subl $64, %esp
2953; KNL_32-NEXT:    vmovapd 72(%ebp), %zmm3
2954; KNL_32-NEXT:    movl 8(%ebp), %eax
2955; KNL_32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
2956; KNL_32-NEXT:    vptestnmd %zmm4, %zmm4, %k1
2957; KNL_32-NEXT:    vptestnmd %zmm1, %zmm1, %k2
2958; KNL_32-NEXT:    vgatherdpd (%eax,%ymm0,8), %zmm2 {%k2}
2959; KNL_32-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
2960; KNL_32-NEXT:    vgatherdpd (%eax,%ymm0,8), %zmm3 {%k1}
2961; KNL_32-NEXT:    vmovapd %zmm2, %zmm0
2962; KNL_32-NEXT:    vmovapd %zmm3, %zmm1
2963; KNL_32-NEXT:    movl %ebp, %esp
2964; KNL_32-NEXT:    popl %ebp
2965; KNL_32-NEXT:    .cfi_def_cfa %esp, 4
2966; KNL_32-NEXT:    retl
2967;
2968; SKX-LABEL: test_gather_setcc_split:
2969; SKX:       # %bb.0:
2970; SKX-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
2971; SKX-NEXT:    vptestnmd %ymm4, %ymm4, %k1
2972; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k2
2973; SKX-NEXT:    vgatherdpd (%rdi,%ymm0,8), %zmm2 {%k2}
2974; SKX-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
2975; SKX-NEXT:    vgatherdpd (%rdi,%ymm0,8), %zmm3 {%k1}
2976; SKX-NEXT:    vmovapd %zmm2, %zmm0
2977; SKX-NEXT:    vmovapd %zmm3, %zmm1
2978; SKX-NEXT:    retq
2979;
2980; SKX_32-LABEL: test_gather_setcc_split:
2981; SKX_32:       # %bb.0:
2982; SKX_32-NEXT:    pushl %ebp
2983; SKX_32-NEXT:    .cfi_def_cfa_offset 8
2984; SKX_32-NEXT:    .cfi_offset %ebp, -8
2985; SKX_32-NEXT:    movl %esp, %ebp
2986; SKX_32-NEXT:    .cfi_def_cfa_register %ebp
2987; SKX_32-NEXT:    andl $-64, %esp
2988; SKX_32-NEXT:    subl $64, %esp
2989; SKX_32-NEXT:    vmovapd 72(%ebp), %zmm3
2990; SKX_32-NEXT:    movl 8(%ebp), %eax
2991; SKX_32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
2992; SKX_32-NEXT:    vptestnmd %ymm4, %ymm4, %k1
2993; SKX_32-NEXT:    vptestnmd %ymm1, %ymm1, %k2
2994; SKX_32-NEXT:    vgatherdpd (%eax,%ymm0,8), %zmm2 {%k2}
2995; SKX_32-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
2996; SKX_32-NEXT:    vgatherdpd (%eax,%ymm0,8), %zmm3 {%k1}
2997; SKX_32-NEXT:    vmovapd %zmm2, %zmm0
2998; SKX_32-NEXT:    vmovapd %zmm3, %zmm1
2999; SKX_32-NEXT:    movl %ebp, %esp
3000; SKX_32-NEXT:    popl %ebp
3001; SKX_32-NEXT:    .cfi_def_cfa %esp, 4
3002; SKX_32-NEXT:    retl
3003  %sext_ind = sext <16 x i32> %ind to <16 x i64>
3004  %gep.random = getelementptr double, double *%base, <16 x i64> %sext_ind
3005
3006  %mask = icmp eq <16 x i32> %cmp, zeroinitializer
3007  %res = call <16 x double> @llvm.masked.gather.v16f64.v16p0f64(<16 x double*> %gep.random, i32 4, <16 x i1> %mask, <16 x double> %passthru)
3008  ret <16 x double>%res
3009}
3010
3011define void @test_scatter_setcc_split(double* %base, <16 x i32> %ind, <16 x i32> %cmp, <16 x double> %src0)  {
3012; KNL_64-LABEL: test_scatter_setcc_split:
3013; KNL_64:       # %bb.0:
3014; KNL_64-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
3015; KNL_64-NEXT:    vptestnmd %zmm4, %zmm4, %k1
3016; KNL_64-NEXT:    vptestnmd %zmm1, %zmm1, %k2
3017; KNL_64-NEXT:    vscatterdpd %zmm2, (%rdi,%ymm0,8) {%k2}
3018; KNL_64-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
3019; KNL_64-NEXT:    vscatterdpd %zmm3, (%rdi,%ymm0,8) {%k1}
3020; KNL_64-NEXT:    vzeroupper
3021; KNL_64-NEXT:    retq
3022;
3023; KNL_32-LABEL: test_scatter_setcc_split:
3024; KNL_32:       # %bb.0:
3025; KNL_32-NEXT:    pushl %ebp
3026; KNL_32-NEXT:    .cfi_def_cfa_offset 8
3027; KNL_32-NEXT:    .cfi_offset %ebp, -8
3028; KNL_32-NEXT:    movl %esp, %ebp
3029; KNL_32-NEXT:    .cfi_def_cfa_register %ebp
3030; KNL_32-NEXT:    andl $-64, %esp
3031; KNL_32-NEXT:    subl $64, %esp
3032; KNL_32-NEXT:    vmovapd 72(%ebp), %zmm3
3033; KNL_32-NEXT:    movl 8(%ebp), %eax
3034; KNL_32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
3035; KNL_32-NEXT:    vptestnmd %zmm4, %zmm4, %k1
3036; KNL_32-NEXT:    vptestnmd %zmm1, %zmm1, %k2
3037; KNL_32-NEXT:    vscatterdpd %zmm2, (%eax,%ymm0,8) {%k2}
3038; KNL_32-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
3039; KNL_32-NEXT:    vscatterdpd %zmm3, (%eax,%ymm0,8) {%k1}
3040; KNL_32-NEXT:    movl %ebp, %esp
3041; KNL_32-NEXT:    popl %ebp
3042; KNL_32-NEXT:    .cfi_def_cfa %esp, 4
3043; KNL_32-NEXT:    vzeroupper
3044; KNL_32-NEXT:    retl
3045;
3046; SKX-LABEL: test_scatter_setcc_split:
3047; SKX:       # %bb.0:
3048; SKX-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
3049; SKX-NEXT:    vptestnmd %ymm4, %ymm4, %k1
3050; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k2
3051; SKX-NEXT:    vscatterdpd %zmm2, (%rdi,%ymm0,8) {%k2}
3052; SKX-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
3053; SKX-NEXT:    vscatterdpd %zmm3, (%rdi,%ymm0,8) {%k1}
3054; SKX-NEXT:    vzeroupper
3055; SKX-NEXT:    retq
3056;
3057; SKX_32-LABEL: test_scatter_setcc_split:
3058; SKX_32:       # %bb.0:
3059; SKX_32-NEXT:    pushl %ebp
3060; SKX_32-NEXT:    .cfi_def_cfa_offset 8
3061; SKX_32-NEXT:    .cfi_offset %ebp, -8
3062; SKX_32-NEXT:    movl %esp, %ebp
3063; SKX_32-NEXT:    .cfi_def_cfa_register %ebp
3064; SKX_32-NEXT:    andl $-64, %esp
3065; SKX_32-NEXT:    subl $64, %esp
3066; SKX_32-NEXT:    vmovapd 72(%ebp), %zmm3
3067; SKX_32-NEXT:    movl 8(%ebp), %eax
3068; SKX_32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
3069; SKX_32-NEXT:    vptestnmd %ymm4, %ymm4, %k1
3070; SKX_32-NEXT:    vptestnmd %ymm1, %ymm1, %k2
3071; SKX_32-NEXT:    vscatterdpd %zmm2, (%eax,%ymm0,8) {%k2}
3072; SKX_32-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
3073; SKX_32-NEXT:    vscatterdpd %zmm3, (%eax,%ymm0,8) {%k1}
3074; SKX_32-NEXT:    movl %ebp, %esp
3075; SKX_32-NEXT:    popl %ebp
3076; SKX_32-NEXT:    .cfi_def_cfa %esp, 4
3077; SKX_32-NEXT:    vzeroupper
3078; SKX_32-NEXT:    retl
3079  %sext_ind = sext <16 x i32> %ind to <16 x i64>
3080  %gep.random = getelementptr double, double *%base, <16 x i64> %sext_ind
3081
3082  %mask = icmp eq <16 x i32> %cmp, zeroinitializer
3083  call void @llvm.masked.scatter.v16f64.v16p0f64(<16 x double> %src0, <16 x double*> %gep.random, i32 4, <16 x i1> %mask)
3084  ret void
3085}
3086
3087; This test case previously triggered an infinite loop when the two gathers became identical after DAG combine removed the sign extend.
3088define <16 x float> @test_sext_cse(float* %base, <16 x i32> %ind, <16 x i32>* %foo) {
3089; KNL_64-LABEL: test_sext_cse:
3090; KNL_64:       # %bb.0:
3091; KNL_64-NEXT:    vmovaps %zmm0, (%rsi)
3092; KNL_64-NEXT:    kxnorw %k0, %k0, %k1
3093; KNL_64-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
3094; KNL_64-NEXT:    vaddps %zmm1, %zmm1, %zmm0
3095; KNL_64-NEXT:    retq
3096;
3097; KNL_32-LABEL: test_sext_cse:
3098; KNL_32:       # %bb.0:
3099; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
3100; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
3101; KNL_32-NEXT:    vmovaps %zmm0, (%ecx)
3102; KNL_32-NEXT:    kxnorw %k0, %k0, %k1
3103; KNL_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
3104; KNL_32-NEXT:    vaddps %zmm1, %zmm1, %zmm0
3105; KNL_32-NEXT:    retl
3106;
3107; SKX-LABEL: test_sext_cse:
3108; SKX:       # %bb.0:
3109; SKX-NEXT:    vmovaps %zmm0, (%rsi)
3110; SKX-NEXT:    kxnorw %k0, %k0, %k1
3111; SKX-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
3112; SKX-NEXT:    vaddps %zmm1, %zmm1, %zmm0
3113; SKX-NEXT:    retq
3114;
3115; SKX_32-LABEL: test_sext_cse:
3116; SKX_32:       # %bb.0:
3117; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
3118; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
3119; SKX_32-NEXT:    vmovaps %zmm0, (%ecx)
3120; SKX_32-NEXT:    kxnorw %k0, %k0, %k1
3121; SKX_32-NEXT:    vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
3122; SKX_32-NEXT:    vaddps %zmm1, %zmm1, %zmm0
3123; SKX_32-NEXT:    retl
3124  %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
3125  %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
3126
3127  %sext_ind = sext <16 x i32> %ind to <16 x i64>
3128  %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
3129
3130  store <16 x i32> %ind, <16 x i32>* %foo
3131  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
3132  %gep.random2 = getelementptr float, <16 x float*> %broadcast.splat, <16 x i32> %ind
3133  %res2 = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random2, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
3134  %res3 = fadd <16 x float> %res2, %res
3135  ret <16 x float>%res3
3136}
3137
3138define void @zero_mask(<2 x double>%a1, <2 x double*> %ptr) {
3139; ALL-LABEL: zero_mask:
3140; ALL:       # %bb.0:
3141; ALL-NEXT:    ret{{[l|q]}}
3142  call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> %a1, <2 x double*> %ptr, i32 4, <2 x i1> zeroinitializer)
3143  ret void
3144}
3145
3146define <2 x i64> @gather_2i64_constant_indices(i64* %ptr, <2 x i1> %mask) {
3147; KNL_64-LABEL: gather_2i64_constant_indices:
3148; KNL_64:       # %bb.0:
3149; KNL_64-NEXT:    vpsllq $63, %xmm0, %xmm0
3150; KNL_64-NEXT:    vptestmq %zmm0, %zmm0, %k0
3151; KNL_64-NEXT:    kshiftlw $14, %k0, %k0
3152; KNL_64-NEXT:    kshiftrw $14, %k0, %k1
3153; KNL_64-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,4294967294,u,u,u,u,u,u>
3154; KNL_64-NEXT:    vpxor %xmm0, %xmm0, %xmm0
3155; KNL_64-NEXT:    vpgatherdq (%rdi,%ymm1,8), %zmm0 {%k1}
3156; KNL_64-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
3157; KNL_64-NEXT:    vzeroupper
3158; KNL_64-NEXT:    retq
3159;
3160; KNL_32-LABEL: gather_2i64_constant_indices:
3161; KNL_32:       # %bb.0:
3162; KNL_32-NEXT:    vpsllq $63, %xmm0, %xmm0
3163; KNL_32-NEXT:    vptestmq %zmm0, %zmm0, %k0
3164; KNL_32-NEXT:    kshiftlw $14, %k0, %k0
3165; KNL_32-NEXT:    kshiftrw $14, %k0, %k1
3166; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
3167; KNL_32-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,4294967294,u,u,u,u,u,u>
3168; KNL_32-NEXT:    vpxor %xmm0, %xmm0, %xmm0
3169; KNL_32-NEXT:    vpgatherdq (%eax,%ymm1,8), %zmm0 {%k1}
3170; KNL_32-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
3171; KNL_32-NEXT:    vzeroupper
3172; KNL_32-NEXT:    retl
3173;
3174; SKX_SMALL-LABEL: gather_2i64_constant_indices:
3175; SKX_SMALL:       # %bb.0:
3176; SKX_SMALL-NEXT:    vpsllq $63, %xmm0, %xmm0
3177; SKX_SMALL-NEXT:    vpmovq2m %xmm0, %k1
3178; SKX_SMALL-NEXT:    vmovdqa {{.*#+}} xmm1 = <0,4294967294,u,u>
3179; SKX_SMALL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
3180; SKX_SMALL-NEXT:    vpgatherdq (%rdi,%xmm1,8), %xmm0 {%k1}
3181; SKX_SMALL-NEXT:    retq
3182;
3183; SKX_LARGE-LABEL: gather_2i64_constant_indices:
3184; SKX_LARGE:       # %bb.0:
3185; SKX_LARGE-NEXT:    vpsllq $63, %xmm0, %xmm0
3186; SKX_LARGE-NEXT:    vpmovq2m %xmm0, %k1
3187; SKX_LARGE-NEXT:    movabsq ${{\.LCPI.*}}, %rax
3188; SKX_LARGE-NEXT:    vmovdqa (%rax), %xmm1
3189; SKX_LARGE-NEXT:    vpxor %xmm0, %xmm0, %xmm0
3190; SKX_LARGE-NEXT:    vpgatherdq (%rdi,%xmm1,8), %xmm0 {%k1}
3191; SKX_LARGE-NEXT:    retq
3192;
3193; SKX_32-LABEL: gather_2i64_constant_indices:
3194; SKX_32:       # %bb.0:
3195; SKX_32-NEXT:    vpsllq $63, %xmm0, %xmm0
3196; SKX_32-NEXT:    vpmovq2m %xmm0, %k1
3197; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
3198; SKX_32-NEXT:    vmovdqa {{.*#+}} xmm1 = <0,4294967294,u,u>
3199; SKX_32-NEXT:    vpxor %xmm0, %xmm0, %xmm0
3200; SKX_32-NEXT:    vpgatherdq (%eax,%xmm1,8), %xmm0 {%k1}
3201; SKX_32-NEXT:    retl
3202  %gep = getelementptr i64, i64* %ptr, <2 x i64> <i64 0, i64 -2>
3203  %res = tail call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> %gep, i32 8, <2 x i1> %mask, <2 x i64> zeroinitializer) #1
3204  ret <2 x i64> %res
3205}
3206
3207define <16 x i32> @gather_16i64_constant_indices(i32* %ptr, <16 x i1> %mask) {
3208; KNL_64-LABEL: gather_16i64_constant_indices:
3209; KNL_64:       # %bb.0:
3210; KNL_64-NEXT:    vpmovsxbd %xmm0, %zmm0
3211; KNL_64-NEXT:    vpslld $31, %zmm0, %zmm0
3212; KNL_64-NEXT:    vptestmd %zmm0, %zmm0, %k1
3213; KNL_64-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,4294967294,1,4294967288,10,20,50,65536,16777215,2147483647,100,4294965296,2147483648,76897723,7,4227069609]
3214; KNL_64-NEXT:    vpxor %xmm0, %xmm0, %xmm0
3215; KNL_64-NEXT:    vpgatherdd (%rdi,%zmm1,4), %zmm0 {%k1}
3216; KNL_64-NEXT:    retq
3217;
3218; KNL_32-LABEL: gather_16i64_constant_indices:
3219; KNL_32:       # %bb.0:
3220; KNL_32-NEXT:    vpmovsxbd %xmm0, %zmm0
3221; KNL_32-NEXT:    vpslld $31, %zmm0, %zmm0
3222; KNL_32-NEXT:    vptestmd %zmm0, %zmm0, %k1
3223; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
3224; KNL_32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,4294967294,1,4294967288,10,20,50,65536,16777215,2147483647,100,4294965296,2147483648,76897723,7,4227069609]
3225; KNL_32-NEXT:    vpxor %xmm0, %xmm0, %xmm0
3226; KNL_32-NEXT:    vpgatherdd (%eax,%zmm1,4), %zmm0 {%k1}
3227; KNL_32-NEXT:    retl
3228;
3229; SKX_SMALL-LABEL: gather_16i64_constant_indices:
3230; SKX_SMALL:       # %bb.0:
3231; SKX_SMALL-NEXT:    vpmovsxbd %xmm0, %zmm0
3232; SKX_SMALL-NEXT:    vpslld $31, %zmm0, %zmm0
3233; SKX_SMALL-NEXT:    vpmovd2m %zmm0, %k1
3234; SKX_SMALL-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,4294967294,1,4294967288,10,20,50,65536,16777215,2147483647,100,4294965296,2147483648,76897723,7,4227069609]
3235; SKX_SMALL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
3236; SKX_SMALL-NEXT:    vpgatherdd (%rdi,%zmm1,4), %zmm0 {%k1}
3237; SKX_SMALL-NEXT:    retq
3238;
3239; SKX_LARGE-LABEL: gather_16i64_constant_indices:
3240; SKX_LARGE:       # %bb.0:
3241; SKX_LARGE-NEXT:    vpmovsxbd %xmm0, %zmm0
3242; SKX_LARGE-NEXT:    vpslld $31, %zmm0, %zmm0
3243; SKX_LARGE-NEXT:    vpmovd2m %zmm0, %k1
3244; SKX_LARGE-NEXT:    movabsq ${{\.LCPI.*}}, %rax
3245; SKX_LARGE-NEXT:    vmovdqa64 (%rax), %zmm1
3246; SKX_LARGE-NEXT:    vpxor %xmm0, %xmm0, %xmm0
3247; SKX_LARGE-NEXT:    vpgatherdd (%rdi,%zmm1,4), %zmm0 {%k1}
3248; SKX_LARGE-NEXT:    retq
3249;
3250; SKX_32-LABEL: gather_16i64_constant_indices:
3251; SKX_32:       # %bb.0:
3252; SKX_32-NEXT:    vpmovsxbd %xmm0, %zmm0
3253; SKX_32-NEXT:    vpslld $31, %zmm0, %zmm0
3254; SKX_32-NEXT:    vpmovd2m %zmm0, %k1
3255; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
3256; SKX_32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,4294967294,1,4294967288,10,20,50,65536,16777215,2147483647,100,4294965296,2147483648,76897723,7,4227069609]
3257; SKX_32-NEXT:    vpxor %xmm0, %xmm0, %xmm0
3258; SKX_32-NEXT:    vpgatherdd (%eax,%zmm1,4), %zmm0 {%k1}
3259; SKX_32-NEXT:    retl
3260  %gep = getelementptr i32, i32* %ptr, <16 x i64> <i64 0, i64 -2, i64 1, i64 -8, i64 10, i64 20, i64 50, i64 65536, i64 16777215, i64 2147483647, i64 100, i64 -2000, i64 -2147483648, i64 76897723, i64 7, i64 -67897687>
3261  %res = tail call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %gep, i32 4, <16 x i1> %mask, <16 x i32> zeroinitializer) #1
3262  ret <16 x i32> %res
3263}
3264
3265define void @scatter_2i64_constant_indices(i32* %ptr, <2 x i1> %mask, <2 x i32> %src0)  {
3266; KNL_64-LABEL: scatter_2i64_constant_indices:
3267; KNL_64:       # %bb.0:
3268; KNL_64-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
3269; KNL_64-NEXT:    vpsllq $63, %xmm0, %xmm0
3270; KNL_64-NEXT:    vptestmq %zmm0, %zmm0, %k0
3271; KNL_64-NEXT:    kshiftlw $14, %k0, %k0
3272; KNL_64-NEXT:    kshiftrw $14, %k0, %k1
3273; KNL_64-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <0,4294967294,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
3274; KNL_64-NEXT:    vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
3275; KNL_64-NEXT:    vzeroupper
3276; KNL_64-NEXT:    retq
3277;
3278; KNL_32-LABEL: scatter_2i64_constant_indices:
3279; KNL_32:       # %bb.0:
3280; KNL_32-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
3281; KNL_32-NEXT:    vpsllq $63, %xmm0, %xmm0
3282; KNL_32-NEXT:    vptestmq %zmm0, %zmm0, %k0
3283; KNL_32-NEXT:    kshiftlw $14, %k0, %k0
3284; KNL_32-NEXT:    kshiftrw $14, %k0, %k1
3285; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
3286; KNL_32-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <0,4294967294,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
3287; KNL_32-NEXT:    vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1}
3288; KNL_32-NEXT:    vzeroupper
3289; KNL_32-NEXT:    retl
3290;
3291; SKX_SMALL-LABEL: scatter_2i64_constant_indices:
3292; SKX_SMALL:       # %bb.0:
3293; SKX_SMALL-NEXT:    vpsllq $63, %xmm0, %xmm0
3294; SKX_SMALL-NEXT:    vpmovq2m %xmm0, %k1
3295; SKX_SMALL-NEXT:    vmovdqa {{.*#+}} xmm0 = <0,4294967294,u,u>
3296; SKX_SMALL-NEXT:    vpscatterdd %xmm1, (%rdi,%xmm0,4) {%k1}
3297; SKX_SMALL-NEXT:    retq
3298;
3299; SKX_LARGE-LABEL: scatter_2i64_constant_indices:
3300; SKX_LARGE:       # %bb.0:
3301; SKX_LARGE-NEXT:    vpsllq $63, %xmm0, %xmm0
3302; SKX_LARGE-NEXT:    vpmovq2m %xmm0, %k1
3303; SKX_LARGE-NEXT:    movabsq ${{\.LCPI.*}}, %rax
3304; SKX_LARGE-NEXT:    vmovdqa (%rax), %xmm0
3305; SKX_LARGE-NEXT:    vpscatterdd %xmm1, (%rdi,%xmm0,4) {%k1}
3306; SKX_LARGE-NEXT:    retq
3307;
3308; SKX_32-LABEL: scatter_2i64_constant_indices:
3309; SKX_32:       # %bb.0:
3310; SKX_32-NEXT:    vpsllq $63, %xmm0, %xmm0
3311; SKX_32-NEXT:    vpmovq2m %xmm0, %k1
3312; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
3313; SKX_32-NEXT:    vmovdqa {{.*#+}} xmm0 = <0,4294967294,u,u>
3314; SKX_32-NEXT:    vpscatterdd %xmm1, (%eax,%xmm0,4) {%k1}
3315; SKX_32-NEXT:    retl
3316  %gep = getelementptr i32, i32* %ptr, <2 x i64> <i64 0, i64 -2>
3317  call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %src0, <2 x i32*> %gep, i32 4, <2 x i1> %mask)
3318  ret void
3319}
3320
3321define void @scatter_16i64_constant_indices(i32* %ptr, <16 x i1> %mask, <16 x i32> %src0)  {
3322; KNL_64-LABEL: scatter_16i64_constant_indices:
3323; KNL_64:       # %bb.0:
3324; KNL_64-NEXT:    vpmovsxbd %xmm0, %zmm0
3325; KNL_64-NEXT:    vpslld $31, %zmm0, %zmm0
3326; KNL_64-NEXT:    vptestmd %zmm0, %zmm0, %k1
3327; KNL_64-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,4294967294,1,4294967288,10,20,50,65536,16777215,2147483647,100,4294965296,2147483648,76897723,7,4227069609]
3328; KNL_64-NEXT:    vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
3329; KNL_64-NEXT:    vzeroupper
3330; KNL_64-NEXT:    retq
3331;
3332; KNL_32-LABEL: scatter_16i64_constant_indices:
3333; KNL_32:       # %bb.0:
3334; KNL_32-NEXT:    vpmovsxbd %xmm0, %zmm0
3335; KNL_32-NEXT:    vpslld $31, %zmm0, %zmm0
3336; KNL_32-NEXT:    vptestmd %zmm0, %zmm0, %k1
3337; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
3338; KNL_32-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,4294967294,1,4294967288,10,20,50,65536,16777215,2147483647,100,4294965296,2147483648,76897723,7,4227069609]
3339; KNL_32-NEXT:    vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1}
3340; KNL_32-NEXT:    vzeroupper
3341; KNL_32-NEXT:    retl
3342;
3343; SKX_SMALL-LABEL: scatter_16i64_constant_indices:
3344; SKX_SMALL:       # %bb.0:
3345; SKX_SMALL-NEXT:    vpmovsxbd %xmm0, %zmm0
3346; SKX_SMALL-NEXT:    vpslld $31, %zmm0, %zmm0
3347; SKX_SMALL-NEXT:    vpmovd2m %zmm0, %k1
3348; SKX_SMALL-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,4294967294,1,4294967288,10,20,50,65536,16777215,2147483647,100,4294965296,2147483648,76897723,7,4227069609]
3349; SKX_SMALL-NEXT:    vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
3350; SKX_SMALL-NEXT:    vzeroupper
3351; SKX_SMALL-NEXT:    retq
3352;
3353; SKX_LARGE-LABEL: scatter_16i64_constant_indices:
3354; SKX_LARGE:       # %bb.0:
3355; SKX_LARGE-NEXT:    vpmovsxbd %xmm0, %zmm0
3356; SKX_LARGE-NEXT:    vpslld $31, %zmm0, %zmm0
3357; SKX_LARGE-NEXT:    vpmovd2m %zmm0, %k1
3358; SKX_LARGE-NEXT:    movabsq ${{\.LCPI.*}}, %rax
3359; SKX_LARGE-NEXT:    vmovdqa64 (%rax), %zmm0
3360; SKX_LARGE-NEXT:    vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
3361; SKX_LARGE-NEXT:    vzeroupper
3362; SKX_LARGE-NEXT:    retq
3363;
3364; SKX_32-LABEL: scatter_16i64_constant_indices:
3365; SKX_32:       # %bb.0:
3366; SKX_32-NEXT:    vpmovsxbd %xmm0, %zmm0
3367; SKX_32-NEXT:    vpslld $31, %zmm0, %zmm0
3368; SKX_32-NEXT:    vpmovd2m %zmm0, %k1
3369; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
3370; SKX_32-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,4294967294,1,4294967288,10,20,50,65536,16777215,2147483647,100,4294965296,2147483648,76897723,7,4227069609]
3371; SKX_32-NEXT:    vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1}
3372; SKX_32-NEXT:    vzeroupper
3373; SKX_32-NEXT:    retl
3374  %gep = getelementptr i32, i32* %ptr, <16 x i64> <i64 0, i64 -2, i64 1, i64 -8, i64 10, i64 20, i64 50, i64 65536, i64 16777215, i64 2147483647, i64 100, i64 -2000, i64 -2147483648, i64 76897723, i64 7, i64 -67897687>
3375  call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %src0, <16 x i32*> %gep, i32 4, <16 x i1> %mask)
3376  ret void
3377}
3378
3379define <4 x i32> @splat_ptr_gather(i32* %ptr, <4 x i1> %mask, <4 x i32> %passthru) {
3380; KNL_64-LABEL: splat_ptr_gather:
3381; KNL_64:       # %bb.0:
3382; KNL_64-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
3383; KNL_64-NEXT:    vpslld $31, %xmm0, %xmm0
3384; KNL_64-NEXT:    vptestmd %zmm0, %zmm0, %k0
3385; KNL_64-NEXT:    kshiftlw $12, %k0, %k0
3386; KNL_64-NEXT:    kshiftrw $12, %k0, %k1
3387; KNL_64-NEXT:    vpxor %xmm0, %xmm0, %xmm0
3388; KNL_64-NEXT:    vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
3389; KNL_64-NEXT:    vmovdqa %xmm1, %xmm0
3390; KNL_64-NEXT:    vzeroupper
3391; KNL_64-NEXT:    retq
3392;
3393; KNL_32-LABEL: splat_ptr_gather:
3394; KNL_32:       # %bb.0:
3395; KNL_32-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
3396; KNL_32-NEXT:    vpslld $31, %xmm0, %xmm0
3397; KNL_32-NEXT:    vptestmd %zmm0, %zmm0, %k0
3398; KNL_32-NEXT:    kshiftlw $12, %k0, %k0
3399; KNL_32-NEXT:    kshiftrw $12, %k0, %k1
3400; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
3401; KNL_32-NEXT:    vpxor %xmm0, %xmm0, %xmm0
3402; KNL_32-NEXT:    vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1}
3403; KNL_32-NEXT:    vmovdqa %xmm1, %xmm0
3404; KNL_32-NEXT:    vzeroupper
3405; KNL_32-NEXT:    retl
3406;
3407; SKX-LABEL: splat_ptr_gather:
3408; SKX:       # %bb.0:
3409; SKX-NEXT:    vpslld $31, %xmm0, %xmm0
3410; SKX-NEXT:    vpmovd2m %xmm0, %k1
3411; SKX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
3412; SKX-NEXT:    vpgatherdd (%rdi,%xmm0,4), %xmm1 {%k1}
3413; SKX-NEXT:    vmovdqa %xmm1, %xmm0
3414; SKX-NEXT:    retq
3415;
3416; SKX_32-LABEL: splat_ptr_gather:
3417; SKX_32:       # %bb.0:
3418; SKX_32-NEXT:    vpslld $31, %xmm0, %xmm0
3419; SKX_32-NEXT:    vpmovd2m %xmm0, %k1
3420; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
3421; SKX_32-NEXT:    vpxor %xmm0, %xmm0, %xmm0
3422; SKX_32-NEXT:    vpgatherdd (%eax,%xmm0,4), %xmm1 {%k1}
3423; SKX_32-NEXT:    vmovdqa %xmm1, %xmm0
3424; SKX_32-NEXT:    retl
3425  %1 = insertelement <4 x i32*> undef, i32* %ptr, i32 0
3426  %2 = shufflevector <4 x i32*> %1, <4 x i32*> undef, <4 x i32> zeroinitializer
3427  %3 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %2, i32 4, <4 x i1> %mask, <4 x i32> %passthru)
3428  ret <4 x i32> %3
3429}
3430declare  <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>)
3431
3432define void @splat_ptr_scatter(i32* %ptr, <4 x i1> %mask, <4 x i32> %val) {
3433; KNL_64-LABEL: splat_ptr_scatter:
3434; KNL_64:       # %bb.0:
3435; KNL_64-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
3436; KNL_64-NEXT:    vpslld $31, %xmm0, %xmm0
3437; KNL_64-NEXT:    vptestmd %zmm0, %zmm0, %k0
3438; KNL_64-NEXT:    kshiftlw $12, %k0, %k0
3439; KNL_64-NEXT:    kshiftrw $12, %k0, %k1
3440; KNL_64-NEXT:    vpxor %xmm0, %xmm0, %xmm0
3441; KNL_64-NEXT:    vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
3442; KNL_64-NEXT:    vzeroupper
3443; KNL_64-NEXT:    retq
3444;
3445; KNL_32-LABEL: splat_ptr_scatter:
3446; KNL_32:       # %bb.0:
3447; KNL_32-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
3448; KNL_32-NEXT:    vpslld $31, %xmm0, %xmm0
3449; KNL_32-NEXT:    vptestmd %zmm0, %zmm0, %k0
3450; KNL_32-NEXT:    kshiftlw $12, %k0, %k0
3451; KNL_32-NEXT:    kshiftrw $12, %k0, %k1
3452; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
3453; KNL_32-NEXT:    vpxor %xmm0, %xmm0, %xmm0
3454; KNL_32-NEXT:    vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1}
3455; KNL_32-NEXT:    vzeroupper
3456; KNL_32-NEXT:    retl
3457;
3458; SKX-LABEL: splat_ptr_scatter:
3459; SKX:       # %bb.0:
3460; SKX-NEXT:    vpslld $31, %xmm0, %xmm0
3461; SKX-NEXT:    vpmovd2m %xmm0, %k1
3462; SKX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
3463; SKX-NEXT:    vpscatterdd %xmm1, (%rdi,%xmm0,4) {%k1}
3464; SKX-NEXT:    retq
3465;
3466; SKX_32-LABEL: splat_ptr_scatter:
3467; SKX_32:       # %bb.0:
3468; SKX_32-NEXT:    vpslld $31, %xmm0, %xmm0
3469; SKX_32-NEXT:    vpmovd2m %xmm0, %k1
3470; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
3471; SKX_32-NEXT:    vpxor %xmm0, %xmm0, %xmm0
3472; SKX_32-NEXT:    vpscatterdd %xmm1, (%eax,%xmm0,4) {%k1}
3473; SKX_32-NEXT:    retl
3474  %1 = insertelement <4 x i32*> undef, i32* %ptr, i32 0
3475  %2 = shufflevector <4 x i32*> %1, <4 x i32*> undef, <4 x i32> zeroinitializer
3476  call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %val, <4 x i32*> %2, i32 4, <4 x i1> %mask)
3477  ret void
3478}
3479
3480%struct.foo = type { i8*, i64, i16, i16, i32 }
3481
3482; This used to cause fast-isel to generate bad copy instructions that would
3483; cause an error in copyPhysReg.
3484define <8 x i64> @pr45906(<8 x %struct.foo*> %ptr) {
3485; KNL_64-LABEL: pr45906:
3486; KNL_64:       # %bb.0: # %bb
3487; KNL_64-NEXT:    vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
3488; KNL_64-NEXT:    kxnorw %k0, %k0, %k1
3489; KNL_64-NEXT:    vpgatherqq (,%zmm1), %zmm0 {%k1}
3490; KNL_64-NEXT:    retq
3491;
3492; KNL_32-LABEL: pr45906:
3493; KNL_32:       # %bb.0: # %bb
3494; KNL_32-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4]
3495; KNL_32-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
3496; KNL_32-NEXT:    kxnorw %k0, %k0, %k1
3497; KNL_32-NEXT:    vpgatherdq (,%ymm1), %zmm0 {%k1}
3498; KNL_32-NEXT:    retl
3499;
3500; SKX_SMALL-LABEL: pr45906:
3501; SKX_SMALL:       # %bb.0: # %bb
3502; SKX_SMALL-NEXT:    vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
3503; SKX_SMALL-NEXT:    kxnorw %k0, %k0, %k1
3504; SKX_SMALL-NEXT:    vpgatherqq (,%zmm1), %zmm0 {%k1}
3505; SKX_SMALL-NEXT:    retq
3506;
3507; SKX_LARGE-LABEL: pr45906:
3508; SKX_LARGE:       # %bb.0: # %bb
3509; SKX_LARGE-NEXT:    movabsq ${{\.LCPI.*}}, %rax
3510; SKX_LARGE-NEXT:    vpaddq (%rax){1to8}, %zmm0, %zmm1
3511; SKX_LARGE-NEXT:    kxnorw %k0, %k0, %k1
3512; SKX_LARGE-NEXT:    vpgatherqq (,%zmm1), %zmm0 {%k1}
3513; SKX_LARGE-NEXT:    retq
3514;
3515; SKX_32-LABEL: pr45906:
3516; SKX_32:       # %bb.0: # %bb
3517; SKX_32-NEXT:    vpaddd {{\.LCPI.*}}{1to8}, %ymm0, %ymm1
3518; SKX_32-NEXT:    kxnorw %k0, %k0, %k1
3519; SKX_32-NEXT:    vpgatherdq (,%ymm1), %zmm0 {%k1}
3520; SKX_32-NEXT:    retl
3521bb:
3522  %tmp = getelementptr inbounds %struct.foo, <8 x %struct.foo*> %ptr, i64 0, i32 1
3523  %tmp1 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> %tmp, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i64> undef)
3524  ret <8 x i64> %tmp1
3525}
3526declare <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*>, i32, <8 x i1>, <8 x i64>)
3527