• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=X86
3; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=X64
4
5; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx2-builtins.c
6
7define <4 x i64> @test_mm256_abs_epi8(<4 x i64> %a0) {
8; CHECK-LABEL: test_mm256_abs_epi8:
9; CHECK:       # %bb.0:
10; CHECK-NEXT:    vpabsb %ymm0, %ymm0
11; CHECK-NEXT:    ret{{[l|q]}}
12  %arg = bitcast <4 x i64> %a0 to <32 x i8>
13  %sub = sub <32 x i8> zeroinitializer, %arg
14  %cmp = icmp sgt <32 x i8> %arg, zeroinitializer
15  %sel = select <32 x i1> %cmp, <32 x i8> %arg, <32 x i8> %sub
16  %res = bitcast <32 x i8> %sel to <4 x i64>
17  ret <4 x i64> %res
18}
19declare <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8>) nounwind readnone
20
21define <4 x i64> @test_mm256_abs_epi16(<4 x i64> %a0) {
22; CHECK-LABEL: test_mm256_abs_epi16:
23; CHECK:       # %bb.0:
24; CHECK-NEXT:    vpabsw %ymm0, %ymm0
25; CHECK-NEXT:    ret{{[l|q]}}
26  %arg = bitcast <4 x i64> %a0 to <16 x i16>
27  %sub = sub <16 x i16> zeroinitializer, %arg
28  %cmp = icmp sgt <16 x i16> %arg, zeroinitializer
29  %sel = select <16 x i1> %cmp, <16 x i16> %arg, <16 x i16> %sub
30  %res = bitcast <16 x i16> %sel to <4 x i64>
31  ret <4 x i64> %res
32}
33declare <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16>) nounwind readnone
34
35define <4 x i64> @test_mm256_abs_epi32(<4 x i64> %a0) {
36; CHECK-LABEL: test_mm256_abs_epi32:
37; CHECK:       # %bb.0:
38; CHECK-NEXT:    vpabsd %ymm0, %ymm0
39; CHECK-NEXT:    ret{{[l|q]}}
40  %arg = bitcast <4 x i64> %a0 to <8 x i32>
41  %sub = sub <8 x i32> zeroinitializer, %arg
42  %cmp = icmp sgt <8 x i32> %arg, zeroinitializer
43  %sel = select <8 x i1> %cmp, <8 x i32> %arg, <8 x i32> %sub
44  %res = bitcast <8 x i32> %sel to <4 x i64>
45  ret <4 x i64> %res
46}
47declare <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32>) nounwind readnone
48
49define <4 x i64> @test_mm256_add_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
50; CHECK-LABEL: test_mm256_add_epi8:
51; CHECK:       # %bb.0:
52; CHECK-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
53; CHECK-NEXT:    ret{{[l|q]}}
54  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
55  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
56  %res = add <32 x i8> %arg0, %arg1
57  %bc = bitcast <32 x i8> %res to <4 x i64>
58  ret <4 x i64> %bc
59}
60
61define <4 x i64> @test_mm256_add_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
62; CHECK-LABEL: test_mm256_add_epi16:
63; CHECK:       # %bb.0:
64; CHECK-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
65; CHECK-NEXT:    ret{{[l|q]}}
66  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
67  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
68  %res = add <16 x i16> %arg0, %arg1
69  %bc = bitcast <16 x i16> %res to <4 x i64>
70  ret <4 x i64> %bc
71}
72
73define <4 x i64> @test_mm256_add_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
74; CHECK-LABEL: test_mm256_add_epi32:
75; CHECK:       # %bb.0:
76; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
77; CHECK-NEXT:    ret{{[l|q]}}
78  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
79  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
80  %res = add <8 x i32> %arg0, %arg1
81  %bc = bitcast <8 x i32> %res to <4 x i64>
82  ret <4 x i64> %bc
83}
84
85define <4 x i64> @test_mm256_add_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
86; CHECK-LABEL: test_mm256_add_epi64:
87; CHECK:       # %bb.0:
88; CHECK-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
89; CHECK-NEXT:    ret{{[l|q]}}
90  %res = add <4 x i64> %a0, %a1
91  ret <4 x i64> %res
92}
93
94define <4 x i64> @test_mm256_adds_epi8(<4 x i64> %a0, <4 x i64> %a1) {
95; CHECK-LABEL: test_mm256_adds_epi8:
96; CHECK:       # %bb.0:
97; CHECK-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0
98; CHECK-NEXT:    ret{{[l|q]}}
99  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
100  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
101  %res = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %arg0, <32 x i8> %arg1)
102  %bc = bitcast <32 x i8> %res to <4 x i64>
103  ret <4 x i64> %bc
104}
105declare <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8>, <32 x i8>) nounwind readnone
106
107define <4 x i64> @test_mm256_adds_epi16(<4 x i64> %a0, <4 x i64> %a1) {
108; CHECK-LABEL: test_mm256_adds_epi16:
109; CHECK:       # %bb.0:
110; CHECK-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0
111; CHECK-NEXT:    ret{{[l|q]}}
112  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
113  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
114  %res = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %arg0, <16 x i16> %arg1)
115  %bc = bitcast <16 x i16> %res to <4 x i64>
116  ret <4 x i64> %bc
117}
118declare <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16>, <16 x i16>) nounwind readnone
119
120define <4 x i64> @test_mm256_adds_epu8(<4 x i64> %a0, <4 x i64> %a1) {
121; CHECK-LABEL: test_mm256_adds_epu8:
122; CHECK:       # %bb.0:
123; CHECK-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0
124; CHECK-NEXT:    ret{{[l|q]}}
125  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
126  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
127  %res = call <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8> %arg0, <32 x i8> %arg1)
128  %bc = bitcast <32 x i8> %res to <4 x i64>
129  ret <4 x i64> %bc
130}
131declare <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8>, <32 x i8>) nounwind readnone
132
133define <4 x i64> @test_mm256_adds_epu16(<4 x i64> %a0, <4 x i64> %a1) {
134; CHECK-LABEL: test_mm256_adds_epu16:
135; CHECK:       # %bb.0:
136; CHECK-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0
137; CHECK-NEXT:    ret{{[l|q]}}
138  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
139  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
140  %res = call <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16> %arg0, <16 x i16> %arg1)
141  %bc = bitcast <16 x i16> %res to <4 x i64>
142  ret <4 x i64> %bc
143}
144declare <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16>, <16 x i16>) nounwind readnone
145
146define <4 x i64> @test_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) {
147; CHECK-LABEL: test_mm256_alignr_epi8:
148; CHECK:       # %bb.0:
149; CHECK-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1],ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17]
150; CHECK-NEXT:    ret{{[l|q]}}
151  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
152  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
153  %shuf = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49>
154  %res = bitcast <32 x i8> %shuf to <4 x i64>
155  ret <4 x i64> %res
156}
157
158define <4 x i64> @test2_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) {
159; CHECK-LABEL: test2_mm256_alignr_epi8:
160; CHECK:       # %bb.0:
161; CHECK-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0],ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16]
162; CHECK-NEXT:    ret{{[l|q]}}
163  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
164  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
165  %shuf = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48>
166  %res = bitcast <32 x i8> %shuf to <4 x i64>
167  ret <4 x i64> %res
168}
169
170define <4 x i64> @test_mm256_and_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
171; CHECK-LABEL: test_mm256_and_si256:
172; CHECK:       # %bb.0:
173; CHECK-NEXT:    vandps %ymm1, %ymm0, %ymm0
174; CHECK-NEXT:    ret{{[l|q]}}
175  %res = and <4 x i64> %a0, %a1
176  ret <4 x i64> %res
177}
178
179define <4 x i64> @test_mm256_andnot_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
180; CHECK-LABEL: test_mm256_andnot_si256:
181; CHECK:       # %bb.0:
182; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
183; CHECK-NEXT:    vpxor %ymm2, %ymm0, %ymm0
184; CHECK-NEXT:    vpand %ymm1, %ymm0, %ymm0
185; CHECK-NEXT:    ret{{[l|q]}}
186  %not = xor <4 x i64> %a0, <i64 -1, i64 -1, i64 -1, i64 -1>
187  %res = and <4 x i64> %not, %a1
188  ret <4 x i64> %res
189}
190
191define <4 x i64> @test_mm256_avg_epu8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
192; CHECK-LABEL: test_mm256_avg_epu8:
193; CHECK:       # %bb.0:
194; CHECK-NEXT:    vpavgb %ymm1, %ymm0, %ymm0
195; CHECK-NEXT:    ret{{[l|q]}}
196  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
197  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
198  %zext0 = zext <32 x i8> %arg0 to <32 x i16>
199  %zext1 = zext <32 x i8> %arg1 to <32 x i16>
200  %add = add <32 x i16> %zext0, %zext1
201  %add1 = add <32 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
202  %lshr = lshr <32 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
203  %res = trunc <32 x i16> %lshr to <32 x i8>
204  %bc = bitcast <32 x i8> %res to <4 x i64>
205  ret <4 x i64> %bc
206}
207
208define <4 x i64> @test_mm256_avg_epu16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
209; CHECK-LABEL: test_mm256_avg_epu16:
210; CHECK:       # %bb.0:
211; CHECK-NEXT:    vpavgw %ymm1, %ymm0, %ymm0
212; CHECK-NEXT:    ret{{[l|q]}}
213  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
214  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
215  %zext0 = zext <16 x i16> %arg0 to <16 x i32>
216  %zext1 = zext <16 x i16> %arg1 to <16 x i32>
217  %add = add <16 x i32> %zext0, %zext1
218  %add1 = add <16 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
219  %lshr = lshr <16 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
220  %res = trunc <16 x i32> %lshr to <16 x i16>
221  %bc = bitcast <16 x i16> %res to <4 x i64>
222  ret <4 x i64> %bc
223}
224
225define <4 x i64> @test_mm256_blend_epi16(<4 x i64> %a0, <4 x i64> %a1) {
226; CHECK-LABEL: test_mm256_blend_epi16:
227; CHECK:       # %bb.0:
228; CHECK-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15]
229; CHECK-NEXT:    ret{{[l|q]}}
230  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
231  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
232  %shuf = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 25, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
233  %res = bitcast <16 x i16> %shuf to <4 x i64>
234  ret <4 x i64> %res
235}
236
237define <2 x i64> @test_mm_blend_epi32(<2 x i64> %a0, <2 x i64> %a1) {
238; CHECK-LABEL: test_mm_blend_epi32:
239; CHECK:       # %bb.0:
240; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
241; CHECK-NEXT:    ret{{[l|q]}}
242  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
243  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
244  %shuf = shufflevector <4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
245  %res = bitcast <4 x i32> %shuf to <2 x i64>
246  ret <2 x i64> %res
247}
248
249define <4 x i64> @test_mm256_blend_epi32(<4 x i64> %a0, <4 x i64> %a1) {
250; CHECK-LABEL: test_mm256_blend_epi32:
251; CHECK:       # %bb.0:
252; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7]
253; CHECK-NEXT:    ret{{[l|q]}}
254  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
255  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
256  %shuf = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 13, i32 6, i32 7>
257  %res = bitcast <8 x i32> %shuf to <4 x i64>
258  ret <4 x i64> %res
259}
260
261define <4 x i64> @test_mm256_blendv_epi8(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) {
262; CHECK-LABEL: test_mm256_blendv_epi8:
263; CHECK:       # %bb.0:
264; CHECK-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
265; CHECK-NEXT:    ret{{[l|q]}}
266  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
267  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
268  %arg2 = bitcast <4 x i64> %a2 to <32 x i8>
269  %call = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %arg0, <32 x i8> %arg1, <32 x i8> %arg2)
270  %res = bitcast <32 x i8> %call to <4 x i64>
271  ret <4 x i64> %res
272}
273declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) nounwind readnone
274
275define <2 x i64> @test_mm_broadcastb_epi8(<2 x i64> %a0) {
276; CHECK-LABEL: test_mm_broadcastb_epi8:
277; CHECK:       # %bb.0:
278; CHECK-NEXT:    vpbroadcastb %xmm0, %xmm0
279; CHECK-NEXT:    ret{{[l|q]}}
280  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
281  %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <16 x i32> zeroinitializer
282  %res = bitcast <16 x i8> %shuf to <2 x i64>
283  ret <2 x i64> %res
284}
285
286define <4 x i64> @test_mm256_broadcastb_epi8(<4 x i64> %a0) {
287; CHECK-LABEL: test_mm256_broadcastb_epi8:
288; CHECK:       # %bb.0:
289; CHECK-NEXT:    vpbroadcastb %xmm0, %ymm0
290; CHECK-NEXT:    ret{{[l|q]}}
291  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
292  %shuf = shufflevector <32 x i8> %arg0, <32 x i8> undef, <32 x i32> zeroinitializer
293  %res = bitcast <32 x i8> %shuf to <4 x i64>
294  ret <4 x i64> %res
295}
296
297define <2 x i64> @test_mm_broadcastd_epi32(<2 x i64> %a0) {
298; CHECK-LABEL: test_mm_broadcastd_epi32:
299; CHECK:       # %bb.0:
300; CHECK-NEXT:    vbroadcastss %xmm0, %xmm0
301; CHECK-NEXT:    ret{{[l|q]}}
302  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
303  %shuf = shufflevector <4 x i32> %arg0, <4 x i32> undef, <4 x i32> zeroinitializer
304  %res = bitcast <4 x i32> %shuf to <2 x i64>
305  ret <2 x i64> %res
306}
307
308define <4 x i64> @test_mm256_broadcastd_epi32(<4 x i64> %a0) {
309; CHECK-LABEL: test_mm256_broadcastd_epi32:
310; CHECK:       # %bb.0:
311; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0
312; CHECK-NEXT:    ret{{[l|q]}}
313  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
314  %shuf = shufflevector <8 x i32> %arg0, <8 x i32> undef, <8 x i32> zeroinitializer
315  %res = bitcast <8 x i32> %shuf to <4 x i64>
316  ret <4 x i64> %res
317}
318
319define <2 x i64> @test_mm_broadcastq_epi64(<2 x i64> %a0) {
320; CHECK-LABEL: test_mm_broadcastq_epi64:
321; CHECK:       # %bb.0:
322; CHECK-NEXT:    vpbroadcastq %xmm0, %xmm0
323; CHECK-NEXT:    ret{{[l|q]}}
324  %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer
325  ret <2 x i64> %res
326}
327
328define <4 x i64> @test_mm256_broadcastq_epi64(<4 x i64> %a0) {
329; CHECK-LABEL: test_mm256_broadcastq_epi64:
330; CHECK:       # %bb.0:
331; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0
332; CHECK-NEXT:    ret{{[l|q]}}
333  %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> zeroinitializer
334  ret <4 x i64> %res
335}
336
337define <2 x double> @test_mm_broadcastsd_pd(<2 x double> %a0) {
338; CHECK-LABEL: test_mm_broadcastsd_pd:
339; CHECK:       # %bb.0:
340; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
341; CHECK-NEXT:    ret{{[l|q]}}
342  %res = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer
343  ret <2 x double> %res
344}
345
346define <4 x double> @test_mm256_broadcastsd_pd(<4 x double> %a0) {
347; CHECK-LABEL: test_mm256_broadcastsd_pd:
348; CHECK:       # %bb.0:
349; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0
350; CHECK-NEXT:    ret{{[l|q]}}
351  %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> zeroinitializer
352  ret <4 x double> %res
353}
354
355define <4 x i64> @test_mm256_broadcastsi128_si256(<2 x i64> %a0) {
356; CHECK-LABEL: test_mm256_broadcastsi128_si256:
357; CHECK:       # %bb.0:
358; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
359; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
360; CHECK-NEXT:    ret{{[l|q]}}
361  %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
362  ret <4 x i64> %res
363}
364
365define <4 x i64> @test_mm256_broadcastsi128_si256_mem(<2 x i64>* %p0) {
366; X86-LABEL: test_mm256_broadcastsi128_si256_mem:
367; X86:       # %bb.0:
368; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
369; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
370; X86-NEXT:    retl
371;
372; X64-LABEL: test_mm256_broadcastsi128_si256_mem:
373; X64:       # %bb.0:
374; X64-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
375; X64-NEXT:    retq
376  %a0 = load <2 x i64>, <2 x i64>* %p0
377  %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
378  ret <4 x i64> %res
379}
380
381define <4 x float> @test_mm_broadcastss_ps(<4 x float> %a0) {
382; CHECK-LABEL: test_mm_broadcastss_ps:
383; CHECK:       # %bb.0:
384; CHECK-NEXT:    vbroadcastss %xmm0, %xmm0
385; CHECK-NEXT:    ret{{[l|q]}}
386  %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer
387  ret <4 x float> %res
388}
389
390define <8 x float> @test_mm256_broadcastss_ps(<8 x float> %a0) {
391; CHECK-LABEL: test_mm256_broadcastss_ps:
392; CHECK:       # %bb.0:
393; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0
394; CHECK-NEXT:    ret{{[l|q]}}
395  %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> zeroinitializer
396  ret <8 x float> %res
397}
398
399define <2 x i64> @test_mm_broadcastw_epi16(<2 x i64> %a0) {
400; CHECK-LABEL: test_mm_broadcastw_epi16:
401; CHECK:       # %bb.0:
402; CHECK-NEXT:    vpbroadcastw %xmm0, %xmm0
403; CHECK-NEXT:    ret{{[l|q]}}
404  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
405  %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <8 x i32> zeroinitializer
406  %res = bitcast <8 x i16> %shuf to <2 x i64>
407  ret <2 x i64> %res
408}
409
410define <4 x i64> @test_mm256_broadcastw_epi16(<4 x i64> %a0) {
411; CHECK-LABEL: test_mm256_broadcastw_epi16:
412; CHECK:       # %bb.0:
413; CHECK-NEXT:    vpbroadcastw %xmm0, %ymm0
414; CHECK-NEXT:    ret{{[l|q]}}
415  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
416  %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> zeroinitializer
417  %res = bitcast <16 x i16> %shuf to <4 x i64>
418  ret <4 x i64> %res
419}
420
421define <4 x i64> @test_mm256_bslli_epi128(<4 x i64> %a0) {
422; CHECK-LABEL: test_mm256_bslli_epi128:
423; CHECK:       # %bb.0:
424; CHECK-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28]
425; CHECK-NEXT:    ret{{[l|q]}}
426  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
427  %shuf = shufflevector <32 x i8> zeroinitializer, <32 x i8> %arg0, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
428  %res = bitcast <32 x i8> %shuf to <4 x i64>
429  ret <4 x i64> %res
430}
431
432define <4 x i64> @test_mm256_bsrli_epi128(<4 x i64> %a0) {
433; CHECK-LABEL: test_mm256_bsrli_epi128:
434; CHECK:       # %bb.0:
435; CHECK-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero
436; CHECK-NEXT:    ret{{[l|q]}}
437  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
438  %shuf = shufflevector <32 x i8> %arg0, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
439  %res = bitcast <32 x i8> %shuf to <4 x i64>
440  ret <4 x i64> %res
441}
442
443define <4 x i64> @test_mm256_cmpeq_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
444; CHECK-LABEL: test_mm256_cmpeq_epi8:
445; CHECK:       # %bb.0:
446; CHECK-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
447; CHECK-NEXT:    ret{{[l|q]}}
448  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
449  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
450  %cmp = icmp eq <32 x i8> %arg0, %arg1
451  %res = sext <32 x i1> %cmp to <32 x i8>
452  %bc = bitcast <32 x i8> %res to <4 x i64>
453  ret <4 x i64> %bc
454}
455
456define <4 x i64> @test_mm256_cmpeq_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
457; CHECK-LABEL: test_mm256_cmpeq_epi16:
458; CHECK:       # %bb.0:
459; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
460; CHECK-NEXT:    ret{{[l|q]}}
461  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
462  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
463  %cmp = icmp eq <16 x i16> %arg0, %arg1
464  %res = sext <16 x i1> %cmp to <16 x i16>
465  %bc = bitcast <16 x i16> %res to <4 x i64>
466  ret <4 x i64> %bc
467}
468
469define <4 x i64> @test_mm256_cmpeq_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
470; CHECK-LABEL: test_mm256_cmpeq_epi32:
471; CHECK:       # %bb.0:
472; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm0, %ymm0
473; CHECK-NEXT:    ret{{[l|q]}}
474  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
475  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
476  %cmp = icmp eq <8 x i32> %arg0, %arg1
477  %res = sext <8 x i1> %cmp to <8 x i32>
478  %bc = bitcast <8 x i32> %res to <4 x i64>
479  ret <4 x i64> %bc
480}
481
482define <4 x i64> @test_mm256_cmpeq_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
483; CHECK-LABEL: test_mm256_cmpeq_epi64:
484; CHECK:       # %bb.0:
485; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
486; CHECK-NEXT:    ret{{[l|q]}}
487  %cmp = icmp eq <4 x i64> %a0, %a1
488  %res = sext <4 x i1> %cmp to <4 x i64>
489  ret <4 x i64> %res
490}
491
492define <4 x i64> @test_mm256_cmpgt_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
493; CHECK-LABEL: test_mm256_cmpgt_epi8:
494; CHECK:       # %bb.0:
495; CHECK-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0
496; CHECK-NEXT:    ret{{[l|q]}}
497  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
498  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
499  %cmp = icmp sgt <32 x i8> %arg0, %arg1
500  %res = sext <32 x i1> %cmp to <32 x i8>
501  %bc = bitcast <32 x i8> %res to <4 x i64>
502  ret <4 x i64> %bc
503}
504
505define <4 x i64> @test_mm256_cmpgt_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
506; CHECK-LABEL: test_mm256_cmpgt_epi16:
507; CHECK:       # %bb.0:
508; CHECK-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
509; CHECK-NEXT:    ret{{[l|q]}}
510  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
511  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
512  %cmp = icmp sgt <16 x i16> %arg0, %arg1
513  %res = sext <16 x i1> %cmp to <16 x i16>
514  %bc = bitcast <16 x i16> %res to <4 x i64>
515  ret <4 x i64> %bc
516}
517
518define <4 x i64> @test_mm256_cmpgt_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
519; CHECK-LABEL: test_mm256_cmpgt_epi32:
520; CHECK:       # %bb.0:
521; CHECK-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0
522; CHECK-NEXT:    ret{{[l|q]}}
523  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
524  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
525  %cmp = icmp sgt <8 x i32> %arg0, %arg1
526  %res = sext <8 x i1> %cmp to <8 x i32>
527  %bc = bitcast <8 x i32> %res to <4 x i64>
528  ret <4 x i64> %bc
529}
530
531define <4 x i64> @test_mm256_cmpgt_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
532; CHECK-LABEL: test_mm256_cmpgt_epi64:
533; CHECK:       # %bb.0:
534; CHECK-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
535; CHECK-NEXT:    ret{{[l|q]}}
536  %cmp = icmp sgt <4 x i64> %a0, %a1
537  %res = sext <4 x i1> %cmp to <4 x i64>
538  ret <4 x i64> %res
539}
540
541define <4 x i64> @test_mm256_cvtepi8_epi16(<2 x i64> %a0) {
542; CHECK-LABEL: test_mm256_cvtepi8_epi16:
543; CHECK:       # %bb.0:
544; CHECK-NEXT:    vpmovsxbw %xmm0, %ymm0
545; CHECK-NEXT:    ret{{[l|q]}}
546  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
547  %ext = sext <16 x i8> %arg0 to <16 x i16>
548  %res = bitcast <16 x i16> %ext to <4 x i64>
549  ret <4 x i64> %res
550}
551
552define <4 x i64> @test_mm256_cvtepi8_epi32(<2 x i64> %a0) {
553; CHECK-LABEL: test_mm256_cvtepi8_epi32:
554; CHECK:       # %bb.0:
555; CHECK-NEXT:    vpmovsxbd %xmm0, %ymm0
556; CHECK-NEXT:    ret{{[l|q]}}
557  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
558  %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
559  %ext = sext <8 x i8> %shuf to <8 x i32>
560  %res = bitcast <8 x i32> %ext to <4 x i64>
561  ret <4 x i64> %res
562}
563
564define <4 x i64> @test_mm256_cvtepi8_epi64(<2 x i64> %a0) {
565; CHECK-LABEL: test_mm256_cvtepi8_epi64:
566; CHECK:       # %bb.0:
567; CHECK-NEXT:    vpmovsxbq %xmm0, %ymm0
568; CHECK-NEXT:    ret{{[l|q]}}
569  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
570  %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
571  %ext = sext <4 x i8> %shuf to <4 x i64>
572  ret <4 x i64> %ext
573}
574
575define <4 x i64> @test_mm256_cvtepi16_epi32(<2 x i64> %a0) {
576; CHECK-LABEL: test_mm256_cvtepi16_epi32:
577; CHECK:       # %bb.0:
578; CHECK-NEXT:    vpmovsxwd %xmm0, %ymm0
579; CHECK-NEXT:    ret{{[l|q]}}
580  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
581  %ext = sext <8 x i16> %arg0 to <8 x i32>
582  %res = bitcast <8 x i32> %ext to <4 x i64>
583  ret <4 x i64> %res
584}
585
586define <4 x i64> @test_mm256_cvtepi16_epi64(<2 x i64> %a0) {
587; CHECK-LABEL: test_mm256_cvtepi16_epi64:
588; CHECK:       # %bb.0:
589; CHECK-NEXT:    vpmovsxwq %xmm0, %ymm0
590; CHECK-NEXT:    ret{{[l|q]}}
591  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
592  %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
593  %ext = sext <4 x i16> %shuf to <4 x i64>
594  ret <4 x i64> %ext
595}
596
597define <4 x i64> @test_mm256_cvtepi32_epi64(<2 x i64> %a0) {
598; CHECK-LABEL: test_mm256_cvtepi32_epi64:
599; CHECK:       # %bb.0:
600; CHECK-NEXT:    vpmovsxdq %xmm0, %ymm0
601; CHECK-NEXT:    ret{{[l|q]}}
602  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
603  %ext = sext <4 x i32> %arg0 to <4 x i64>
604  ret <4 x i64> %ext
605}
606
607define <4 x i64> @test_mm256_cvtepu8_epi16(<2 x i64> %a0) {
608; CHECK-LABEL: test_mm256_cvtepu8_epi16:
609; CHECK:       # %bb.0:
610; CHECK-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
611; CHECK-NEXT:    ret{{[l|q]}}
612  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
613  %ext = zext <16 x i8> %arg0 to <16 x i16>
614  %res = bitcast <16 x i16> %ext to <4 x i64>
615  ret <4 x i64> %res
616}
617
618define <4 x i64> @test_mm256_cvtepu8_epi32(<2 x i64> %a0) {
619; CHECK-LABEL: test_mm256_cvtepu8_epi32:
620; CHECK:       # %bb.0:
621; CHECK-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
622; CHECK-NEXT:    ret{{[l|q]}}
623  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
624  %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
625  %ext = zext <8 x i8> %shuf to <8 x i32>
626  %res = bitcast <8 x i32> %ext to <4 x i64>
627  ret <4 x i64> %res
628}
629
630define <4 x i64> @test_mm256_cvtepu8_epi64(<2 x i64> %a0) {
631; CHECK-LABEL: test_mm256_cvtepu8_epi64:
632; CHECK:       # %bb.0:
633; CHECK-NEXT:    vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
634; CHECK-NEXT:    ret{{[l|q]}}
635  %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
636  %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
637  %ext = zext <4 x i8> %shuf to <4 x i64>
638  ret <4 x i64> %ext
639}
640
641define <4 x i64> @test_mm256_cvtepu16_epi32(<2 x i64> %a0) {
642; CHECK-LABEL: test_mm256_cvtepu16_epi32:
643; CHECK:       # %bb.0:
644; CHECK-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
645; CHECK-NEXT:    ret{{[l|q]}}
646  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
647  %ext = zext <8 x i16> %arg0 to <8 x i32>
648  %res = bitcast <8 x i32> %ext to <4 x i64>
649  ret <4 x i64> %res
650}
651
652define <4 x i64> @test_mm256_cvtepu16_epi64(<2 x i64> %a0) {
653; CHECK-LABEL: test_mm256_cvtepu16_epi64:
654; CHECK:       # %bb.0:
655; CHECK-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
656; CHECK-NEXT:    ret{{[l|q]}}
657  %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
658  %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
659  %ext = zext <4 x i16> %shuf to <4 x i64>
660  ret <4 x i64> %ext
661}
662
663define <4 x i64> @test_mm256_cvtepu32_epi64(<2 x i64> %a0) {
664; CHECK-LABEL: test_mm256_cvtepu32_epi64:
665; CHECK:       # %bb.0:
666; CHECK-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
667; CHECK-NEXT:    ret{{[l|q]}}
668  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
669  %ext = zext <4 x i32> %arg0 to <4 x i64>
670  ret <4 x i64> %ext
671}
672
673define <2 x i64> @test_mm256_extracti128_si256(<4 x i64> %a0) nounwind {
674; CHECK-LABEL: test_mm256_extracti128_si256:
675; CHECK:       # %bb.0:
676; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
677; CHECK-NEXT:    vzeroupper
678; CHECK-NEXT:    ret{{[l|q]}}
679  %res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> <i32 2, i32 3>
680  ret <2 x i64> %res
681}
682
683define <4 x i64> @test_mm256_hadd_epi16(<4 x i64> %a0, <4 x i64> %a1) {
684; CHECK-LABEL: test_mm256_hadd_epi16:
685; CHECK:       # %bb.0:
686; CHECK-NEXT:    vphaddw %ymm1, %ymm0, %ymm0
687; CHECK-NEXT:    ret{{[l|q]}}
688  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
689  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
690  %res = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %arg0, <16 x i16> %arg1)
691  %bc = bitcast <16 x i16> %res to <4 x i64>
692  ret <4 x i64> %bc
693}
694declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>) nounwind readnone
695
696define <4 x i64> @test_mm256_hadd_epi32(<4 x i64> %a0, <4 x i64> %a1) {
697; CHECK-LABEL: test_mm256_hadd_epi32:
698; CHECK:       # %bb.0:
699; CHECK-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
700; CHECK-NEXT:    ret{{[l|q]}}
701  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
702  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
703  %res = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %arg0, <8 x i32> %arg1)
704  %bc = bitcast <8 x i32> %res to <4 x i64>
705  ret <4 x i64> %bc
706}
707declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>) nounwind readnone
708
709define <4 x i64> @test_mm256_hadds_epi16(<4 x i64> %a0, <4 x i64> %a1) {
710; CHECK-LABEL: test_mm256_hadds_epi16:
711; CHECK:       # %bb.0:
712; CHECK-NEXT:    vphaddsw %ymm1, %ymm0, %ymm0
713; CHECK-NEXT:    ret{{[l|q]}}
714  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
715  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
716  %res = call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %arg0, <16 x i16> %arg1)
717  %bc = bitcast <16 x i16> %res to <4 x i64>
718  ret <4 x i64> %bc
719}
720declare <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16>, <16 x i16>) nounwind readnone
721
722define <4 x i64> @test_mm256_hsub_epi16(<4 x i64> %a0, <4 x i64> %a1) {
723; CHECK-LABEL: test_mm256_hsub_epi16:
724; CHECK:       # %bb.0:
725; CHECK-NEXT:    vphsubw %ymm1, %ymm0, %ymm0
726; CHECK-NEXT:    ret{{[l|q]}}
727  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
728  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
729  %res = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %arg0, <16 x i16> %arg1)
730  %bc = bitcast <16 x i16> %res to <4 x i64>
731  ret <4 x i64> %bc
732}
733declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>) nounwind readnone
734
735define <4 x i64> @test_mm256_hsub_epi32(<4 x i64> %a0, <4 x i64> %a1) {
736; CHECK-LABEL: test_mm256_hsub_epi32:
737; CHECK:       # %bb.0:
738; CHECK-NEXT:    vphsubd %ymm1, %ymm0, %ymm0
739; CHECK-NEXT:    ret{{[l|q]}}
740  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
741  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
742  %res = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %arg0, <8 x i32> %arg1)
743  %bc = bitcast <8 x i32> %res to <4 x i64>
744  ret <4 x i64> %bc
745}
746declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>) nounwind readnone
747
748define <4 x i64> @test_mm256_hsubs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
749; CHECK-LABEL: test_mm256_hsubs_epi16:
750; CHECK:       # %bb.0:
751; CHECK-NEXT:    vphsubsw %ymm1, %ymm0, %ymm0
752; CHECK-NEXT:    ret{{[l|q]}}
753  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
754  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
755  %res = call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %arg0, <16 x i16> %arg1)
756  %bc = bitcast <16 x i16> %res to <4 x i64>
757  ret <4 x i64> %bc
758}
759declare <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16>, <16 x i16>) nounwind readnone
760
761define <2 x i64> @test_mm_i32gather_epi32(i32 *%a0, <2 x i64> %a1) {
762; X86-LABEL: test_mm_i32gather_epi32:
763; X86:       # %bb.0:
764; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
765; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
766; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
767; X86-NEXT:    vpgatherdd %xmm2, (%eax,%xmm0,2), %xmm1
768; X86-NEXT:    vmovdqa %xmm1, %xmm0
769; X86-NEXT:    retl
770;
771; X64-LABEL: test_mm_i32gather_epi32:
772; X64:       # %bb.0:
773; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
774; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
775; X64-NEXT:    vpgatherdd %xmm2, (%rdi,%xmm0,2), %xmm1
776; X64-NEXT:    vmovdqa %xmm1, %xmm0
777; X64-NEXT:    retq
778  %arg0 = bitcast i32 *%a0 to i8*
779  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
780  %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
781  %call = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> undef, i8* %arg0, <4 x i32> %arg1, <4 x i32> %mask, i8 2)
782  %bc = bitcast <4 x i32> %call to <2 x i64>
783  ret <2 x i64> %bc
784}
785declare <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32>, i8*, <4 x i32>, <4 x i32>, i8) nounwind readonly
786
787define <2 x i64> @test_mm_mask_i32gather_epi32(<2 x i64> %a0, i32 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
788; X86-LABEL: test_mm_mask_i32gather_epi32:
789; X86:       # %bb.0:
790; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
791; X86-NEXT:    vpgatherdd %xmm2, (%eax,%xmm1,2), %xmm0
792; X86-NEXT:    retl
793;
794; X64-LABEL: test_mm_mask_i32gather_epi32:
795; X64:       # %bb.0:
796; X64-NEXT:    vpgatherdd %xmm2, (%rdi,%xmm1,2), %xmm0
797; X64-NEXT:    retq
798  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
799  %arg1 = bitcast i32 *%a1 to i8*
800  %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
801  %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
802  %call = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> %arg0, i8* %arg1, <4 x i32> %arg2, <4 x i32> %arg3, i8 2)
803  %bc = bitcast <4 x i32> %call to <2 x i64>
804  ret <2 x i64> %bc
805}
806
807define <4 x i64> @test_mm256_i32gather_epi32(i32 *%a0, <4 x i64> %a1) {
808; X86-LABEL: test_mm256_i32gather_epi32:
809; X86:       # %bb.0:
810; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
811; X86-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
812; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
813; X86-NEXT:    vpgatherdd %ymm2, (%eax,%ymm0,2), %ymm1
814; X86-NEXT:    vmovdqa %ymm1, %ymm0
815; X86-NEXT:    retl
816;
817; X64-LABEL: test_mm256_i32gather_epi32:
818; X64:       # %bb.0:
819; X64-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
820; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
821; X64-NEXT:    vpgatherdd %ymm2, (%rdi,%ymm0,2), %ymm1
822; X64-NEXT:    vmovdqa %ymm1, %ymm0
823; X64-NEXT:    retq
824  %arg0 = bitcast i32 *%a0 to i8*
825  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
826  %mask = bitcast <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1> to <8 x i32>
827  %call = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8* %arg0, <8 x i32> %arg1, <8 x i32> %mask, i8 2)
828  %bc = bitcast <8 x i32> %call to <4 x i64>
829  ret <4 x i64> %bc
830}
831declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32>, i8*, <8 x i32>, <8 x i32>, i8) nounwind readonly
832
833define <4 x i64> @test_mm256_mask_i32gather_epi32(<4 x i64> %a0, i32 *%a1, <4 x i64> %a2, <4 x i64> %a3) {
834; X86-LABEL: test_mm256_mask_i32gather_epi32:
835; X86:       # %bb.0:
836; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
837; X86-NEXT:    vpgatherdd %ymm2, (%eax,%ymm1,2), %ymm0
838; X86-NEXT:    retl
839;
840; X64-LABEL: test_mm256_mask_i32gather_epi32:
841; X64:       # %bb.0:
842; X64-NEXT:    vpgatherdd %ymm2, (%rdi,%ymm1,2), %ymm0
843; X64-NEXT:    retq
844  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
845  %arg1 = bitcast i32 *%a1 to i8*
846  %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
847  %arg3 = bitcast <4 x i64> %a3 to <8 x i32>
848  %call = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %arg0, i8* %arg1, <8 x i32> %arg2, <8 x i32> %arg3, i8 2)
849  %bc = bitcast <8 x i32> %call to <4 x i64>
850  ret <4 x i64> %bc
851}
852
853define <2 x i64> @test_mm_i32gather_epi64(i64 *%a0, <2 x i64> %a1) {
854; X86-LABEL: test_mm_i32gather_epi64:
855; X86:       # %bb.0:
856; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
857; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
858; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
859; X86-NEXT:    vpgatherdq %xmm2, (%eax,%xmm0,2), %xmm1
860; X86-NEXT:    vmovdqa %xmm1, %xmm0
861; X86-NEXT:    retl
862;
863; X64-LABEL: test_mm_i32gather_epi64:
864; X64:       # %bb.0:
865; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
866; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
867; X64-NEXT:    vpgatherdq %xmm2, (%rdi,%xmm0,2), %xmm1
868; X64-NEXT:    vmovdqa %xmm1, %xmm0
869; X64-NEXT:    retq
870  %arg0 = bitcast i64 *%a0 to i8*
871  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
872  %res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> undef, i8* %arg0, <4 x i32> %arg1, <2 x i64> <i64 -1, i64 -1>, i8 2)
873  ret <2 x i64> %res
874}
875declare <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64>, i8*, <4 x i32>, <2 x i64>, i8) nounwind readonly
876
877define <2 x i64> @test_mm_mask_i32gather_epi64(<2 x i64> %a0, i64 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
878; X86-LABEL: test_mm_mask_i32gather_epi64:
879; X86:       # %bb.0:
880; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
881; X86-NEXT:    vpgatherdq %xmm2, (%eax,%xmm1,2), %xmm0
882; X86-NEXT:    retl
883;
884; X64-LABEL: test_mm_mask_i32gather_epi64:
885; X64:       # %bb.0:
886; X64-NEXT:    vpgatherdq %xmm2, (%rdi,%xmm1,2), %xmm0
887; X64-NEXT:    retq
888  %arg1 = bitcast i64 *%a1 to i8*
889  %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
890  %res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> %a0, i8* %arg1, <4 x i32> %arg2, <2 x i64> %a3, i8 2)
891  ret <2 x i64> %res
892}
893
894define <4 x i64> @test_mm256_i32gather_epi64(i64 *%a0, <2 x i64> %a1) {
895; X86-LABEL: test_mm256_i32gather_epi64:
896; X86:       # %bb.0:
897; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
898; X86-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
899; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
900; X86-NEXT:    vpgatherdq %ymm2, (%eax,%xmm0,2), %ymm1
901; X86-NEXT:    vmovdqa %ymm1, %ymm0
902; X86-NEXT:    retl
903;
904; X64-LABEL: test_mm256_i32gather_epi64:
905; X64:       # %bb.0:
906; X64-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
907; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
908; X64-NEXT:    vpgatherdq %ymm2, (%rdi,%xmm0,2), %ymm1
909; X64-NEXT:    vmovdqa %ymm1, %ymm0
910; X64-NEXT:    retq
911  %arg0 = bitcast i64 *%a0 to i8*
912  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
913  %res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8* %arg0, <4 x i32> %arg1, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, i8 2)
914  ret <4 x i64> %res
915}
916declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64>, i8*, <4 x i32>, <4 x i64>, i8) nounwind readonly
917
918define <4 x i64> @test_mm256_mask_i32gather_epi64(<4 x i64> %a0, i64 *%a1, <2 x i64> %a2, <4 x i64> %a3) {
919; X86-LABEL: test_mm256_mask_i32gather_epi64:
920; X86:       # %bb.0:
921; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
922; X86-NEXT:    vpgatherdq %ymm2, (%eax,%xmm1,2), %ymm0
923; X86-NEXT:    retl
924;
925; X64-LABEL: test_mm256_mask_i32gather_epi64:
926; X64:       # %bb.0:
927; X64-NEXT:    vpgatherdq %ymm2, (%rdi,%xmm1,2), %ymm0
928; X64-NEXT:    retq
929  %arg1 = bitcast i64 *%a1 to i8*
930  %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
931  %res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %a0, i8* %arg1, <4 x i32> %arg2, <4 x i64> %a3, i8 2)
932  ret <4 x i64> %res
933}
934
935define <2 x double> @test_mm_i32gather_pd(double *%a0, <2 x i64> %a1) {
936; X86-LABEL: test_mm_i32gather_pd:
937; X86:       # %bb.0:
938; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
939; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
940; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
941; X86-NEXT:    vgatherdpd %xmm2, (%eax,%xmm0,2), %xmm1
942; X86-NEXT:    vmovapd %xmm1, %xmm0
943; X86-NEXT:    retl
944;
945; X64-LABEL: test_mm_i32gather_pd:
946; X64:       # %bb.0:
947; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
948; X64-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
949; X64-NEXT:    vgatherdpd %xmm2, (%rdi,%xmm0,2), %xmm1
950; X64-NEXT:    vmovapd %xmm1, %xmm0
951; X64-NEXT:    retq
952  %arg0 = bitcast double *%a0 to i8*
953  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
954  %cmp = fcmp oeq <2 x double> zeroinitializer, zeroinitializer
955  %sext = sext <2 x i1> %cmp to <2 x i64>
956  %mask = bitcast <2 x i64> %sext to <2 x double>
957  %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> undef, i8* %arg0, <4 x i32> %arg1, <2 x double> %mask, i8 2)
958  ret <2 x double> %res
959}
960declare <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double>, i8*, <4 x i32>, <2 x double>, i8) nounwind readonly
961
962define <2 x double> @test_mm_mask_i32gather_pd(<2 x double> %a0, double *%a1, <2 x i64> %a2, <2 x double> %a3) {
963; X86-LABEL: test_mm_mask_i32gather_pd:
964; X86:       # %bb.0:
965; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
966; X86-NEXT:    vgatherdpd %xmm2, (%eax,%xmm1,2), %xmm0
967; X86-NEXT:    retl
968;
969; X64-LABEL: test_mm_mask_i32gather_pd:
970; X64:       # %bb.0:
971; X64-NEXT:    vgatherdpd %xmm2, (%rdi,%xmm1,2), %xmm0
972; X64-NEXT:    retq
973  %arg1 = bitcast double *%a1 to i8*
974  %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
975  %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> %a0, i8* %arg1, <4 x i32> %arg2, <2 x double> %a3, i8 2)
976  ret <2 x double> %res
977}
978
979define <4 x double> @test_mm256_i32gather_pd(double *%a0, <2 x i64> %a1) {
980; X86-LABEL: test_mm256_i32gather_pd:
981; X86:       # %bb.0:
982; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
983; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
984; X86-NEXT:    vcmpeqpd %ymm1, %ymm1, %ymm2
985; X86-NEXT:    vgatherdpd %ymm2, (%eax,%xmm0,2), %ymm1
986; X86-NEXT:    vmovapd %ymm1, %ymm0
987; X86-NEXT:    retl
988;
989; X64-LABEL: test_mm256_i32gather_pd:
990; X64:       # %bb.0:
991; X64-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
992; X64-NEXT:    vcmpeqpd %ymm1, %ymm1, %ymm2
993; X64-NEXT:    vgatherdpd %ymm2, (%rdi,%xmm0,2), %ymm1
994; X64-NEXT:    vmovapd %ymm1, %ymm0
995; X64-NEXT:    retq
996  %arg0 = bitcast double *%a0 to i8*
997  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
998  %mask = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i8 0)
999  %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8* %arg0, <4 x i32> %arg1, <4 x double> %mask, i8 2)
1000  ret <4 x double> %res
1001}
1002declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double>, i8*, <4 x i32>, <4 x double>, i8) nounwind readonly
1003
1004define <4 x double> @test_mm256_mask_i32gather_pd(<4 x double> %a0, double *%a1, <2 x i64> %a2, <4 x double> %a3) {
1005; X86-LABEL: test_mm256_mask_i32gather_pd:
1006; X86:       # %bb.0:
1007; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1008; X86-NEXT:    vgatherdpd %ymm2, (%eax,%xmm1,2), %ymm0
1009; X86-NEXT:    retl
1010;
1011; X64-LABEL: test_mm256_mask_i32gather_pd:
1012; X64:       # %bb.0:
1013; X64-NEXT:    vgatherdpd %ymm2, (%rdi,%xmm1,2), %ymm0
1014; X64-NEXT:    retq
1015  %arg1 = bitcast double *%a1 to i8*
1016  %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
1017  %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %a0, i8* %arg1, <4 x i32> %arg2, <4 x double> %a3, i8 2)
1018  ret <4 x double> %res
1019}
1020
1021define <4 x float> @test_mm_i32gather_ps(float *%a0, <2 x i64> %a1) {
1022; X86-LABEL: test_mm_i32gather_ps:
1023; X86:       # %bb.0:
1024; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1025; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1026; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1027; X86-NEXT:    vgatherdps %xmm2, (%eax,%xmm0,2), %xmm1
1028; X86-NEXT:    vmovaps %xmm1, %xmm0
1029; X86-NEXT:    retl
1030;
1031; X64-LABEL: test_mm_i32gather_ps:
1032; X64:       # %bb.0:
1033; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1034; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1035; X64-NEXT:    vgatherdps %xmm2, (%rdi,%xmm0,2), %xmm1
1036; X64-NEXT:    vmovaps %xmm1, %xmm0
1037; X64-NEXT:    retq
1038  %arg0 = bitcast float *%a0 to i8*
1039  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1040  %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
1041  %sext = sext <4 x i1> %cmp to <4 x i32>
1042  %mask = bitcast <4 x i32> %sext to <4 x float>
1043  %call = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> undef, i8* %arg0, <4 x i32> %arg1, <4 x float> %mask, i8 2)
1044  ret <4 x float> %call
1045}
1046declare <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float>, i8*, <4 x i32>, <4 x float>, i8) nounwind readonly
1047
1048define <4 x float> @test_mm_mask_i32gather_ps(<4 x float> %a0, float *%a1, <2 x i64> %a2, <4 x float> %a3) {
1049; X86-LABEL: test_mm_mask_i32gather_ps:
1050; X86:       # %bb.0:
1051; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1052; X86-NEXT:    vgatherdps %xmm2, (%eax,%xmm1,2), %xmm0
1053; X86-NEXT:    retl
1054;
1055; X64-LABEL: test_mm_mask_i32gather_ps:
1056; X64:       # %bb.0:
1057; X64-NEXT:    vgatherdps %xmm2, (%rdi,%xmm1,2), %xmm0
1058; X64-NEXT:    retq
1059  %arg1 = bitcast float *%a1 to i8*
1060  %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
1061  %call = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> %a0, i8* %arg1, <4 x i32> %arg2, <4 x float> %a3, i8 2)
1062  ret <4 x float> %call
1063}
1064
1065define <8 x float> @test_mm256_i32gather_ps(float *%a0, <4 x i64> %a1) {
1066; X86-LABEL: test_mm256_i32gather_ps:
1067; X86:       # %bb.0:
1068; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1069; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1070; X86-NEXT:    vcmpeqps %ymm1, %ymm1, %ymm2
1071; X86-NEXT:    vgatherdps %ymm2, (%eax,%ymm0,2), %ymm1
1072; X86-NEXT:    vmovaps %ymm1, %ymm0
1073; X86-NEXT:    retl
1074;
1075; X64-LABEL: test_mm256_i32gather_ps:
1076; X64:       # %bb.0:
1077; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1078; X64-NEXT:    vcmpeqps %ymm1, %ymm1, %ymm2
1079; X64-NEXT:    vgatherdps %ymm2, (%rdi,%ymm0,2), %ymm1
1080; X64-NEXT:    vmovaps %ymm1, %ymm0
1081; X64-NEXT:    retq
1082  %arg0 = bitcast float *%a0 to i8*
1083  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1084  %mask = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> zeroinitializer, <8 x float> zeroinitializer, i8 0)
1085  %call = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8* %arg0, <8 x i32> %arg1, <8 x float> %mask, i8 2)
1086  ret <8 x float> %call
1087}
1088declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float>, i8*, <8 x i32>, <8 x float>, i8) nounwind readonly
1089
1090define <8 x float> @test_mm256_mask_i32gather_ps(<8 x float> %a0, float *%a1, <4 x i64> %a2, <8 x float> %a3) {
1091; X86-LABEL: test_mm256_mask_i32gather_ps:
1092; X86:       # %bb.0:
1093; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1094; X86-NEXT:    vgatherdps %ymm2, (%eax,%ymm1,2), %ymm0
1095; X86-NEXT:    retl
1096;
1097; X64-LABEL: test_mm256_mask_i32gather_ps:
1098; X64:       # %bb.0:
1099; X64-NEXT:    vgatherdps %ymm2, (%rdi,%ymm1,2), %ymm0
1100; X64-NEXT:    retq
1101  %arg1 = bitcast float *%a1 to i8*
1102  %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
1103  %call = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %a0, i8* %arg1, <8 x i32> %arg2, <8 x float> %a3, i8 2)
1104  ret <8 x float> %call
1105}
1106
1107define <2 x i64> @test_mm_i64gather_epi32(i32 *%a0, <2 x i64> %a1) {
1108; X86-LABEL: test_mm_i64gather_epi32:
1109; X86:       # %bb.0:
1110; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1111; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1112; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1113; X86-NEXT:    vpgatherqd %xmm2, (%eax,%xmm0,2), %xmm1
1114; X86-NEXT:    vmovdqa %xmm1, %xmm0
1115; X86-NEXT:    retl
1116;
1117; X64-LABEL: test_mm_i64gather_epi32:
1118; X64:       # %bb.0:
1119; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1120; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1121; X64-NEXT:    vpgatherqd %xmm2, (%rdi,%xmm0,2), %xmm1
1122; X64-NEXT:    vmovdqa %xmm1, %xmm0
1123; X64-NEXT:    retq
1124  %arg0 = bitcast i32 *%a0 to i8*
1125  %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
1126  %call = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> undef, i8* %arg0, <2 x i64> %a1, <4 x i32> %mask, i8 2)
1127  %bc = bitcast <4 x i32> %call to <2 x i64>
1128  ret <2 x i64> %bc
1129}
1130declare <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32>, i8*, <2 x i64>, <4 x i32>, i8) nounwind readonly
1131
1132define <2 x i64> @test_mm_mask_i64gather_epi32(<2 x i64> %a0, i32 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
1133; X86-LABEL: test_mm_mask_i64gather_epi32:
1134; X86:       # %bb.0:
1135; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1136; X86-NEXT:    vpgatherqd %xmm2, (%eax,%xmm1,2), %xmm0
1137; X86-NEXT:    retl
1138;
1139; X64-LABEL: test_mm_mask_i64gather_epi32:
1140; X64:       # %bb.0:
1141; X64-NEXT:    vpgatherqd %xmm2, (%rdi,%xmm1,2), %xmm0
1142; X64-NEXT:    retq
1143  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
1144  %arg1 = bitcast i32 *%a1 to i8*
1145  %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
1146  %call = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> %arg0, i8* %arg1, <2 x i64> %a2, <4 x i32> %arg3, i8 2)
1147  %bc = bitcast <4 x i32> %call to <2 x i64>
1148  ret <2 x i64> %bc
1149}
1150
1151define <2 x i64> @test_mm256_i64gather_epi32(i32 *%a0, <4 x i64> %a1) {
1152; X86-LABEL: test_mm256_i64gather_epi32:
1153; X86:       # %bb.0:
1154; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1155; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1156; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1157; X86-NEXT:    vpgatherqd %xmm2, (%eax,%ymm0,2), %xmm1
1158; X86-NEXT:    vmovdqa %xmm1, %xmm0
1159; X86-NEXT:    vzeroupper
1160; X86-NEXT:    retl
1161;
1162; X64-LABEL: test_mm256_i64gather_epi32:
1163; X64:       # %bb.0:
1164; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1165; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1166; X64-NEXT:    vpgatherqd %xmm2, (%rdi,%ymm0,2), %xmm1
1167; X64-NEXT:    vmovdqa %xmm1, %xmm0
1168; X64-NEXT:    vzeroupper
1169; X64-NEXT:    retq
1170  %arg0 = bitcast i32 *%a0 to i8*
1171  %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
1172  %call = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8* %arg0, <4 x i64> %a1, <4 x i32> %mask, i8 2)
1173  %bc = bitcast <4 x i32> %call to <2 x i64>
1174  ret <2 x i64> %bc
1175}
1176declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32>, i8*, <4 x i64>, <4 x i32>, i8) nounwind readonly
1177
1178define <2 x i64> @test_mm256_mask_i64gather_epi32(<2 x i64> %a0, i32 *%a1, <4 x i64> %a2, <2 x i64> %a3) {
1179; X86-LABEL: test_mm256_mask_i64gather_epi32:
1180; X86:       # %bb.0:
1181; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1182; X86-NEXT:    vpgatherqd %xmm2, (%eax,%ymm1,2), %xmm0
1183; X86-NEXT:    vzeroupper
1184; X86-NEXT:    retl
1185;
1186; X64-LABEL: test_mm256_mask_i64gather_epi32:
1187; X64:       # %bb.0:
1188; X64-NEXT:    vpgatherqd %xmm2, (%rdi,%ymm1,2), %xmm0
1189; X64-NEXT:    vzeroupper
1190; X64-NEXT:    retq
1191  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
1192  %arg1 = bitcast i32 *%a1 to i8*
1193  %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
1194  %call = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %arg0, i8* %arg1, <4 x i64> %a2, <4 x i32> %arg3, i8 2)
1195  %bc = bitcast <4 x i32> %call to <2 x i64>
1196  ret <2 x i64> %bc
1197}
1198
1199define <2 x i64> @test_mm_i64gather_epi64(i64 *%a0, <2 x i64> %a1) {
1200; X86-LABEL: test_mm_i64gather_epi64:
1201; X86:       # %bb.0:
1202; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1203; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1204; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1205; X86-NEXT:    vpgatherqq %xmm2, (%eax,%xmm0,2), %xmm1
1206; X86-NEXT:    vmovdqa %xmm1, %xmm0
1207; X86-NEXT:    retl
1208;
1209; X64-LABEL: test_mm_i64gather_epi64:
1210; X64:       # %bb.0:
1211; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1212; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1213; X64-NEXT:    vpgatherqq %xmm2, (%rdi,%xmm0,2), %xmm1
1214; X64-NEXT:    vmovdqa %xmm1, %xmm0
1215; X64-NEXT:    retq
1216  %arg0 = bitcast i64 *%a0 to i8*
1217  %call = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> undef, i8* %arg0, <2 x i64> %a1, <2 x i64> <i64 -1, i64 -1>, i8 2)
1218  ret <2 x i64> %call
1219}
1220declare <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64>, i8*, <2 x i64>, <2 x i64>, i8) nounwind readonly
1221
1222define <2 x i64> @test_mm_mask_i64gather_epi64(<2 x i64> %a0, i64 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
1223; X86-LABEL: test_mm_mask_i64gather_epi64:
1224; X86:       # %bb.0:
1225; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1226; X86-NEXT:    vpgatherqq %xmm2, (%eax,%xmm1,2), %xmm0
1227; X86-NEXT:    retl
1228;
1229; X64-LABEL: test_mm_mask_i64gather_epi64:
1230; X64:       # %bb.0:
1231; X64-NEXT:    vpgatherqq %xmm2, (%rdi,%xmm1,2), %xmm0
1232; X64-NEXT:    retq
1233  %arg1 = bitcast i64 *%a1 to i8*
1234  %call = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> %a0, i8* %arg1, <2 x i64> %a2, <2 x i64> %a3, i8 2)
1235  ret <2 x i64> %call
1236}
1237
1238define <4 x i64> @test_mm256_i64gather_epi64(i64 *%a0, <4 x i64> %a1) {
1239; X86-LABEL: test_mm256_i64gather_epi64:
1240; X86:       # %bb.0:
1241; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1242; X86-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
1243; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1244; X86-NEXT:    vpgatherqq %ymm2, (%eax,%ymm0,2), %ymm1
1245; X86-NEXT:    vmovdqa %ymm1, %ymm0
1246; X86-NEXT:    retl
1247;
1248; X64-LABEL: test_mm256_i64gather_epi64:
1249; X64:       # %bb.0:
1250; X64-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
1251; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1252; X64-NEXT:    vpgatherqq %ymm2, (%rdi,%ymm0,2), %ymm1
1253; X64-NEXT:    vmovdqa %ymm1, %ymm0
1254; X64-NEXT:    retq
1255  %arg0 = bitcast i64 *%a0 to i8*
1256  %call = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8* %arg0, <4 x i64> %a1, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, i8 2)
1257  ret <4 x i64> %call
1258}
1259declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64>, i8*, <4 x i64>, <4 x i64>, i8) nounwind readonly
1260
1261define <4 x i64> @test_mm256_mask_i64gather_epi64(<4 x i64> %a0, i64 *%a1, <4 x i64> %a2, <4 x i64> %a3) {
1262; X86-LABEL: test_mm256_mask_i64gather_epi64:
1263; X86:       # %bb.0:
1264; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1265; X86-NEXT:    vpgatherqq %ymm2, (%eax,%ymm1,2), %ymm0
1266; X86-NEXT:    retl
1267;
1268; X64-LABEL: test_mm256_mask_i64gather_epi64:
1269; X64:       # %bb.0:
1270; X64-NEXT:    vpgatherqq %ymm2, (%rdi,%ymm1,2), %ymm0
1271; X64-NEXT:    retq
1272  %arg1 = bitcast i64 *%a1 to i8*
1273  %call = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %a0, i8* %arg1, <4 x i64> %a2, <4 x i64> %a3, i8 2)
1274  ret <4 x i64> %call
1275}
1276
1277define <2 x double> @test_mm_i64gather_pd(double *%a0, <2 x i64> %a1) {
1278; X86-LABEL: test_mm_i64gather_pd:
1279; X86:       # %bb.0:
1280; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1281; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1282; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1283; X86-NEXT:    vgatherqpd %xmm2, (%eax,%xmm0,2), %xmm1
1284; X86-NEXT:    vmovapd %xmm1, %xmm0
1285; X86-NEXT:    retl
1286;
1287; X64-LABEL: test_mm_i64gather_pd:
1288; X64:       # %bb.0:
1289; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1290; X64-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1291; X64-NEXT:    vgatherqpd %xmm2, (%rdi,%xmm0,2), %xmm1
1292; X64-NEXT:    vmovapd %xmm1, %xmm0
1293; X64-NEXT:    retq
1294  %arg0 = bitcast double *%a0 to i8*
1295  %cmp = fcmp oeq <2 x double> zeroinitializer, zeroinitializer
1296  %sext = sext <2 x i1> %cmp to <2 x i64>
1297  %mask = bitcast <2 x i64> %sext to <2 x double>
1298  %call = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> undef, i8* %arg0, <2 x i64> %a1, <2 x double> %mask, i8 2)
1299  ret <2 x double> %call
1300}
1301declare <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double>, i8*, <2 x i64>, <2 x double>, i8) nounwind readonly
1302
1303define <2 x double> @test_mm_mask_i64gather_pd(<2 x double> %a0, double *%a1, <2 x i64> %a2, <2 x double> %a3) {
1304; X86-LABEL: test_mm_mask_i64gather_pd:
1305; X86:       # %bb.0:
1306; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1307; X86-NEXT:    vgatherqpd %xmm2, (%eax,%xmm1,2), %xmm0
1308; X86-NEXT:    retl
1309;
1310; X64-LABEL: test_mm_mask_i64gather_pd:
1311; X64:       # %bb.0:
1312; X64-NEXT:    vgatherqpd %xmm2, (%rdi,%xmm1,2), %xmm0
1313; X64-NEXT:    retq
1314  %arg1 = bitcast double *%a1 to i8*
1315  %call = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> %a0, i8* %arg1, <2 x i64> %a2, <2 x double> %a3, i8 2)
1316  ret <2 x double> %call
1317}
1318
1319define <4 x double> @test_mm256_i64gather_pd(double *%a0, <4 x i64> %a1) {
1320; X86-LABEL: test_mm256_i64gather_pd:
1321; X86:       # %bb.0:
1322; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1323; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1324; X86-NEXT:    vcmpeqpd %ymm1, %ymm1, %ymm2
1325; X86-NEXT:    vgatherqpd %ymm2, (%eax,%ymm0,2), %ymm1
1326; X86-NEXT:    vmovapd %ymm1, %ymm0
1327; X86-NEXT:    retl
1328;
1329; X64-LABEL: test_mm256_i64gather_pd:
1330; X64:       # %bb.0:
1331; X64-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1332; X64-NEXT:    vcmpeqpd %ymm1, %ymm1, %ymm2
1333; X64-NEXT:    vgatherqpd %ymm2, (%rdi,%ymm0,2), %ymm1
1334; X64-NEXT:    vmovapd %ymm1, %ymm0
1335; X64-NEXT:    retq
1336  %arg0 = bitcast double *%a0 to i8*
1337  %mask = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i8 0)
1338  %call = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8* %arg0, <4 x i64> %a1, <4 x double> %mask, i8 2)
1339  ret <4 x double> %call
1340}
1341declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double>, i8*, <4 x i64>, <4 x double>, i8) nounwind readonly
1342
1343define <4 x double> @test_mm256_mask_i64gather_pd(<4 x double> %a0, i64 *%a1, <4 x i64> %a2, <4 x double> %a3) {
1344; X86-LABEL: test_mm256_mask_i64gather_pd:
1345; X86:       # %bb.0:
1346; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1347; X86-NEXT:    vgatherqpd %ymm2, (%eax,%ymm1,2), %ymm0
1348; X86-NEXT:    retl
1349;
1350; X64-LABEL: test_mm256_mask_i64gather_pd:
1351; X64:       # %bb.0:
1352; X64-NEXT:    vgatherqpd %ymm2, (%rdi,%ymm1,2), %ymm0
1353; X64-NEXT:    retq
1354  %arg1 = bitcast i64 *%a1 to i8*
1355  %call = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %a0, i8* %arg1, <4 x i64> %a2, <4 x double> %a3, i8 2)
1356  ret <4 x double> %call
1357}
1358
1359define <4 x float> @test_mm_i64gather_ps(float *%a0, <2 x i64> %a1) {
1360; X86-LABEL: test_mm_i64gather_ps:
1361; X86:       # %bb.0:
1362; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1363; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1364; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1365; X86-NEXT:    vgatherqps %xmm2, (%eax,%xmm0,2), %xmm1
1366; X86-NEXT:    vmovaps %xmm1, %xmm0
1367; X86-NEXT:    retl
1368;
1369; X64-LABEL: test_mm_i64gather_ps:
1370; X64:       # %bb.0:
1371; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1372; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1373; X64-NEXT:    vgatherqps %xmm2, (%rdi,%xmm0,2), %xmm1
1374; X64-NEXT:    vmovaps %xmm1, %xmm0
1375; X64-NEXT:    retq
1376  %arg0 = bitcast float *%a0 to i8*
1377  %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
1378  %sext = sext <4 x i1> %cmp to <4 x i32>
1379  %mask = bitcast <4 x i32> %sext to <4 x float>
1380  %call = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> undef, i8* %arg0, <2 x i64> %a1, <4 x float> %mask, i8 2)
1381  ret <4 x float> %call
1382}
1383declare <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float>, i8*, <2 x i64>, <4 x float>, i8) nounwind readonly
1384
1385define <4 x float> @test_mm_mask_i64gather_ps(<4 x float> %a0, float *%a1, <2 x i64> %a2, <4 x float> %a3) {
1386; X86-LABEL: test_mm_mask_i64gather_ps:
1387; X86:       # %bb.0:
1388; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1389; X86-NEXT:    vgatherqps %xmm2, (%eax,%xmm1,2), %xmm0
1390; X86-NEXT:    retl
1391;
1392; X64-LABEL: test_mm_mask_i64gather_ps:
1393; X64:       # %bb.0:
1394; X64-NEXT:    vgatherqps %xmm2, (%rdi,%xmm1,2), %xmm0
1395; X64-NEXT:    retq
1396  %arg1 = bitcast float *%a1 to i8*
1397  %call = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> %a0, i8* %arg1, <2 x i64> %a2, <4 x float> %a3, i8 2)
1398  ret <4 x float> %call
1399}
1400
1401define <4 x float> @test_mm256_i64gather_ps(float *%a0, <4 x i64> %a1) {
1402; X86-LABEL: test_mm256_i64gather_ps:
1403; X86:       # %bb.0:
1404; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1405; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1406; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1407; X86-NEXT:    vgatherqps %xmm2, (%eax,%ymm0,2), %xmm1
1408; X86-NEXT:    vmovaps %xmm1, %xmm0
1409; X86-NEXT:    vzeroupper
1410; X86-NEXT:    retl
1411;
1412; X64-LABEL: test_mm256_i64gather_ps:
1413; X64:       # %bb.0:
1414; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1415; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1416; X64-NEXT:    vgatherqps %xmm2, (%rdi,%ymm0,2), %xmm1
1417; X64-NEXT:    vmovaps %xmm1, %xmm0
1418; X64-NEXT:    vzeroupper
1419; X64-NEXT:    retq
1420  %arg0 = bitcast float *%a0 to i8*
1421  %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
1422  %sext = sext <4 x i1> %cmp to <4 x i32>
1423  %mask = bitcast <4 x i32> %sext to <4 x float>
1424  %call = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8* %arg0, <4 x i64> %a1, <4 x float> %mask, i8 2)
1425  ret <4 x float> %call
1426}
1427declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float>, i8*, <4 x i64>, <4 x float>, i8) nounwind readonly
1428
1429define <4 x float> @test_mm256_mask_i64gather_ps(<4 x float> %a0, float *%a1, <4 x i64> %a2, <4 x float> %a3) {
1430; X86-LABEL: test_mm256_mask_i64gather_ps:
1431; X86:       # %bb.0:
1432; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1433; X86-NEXT:    vgatherqps %xmm2, (%eax,%ymm1,2), %xmm0
1434; X86-NEXT:    vzeroupper
1435; X86-NEXT:    retl
1436;
1437; X64-LABEL: test_mm256_mask_i64gather_ps:
1438; X64:       # %bb.0:
1439; X64-NEXT:    vgatherqps %xmm2, (%rdi,%ymm1,2), %xmm0
1440; X64-NEXT:    vzeroupper
1441; X64-NEXT:    retq
1442  %arg1 = bitcast float *%a1 to i8*
1443  %call = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %a0, i8* %arg1, <4 x i64> %a2, <4 x float> %a3, i8 2)
1444  ret <4 x float> %call
1445}
1446
1447define <4 x i64> @test0_mm256_inserti128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind {
1448; CHECK-LABEL: test0_mm256_inserti128_si256:
1449; CHECK:       # %bb.0:
1450; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
1451; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
1452; CHECK-NEXT:    ret{{[l|q]}}
1453  %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1454  %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
1455  ret <4 x i64> %res
1456}
1457
1458define <4 x i64> @test1_mm256_inserti128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind {
1459; CHECK-LABEL: test1_mm256_inserti128_si256:
1460; CHECK:       # %bb.0:
1461; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1462; CHECK-NEXT:    ret{{[l|q]}}
1463  %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1464  %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1465  ret <4 x i64> %res
1466}
1467
1468define <4 x i64> @test_mm256_madd_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1469; CHECK-LABEL: test_mm256_madd_epi16:
1470; CHECK:       # %bb.0:
1471; CHECK-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0
1472; CHECK-NEXT:    ret{{[l|q]}}
1473  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1474  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1475  %res = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %arg0, <16 x i16> %arg1)
1476  %bc = bitcast <8 x i32> %res to <4 x i64>
1477  ret <4 x i64> %bc
1478}
1479declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readnone
1480
1481define <4 x i64> @test_mm256_maddubs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1482; CHECK-LABEL: test_mm256_maddubs_epi16:
1483; CHECK:       # %bb.0:
1484; CHECK-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0
1485; CHECK-NEXT:    ret{{[l|q]}}
1486  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1487  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1488  %res = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %arg0, <32 x i8> %arg1)
1489  %bc = bitcast <16 x i16> %res to <4 x i64>
1490  ret <4 x i64> %bc
1491}
1492declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind readnone
1493
1494define <2 x i64> @test_mm_maskload_epi32(i32* %a0, <2 x i64> %a1) nounwind {
1495; X86-LABEL: test_mm_maskload_epi32:
1496; X86:       # %bb.0:
1497; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1498; X86-NEXT:    vpmaskmovd (%eax), %xmm0, %xmm0
1499; X86-NEXT:    retl
1500;
1501; X64-LABEL: test_mm_maskload_epi32:
1502; X64:       # %bb.0:
1503; X64-NEXT:    vpmaskmovd (%rdi), %xmm0, %xmm0
1504; X64-NEXT:    retq
1505  %arg0 = bitcast i32* %a0 to i8*
1506  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1507  %call = call <4 x i32> @llvm.x86.avx2.maskload.d(i8* %arg0, <4 x i32> %arg1)
1508  %bc = bitcast <4 x i32> %call to <2 x i64>
1509  ret <2 x i64> %bc
1510}
1511declare <4 x i32> @llvm.x86.avx2.maskload.d(i8*, <4 x i32>) nounwind readonly
1512
1513define <4 x i64> @test_mm256_maskload_epi32(i32* %a0, <4 x i64> %a1) nounwind {
1514; X86-LABEL: test_mm256_maskload_epi32:
1515; X86:       # %bb.0:
1516; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1517; X86-NEXT:    vpmaskmovd (%eax), %ymm0, %ymm0
1518; X86-NEXT:    retl
1519;
1520; X64-LABEL: test_mm256_maskload_epi32:
1521; X64:       # %bb.0:
1522; X64-NEXT:    vpmaskmovd (%rdi), %ymm0, %ymm0
1523; X64-NEXT:    retq
1524  %arg0 = bitcast i32* %a0 to i8*
1525  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1526  %call = call <8 x i32> @llvm.x86.avx2.maskload.d.256(i8* %arg0, <8 x i32> %arg1)
1527  %bc = bitcast <8 x i32> %call to <4 x i64>
1528  ret <4 x i64> %bc
1529}
1530declare <8 x i32> @llvm.x86.avx2.maskload.d.256(i8*, <8 x i32>) nounwind readonly
1531
1532define <2 x i64> @test_mm_maskload_epi64(i64* %a0, <2 x i64> %a1) nounwind {
1533; X86-LABEL: test_mm_maskload_epi64:
1534; X86:       # %bb.0:
1535; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1536; X86-NEXT:    vpmaskmovq (%eax), %xmm0, %xmm0
1537; X86-NEXT:    retl
1538;
1539; X64-LABEL: test_mm_maskload_epi64:
1540; X64:       # %bb.0:
1541; X64-NEXT:    vpmaskmovq (%rdi), %xmm0, %xmm0
1542; X64-NEXT:    retq
1543  %arg0 = bitcast i64* %a0 to i8*
1544  %res = call <2 x i64> @llvm.x86.avx2.maskload.q(i8* %arg0, <2 x i64> %a1)
1545  ret <2 x i64> %res
1546}
1547declare <2 x i64> @llvm.x86.avx2.maskload.q(i8*, <2 x i64>) nounwind readonly
1548
1549define <4 x i64> @test_mm256_maskload_epi64(i64* %a0, <4 x i64> %a1) nounwind {
1550; X86-LABEL: test_mm256_maskload_epi64:
1551; X86:       # %bb.0:
1552; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1553; X86-NEXT:    vpmaskmovq (%eax), %ymm0, %ymm0
1554; X86-NEXT:    retl
1555;
1556; X64-LABEL: test_mm256_maskload_epi64:
1557; X64:       # %bb.0:
1558; X64-NEXT:    vpmaskmovq (%rdi), %ymm0, %ymm0
1559; X64-NEXT:    retq
1560  %arg0 = bitcast i64* %a0 to i8*
1561  %res = call <4 x i64> @llvm.x86.avx2.maskload.q.256(i8* %arg0, <4 x i64> %a1)
1562  ret <4 x i64> %res
1563}
1564declare <4 x i64> @llvm.x86.avx2.maskload.q.256(i8*, <4 x i64>) nounwind readonly
1565
1566define void @test_mm_maskstore_epi32(float* %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
1567; X86-LABEL: test_mm_maskstore_epi32:
1568; X86:       # %bb.0:
1569; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1570; X86-NEXT:    vpmaskmovd %xmm1, %xmm0, (%eax)
1571; X86-NEXT:    retl
1572;
1573; X64-LABEL: test_mm_maskstore_epi32:
1574; X64:       # %bb.0:
1575; X64-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi)
1576; X64-NEXT:    retq
1577  %arg0 = bitcast float* %a0 to i8*
1578  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1579  %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
1580  call void @llvm.x86.avx2.maskstore.d(i8* %arg0, <4 x i32> %arg1, <4 x i32> %arg2)
1581  ret void
1582}
1583declare void @llvm.x86.avx2.maskstore.d(i8*, <4 x i32>, <4 x i32>) nounwind readnone
1584
1585define void @test_mm256_maskstore_epi32(float* %a0, <4 x i64> %a1, <4 x i64> %a2) nounwind {
1586; X86-LABEL: test_mm256_maskstore_epi32:
1587; X86:       # %bb.0:
1588; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1589; X86-NEXT:    vpmaskmovd %ymm1, %ymm0, (%eax)
1590; X86-NEXT:    vzeroupper
1591; X86-NEXT:    retl
1592;
1593; X64-LABEL: test_mm256_maskstore_epi32:
1594; X64:       # %bb.0:
1595; X64-NEXT:    vpmaskmovd %ymm1, %ymm0, (%rdi)
1596; X64-NEXT:    vzeroupper
1597; X64-NEXT:    retq
1598  %arg0 = bitcast float* %a0 to i8*
1599  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1600  %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
1601  call void @llvm.x86.avx2.maskstore.d.256(i8* %arg0, <8 x i32> %arg1, <8 x i32> %arg2)
1602  ret void
1603}
1604declare void @llvm.x86.avx2.maskstore.d.256(i8*, <8 x i32>, <8 x i32>) nounwind readnone
1605
1606define void @test_mm_maskstore_epi64(i64* %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
1607; X86-LABEL: test_mm_maskstore_epi64:
1608; X86:       # %bb.0:
1609; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1610; X86-NEXT:    vpmaskmovq %xmm1, %xmm0, (%eax)
1611; X86-NEXT:    retl
1612;
1613; X64-LABEL: test_mm_maskstore_epi64:
1614; X64:       # %bb.0:
1615; X64-NEXT:    vpmaskmovq %xmm1, %xmm0, (%rdi)
1616; X64-NEXT:    retq
1617  %arg0 = bitcast i64* %a0 to i8*
1618  call void @llvm.x86.avx2.maskstore.q(i8* %arg0, <2 x i64> %a1, <2 x i64> %a2)
1619  ret void
1620}
1621declare void @llvm.x86.avx2.maskstore.q(i8*, <2 x i64>, <2 x i64>) nounwind readnone
1622
1623define void @test_mm256_maskstore_epi64(i64* %a0, <4 x i64> %a1, <4 x i64> %a2) nounwind {
1624; X86-LABEL: test_mm256_maskstore_epi64:
1625; X86:       # %bb.0:
1626; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1627; X86-NEXT:    vpmaskmovq %ymm1, %ymm0, (%eax)
1628; X86-NEXT:    vzeroupper
1629; X86-NEXT:    retl
1630;
1631; X64-LABEL: test_mm256_maskstore_epi64:
1632; X64:       # %bb.0:
1633; X64-NEXT:    vpmaskmovq %ymm1, %ymm0, (%rdi)
1634; X64-NEXT:    vzeroupper
1635; X64-NEXT:    retq
1636  %arg0 = bitcast i64* %a0 to i8*
1637  call void @llvm.x86.avx2.maskstore.q.256(i8* %arg0, <4 x i64> %a1, <4 x i64> %a2)
1638  ret void
1639}
1640declare void @llvm.x86.avx2.maskstore.q.256(i8*, <4 x i64>, <4 x i64>) nounwind readnone
1641
1642define <4 x i64> @test_mm256_max_epi8(<4 x i64> %a0, <4 x i64> %a1) {
1643; CHECK-LABEL: test_mm256_max_epi8:
1644; CHECK:       # %bb.0:
1645; CHECK-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
1646; CHECK-NEXT:    ret{{[l|q]}}
1647  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1648  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1649  %cmp = icmp sgt <32 x i8> %arg0, %arg1
1650  %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1
1651  %bc = bitcast <32 x i8> %sel to <4 x i64>
1652  ret <4 x i64> %bc
1653}
1654
1655define <4 x i64> @test_mm256_max_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1656; CHECK-LABEL: test_mm256_max_epi16:
1657; CHECK:       # %bb.0:
1658; CHECK-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
1659; CHECK-NEXT:    ret{{[l|q]}}
1660  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1661  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1662  %cmp = icmp sgt <16 x i16> %arg0, %arg1
1663  %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1
1664  %bc = bitcast <16 x i16> %sel to <4 x i64>
1665  ret <4 x i64> %bc
1666}
1667
1668define <4 x i64> @test_mm256_max_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1669; CHECK-LABEL: test_mm256_max_epi32:
1670; CHECK:       # %bb.0:
1671; CHECK-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
1672; CHECK-NEXT:    ret{{[l|q]}}
1673  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1674  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1675  %cmp = icmp sgt <8 x i32> %arg0, %arg1
1676  %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1
1677  %bc = bitcast <8 x i32> %sel to <4 x i64>
1678  ret <4 x i64> %bc
1679}
1680
1681define <4 x i64> @test_mm256_max_epu8(<4 x i64> %a0, <4 x i64> %a1) {
1682; CHECK-LABEL: test_mm256_max_epu8:
1683; CHECK:       # %bb.0:
1684; CHECK-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
1685; CHECK-NEXT:    ret{{[l|q]}}
1686  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1687  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1688  %cmp = icmp ugt <32 x i8> %arg0, %arg1
1689  %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1
1690  %bc = bitcast <32 x i8> %sel to <4 x i64>
1691  ret <4 x i64> %bc
1692}
1693
1694define <4 x i64> @test_mm256_max_epu16(<4 x i64> %a0, <4 x i64> %a1) {
1695; CHECK-LABEL: test_mm256_max_epu16:
1696; CHECK:       # %bb.0:
1697; CHECK-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
1698; CHECK-NEXT:    ret{{[l|q]}}
1699  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1700  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1701  %cmp = icmp ugt <16 x i16> %arg0, %arg1
1702  %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1
1703  %bc = bitcast <16 x i16> %sel to <4 x i64>
1704  ret <4 x i64> %bc
1705}
1706
1707define <4 x i64> @test_mm256_max_epu32(<4 x i64> %a0, <4 x i64> %a1) {
1708; CHECK-LABEL: test_mm256_max_epu32:
1709; CHECK:       # %bb.0:
1710; CHECK-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
1711; CHECK-NEXT:    ret{{[l|q]}}
1712  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1713  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1714  %cmp = icmp ugt <8 x i32> %arg0, %arg1
1715  %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1
1716  %bc = bitcast <8 x i32> %sel to <4 x i64>
1717  ret <4 x i64> %bc
1718}
1719
1720define <4 x i64> @test_mm256_min_epi8(<4 x i64> %a0, <4 x i64> %a1) {
1721; CHECK-LABEL: test_mm256_min_epi8:
1722; CHECK:       # %bb.0:
1723; CHECK-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
1724; CHECK-NEXT:    ret{{[l|q]}}
1725  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1726  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1727  %cmp = icmp slt <32 x i8> %arg0, %arg1
1728  %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1
1729  %bc = bitcast <32 x i8> %sel to <4 x i64>
1730  ret <4 x i64> %bc
1731}
1732
1733define <4 x i64> @test_mm256_min_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1734; CHECK-LABEL: test_mm256_min_epi16:
1735; CHECK:       # %bb.0:
1736; CHECK-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
1737; CHECK-NEXT:    ret{{[l|q]}}
1738  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1739  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1740  %cmp = icmp slt <16 x i16> %arg0, %arg1
1741  %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1
1742  %bc = bitcast <16 x i16> %sel to <4 x i64>
1743  ret <4 x i64> %bc
1744}
1745
1746define <4 x i64> @test_mm256_min_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1747; CHECK-LABEL: test_mm256_min_epi32:
1748; CHECK:       # %bb.0:
1749; CHECK-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
1750; CHECK-NEXT:    ret{{[l|q]}}
1751  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1752  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1753  %cmp = icmp slt <8 x i32> %arg0, %arg1
1754  %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1
1755  %bc = bitcast <8 x i32> %sel to <4 x i64>
1756  ret <4 x i64> %bc
1757}
1758
1759define <4 x i64> @test_mm256_min_epu8(<4 x i64> %a0, <4 x i64> %a1) {
1760; CHECK-LABEL: test_mm256_min_epu8:
1761; CHECK:       # %bb.0:
1762; CHECK-NEXT:    vpminub %ymm1, %ymm0, %ymm0
1763; CHECK-NEXT:    ret{{[l|q]}}
1764  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1765  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1766  %cmp = icmp ult <32 x i8> %arg0, %arg1
1767  %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1
1768  %bc = bitcast <32 x i8> %sel to <4 x i64>
1769  ret <4 x i64> %bc
1770}
1771
1772define <4 x i64> @test_mm256_min_epu16(<4 x i64> %a0, <4 x i64> %a1) {
1773; CHECK-LABEL: test_mm256_min_epu16:
1774; CHECK:       # %bb.0:
1775; CHECK-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
1776; CHECK-NEXT:    ret{{[l|q]}}
1777  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1778  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1779  %cmp = icmp ult <16 x i16> %arg0, %arg1
1780  %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1
1781  %bc = bitcast <16 x i16> %sel to <4 x i64>
1782  ret <4 x i64> %bc
1783}
1784
1785define <4 x i64> @test_mm256_min_epu32(<4 x i64> %a0, <4 x i64> %a1) {
1786; CHECK-LABEL: test_mm256_min_epu32:
1787; CHECK:       # %bb.0:
1788; CHECK-NEXT:    vpminud %ymm1, %ymm0, %ymm0
1789; CHECK-NEXT:    ret{{[l|q]}}
1790  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1791  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1792  %cmp = icmp ult <8 x i32> %arg0, %arg1
1793  %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1
1794  %bc = bitcast <8 x i32> %sel to <4 x i64>
1795  ret <4 x i64> %bc
1796}
1797
1798define i32 @test_mm256_movemask_epi8(<4 x i64> %a0) nounwind {
1799; CHECK-LABEL: test_mm256_movemask_epi8:
1800; CHECK:       # %bb.0:
1801; CHECK-NEXT:    vpmovmskb %ymm0, %eax
1802; CHECK-NEXT:    vzeroupper
1803; CHECK-NEXT:    ret{{[l|q]}}
1804  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1805  %res = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> %arg0)
1806  ret i32 %res
1807}
1808declare i32 @llvm.x86.avx2.pmovmskb(<32 x i8>) nounwind readnone
1809
1810define <4 x i64> @test_mm256_mpsadbw_epu8(<4 x i64> %a0, <4 x i64> %a1) {
1811; CHECK-LABEL: test_mm256_mpsadbw_epu8:
1812; CHECK:       # %bb.0:
1813; CHECK-NEXT:    vmpsadbw $3, %ymm1, %ymm0, %ymm0
1814; CHECK-NEXT:    ret{{[l|q]}}
1815  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1816  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1817  %call = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %arg0, <32 x i8> %arg1, i8 3)
1818  %bc = bitcast <16 x i16>  %call to <4 x i64>
1819  ret <4 x i64> %bc
1820}
1821declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind readnone
1822
1823define <4 x i64> @test_mm256_mul_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1824; CHECK-LABEL: test_mm256_mul_epi32:
1825; CHECK:       # %bb.0:
1826; CHECK-NEXT:    vpsllq $32, %ymm0, %ymm0
1827; CHECK-NEXT:    vpsrad $31, %ymm0, %ymm2
1828; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
1829; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
1830; CHECK-NEXT:    vpsllq $32, %ymm1, %ymm1
1831; CHECK-NEXT:    vpsrad $31, %ymm1, %ymm2
1832; CHECK-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
1833; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
1834; CHECK-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0
1835; CHECK-NEXT:    ret{{[l|q]}}
1836  %A = shl <4 x i64> %a0, <i64 32, i64 32, i64 32, i64 32>
1837  %A1 = ashr exact <4 x i64> %A, <i64 32, i64 32, i64 32, i64 32>
1838  %B = shl <4 x i64> %a1, <i64 32, i64 32, i64 32, i64 32>
1839  %B1 = ashr exact <4 x i64> %B, <i64 32, i64 32, i64 32, i64 32>
1840  %res = mul nsw <4 x i64> %A1, %B1
1841  ret <4 x i64> %res
1842}
1843declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone
1844
1845define <4 x i64> @test_mm256_mul_epu32(<4 x i64> %a0, <4 x i64> %a1) {
1846; CHECK-LABEL: test_mm256_mul_epu32:
1847; CHECK:       # %bb.0:
1848; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1849; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
1850; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
1851; CHECK-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
1852; CHECK-NEXT:    ret{{[l|q]}}
1853  %A = and <4 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1854  %B = and <4 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1855  %res = mul nuw <4 x i64> %A, %B
1856  ret <4 x i64> %res
1857}
1858declare <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32>, <8 x i32>) nounwind readnone
1859
1860define <4 x i64> @test_mm256_mulhi_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1861; CHECK-LABEL: test_mm256_mulhi_epi16:
1862; CHECK:       # %bb.0:
1863; CHECK-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0
1864; CHECK-NEXT:    ret{{[l|q]}}
1865  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1866  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1867  %res = call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %arg0, <16 x i16> %arg1)
1868  %bc = bitcast <16 x i16> %res to <4 x i64>
1869  ret <4 x i64> %bc
1870}
1871declare <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16>, <16 x i16>) nounwind readnone
1872
1873define <4 x i64> @test_mm256_mulhi_epu16(<4 x i64> %a0, <4 x i64> %a1) {
1874; CHECK-LABEL: test_mm256_mulhi_epu16:
1875; CHECK:       # %bb.0:
1876; CHECK-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0
1877; CHECK-NEXT:    ret{{[l|q]}}
1878  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1879  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1880  %res = call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> %arg0, <16 x i16> %arg1)
1881  %bc = bitcast <16 x i16> %res to <4 x i64>
1882  ret <4 x i64> %bc
1883}
1884declare <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16>, <16 x i16>) nounwind readnone
1885
1886define <4 x i64> @test_mm256_mulhrs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1887; CHECK-LABEL: test_mm256_mulhrs_epi16:
1888; CHECK:       # %bb.0:
1889; CHECK-NEXT:    vpmulhrsw %ymm1, %ymm0, %ymm0
1890; CHECK-NEXT:    ret{{[l|q]}}
1891  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1892  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1893  %res = call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %arg0, <16 x i16> %arg1)
1894  %bc = bitcast <16 x i16> %res to <4 x i64>
1895  ret <4 x i64> %bc
1896}
1897declare <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16>, <16 x i16>) nounwind readnone
1898
1899define <4 x i64> @test_mm256_mullo_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1900; CHECK-LABEL: test_mm256_mullo_epi16:
1901; CHECK:       # %bb.0:
1902; CHECK-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1903; CHECK-NEXT:    ret{{[l|q]}}
1904  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1905  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1906  %res = mul <16 x i16> %arg0, %arg1
1907  %bc = bitcast <16 x i16> %res to <4 x i64>
1908  ret <4 x i64> %bc
1909}
1910
1911define <4 x i64> @test_mm256_mullo_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1912; CHECK-LABEL: test_mm256_mullo_epi32:
1913; CHECK:       # %bb.0:
1914; CHECK-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
1915; CHECK-NEXT:    ret{{[l|q]}}
1916  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1917  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1918  %res = mul <8 x i32> %arg0, %arg1
1919  %bc = bitcast <8 x i32> %res to <4 x i64>
1920  ret <4 x i64> %bc
1921}
1922
1923define <4 x i64> @test_mm256_or_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
1924; CHECK-LABEL: test_mm256_or_si256:
1925; CHECK:       # %bb.0:
1926; CHECK-NEXT:    vorps %ymm1, %ymm0, %ymm0
1927; CHECK-NEXT:    ret{{[l|q]}}
1928  %res = or <4 x i64> %a0, %a1
1929  ret <4 x i64> %res
1930}
1931
1932define <4 x i64> @test_mm256_packs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1933; CHECK-LABEL: test_mm256_packs_epi16:
1934; CHECK:       # %bb.0:
1935; CHECK-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0
1936; CHECK-NEXT:    ret{{[l|q]}}
1937  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1938  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1939  %call = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %arg0, <16 x i16> %arg1)
1940  %res = bitcast <32 x i8> %call to <4 x i64>
1941  ret <4 x i64> %res
1942}
1943declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone
1944
1945define <4 x i64> @test_mm256_packs_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1946; CHECK-LABEL: test_mm256_packs_epi32:
1947; CHECK:       # %bb.0:
1948; CHECK-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0
1949; CHECK-NEXT:    ret{{[l|q]}}
1950  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1951  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1952  %call = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %arg0, <8 x i32> %arg1)
1953  %res = bitcast <16 x i16> %call to <4 x i64>
1954  ret <4 x i64> %res
1955}
1956declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone
1957
1958define <4 x i64> @test_mm256_packus_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1959; CHECK-LABEL: test_mm256_packus_epi16:
1960; CHECK:       # %bb.0:
1961; CHECK-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
1962; CHECK-NEXT:    ret{{[l|q]}}
1963  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1964  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1965  %call = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %arg0, <16 x i16> %arg1)
1966  %res = bitcast <32 x i8> %call to <4 x i64>
1967  ret <4 x i64> %res
1968}
1969declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone
1970
1971define <4 x i64> @test_mm256_packus_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1972; CHECK-LABEL: test_mm256_packus_epi32:
1973; CHECK:       # %bb.0:
1974; CHECK-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
1975; CHECK-NEXT:    ret{{[l|q]}}
1976  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1977  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1978  %call = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %arg0, <8 x i32> %arg1)
1979  %res = bitcast <16 x i16> %call to <4 x i64>
1980  ret <4 x i64> %res
1981}
1982declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
1983
1984define <4 x i64> @test_mm256_permute2x128_si256(<4 x i64> %a0, <4 x i64> %a1) {
1985; CHECK-LABEL: test_mm256_permute2x128_si256:
1986; CHECK:       # %bb.0:
1987; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1988; CHECK-NEXT:    ret{{[l|q]}}
1989  %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1990  ret <4 x i64> %res
1991}
1992declare <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64>, <4 x i64>, i8) nounwind readonly
1993
1994define <4 x i64> @test_mm256_permute4x64_epi64(<4 x i64> %a0) {
1995; CHECK-LABEL: test_mm256_permute4x64_epi64:
1996; CHECK:       # %bb.0:
1997; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,0,2,0]
1998; CHECK-NEXT:    ret{{[l|q]}}
1999  %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 0>
2000  ret <4 x i64> %res
2001}
2002
2003define <4 x double> @test_mm256_permute4x64_pd(<4 x double> %a0) {
2004; CHECK-LABEL: test_mm256_permute4x64_pd:
2005; CHECK:       # %bb.0:
2006; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,2,1,0]
2007; CHECK-NEXT:    ret{{[l|q]}}
2008  %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 2, i32 1, i32 0>
2009  ret <4 x double> %res
2010}
2011
2012define <4 x i64> @test_mm256_permutevar8x32_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2013; CHECK-LABEL: test_mm256_permutevar8x32_epi32:
2014; CHECK:       # %bb.0:
2015; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
2016; CHECK-NEXT:    ret{{[l|q]}}
2017  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2018  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2019  %call = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %arg0, <8 x i32> %arg1)
2020  %res = bitcast <8 x i32> %call to <4 x i64>
2021  ret <4 x i64> %res
2022}
2023declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly
2024
2025define <8 x float> @test_mm256_permutevar8x32_ps(<8 x float> %a0, <4 x i64> %a1) {
2026; CHECK-LABEL: test_mm256_permutevar8x32_ps:
2027; CHECK:       # %bb.0:
2028; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
2029; CHECK-NEXT:    ret{{[l|q]}}
2030  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2031  %res = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %arg1)
2032  ret <8 x float> %res
2033}
2034declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind readonly
2035
2036define <4 x i64> @test_mm256_sad_epu8(<4 x i64> %a0, <4 x i64> %a1) {
2037; CHECK-LABEL: test_mm256_sad_epu8:
2038; CHECK:       # %bb.0:
2039; CHECK-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
2040; CHECK-NEXT:    ret{{[l|q]}}
2041  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2042  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2043  %res = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %arg0, <32 x i8> %arg1)
2044  ret <4 x i64> %res
2045}
2046declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone
2047
2048define <4 x i64> @test_mm256_shuffle_epi32(<4 x i64> %a0) {
2049; CHECK-LABEL: test_mm256_shuffle_epi32:
2050; CHECK:       # %bb.0:
2051; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,3,0,0,7,7,4,4]
2052; CHECK-NEXT:    ret{{[l|q]}}
2053  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2054  %shuf = shufflevector <8 x i32> %arg0, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 0, i32 0, i32 7, i32 7, i32 4, i32 4>
2055  %res = bitcast <8 x i32> %shuf to <4 x i64>
2056  ret <4 x i64> %res
2057}
2058
2059define <4 x i64> @test_mm256_shuffle_epi8(<4 x i64> %a0, <4 x i64> %a1) {
2060; CHECK-LABEL: test_mm256_shuffle_epi8:
2061; CHECK:       # %bb.0:
2062; CHECK-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
2063; CHECK-NEXT:    ret{{[l|q]}}
2064  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2065  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2066  %shuf = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %arg0, <32 x i8> %arg1)
2067  %res = bitcast <32 x i8> %shuf to <4 x i64>
2068  ret <4 x i64> %res
2069}
2070declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) nounwind readnone
2071
2072define <4 x i64> @test_mm256_shufflehi_epi16(<4 x i64> %a0) {
2073; CHECK-LABEL: test_mm256_shufflehi_epi16:
2074; CHECK:       # %bb.0:
2075; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,6,5,8,9,10,11,15,14,14,13]
2076; CHECK-NEXT:    ret{{[l|q]}}
2077  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2078  %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 14, i32 13>
2079  %res = bitcast <16 x i16> %shuf to <4 x i64>
2080  ret <4 x i64> %res
2081}
2082
2083define <4 x i64> @test_mm256_shufflelo_epi16(<4 x i64> %a0) {
2084; CHECK-LABEL: test_mm256_shufflelo_epi16:
2085; CHECK:       # %bb.0:
2086; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,0,1,1,4,5,6,7,11,8,9,9,12,13,14,15]
2087; CHECK-NEXT:    ret{{[l|q]}}
2088  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2089  %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> <i32 3, i32 0, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 9, i32 9, i32 12, i32 13, i32 14, i32 15>
2090  %res = bitcast <16 x i16> %shuf to <4 x i64>
2091  ret <4 x i64> %res
2092}
2093
2094define <4 x i64> @test_mm256_sign_epi8(<4 x i64> %a0, <4 x i64> %a1) {
2095; CHECK-LABEL: test_mm256_sign_epi8:
2096; CHECK:       # %bb.0:
2097; CHECK-NEXT:    vpsignb %ymm1, %ymm0, %ymm0
2098; CHECK-NEXT:    ret{{[l|q]}}
2099  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2100  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2101  %call = call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> %arg0, <32 x i8> %arg1)
2102  %res = bitcast <32 x i8> %call to <4 x i64>
2103  ret <4 x i64> %res
2104}
2105declare <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8>, <32 x i8>) nounwind readnone
2106
2107define <4 x i64> @test_mm256_sign_epi16(<4 x i64> %a0, <4 x i64> %a1) {
2108; CHECK-LABEL: test_mm256_sign_epi16:
2109; CHECK:       # %bb.0:
2110; CHECK-NEXT:    vpsignw %ymm1, %ymm0, %ymm0
2111; CHECK-NEXT:    ret{{[l|q]}}
2112  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2113  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2114  %call = call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> %arg0, <16 x i16> %arg1)
2115  %res = bitcast <16 x i16> %call to <4 x i64>
2116  ret <4 x i64> %res
2117}
2118declare <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16>, <16 x i16>) nounwind readnone
2119
2120define <4 x i64> @test_mm256_sign_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2121; CHECK-LABEL: test_mm256_sign_epi32:
2122; CHECK:       # %bb.0:
2123; CHECK-NEXT:    vpsignd %ymm1, %ymm0, %ymm0
2124; CHECK-NEXT:    ret{{[l|q]}}
2125  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2126  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2127  %call = call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> %arg0, <8 x i32> %arg1)
2128  %res = bitcast <8 x i32> %call to <4 x i64>
2129  ret <4 x i64> %res
2130}
2131declare <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32>, <8 x i32>) nounwind readnone
2132
2133define <4 x i64> @test_mm256_sll_epi16(<4 x i64> %a0, <2 x i64> %a1) {
2134; CHECK-LABEL: test_mm256_sll_epi16:
2135; CHECK:       # %bb.0:
2136; CHECK-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
2137; CHECK-NEXT:    ret{{[l|q]}}
2138  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2139  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
2140  %res = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %arg0, <8 x i16> %arg1)
2141  %bc = bitcast <16 x i16> %res to <4 x i64>
2142  ret <4 x i64> %bc
2143}
2144declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) nounwind readnone
2145
2146define <4 x i64> @test_mm256_sll_epi32(<4 x i64> %a0, <2 x i64> %a1) {
2147; CHECK-LABEL: test_mm256_sll_epi32:
2148; CHECK:       # %bb.0:
2149; CHECK-NEXT:    vpslld %xmm1, %ymm0, %ymm0
2150; CHECK-NEXT:    ret{{[l|q]}}
2151  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2152  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2153  %res = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %arg0, <4 x i32> %arg1)
2154  %bc = bitcast <8 x i32> %res to <4 x i64>
2155  ret <4 x i64> %bc
2156}
2157declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone
2158
2159define <4 x i64> @test_mm256_sll_epi64(<4 x i64> %a0, <2 x i64> %a1) {
2160; CHECK-LABEL: test_mm256_sll_epi64:
2161; CHECK:       # %bb.0:
2162; CHECK-NEXT:    vpsllq %xmm1, %ymm0, %ymm0
2163; CHECK-NEXT:    ret{{[l|q]}}
2164  %res = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %a0, <2 x i64> %a1)
2165  ret <4 x i64> %res
2166}
2167declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) nounwind readnone
2168
2169define <4 x i64> @test_mm256_slli_epi16(<4 x i64> %a0) {
2170; CHECK-LABEL: test_mm256_slli_epi16:
2171; CHECK:       # %bb.0:
2172; CHECK-NEXT:    vpsllw $3, %ymm0, %ymm0
2173; CHECK-NEXT:    ret{{[l|q]}}
2174  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2175  %res = call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %arg0, i32 3)
2176  %bc = bitcast <16 x i16> %res to <4 x i64>
2177  ret <4 x i64> %bc
2178}
2179declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) nounwind readnone
2180
2181define <4 x i64> @test_mm256_slli_epi32(<4 x i64> %a0) {
2182; CHECK-LABEL: test_mm256_slli_epi32:
2183; CHECK:       # %bb.0:
2184; CHECK-NEXT:    vpslld $3, %ymm0, %ymm0
2185; CHECK-NEXT:    ret{{[l|q]}}
2186  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2187  %res = call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %arg0, i32 3)
2188  %bc = bitcast <8 x i32> %res to <4 x i64>
2189  ret <4 x i64> %bc
2190}
2191declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) nounwind readnone
2192
2193define <4 x i64> @test_mm256_slli_epi64(<4 x i64> %a0) {
2194; CHECK-LABEL: test_mm256_slli_epi64:
2195; CHECK:       # %bb.0:
2196; CHECK-NEXT:    vpsllq $3, %ymm0, %ymm0
2197; CHECK-NEXT:    ret{{[l|q]}}
2198  %res = call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %a0, i32 3)
2199  ret <4 x i64> %res
2200}
2201declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) nounwind readnone
2202
2203define <4 x i64> @test_mm256_slli_si256(<4 x i64> %a0) {
2204; CHECK-LABEL: test_mm256_slli_si256:
2205; CHECK:       # %bb.0:
2206; CHECK-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28]
2207; CHECK-NEXT:    ret{{[l|q]}}
2208  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2209  %shuf = shufflevector <32 x i8> zeroinitializer, <32 x i8> %arg0, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
2210  %res = bitcast <32 x i8> %shuf to <4 x i64>
2211  ret <4 x i64> %res
2212}
2213
2214define <2 x i64> @test_mm_sllv_epi32(<2 x i64> %a0, <2 x i64> %a1) {
2215; CHECK-LABEL: test_mm_sllv_epi32:
2216; CHECK:       # %bb.0:
2217; CHECK-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
2218; CHECK-NEXT:    ret{{[l|q]}}
2219  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2220  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2221  %res = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %arg0, <4 x i32> %arg1)
2222  %bc = bitcast <4 x i32> %res to <2 x i64>
2223  ret <2 x i64> %bc
2224}
2225declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) nounwind readnone
2226
2227define <4 x i64> @test_mm256_sllv_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2228; CHECK-LABEL: test_mm256_sllv_epi32:
2229; CHECK:       # %bb.0:
2230; CHECK-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
2231; CHECK-NEXT:    ret{{[l|q]}}
2232  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2233  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2234  %res = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
2235  %bc = bitcast <8 x i32> %res to <4 x i64>
2236  ret <4 x i64> %bc
2237}
2238declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
2239
2240define <2 x i64> @test_mm_sllv_epi64(<2 x i64> %a0, <2 x i64> %a1) {
2241; CHECK-LABEL: test_mm_sllv_epi64:
2242; CHECK:       # %bb.0:
2243; CHECK-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
2244; CHECK-NEXT:    ret{{[l|q]}}
2245  %res = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %a0, <2 x i64> %a1)
2246  ret <2 x i64> %res
2247}
2248declare <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64>, <2 x i64>) nounwind readnone
2249
2250define <4 x i64> @test_mm256_sllv_epi64(<4 x i64> %a0, <4 x i64> %a1) {
2251; CHECK-LABEL: test_mm256_sllv_epi64:
2252; CHECK:       # %bb.0:
2253; CHECK-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0
2254; CHECK-NEXT:    ret{{[l|q]}}
2255  %res = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %a0, <4 x i64> %a1)
2256  ret <4 x i64> %res
2257}
2258declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
2259
2260define <4 x i64> @test_mm256_sra_epi16(<4 x i64> %a0, <2 x i64> %a1) {
2261; CHECK-LABEL: test_mm256_sra_epi16:
2262; CHECK:       # %bb.0:
2263; CHECK-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
2264; CHECK-NEXT:    ret{{[l|q]}}
2265  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2266  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
2267  %res = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %arg0, <8 x i16> %arg1)
2268  %bc = bitcast <16 x i16> %res to <4 x i64>
2269  ret <4 x i64> %bc
2270}
2271declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) nounwind readnone
2272
2273define <4 x i64> @test_mm256_sra_epi32(<4 x i64> %a0, <2 x i64> %a1) {
2274; CHECK-LABEL: test_mm256_sra_epi32:
2275; CHECK:       # %bb.0:
2276; CHECK-NEXT:    vpsrad %xmm1, %ymm0, %ymm0
2277; CHECK-NEXT:    ret{{[l|q]}}
2278  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2279  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2280  %res = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %arg0, <4 x i32> %arg1)
2281  %bc = bitcast <8 x i32> %res to <4 x i64>
2282  ret <4 x i64> %bc
2283}
2284declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) nounwind readnone
2285
2286define <4 x i64> @test_mm256_srai_epi16(<4 x i64> %a0) {
2287; CHECK-LABEL: test_mm256_srai_epi16:
2288; CHECK:       # %bb.0:
2289; CHECK-NEXT:    vpsraw $3, %ymm0, %ymm0
2290; CHECK-NEXT:    ret{{[l|q]}}
2291  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2292  %res = call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %arg0, i32 3)
2293  %bc = bitcast <16 x i16> %res to <4 x i64>
2294  ret <4 x i64> %bc
2295}
2296declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32) nounwind readnone
2297
2298define <4 x i64> @test_mm256_srai_epi32(<4 x i64> %a0) {
2299; CHECK-LABEL: test_mm256_srai_epi32:
2300; CHECK:       # %bb.0:
2301; CHECK-NEXT:    vpsrad $3, %ymm0, %ymm0
2302; CHECK-NEXT:    ret{{[l|q]}}
2303  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2304  %res = call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %arg0, i32 3)
2305  %bc = bitcast <8 x i32> %res to <4 x i64>
2306  ret <4 x i64> %bc
2307}
2308declare <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32>, i32) nounwind readnone
2309
2310define <2 x i64> @test_mm_srav_epi32(<2 x i64> %a0, <2 x i64> %a1) {
2311; CHECK-LABEL: test_mm_srav_epi32:
2312; CHECK:       # %bb.0:
2313; CHECK-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
2314; CHECK-NEXT:    ret{{[l|q]}}
2315  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2316  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2317  %res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %arg0, <4 x i32> %arg1)
2318  %bc = bitcast <4 x i32> %res to <2 x i64>
2319  ret <2 x i64> %bc
2320}
2321declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone
2322
2323define <4 x i64> @test_mm256_srav_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2324; CHECK-LABEL: test_mm256_srav_epi32:
2325; CHECK:       # %bb.0:
2326; CHECK-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
2327; CHECK-NEXT:    ret{{[l|q]}}
2328  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2329  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2330  %res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
2331  %bc = bitcast <8 x i32> %res to <4 x i64>
2332  ret <4 x i64> %bc
2333}
2334declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone
2335
2336define <4 x i64> @test_mm256_srl_epi16(<4 x i64> %a0, <2 x i64> %a1) {
2337; CHECK-LABEL: test_mm256_srl_epi16:
2338; CHECK:       # %bb.0:
2339; CHECK-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
2340; CHECK-NEXT:    ret{{[l|q]}}
2341  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2342  %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
2343  %res = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %arg0, <8 x i16> %arg1)
2344  %bc = bitcast <16 x i16> %res to <4 x i64>
2345  ret <4 x i64> %bc
2346}
2347declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) nounwind readnone
2348
2349define <4 x i64> @test_mm256_srl_epi32(<4 x i64> %a0, <2 x i64> %a1) {
2350; CHECK-LABEL: test_mm256_srl_epi32:
2351; CHECK:       # %bb.0:
2352; CHECK-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
2353; CHECK-NEXT:    ret{{[l|q]}}
2354  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2355  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2356  %res = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %arg0, <4 x i32> %arg1)
2357  %bc = bitcast <8 x i32> %res to <4 x i64>
2358  ret <4 x i64> %bc
2359}
2360declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone
2361
2362define <4 x i64> @test_mm256_srl_epi64(<4 x i64> %a0, <2 x i64> %a1) {
2363; CHECK-LABEL: test_mm256_srl_epi64:
2364; CHECK:       # %bb.0:
2365; CHECK-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
2366; CHECK-NEXT:    ret{{[l|q]}}
2367  %res = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %a0, <2 x i64> %a1)
2368  ret <4 x i64> %res
2369}
2370declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) nounwind readnone
2371
2372define <4 x i64> @test_mm256_srli_epi16(<4 x i64> %a0) {
2373; CHECK-LABEL: test_mm256_srli_epi16:
2374; CHECK:       # %bb.0:
2375; CHECK-NEXT:    vpsrlw $3, %ymm0, %ymm0
2376; CHECK-NEXT:    ret{{[l|q]}}
2377  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2378  %res = call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %arg0, i32 3)
2379  %bc = bitcast <16 x i16> %res to <4 x i64>
2380  ret <4 x i64> %bc
2381}
2382declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) nounwind readnone
2383
2384define <4 x i64> @test_mm256_srli_epi32(<4 x i64> %a0) {
2385; CHECK-LABEL: test_mm256_srli_epi32:
2386; CHECK:       # %bb.0:
2387; CHECK-NEXT:    vpsrld $3, %ymm0, %ymm0
2388; CHECK-NEXT:    ret{{[l|q]}}
2389  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2390  %res = call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %arg0, i32 3)
2391  %bc = bitcast <8 x i32> %res to <4 x i64>
2392  ret <4 x i64> %bc
2393}
2394declare <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32>, i32) nounwind readnone
2395
2396define <4 x i64> @test_mm256_srli_epi64(<4 x i64> %a0) {
2397; CHECK-LABEL: test_mm256_srli_epi64:
2398; CHECK:       # %bb.0:
2399; CHECK-NEXT:    vpsrlq $3, %ymm0, %ymm0
2400; CHECK-NEXT:    ret{{[l|q]}}
2401  %res = call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %a0, i32 3)
2402  ret <4 x i64> %res
2403}
2404declare <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64>, i32) nounwind readnone
2405
2406define <4 x i64> @test_mm256_srli_si256(<4 x i64> %a0) {
2407; CHECK-LABEL: test_mm256_srli_si256:
2408; CHECK:       # %bb.0:
2409; CHECK-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero
2410; CHECK-NEXT:    ret{{[l|q]}}
2411  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2412  %shuf = shufflevector <32 x i8> %arg0, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
2413  %res = bitcast <32 x i8> %shuf to <4 x i64>
2414  ret <4 x i64> %res
2415}
2416
2417define <2 x i64> @test_mm_srlv_epi32(<2 x i64> %a0, <2 x i64> %a1) {
2418; CHECK-LABEL: test_mm_srlv_epi32:
2419; CHECK:       # %bb.0:
2420; CHECK-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
2421; CHECK-NEXT:    ret{{[l|q]}}
2422  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2423  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2424  %res = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %arg0, <4 x i32> %arg1)
2425  %bc = bitcast <4 x i32> %res to <2 x i64>
2426  ret <2 x i64> %bc
2427}
2428declare <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32>, <4 x i32>) nounwind readnone
2429
2430define <4 x i64> @test_mm256_srlv_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2431; CHECK-LABEL: test_mm256_srlv_epi32:
2432; CHECK:       # %bb.0:
2433; CHECK-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
2434; CHECK-NEXT:    ret{{[l|q]}}
2435  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2436  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2437  %res = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
2438  %bc = bitcast <8 x i32> %res to <4 x i64>
2439  ret <4 x i64> %bc
2440}
2441declare <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
2442
2443define <2 x i64> @test_mm_srlv_epi64(<2 x i64> %a0, <2 x i64> %a1) {
2444; CHECK-LABEL: test_mm_srlv_epi64:
2445; CHECK:       # %bb.0:
2446; CHECK-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
2447; CHECK-NEXT:    ret{{[l|q]}}
2448  %res = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %a0, <2 x i64> %a1)
2449  ret <2 x i64> %res
2450}
2451declare <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64>, <2 x i64>) nounwind readnone
2452
2453define <4 x i64> @test_mm256_srlv_epi64(<4 x i64> %a0, <4 x i64> %a1) {
2454; CHECK-LABEL: test_mm256_srlv_epi64:
2455; CHECK:       # %bb.0:
2456; CHECK-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
2457; CHECK-NEXT:    ret{{[l|q]}}
2458  %res = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %a0, <4 x i64> %a1)
2459  ret <4 x i64> %res
2460}
2461declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
2462
2463define <4 x i64> @test_mm256_stream_load_si256(<4 x i64> *%a0) {
2464; X86-LABEL: test_mm256_stream_load_si256:
2465; X86:       # %bb.0:
2466; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
2467; X86-NEXT:    vmovntdqa (%eax), %ymm0
2468; X86-NEXT:    retl
2469;
2470; X64-LABEL: test_mm256_stream_load_si256:
2471; X64:       # %bb.0:
2472; X64-NEXT:    vmovntdqa (%rdi), %ymm0
2473; X64-NEXT:    retq
2474  %arg0 = bitcast <4 x i64> *%a0 to i8*
2475  %res = call <4 x i64> @llvm.x86.avx2.movntdqa(i8* %arg0)
2476  ret <4 x i64> %res
2477}
2478declare <4 x i64> @llvm.x86.avx2.movntdqa(i8*) nounwind readonly
2479
2480define <4 x i64> @test_mm256_sub_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2481; CHECK-LABEL: test_mm256_sub_epi8:
2482; CHECK:       # %bb.0:
2483; CHECK-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
2484; CHECK-NEXT:    ret{{[l|q]}}
2485  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2486  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2487  %res = sub <32 x i8> %arg0, %arg1
2488  %bc = bitcast <32 x i8> %res to <4 x i64>
2489  ret <4 x i64> %bc
2490}
2491
2492define <4 x i64> @test_mm256_sub_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2493; CHECK-LABEL: test_mm256_sub_epi16:
2494; CHECK:       # %bb.0:
2495; CHECK-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
2496; CHECK-NEXT:    ret{{[l|q]}}
2497  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2498  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2499  %res = sub <16 x i16> %arg0, %arg1
2500  %bc = bitcast <16 x i16> %res to <4 x i64>
2501  ret <4 x i64> %bc
2502}
2503
2504define <4 x i64> @test_mm256_sub_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2505; CHECK-LABEL: test_mm256_sub_epi32:
2506; CHECK:       # %bb.0:
2507; CHECK-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
2508; CHECK-NEXT:    ret{{[l|q]}}
2509  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2510  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2511  %res = sub <8 x i32> %arg0, %arg1
2512  %bc = bitcast <8 x i32> %res to <4 x i64>
2513  ret <4 x i64> %bc
2514}
2515
2516define <4 x i64> @test_mm256_sub_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2517; CHECK-LABEL: test_mm256_sub_epi64:
2518; CHECK:       # %bb.0:
2519; CHECK-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
2520; CHECK-NEXT:    ret{{[l|q]}}
2521  %res = sub <4 x i64> %a0, %a1
2522  ret <4 x i64> %res
2523}
2524
2525define <4 x i64> @test_mm256_subs_epi8(<4 x i64> %a0, <4 x i64> %a1) {
2526; CHECK-LABEL: test_mm256_subs_epi8:
2527; CHECK:       # %bb.0:
2528; CHECK-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0
2529; CHECK-NEXT:    ret{{[l|q]}}
2530  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2531  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2532  %res = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %arg0, <32 x i8> %arg1)
2533  %bc = bitcast <32 x i8> %res to <4 x i64>
2534  ret <4 x i64> %bc
2535}
2536declare <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8>, <32 x i8>) nounwind readnone
2537
2538define <4 x i64> @test_mm256_subs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
2539; CHECK-LABEL: test_mm256_subs_epi16:
2540; CHECK:       # %bb.0:
2541; CHECK-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0
2542; CHECK-NEXT:    ret{{[l|q]}}
2543  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2544  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2545  %res = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %arg0, <16 x i16> %arg1)
2546  %bc = bitcast <16 x i16> %res to <4 x i64>
2547  ret <4 x i64> %bc
2548}
2549declare <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16>, <16 x i16>) nounwind readnone
2550
2551define <4 x i64> @test_mm256_subs_epu8(<4 x i64> %a0, <4 x i64> %a1) {
2552; CHECK-LABEL: test_mm256_subs_epu8:
2553; CHECK:       # %bb.0:
2554; CHECK-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0
2555; CHECK-NEXT:    ret{{[l|q]}}
2556  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2557  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2558  %res = call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %arg0, <32 x i8> %arg1)
2559  %bc = bitcast <32 x i8> %res to <4 x i64>
2560  ret <4 x i64> %bc
2561}
2562declare <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8>, <32 x i8>) nounwind readnone
2563
2564define <4 x i64> @test_mm256_subs_epu16(<4 x i64> %a0, <4 x i64> %a1) {
2565; CHECK-LABEL: test_mm256_subs_epu16:
2566; CHECK:       # %bb.0:
2567; CHECK-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0
2568; CHECK-NEXT:    ret{{[l|q]}}
2569  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2570  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2571  %res = call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %arg0, <16 x i16> %arg1)
2572  %bc = bitcast <16 x i16> %res to <4 x i64>
2573  ret <4 x i64> %bc
2574}
2575declare <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16>, <16 x i16>) nounwind readnone
2576
2577define <4 x i64> @test_mm256_unpackhi_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2578; CHECK-LABEL: test_mm256_unpackhi_epi8:
2579; CHECK:       # %bb.0:
2580; CHECK-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
2581; CHECK-NEXT:    ret{{[l|q]}}
2582  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2583  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2584  %res = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
2585  %bc = bitcast <32 x i8> %res to <4 x i64>
2586  ret <4 x i64> %bc
2587}
2588
2589define <4 x i64> @test_mm256_unpackhi_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2590; CHECK-LABEL: test_mm256_unpackhi_epi16:
2591; CHECK:       # %bb.0:
2592; CHECK-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
2593; CHECK-NEXT:    ret{{[l|q]}}
2594  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2595  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2596  %res = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
2597  %bc = bitcast <16 x i16> %res to <4 x i64>
2598  ret <4 x i64> %bc
2599}
2600
2601define <4 x i64> @test_mm256_unpackhi_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2602; CHECK-LABEL: test_mm256_unpackhi_epi32:
2603; CHECK:       # %bb.0:
2604; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
2605; CHECK-NEXT:    ret{{[l|q]}}
2606  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2607  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2608  %res = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
2609  %bc = bitcast <8 x i32> %res to <4 x i64>
2610  ret <4 x i64> %bc
2611}
2612
2613define <4 x i64> @test_mm256_unpackhi_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2614; CHECK-LABEL: test_mm256_unpackhi_epi64:
2615; CHECK:       # %bb.0:
2616; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
2617; CHECK-NEXT:    ret{{[l|q]}}
2618  %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
2619  ret <4 x i64> %res
2620}
2621
2622define <4 x i64> @test_mm256_unpacklo_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2623; CHECK-LABEL: test_mm256_unpacklo_epi8:
2624; CHECK:       # %bb.0:
2625; CHECK-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
2626; CHECK-NEXT:    ret{{[l|q]}}
2627  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2628  %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2629  %res = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
2630  %bc = bitcast <32 x i8> %res to <4 x i64>
2631  ret <4 x i64> %bc
2632}
2633
2634define <4 x i64> @test_mm256_unpacklo_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2635; CHECK-LABEL: test_mm256_unpacklo_epi16:
2636; CHECK:       # %bb.0:
2637; CHECK-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
2638; CHECK-NEXT:    ret{{[l|q]}}
2639  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2640  %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2641  %res = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
2642  %bc = bitcast <16 x i16> %res to <4 x i64>
2643  ret <4 x i64> %bc
2644}
2645
2646define <4 x i64> @test_mm256_unpacklo_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2647; CHECK-LABEL: test_mm256_unpacklo_epi32:
2648; CHECK:       # %bb.0:
2649; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
2650; CHECK-NEXT:    ret{{[l|q]}}
2651  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2652  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2653  %res = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
2654  %bc = bitcast <8 x i32> %res to <4 x i64>
2655  ret <4 x i64> %bc
2656}
2657
2658define <4 x i64> @test_mm256_unpacklo_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2659; CHECK-LABEL: test_mm256_unpacklo_epi64:
2660; CHECK:       # %bb.0:
2661; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
2662; CHECK-NEXT:    ret{{[l|q]}}
2663  %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
2664  ret <4 x i64> %res
2665}
2666
2667define <4 x i64> @test_mm256_xor_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2668; CHECK-LABEL: test_mm256_xor_si256:
2669; CHECK:       # %bb.0:
2670; CHECK-NEXT:    vxorps %ymm1, %ymm0, %ymm0
2671; CHECK-NEXT:    ret{{[l|q]}}
2672  %res = xor <4 x i64> %a0, %a1
2673  ret <4 x i64> %res
2674}
2675
2676declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
2677
2678declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
2679