• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=CHECK,X86
3; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=CHECK,X64
4
5; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512vlvbmi2-builtins.c
6
7define <2 x i64> @test_mm_mask_compress_epi16(<2 x i64> %__S, i8 zeroext %__U, <2 x i64> %__D) {
8; X86-LABEL: test_mm_mask_compress_epi16:
9; X86:       # %bb.0: # %entry
10; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
11; X86-NEXT:    kmovd %eax, %k1
12; X86-NEXT:    vpcompressw %xmm1, %xmm0 {%k1}
13; X86-NEXT:    retl
14;
15; X64-LABEL: test_mm_mask_compress_epi16:
16; X64:       # %bb.0: # %entry
17; X64-NEXT:    kmovd %edi, %k1
18; X64-NEXT:    vpcompressw %xmm1, %xmm0 {%k1}
19; X64-NEXT:    retq
20entry:
21  %0 = bitcast <2 x i64> %__D to <8 x i16>
22  %1 = bitcast <2 x i64> %__S to <8 x i16>
23  %2 = tail call <8 x i16> @llvm.x86.avx512.mask.compress.w.128(<8 x i16> %0, <8 x i16> %1, i8 %__U)
24  %3 = bitcast <8 x i16> %2 to <2 x i64>
25  ret <2 x i64> %3
26}
27
28define <2 x i64> @test_mm_maskz_compress_epi16(i8 zeroext %__U, <2 x i64> %__D) {
29; X86-LABEL: test_mm_maskz_compress_epi16:
30; X86:       # %bb.0: # %entry
31; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
32; X86-NEXT:    kmovd %eax, %k1
33; X86-NEXT:    vpcompressw %xmm0, %xmm0 {%k1} {z}
34; X86-NEXT:    retl
35;
36; X64-LABEL: test_mm_maskz_compress_epi16:
37; X64:       # %bb.0: # %entry
38; X64-NEXT:    kmovd %edi, %k1
39; X64-NEXT:    vpcompressw %xmm0, %xmm0 {%k1} {z}
40; X64-NEXT:    retq
41entry:
42  %0 = bitcast <2 x i64> %__D to <8 x i16>
43  %1 = tail call <8 x i16> @llvm.x86.avx512.mask.compress.w.128(<8 x i16> %0, <8 x i16> zeroinitializer, i8 %__U)
44  %2 = bitcast <8 x i16> %1 to <2 x i64>
45  ret <2 x i64> %2
46}
47
48define <2 x i64> @test_mm_mask_compress_epi8(<2 x i64> %__S, i16 zeroext %__U, <2 x i64> %__D) {
49; X86-LABEL: test_mm_mask_compress_epi8:
50; X86:       # %bb.0: # %entry
51; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
52; X86-NEXT:    vpcompressb %xmm1, %xmm0 {%k1}
53; X86-NEXT:    retl
54;
55; X64-LABEL: test_mm_mask_compress_epi8:
56; X64:       # %bb.0: # %entry
57; X64-NEXT:    kmovd %edi, %k1
58; X64-NEXT:    vpcompressb %xmm1, %xmm0 {%k1}
59; X64-NEXT:    retq
60entry:
61  %0 = bitcast <2 x i64> %__D to <16 x i8>
62  %1 = bitcast <2 x i64> %__S to <16 x i8>
63  %2 = tail call <16 x i8> @llvm.x86.avx512.mask.compress.b.128(<16 x i8> %0, <16 x i8> %1, i16 %__U)
64  %3 = bitcast <16 x i8> %2 to <2 x i64>
65  ret <2 x i64> %3
66}
67
68define <2 x i64> @test_mm_maskz_compress_epi8(i16 zeroext %__U, <2 x i64> %__D) {
69; X86-LABEL: test_mm_maskz_compress_epi8:
70; X86:       # %bb.0: # %entry
71; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
72; X86-NEXT:    vpcompressb %xmm0, %xmm0 {%k1} {z}
73; X86-NEXT:    retl
74;
75; X64-LABEL: test_mm_maskz_compress_epi8:
76; X64:       # %bb.0: # %entry
77; X64-NEXT:    kmovd %edi, %k1
78; X64-NEXT:    vpcompressb %xmm0, %xmm0 {%k1} {z}
79; X64-NEXT:    retq
80entry:
81  %0 = bitcast <2 x i64> %__D to <16 x i8>
82  %1 = tail call <16 x i8> @llvm.x86.avx512.mask.compress.b.128(<16 x i8> %0, <16 x i8> zeroinitializer, i16 %__U)
83  %2 = bitcast <16 x i8> %1 to <2 x i64>
84  ret <2 x i64> %2
85}
86
87define void @test_mm_mask_compressstoreu_epi16(i8* %__P, i8 zeroext %__U, <2 x i64> %__D) {
88; X86-LABEL: test_mm_mask_compressstoreu_epi16:
89; X86:       # %bb.0: # %entry
90; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
91; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
92; X86-NEXT:    kmovd %eax, %k1
93; X86-NEXT:    vpcompressw %xmm0, (%ecx) {%k1}
94; X86-NEXT:    retl
95;
96; X64-LABEL: test_mm_mask_compressstoreu_epi16:
97; X64:       # %bb.0: # %entry
98; X64-NEXT:    kmovd %esi, %k1
99; X64-NEXT:    vpcompressw %xmm0, (%rdi) {%k1}
100; X64-NEXT:    retq
101entry:
102  %0 = bitcast <2 x i64> %__D to <8 x i16>
103  %1 = bitcast i8* %__P to i16*
104  %2 = bitcast i8 %__U to <8 x i1>
105  tail call void @llvm.masked.compressstore.v8i16(<8 x i16> %0, i16* %1, <8 x i1> %2)
106  ret void
107}
108
109define void @test_mm_mask_compressstoreu_epi8(i8* %__P, i16 zeroext %__U, <2 x i64> %__D) {
110; X86-LABEL: test_mm_mask_compressstoreu_epi8:
111; X86:       # %bb.0: # %entry
112; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
113; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
114; X86-NEXT:    vpcompressb %xmm0, (%eax) {%k1}
115; X86-NEXT:    retl
116;
117; X64-LABEL: test_mm_mask_compressstoreu_epi8:
118; X64:       # %bb.0: # %entry
119; X64-NEXT:    kmovd %esi, %k1
120; X64-NEXT:    vpcompressb %xmm0, (%rdi) {%k1}
121; X64-NEXT:    retq
122entry:
123  %0 = bitcast <2 x i64> %__D to <16 x i8>
124  %1 = bitcast i16 %__U to <16 x i1>
125  tail call void @llvm.masked.compressstore.v16i8(<16 x i8> %0, i8* %__P, <16 x i1> %1)
126  ret void
127}
128
129define <2 x i64> @test_mm_mask_expand_epi16(<2 x i64> %__S, i8 zeroext %__U, <2 x i64> %__D) {
130; X86-LABEL: test_mm_mask_expand_epi16:
131; X86:       # %bb.0: # %entry
132; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
133; X86-NEXT:    kmovd %eax, %k1
134; X86-NEXT:    vpexpandw %xmm1, %xmm0 {%k1}
135; X86-NEXT:    retl
136;
137; X64-LABEL: test_mm_mask_expand_epi16:
138; X64:       # %bb.0: # %entry
139; X64-NEXT:    kmovd %edi, %k1
140; X64-NEXT:    vpexpandw %xmm1, %xmm0 {%k1}
141; X64-NEXT:    retq
142entry:
143  %0 = bitcast <2 x i64> %__D to <8 x i16>
144  %1 = bitcast <2 x i64> %__S to <8 x i16>
145  %2 = tail call <8 x i16> @llvm.x86.avx512.mask.expand.w.128(<8 x i16> %0, <8 x i16> %1, i8 %__U)
146  %3 = bitcast <8 x i16> %2 to <2 x i64>
147  ret <2 x i64> %3
148}
149
150define <2 x i64> @test_mm_maskz_expand_epi16(i8 zeroext %__U, <2 x i64> %__D) {
151; X86-LABEL: test_mm_maskz_expand_epi16:
152; X86:       # %bb.0: # %entry
153; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
154; X86-NEXT:    kmovd %eax, %k1
155; X86-NEXT:    vpexpandw %xmm0, %xmm0 {%k1} {z}
156; X86-NEXT:    retl
157;
158; X64-LABEL: test_mm_maskz_expand_epi16:
159; X64:       # %bb.0: # %entry
160; X64-NEXT:    kmovd %edi, %k1
161; X64-NEXT:    vpexpandw %xmm0, %xmm0 {%k1} {z}
162; X64-NEXT:    retq
163entry:
164  %0 = bitcast <2 x i64> %__D to <8 x i16>
165  %1 = tail call <8 x i16> @llvm.x86.avx512.mask.expand.w.128(<8 x i16> %0, <8 x i16> zeroinitializer, i8 %__U)
166  %2 = bitcast <8 x i16> %1 to <2 x i64>
167  ret <2 x i64> %2
168}
169
170define <2 x i64> @test_mm_mask_expand_epi8(<2 x i64> %__S, i16 zeroext %__U, <2 x i64> %__D) {
171; X86-LABEL: test_mm_mask_expand_epi8:
172; X86:       # %bb.0: # %entry
173; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
174; X86-NEXT:    vpexpandb %xmm1, %xmm0 {%k1}
175; X86-NEXT:    retl
176;
177; X64-LABEL: test_mm_mask_expand_epi8:
178; X64:       # %bb.0: # %entry
179; X64-NEXT:    kmovd %edi, %k1
180; X64-NEXT:    vpexpandb %xmm1, %xmm0 {%k1}
181; X64-NEXT:    retq
182entry:
183  %0 = bitcast <2 x i64> %__D to <16 x i8>
184  %1 = bitcast <2 x i64> %__S to <16 x i8>
185  %2 = tail call <16 x i8> @llvm.x86.avx512.mask.expand.b.128(<16 x i8> %0, <16 x i8> %1, i16 %__U)
186  %3 = bitcast <16 x i8> %2 to <2 x i64>
187  ret <2 x i64> %3
188}
189
190define <2 x i64> @test_mm_maskz_expand_epi8(i16 zeroext %__U, <2 x i64> %__D) {
191; X86-LABEL: test_mm_maskz_expand_epi8:
192; X86:       # %bb.0: # %entry
193; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
194; X86-NEXT:    vpexpandb %xmm0, %xmm0 {%k1} {z}
195; X86-NEXT:    retl
196;
197; X64-LABEL: test_mm_maskz_expand_epi8:
198; X64:       # %bb.0: # %entry
199; X64-NEXT:    kmovd %edi, %k1
200; X64-NEXT:    vpexpandb %xmm0, %xmm0 {%k1} {z}
201; X64-NEXT:    retq
202entry:
203  %0 = bitcast <2 x i64> %__D to <16 x i8>
204  %1 = tail call <16 x i8> @llvm.x86.avx512.mask.expand.b.128(<16 x i8> %0, <16 x i8> zeroinitializer, i16 %__U)
205  %2 = bitcast <16 x i8> %1 to <2 x i64>
206  ret <2 x i64> %2
207}
208
209define <2 x i64> @test_mm_mask_expandloadu_epi16(<2 x i64> %__S, i8 zeroext %__U, i8* readonly %__P) {
210; X86-LABEL: test_mm_mask_expandloadu_epi16:
211; X86:       # %bb.0: # %entry
212; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
213; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
214; X86-NEXT:    kmovd %ecx, %k1
215; X86-NEXT:    vpexpandw (%eax), %xmm0 {%k1}
216; X86-NEXT:    retl
217;
218; X64-LABEL: test_mm_mask_expandloadu_epi16:
219; X64:       # %bb.0: # %entry
220; X64-NEXT:    kmovd %edi, %k1
221; X64-NEXT:    vpexpandw (%rsi), %xmm0 {%k1}
222; X64-NEXT:    retq
223entry:
224  %0 = bitcast <2 x i64> %__S to <8 x i16>
225  %1 = bitcast i8* %__P to i16*
226  %2 = bitcast i8 %__U to <8 x i1>
227  %3 = tail call <8 x i16> @llvm.masked.expandload.v8i16(i16* %1, <8 x i1> %2, <8 x i16> %0)
228  %4 = bitcast <8 x i16> %3 to <2 x i64>
229  ret <2 x i64> %4
230}
231
232define <2 x i64> @test_mm_maskz_expandloadu_epi16(i8 zeroext %__U, i8* readonly %__P) {
233; X86-LABEL: test_mm_maskz_expandloadu_epi16:
234; X86:       # %bb.0: # %entry
235; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
236; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
237; X86-NEXT:    kmovd %ecx, %k1
238; X86-NEXT:    vpexpandw (%eax), %xmm0 {%k1} {z}
239; X86-NEXT:    retl
240;
241; X64-LABEL: test_mm_maskz_expandloadu_epi16:
242; X64:       # %bb.0: # %entry
243; X64-NEXT:    kmovd %edi, %k1
244; X64-NEXT:    vpexpandw (%rsi), %xmm0 {%k1} {z}
245; X64-NEXT:    retq
246entry:
247  %0 = bitcast i8* %__P to i16*
248  %1 = bitcast i8 %__U to <8 x i1>
249  %2 = tail call <8 x i16> @llvm.masked.expandload.v8i16(i16* %0, <8 x i1> %1, <8 x i16> zeroinitializer)
250  %3 = bitcast <8 x i16> %2 to <2 x i64>
251  ret <2 x i64> %3
252}
253
254define <2 x i64> @test_mm_mask_expandloadu_epi8(<2 x i64> %__S, i16 zeroext %__U, i8* readonly %__P) {
255; X86-LABEL: test_mm_mask_expandloadu_epi8:
256; X86:       # %bb.0: # %entry
257; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
258; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
259; X86-NEXT:    vpexpandb (%eax), %xmm0 {%k1}
260; X86-NEXT:    retl
261;
262; X64-LABEL: test_mm_mask_expandloadu_epi8:
263; X64:       # %bb.0: # %entry
264; X64-NEXT:    kmovd %edi, %k1
265; X64-NEXT:    vpexpandb (%rsi), %xmm0 {%k1}
266; X64-NEXT:    retq
267entry:
268  %0 = bitcast <2 x i64> %__S to <16 x i8>
269  %1 = bitcast i16 %__U to <16 x i1>
270  %2 = tail call <16 x i8> @llvm.masked.expandload.v16i8(i8* %__P, <16 x i1> %1, <16 x i8> %0)
271  %3 = bitcast <16 x i8> %2 to <2 x i64>
272  ret <2 x i64> %3
273}
274
275define <2 x i64> @test_mm_maskz_expandloadu_epi8(i16 zeroext %__U, i8* readonly %__P) {
276; X86-LABEL: test_mm_maskz_expandloadu_epi8:
277; X86:       # %bb.0: # %entry
278; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
279; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
280; X86-NEXT:    vpexpandb (%eax), %xmm0 {%k1} {z}
281; X86-NEXT:    retl
282;
283; X64-LABEL: test_mm_maskz_expandloadu_epi8:
284; X64:       # %bb.0: # %entry
285; X64-NEXT:    kmovd %edi, %k1
286; X64-NEXT:    vpexpandb (%rsi), %xmm0 {%k1} {z}
287; X64-NEXT:    retq
288entry:
289  %0 = bitcast i16 %__U to <16 x i1>
290  %1 = tail call <16 x i8> @llvm.masked.expandload.v16i8(i8* %__P, <16 x i1> %0, <16 x i8> zeroinitializer)
291  %2 = bitcast <16 x i8> %1 to <2 x i64>
292  ret <2 x i64> %2
293}
294
295define <4 x i64> @test_mm256_mask_compress_epi16(<4 x i64> %__S, i16 zeroext %__U, <4 x i64> %__D) {
296; X86-LABEL: test_mm256_mask_compress_epi16:
297; X86:       # %bb.0: # %entry
298; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
299; X86-NEXT:    vpcompressw %ymm1, %ymm0 {%k1}
300; X86-NEXT:    retl
301;
302; X64-LABEL: test_mm256_mask_compress_epi16:
303; X64:       # %bb.0: # %entry
304; X64-NEXT:    kmovd %edi, %k1
305; X64-NEXT:    vpcompressw %ymm1, %ymm0 {%k1}
306; X64-NEXT:    retq
307entry:
308  %0 = bitcast <4 x i64> %__D to <16 x i16>
309  %1 = bitcast <4 x i64> %__S to <16 x i16>
310  %2 = tail call <16 x i16> @llvm.x86.avx512.mask.compress.w.256(<16 x i16> %0, <16 x i16> %1, i16 %__U)
311  %3 = bitcast <16 x i16> %2 to <4 x i64>
312  ret <4 x i64> %3
313}
314
315define <4 x i64> @test_mm256_maskz_compress_epi16(i16 zeroext %__U, <4 x i64> %__D) {
316; X86-LABEL: test_mm256_maskz_compress_epi16:
317; X86:       # %bb.0: # %entry
318; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
319; X86-NEXT:    vpcompressw %ymm0, %ymm0 {%k1} {z}
320; X86-NEXT:    retl
321;
322; X64-LABEL: test_mm256_maskz_compress_epi16:
323; X64:       # %bb.0: # %entry
324; X64-NEXT:    kmovd %edi, %k1
325; X64-NEXT:    vpcompressw %ymm0, %ymm0 {%k1} {z}
326; X64-NEXT:    retq
327entry:
328  %0 = bitcast <4 x i64> %__D to <16 x i16>
329  %1 = tail call <16 x i16> @llvm.x86.avx512.mask.compress.w.256(<16 x i16> %0, <16 x i16> zeroinitializer, i16 %__U)
330  %2 = bitcast <16 x i16> %1 to <4 x i64>
331  ret <4 x i64> %2
332}
333
334define <4 x i64> @test_mm256_mask_compress_epi8(<4 x i64> %__S, i32 %__U, <4 x i64> %__D) {
335; X86-LABEL: test_mm256_mask_compress_epi8:
336; X86:       # %bb.0: # %entry
337; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
338; X86-NEXT:    vpcompressb %ymm1, %ymm0 {%k1}
339; X86-NEXT:    retl
340;
341; X64-LABEL: test_mm256_mask_compress_epi8:
342; X64:       # %bb.0: # %entry
343; X64-NEXT:    kmovd %edi, %k1
344; X64-NEXT:    vpcompressb %ymm1, %ymm0 {%k1}
345; X64-NEXT:    retq
346entry:
347  %0 = bitcast <4 x i64> %__D to <32 x i8>
348  %1 = bitcast <4 x i64> %__S to <32 x i8>
349  %2 = tail call <32 x i8> @llvm.x86.avx512.mask.compress.b.256(<32 x i8> %0, <32 x i8> %1, i32 %__U)
350  %3 = bitcast <32 x i8> %2 to <4 x i64>
351  ret <4 x i64> %3
352}
353
354define <4 x i64> @test_mm256_maskz_compress_epi8(i32 %__U, <4 x i64> %__D) {
355; X86-LABEL: test_mm256_maskz_compress_epi8:
356; X86:       # %bb.0: # %entry
357; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
358; X86-NEXT:    vpcompressb %ymm0, %ymm0 {%k1} {z}
359; X86-NEXT:    retl
360;
361; X64-LABEL: test_mm256_maskz_compress_epi8:
362; X64:       # %bb.0: # %entry
363; X64-NEXT:    kmovd %edi, %k1
364; X64-NEXT:    vpcompressb %ymm0, %ymm0 {%k1} {z}
365; X64-NEXT:    retq
366entry:
367  %0 = bitcast <4 x i64> %__D to <32 x i8>
368  %1 = tail call <32 x i8> @llvm.x86.avx512.mask.compress.b.256(<32 x i8> %0, <32 x i8> zeroinitializer, i32 %__U)
369  %2 = bitcast <32 x i8> %1 to <4 x i64>
370  ret <4 x i64> %2
371}
372
373define void @test_mm256_mask_compressstoreu_epi16(i8* %__P, i16 zeroext %__U, <4 x i64> %__D) {
374; X86-LABEL: test_mm256_mask_compressstoreu_epi16:
375; X86:       # %bb.0: # %entry
376; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
377; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
378; X86-NEXT:    vpcompressw %ymm0, (%eax) {%k1}
379; X86-NEXT:    vzeroupper
380; X86-NEXT:    retl
381;
382; X64-LABEL: test_mm256_mask_compressstoreu_epi16:
383; X64:       # %bb.0: # %entry
384; X64-NEXT:    kmovd %esi, %k1
385; X64-NEXT:    vpcompressw %ymm0, (%rdi) {%k1}
386; X64-NEXT:    vzeroupper
387; X64-NEXT:    retq
388entry:
389  %0 = bitcast <4 x i64> %__D to <16 x i16>
390  %1 = bitcast i8* %__P to i16*
391  %2 = bitcast i16 %__U to <16 x i1>
392  tail call void @llvm.masked.compressstore.v16i16(<16 x i16> %0, i16* %1, <16 x i1> %2)
393  ret void
394}
395
396define void @test_mm256_mask_compressstoreu_epi8(i8* %__P, i32 %__U, <4 x i64> %__D) {
397; X86-LABEL: test_mm256_mask_compressstoreu_epi8:
398; X86:       # %bb.0: # %entry
399; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
400; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
401; X86-NEXT:    vpcompressb %ymm0, (%eax) {%k1}
402; X86-NEXT:    vzeroupper
403; X86-NEXT:    retl
404;
405; X64-LABEL: test_mm256_mask_compressstoreu_epi8:
406; X64:       # %bb.0: # %entry
407; X64-NEXT:    kmovd %esi, %k1
408; X64-NEXT:    vpcompressb %ymm0, (%rdi) {%k1}
409; X64-NEXT:    vzeroupper
410; X64-NEXT:    retq
411entry:
412  %0 = bitcast <4 x i64> %__D to <32 x i8>
413  %1 = bitcast i32 %__U to <32 x i1>
414  tail call void @llvm.masked.compressstore.v32i8(<32 x i8> %0, i8* %__P, <32 x i1> %1)
415  ret void
416}
417
418define <4 x i64> @test_mm256_mask_expand_epi16(<4 x i64> %__S, i16 zeroext %__U, <4 x i64> %__D) {
419; X86-LABEL: test_mm256_mask_expand_epi16:
420; X86:       # %bb.0: # %entry
421; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
422; X86-NEXT:    vpexpandw %ymm1, %ymm0 {%k1}
423; X86-NEXT:    retl
424;
425; X64-LABEL: test_mm256_mask_expand_epi16:
426; X64:       # %bb.0: # %entry
427; X64-NEXT:    kmovd %edi, %k1
428; X64-NEXT:    vpexpandw %ymm1, %ymm0 {%k1}
429; X64-NEXT:    retq
430entry:
431  %0 = bitcast <4 x i64> %__D to <16 x i16>
432  %1 = bitcast <4 x i64> %__S to <16 x i16>
433  %2 = tail call <16 x i16> @llvm.x86.avx512.mask.expand.w.256(<16 x i16> %0, <16 x i16> %1, i16 %__U)
434  %3 = bitcast <16 x i16> %2 to <4 x i64>
435  ret <4 x i64> %3
436}
437
438define <4 x i64> @test_mm256_maskz_expand_epi16(i16 zeroext %__U, <4 x i64> %__D) {
439; X86-LABEL: test_mm256_maskz_expand_epi16:
440; X86:       # %bb.0: # %entry
441; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
442; X86-NEXT:    vpexpandw %ymm0, %ymm0 {%k1} {z}
443; X86-NEXT:    retl
444;
445; X64-LABEL: test_mm256_maskz_expand_epi16:
446; X64:       # %bb.0: # %entry
447; X64-NEXT:    kmovd %edi, %k1
448; X64-NEXT:    vpexpandw %ymm0, %ymm0 {%k1} {z}
449; X64-NEXT:    retq
450entry:
451  %0 = bitcast <4 x i64> %__D to <16 x i16>
452  %1 = tail call <16 x i16> @llvm.x86.avx512.mask.expand.w.256(<16 x i16> %0, <16 x i16> zeroinitializer, i16 %__U)
453  %2 = bitcast <16 x i16> %1 to <4 x i64>
454  ret <4 x i64> %2
455}
456
457define <4 x i64> @test_mm256_mask_expand_epi8(<4 x i64> %__S, i32 %__U, <4 x i64> %__D) {
458; X86-LABEL: test_mm256_mask_expand_epi8:
459; X86:       # %bb.0: # %entry
460; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
461; X86-NEXT:    vpexpandb %ymm1, %ymm0 {%k1}
462; X86-NEXT:    retl
463;
464; X64-LABEL: test_mm256_mask_expand_epi8:
465; X64:       # %bb.0: # %entry
466; X64-NEXT:    kmovd %edi, %k1
467; X64-NEXT:    vpexpandb %ymm1, %ymm0 {%k1}
468; X64-NEXT:    retq
469entry:
470  %0 = bitcast <4 x i64> %__D to <32 x i8>
471  %1 = bitcast <4 x i64> %__S to <32 x i8>
472  %2 = tail call <32 x i8> @llvm.x86.avx512.mask.expand.b.256(<32 x i8> %0, <32 x i8> %1, i32 %__U)
473  %3 = bitcast <32 x i8> %2 to <4 x i64>
474  ret <4 x i64> %3
475}
476
477define <4 x i64> @test_mm256_maskz_expand_epi8(i32 %__U, <4 x i64> %__D) {
478; X86-LABEL: test_mm256_maskz_expand_epi8:
479; X86:       # %bb.0: # %entry
480; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
481; X86-NEXT:    vpexpandb %ymm0, %ymm0 {%k1} {z}
482; X86-NEXT:    retl
483;
484; X64-LABEL: test_mm256_maskz_expand_epi8:
485; X64:       # %bb.0: # %entry
486; X64-NEXT:    kmovd %edi, %k1
487; X64-NEXT:    vpexpandb %ymm0, %ymm0 {%k1} {z}
488; X64-NEXT:    retq
489entry:
490  %0 = bitcast <4 x i64> %__D to <32 x i8>
491  %1 = tail call <32 x i8> @llvm.x86.avx512.mask.expand.b.256(<32 x i8> %0, <32 x i8> zeroinitializer, i32 %__U)
492  %2 = bitcast <32 x i8> %1 to <4 x i64>
493  ret <4 x i64> %2
494}
495
496define <4 x i64> @test_mm256_mask_expandloadu_epi16(<4 x i64> %__S, i16 zeroext %__U, i8* readonly %__P) {
497; X86-LABEL: test_mm256_mask_expandloadu_epi16:
498; X86:       # %bb.0: # %entry
499; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
500; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
501; X86-NEXT:    vpexpandw (%eax), %ymm0 {%k1}
502; X86-NEXT:    retl
503;
504; X64-LABEL: test_mm256_mask_expandloadu_epi16:
505; X64:       # %bb.0: # %entry
506; X64-NEXT:    kmovd %edi, %k1
507; X64-NEXT:    vpexpandw (%rsi), %ymm0 {%k1}
508; X64-NEXT:    retq
509entry:
510  %0 = bitcast <4 x i64> %__S to <16 x i16>
511  %1 = bitcast i8* %__P to i16*
512  %2 = bitcast i16 %__U to <16 x i1>
513  %3 = tail call <16 x i16> @llvm.masked.expandload.v16i16(i16* %1, <16 x i1> %2, <16 x i16> %0)
514  %4 = bitcast <16 x i16> %3 to <4 x i64>
515  ret <4 x i64> %4
516}
517
518define <4 x i64> @test_mm256_maskz_expandloadu_epi16(i16 zeroext %__U, i8* readonly %__P) {
519; X86-LABEL: test_mm256_maskz_expandloadu_epi16:
520; X86:       # %bb.0: # %entry
521; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
522; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
523; X86-NEXT:    vpexpandw (%eax), %ymm0 {%k1} {z}
524; X86-NEXT:    retl
525;
526; X64-LABEL: test_mm256_maskz_expandloadu_epi16:
527; X64:       # %bb.0: # %entry
528; X64-NEXT:    kmovd %edi, %k1
529; X64-NEXT:    vpexpandw (%rsi), %ymm0 {%k1} {z}
530; X64-NEXT:    retq
531entry:
532  %0 = bitcast i8* %__P to i16*
533  %1 = bitcast i16 %__U to <16 x i1>
534  %2 = tail call <16 x i16> @llvm.masked.expandload.v16i16(i16* %0, <16 x i1> %1, <16 x i16> zeroinitializer)
535  %3 = bitcast <16 x i16> %2 to <4 x i64>
536  ret <4 x i64> %3
537}
538
539define <4 x i64> @test_mm256_mask_expandloadu_epi8(<4 x i64> %__S, i32 %__U, i8* readonly %__P) {
540; X86-LABEL: test_mm256_mask_expandloadu_epi8:
541; X86:       # %bb.0: # %entry
542; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
543; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
544; X86-NEXT:    vpexpandb (%eax), %ymm0 {%k1}
545; X86-NEXT:    retl
546;
547; X64-LABEL: test_mm256_mask_expandloadu_epi8:
548; X64:       # %bb.0: # %entry
549; X64-NEXT:    kmovd %edi, %k1
550; X64-NEXT:    vpexpandb (%rsi), %ymm0 {%k1}
551; X64-NEXT:    retq
552entry:
553  %0 = bitcast <4 x i64> %__S to <32 x i8>
554  %1 = bitcast i32 %__U to <32 x i1>
555  %2 = tail call <32 x i8> @llvm.masked.expandload.v32i8(i8* %__P, <32 x i1> %1, <32 x i8> %0)
556  %3 = bitcast <32 x i8> %2 to <4 x i64>
557  ret <4 x i64> %3
558}
559
560define <4 x i64> @test_mm256_maskz_expandloadu_epi8(i32 %__U, i8* readonly %__P) {
561; X86-LABEL: test_mm256_maskz_expandloadu_epi8:
562; X86:       # %bb.0: # %entry
563; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
564; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
565; X86-NEXT:    vpexpandb (%eax), %ymm0 {%k1} {z}
566; X86-NEXT:    retl
567;
568; X64-LABEL: test_mm256_maskz_expandloadu_epi8:
569; X64:       # %bb.0: # %entry
570; X64-NEXT:    kmovd %edi, %k1
571; X64-NEXT:    vpexpandb (%rsi), %ymm0 {%k1} {z}
572; X64-NEXT:    retq
573entry:
574  %0 = bitcast i32 %__U to <32 x i1>
575  %1 = tail call <32 x i8> @llvm.masked.expandload.v32i8(i8* %__P, <32 x i1> %0, <32 x i8> zeroinitializer)
576  %2 = bitcast <32 x i8> %1 to <4 x i64>
577  ret <4 x i64> %2
578}
579
580define <4 x i64> @test_mm256_mask_shldi_epi64(<4 x i64> %__S, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
581; X86-LABEL: test_mm256_mask_shldi_epi64:
582; X86:       # %bb.0: # %entry
583; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
584; X86-NEXT:    kmovd %eax, %k1
585; X86-NEXT:    vpshldq $47, %ymm2, %ymm1, %ymm0 {%k1}
586; X86-NEXT:    retl
587;
588; X64-LABEL: test_mm256_mask_shldi_epi64:
589; X64:       # %bb.0: # %entry
590; X64-NEXT:    kmovd %edi, %k1
591; X64-NEXT:    vpshldq $47, %ymm2, %ymm1, %ymm0 {%k1}
592; X64-NEXT:    retq
593entry:
594  %0 = tail call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> <i64 47, i64 47, i64 47, i64 47>)
595  %1 = bitcast i8 %__U to <8 x i1>
596  %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
597  %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> %__S
598  ret <4 x i64> %2
599}
600
601declare <4 x i64> @llvm.fshl.v4i64(<4 x i64>, <4 x i64>, <4 x i64>)
602
603define <4 x i64> @test_mm256_maskz_shldi_epi64(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
604; X86-LABEL: test_mm256_maskz_shldi_epi64:
605; X86:       # %bb.0: # %entry
606; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
607; X86-NEXT:    kmovd %eax, %k1
608; X86-NEXT:    vpshldq $63, %ymm1, %ymm0, %ymm0 {%k1} {z}
609; X86-NEXT:    retl
610;
611; X64-LABEL: test_mm256_maskz_shldi_epi64:
612; X64:       # %bb.0: # %entry
613; X64-NEXT:    kmovd %edi, %k1
614; X64-NEXT:    vpshldq $63, %ymm1, %ymm0, %ymm0 {%k1} {z}
615; X64-NEXT:    retq
616entry:
617  %0 = tail call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> <i64 63, i64 63, i64 63, i64 63>)
618  %1 = bitcast i8 %__U to <8 x i1>
619  %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
620  %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> zeroinitializer
621  ret <4 x i64> %2
622}
623
624define <4 x i64> @test_mm256_shldi_epi64(<4 x i64> %__A, <4 x i64> %__B) {
625; CHECK-LABEL: test_mm256_shldi_epi64:
626; CHECK:       # %bb.0: # %entry
627; CHECK-NEXT:    vpshldq $31, %ymm1, %ymm0, %ymm0
628; CHECK-NEXT:    ret{{[l|q]}}
629entry:
630  %0 = tail call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> <i64 31, i64 31, i64 31, i64 31>)
631  ret <4 x i64> %0
632}
633
634define <2 x i64> @test_mm_mask_shldi_epi64(<2 x i64> %__S, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
635; X86-LABEL: test_mm_mask_shldi_epi64:
636; X86:       # %bb.0: # %entry
637; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
638; X86-NEXT:    kmovd %eax, %k1
639; X86-NEXT:    vpshldq $47, %xmm2, %xmm1, %xmm0 {%k1}
640; X86-NEXT:    retl
641;
642; X64-LABEL: test_mm_mask_shldi_epi64:
643; X64:       # %bb.0: # %entry
644; X64-NEXT:    kmovd %edi, %k1
645; X64-NEXT:    vpshldq $47, %xmm2, %xmm1, %xmm0 {%k1}
646; X64-NEXT:    retq
647entry:
648  %0 = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> <i64 47, i64 47>)
649  %1 = bitcast i8 %__U to <8 x i1>
650  %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
651  %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> %__S
652  ret <2 x i64> %2
653}
654
655declare <2 x i64> @llvm.fshl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
656
657define <2 x i64> @test_mm_maskz_shldi_epi64(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
658; X86-LABEL: test_mm_maskz_shldi_epi64:
659; X86:       # %bb.0: # %entry
660; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
661; X86-NEXT:    kmovd %eax, %k1
662; X86-NEXT:    vpshldq $63, %xmm1, %xmm0, %xmm0 {%k1} {z}
663; X86-NEXT:    retl
664;
665; X64-LABEL: test_mm_maskz_shldi_epi64:
666; X64:       # %bb.0: # %entry
667; X64-NEXT:    kmovd %edi, %k1
668; X64-NEXT:    vpshldq $63, %xmm1, %xmm0, %xmm0 {%k1} {z}
669; X64-NEXT:    retq
670entry:
671  %0 = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> <i64 63, i64 63>)
672  %1 = bitcast i8 %__U to <8 x i1>
673  %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
674  %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> zeroinitializer
675  ret <2 x i64> %2
676}
677
678define <2 x i64> @test_mm_shldi_epi64(<2 x i64> %__A, <2 x i64> %__B) {
679; CHECK-LABEL: test_mm_shldi_epi64:
680; CHECK:       # %bb.0: # %entry
681; CHECK-NEXT:    vpshldq $31, %xmm1, %xmm0, %xmm0
682; CHECK-NEXT:    ret{{[l|q]}}
683entry:
684  %0 = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> <i64 31, i64 31>)
685  ret <2 x i64> %0
686}
687
688define <4 x i64> @test_mm256_mask_shldi_epi32(<4 x i64> %__S, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
689; X86-LABEL: test_mm256_mask_shldi_epi32:
690; X86:       # %bb.0: # %entry
691; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
692; X86-NEXT:    kmovd %eax, %k1
693; X86-NEXT:    vpshldd $7, %ymm2, %ymm1, %ymm0 {%k1}
694; X86-NEXT:    retl
695;
696; X64-LABEL: test_mm256_mask_shldi_epi32:
697; X64:       # %bb.0: # %entry
698; X64-NEXT:    kmovd %edi, %k1
699; X64-NEXT:    vpshldd $7, %ymm2, %ymm1, %ymm0 {%k1}
700; X64-NEXT:    retq
701entry:
702  %0 = bitcast <4 x i64> %__A to <8 x i32>
703  %1 = bitcast <4 x i64> %__B to <8 x i32>
704  %2 = tail call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %0, <8 x i32> %1, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>)
705  %3 = bitcast <4 x i64> %__S to <8 x i32>
706  %4 = bitcast i8 %__U to <8 x i1>
707  %5 = select <8 x i1> %4, <8 x i32> %2, <8 x i32> %3
708  %6 = bitcast <8 x i32> %5 to <4 x i64>
709  ret <4 x i64> %6
710}
711
712declare <8 x i32> @llvm.fshl.v8i32(<8 x i32>, <8 x i32>, <8 x i32>)
713
714define <4 x i64> @test_mm256_maskz_shldi_epi32(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
715; X86-LABEL: test_mm256_maskz_shldi_epi32:
716; X86:       # %bb.0: # %entry
717; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
718; X86-NEXT:    kmovd %eax, %k1
719; X86-NEXT:    vpshldd $15, %ymm1, %ymm0, %ymm0 {%k1} {z}
720; X86-NEXT:    retl
721;
722; X64-LABEL: test_mm256_maskz_shldi_epi32:
723; X64:       # %bb.0: # %entry
724; X64-NEXT:    kmovd %edi, %k1
725; X64-NEXT:    vpshldd $15, %ymm1, %ymm0, %ymm0 {%k1} {z}
726; X64-NEXT:    retq
727entry:
728  %0 = bitcast <4 x i64> %__A to <8 x i32>
729  %1 = bitcast <4 x i64> %__B to <8 x i32>
730  %2 = tail call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %0, <8 x i32> %1, <8 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>)
731  %3 = bitcast i8 %__U to <8 x i1>
732  %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
733  %5 = bitcast <8 x i32> %4 to <4 x i64>
734  ret <4 x i64> %5
735}
736
737define <4 x i64> @test_mm256_shldi_epi32(<4 x i64> %__A, <4 x i64> %__B) {
738; CHECK-LABEL: test_mm256_shldi_epi32:
739; CHECK:       # %bb.0: # %entry
740; CHECK-NEXT:    vpshldd $31, %ymm1, %ymm0, %ymm0
741; CHECK-NEXT:    ret{{[l|q]}}
742entry:
743  %0 = bitcast <4 x i64> %__A to <8 x i32>
744  %1 = bitcast <4 x i64> %__B to <8 x i32>
745  %2 = tail call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %0, <8 x i32> %1, <8 x i32> <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>)
746  %3 = bitcast <8 x i32> %2 to <4 x i64>
747  ret <4 x i64> %3
748}
749
750define <2 x i64> @test_mm_mask_shldi_epi32(<2 x i64> %__S, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
751; X86-LABEL: test_mm_mask_shldi_epi32:
752; X86:       # %bb.0: # %entry
753; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
754; X86-NEXT:    kmovd %eax, %k1
755; X86-NEXT:    vpshldd $7, %xmm2, %xmm1, %xmm0 {%k1}
756; X86-NEXT:    retl
757;
758; X64-LABEL: test_mm_mask_shldi_epi32:
759; X64:       # %bb.0: # %entry
760; X64-NEXT:    kmovd %edi, %k1
761; X64-NEXT:    vpshldd $7, %xmm2, %xmm1, %xmm0 {%k1}
762; X64-NEXT:    retq
763entry:
764  %0 = bitcast <2 x i64> %__A to <4 x i32>
765  %1 = bitcast <2 x i64> %__B to <4 x i32>
766  %2 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> <i32 7, i32 7, i32 7, i32 7>)
767  %3 = bitcast <2 x i64> %__S to <4 x i32>
768  %4 = bitcast i8 %__U to <8 x i1>
769  %extract = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
770  %5 = select <4 x i1> %extract, <4 x i32> %2, <4 x i32> %3
771  %6 = bitcast <4 x i32> %5 to <2 x i64>
772  ret <2 x i64> %6
773}
774
775declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
776
777define <2 x i64> @test_mm_maskz_shldi_epi32(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
778; X86-LABEL: test_mm_maskz_shldi_epi32:
779; X86:       # %bb.0: # %entry
780; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
781; X86-NEXT:    kmovd %eax, %k1
782; X86-NEXT:    vpshldd $15, %xmm1, %xmm0, %xmm0 {%k1} {z}
783; X86-NEXT:    retl
784;
785; X64-LABEL: test_mm_maskz_shldi_epi32:
786; X64:       # %bb.0: # %entry
787; X64-NEXT:    kmovd %edi, %k1
788; X64-NEXT:    vpshldd $15, %xmm1, %xmm0, %xmm0 {%k1} {z}
789; X64-NEXT:    retq
790entry:
791  %0 = bitcast <2 x i64> %__A to <4 x i32>
792  %1 = bitcast <2 x i64> %__B to <4 x i32>
793  %2 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> <i32 15, i32 15, i32 15, i32 15>)
794  %3 = bitcast i8 %__U to <8 x i1>
795  %extract = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
796  %4 = select <4 x i1> %extract, <4 x i32> %2, <4 x i32> zeroinitializer
797  %5 = bitcast <4 x i32> %4 to <2 x i64>
798  ret <2 x i64> %5
799}
800
801define <2 x i64> @test_mm_shldi_epi32(<2 x i64> %__A, <2 x i64> %__B) {
802; CHECK-LABEL: test_mm_shldi_epi32:
803; CHECK:       # %bb.0: # %entry
804; CHECK-NEXT:    vpshldd $31, %xmm1, %xmm0, %xmm0
805; CHECK-NEXT:    ret{{[l|q]}}
806entry:
807  %0 = bitcast <2 x i64> %__A to <4 x i32>
808  %1 = bitcast <2 x i64> %__B to <4 x i32>
809  %2 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> <i32 31, i32 31, i32 31, i32 31>)
810  %3 = bitcast <4 x i32> %2 to <2 x i64>
811  ret <2 x i64> %3
812}
813
814define <4 x i64> @test_mm256_mask_shldi_epi16(<4 x i64> %__S, i16 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
815; X86-LABEL: test_mm256_mask_shldi_epi16:
816; X86:       # %bb.0: # %entry
817; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
818; X86-NEXT:    vpshldw $3, %ymm2, %ymm1, %ymm0 {%k1}
819; X86-NEXT:    retl
820;
821; X64-LABEL: test_mm256_mask_shldi_epi16:
822; X64:       # %bb.0: # %entry
823; X64-NEXT:    kmovd %edi, %k1
824; X64-NEXT:    vpshldw $3, %ymm2, %ymm1, %ymm0 {%k1}
825; X64-NEXT:    retq
826entry:
827  %0 = bitcast <4 x i64> %__A to <16 x i16>
828  %1 = bitcast <4 x i64> %__B to <16 x i16>
829  %2 = tail call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %0, <16 x i16> %1, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
830  %3 = bitcast <4 x i64> %__S to <16 x i16>
831  %4 = bitcast i16 %__U to <16 x i1>
832  %5 = select <16 x i1> %4, <16 x i16> %2, <16 x i16> %3
833  %6 = bitcast <16 x i16> %5 to <4 x i64>
834  ret <4 x i64> %6
835}
836
837declare <16 x i16> @llvm.fshl.v16i16(<16 x i16>, <16 x i16>, <16 x i16>)
838
839define <4 x i64> @test_mm256_maskz_shldi_epi16(i16 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
840; X86-LABEL: test_mm256_maskz_shldi_epi16:
841; X86:       # %bb.0: # %entry
842; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
843; X86-NEXT:    vpshldw $7, %ymm1, %ymm0, %ymm0 {%k1} {z}
844; X86-NEXT:    retl
845;
846; X64-LABEL: test_mm256_maskz_shldi_epi16:
847; X64:       # %bb.0: # %entry
848; X64-NEXT:    kmovd %edi, %k1
849; X64-NEXT:    vpshldw $7, %ymm1, %ymm0, %ymm0 {%k1} {z}
850; X64-NEXT:    retq
851entry:
852  %0 = bitcast <4 x i64> %__A to <16 x i16>
853  %1 = bitcast <4 x i64> %__B to <16 x i16>
854  %2 = tail call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %0, <16 x i16> %1, <16 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
855  %3 = bitcast i16 %__U to <16 x i1>
856  %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> zeroinitializer
857  %5 = bitcast <16 x i16> %4 to <4 x i64>
858  ret <4 x i64> %5
859}
860
861define <4 x i64> @test_mm256_shldi_epi16(<4 x i64> %__A, <4 x i64> %__B) {
862; CHECK-LABEL: test_mm256_shldi_epi16:
863; CHECK:       # %bb.0: # %entry
864; CHECK-NEXT:    vpshldw $15, %ymm1, %ymm0, %ymm0
865; CHECK-NEXT:    ret{{[l|q]}}
866entry:
867  %0 = bitcast <4 x i64> %__A to <16 x i16>
868  %1 = bitcast <4 x i64> %__B to <16 x i16>
869  %2 = tail call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %0, <16 x i16> %1, <16 x i16> <i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31>)
870  %3 = bitcast <16 x i16> %2 to <4 x i64>
871  ret <4 x i64> %3
872}
873
874define <2 x i64> @test_mm_mask_shldi_epi16(<2 x i64> %__S, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
875; X86-LABEL: test_mm_mask_shldi_epi16:
876; X86:       # %bb.0: # %entry
877; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
878; X86-NEXT:    kmovd %eax, %k1
879; X86-NEXT:    vpshldw $3, %xmm2, %xmm1, %xmm0 {%k1}
880; X86-NEXT:    retl
881;
882; X64-LABEL: test_mm_mask_shldi_epi16:
883; X64:       # %bb.0: # %entry
884; X64-NEXT:    kmovd %edi, %k1
885; X64-NEXT:    vpshldw $3, %xmm2, %xmm1, %xmm0 {%k1}
886; X64-NEXT:    retq
887entry:
888  %0 = bitcast <2 x i64> %__A to <8 x i16>
889  %1 = bitcast <2 x i64> %__B to <8 x i16>
890  %2 = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %0, <8 x i16> %1, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
891  %3 = bitcast <2 x i64> %__S to <8 x i16>
892  %4 = bitcast i8 %__U to <8 x i1>
893  %5 = select <8 x i1> %4, <8 x i16> %2, <8 x i16> %3
894  %6 = bitcast <8 x i16> %5 to <2 x i64>
895  ret <2 x i64> %6
896}
897
898declare <8 x i16> @llvm.fshl.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
899
900define <2 x i64> @test_mm_maskz_shldi_epi16(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
901; X86-LABEL: test_mm_maskz_shldi_epi16:
902; X86:       # %bb.0: # %entry
903; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
904; X86-NEXT:    kmovd %eax, %k1
905; X86-NEXT:    vpshldw $7, %xmm1, %xmm0, %xmm0 {%k1} {z}
906; X86-NEXT:    retl
907;
908; X64-LABEL: test_mm_maskz_shldi_epi16:
909; X64:       # %bb.0: # %entry
910; X64-NEXT:    kmovd %edi, %k1
911; X64-NEXT:    vpshldw $7, %xmm1, %xmm0, %xmm0 {%k1} {z}
912; X64-NEXT:    retq
913entry:
914  %0 = bitcast <2 x i64> %__A to <8 x i16>
915  %1 = bitcast <2 x i64> %__B to <8 x i16>
916  %2 = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %0, <8 x i16> %1, <8 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
917  %3 = bitcast i8 %__U to <8 x i1>
918  %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> zeroinitializer
919  %5 = bitcast <8 x i16> %4 to <2 x i64>
920  ret <2 x i64> %5
921}
922
923define <2 x i64> @test_mm_shldi_epi16(<2 x i64> %__A, <2 x i64> %__B) {
924; CHECK-LABEL: test_mm_shldi_epi16:
925; CHECK:       # %bb.0: # %entry
926; CHECK-NEXT:    vpshldw $15, %xmm1, %xmm0, %xmm0
927; CHECK-NEXT:    ret{{[l|q]}}
928entry:
929  %0 = bitcast <2 x i64> %__A to <8 x i16>
930  %1 = bitcast <2 x i64> %__B to <8 x i16>
931  %2 = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %0, <8 x i16> %1, <8 x i16> <i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31>)
932  %3 = bitcast <8 x i16> %2 to <2 x i64>
933  ret <2 x i64> %3
934}
935
936define <4 x i64> @test_mm256_mask_shrdi_epi64(<4 x i64> %__S, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
937; X86-LABEL: test_mm256_mask_shrdi_epi64:
938; X86:       # %bb.0: # %entry
939; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
940; X86-NEXT:    kmovd %eax, %k1
941; X86-NEXT:    vpshrdq $47, %ymm2, %ymm1, %ymm0 {%k1}
942; X86-NEXT:    retl
943;
944; X64-LABEL: test_mm256_mask_shrdi_epi64:
945; X64:       # %bb.0: # %entry
946; X64-NEXT:    kmovd %edi, %k1
947; X64-NEXT:    vpshrdq $47, %ymm2, %ymm1, %ymm0 {%k1}
948; X64-NEXT:    retq
949entry:
950  %0 = tail call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %__B, <4 x i64> %__A, <4 x i64> <i64 47, i64 47, i64 47, i64 47>)
951  %1 = bitcast i8 %__U to <8 x i1>
952  %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
953  %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> %__S
954  ret <4 x i64> %2
955}
956
957declare <4 x i64> @llvm.fshr.v4i64(<4 x i64>, <4 x i64>, <4 x i64>)
958
959define <4 x i64> @test_mm256_maskz_shrdi_epi64(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
960; X86-LABEL: test_mm256_maskz_shrdi_epi64:
961; X86:       # %bb.0: # %entry
962; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
963; X86-NEXT:    kmovd %eax, %k1
964; X86-NEXT:    vpshrdq $63, %ymm1, %ymm0, %ymm0 {%k1} {z}
965; X86-NEXT:    retl
966;
967; X64-LABEL: test_mm256_maskz_shrdi_epi64:
968; X64:       # %bb.0: # %entry
969; X64-NEXT:    kmovd %edi, %k1
970; X64-NEXT:    vpshrdq $63, %ymm1, %ymm0, %ymm0 {%k1} {z}
971; X64-NEXT:    retq
972entry:
973  %0 = tail call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %__B, <4 x i64> %__A, <4 x i64> <i64 63, i64 63, i64 63, i64 63>)
974  %1 = bitcast i8 %__U to <8 x i1>
975  %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
976  %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> zeroinitializer
977  ret <4 x i64> %2
978}
979
980define <4 x i64> @test_mm256_shrdi_epi64(<4 x i64> %__A, <4 x i64> %__B) {
981; CHECK-LABEL: test_mm256_shrdi_epi64:
982; CHECK:       # %bb.0: # %entry
983; CHECK-NEXT:    vpshrdq $31, %ymm1, %ymm0, %ymm0
984; CHECK-NEXT:    ret{{[l|q]}}
985entry:
986  %0 = tail call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %__B, <4 x i64> %__A, <4 x i64> <i64 31, i64 31, i64 31, i64 31>)
987  ret <4 x i64> %0
988}
989
990define <2 x i64> @test_mm_mask_shrdi_epi64(<2 x i64> %__S, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
991; X86-LABEL: test_mm_mask_shrdi_epi64:
992; X86:       # %bb.0: # %entry
993; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
994; X86-NEXT:    kmovd %eax, %k1
995; X86-NEXT:    vpshrdq $47, %xmm2, %xmm1, %xmm0 {%k1}
996; X86-NEXT:    retl
997;
998; X64-LABEL: test_mm_mask_shrdi_epi64:
999; X64:       # %bb.0: # %entry
1000; X64-NEXT:    kmovd %edi, %k1
1001; X64-NEXT:    vpshrdq $47, %xmm2, %xmm1, %xmm0 {%k1}
1002; X64-NEXT:    retq
1003entry:
1004  %0 = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %__B, <2 x i64> %__A, <2 x i64> <i64 47, i64 47>)
1005  %1 = bitcast i8 %__U to <8 x i1>
1006  %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1007  %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> %__S
1008  ret <2 x i64> %2
1009}
1010
1011declare <2 x i64> @llvm.fshr.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
1012
1013define <2 x i64> @test_mm_maskz_shrdi_epi64(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1014; X86-LABEL: test_mm_maskz_shrdi_epi64:
1015; X86:       # %bb.0: # %entry
1016; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1017; X86-NEXT:    kmovd %eax, %k1
1018; X86-NEXT:    vpshrdq $63, %xmm1, %xmm0, %xmm0 {%k1} {z}
1019; X86-NEXT:    retl
1020;
1021; X64-LABEL: test_mm_maskz_shrdi_epi64:
1022; X64:       # %bb.0: # %entry
1023; X64-NEXT:    kmovd %edi, %k1
1024; X64-NEXT:    vpshrdq $63, %xmm1, %xmm0, %xmm0 {%k1} {z}
1025; X64-NEXT:    retq
1026entry:
1027  %0 = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %__B, <2 x i64> %__A, <2 x i64> <i64 63, i64 63>)
1028  %1 = bitcast i8 %__U to <8 x i1>
1029  %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1030  %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> zeroinitializer
1031  ret <2 x i64> %2
1032}
1033
1034define <2 x i64> @test_mm_shrdi_epi64(<2 x i64> %__A, <2 x i64> %__B) {
1035; CHECK-LABEL: test_mm_shrdi_epi64:
1036; CHECK:       # %bb.0: # %entry
1037; CHECK-NEXT:    vpshrdq $31, %xmm1, %xmm0, %xmm0
1038; CHECK-NEXT:    ret{{[l|q]}}
1039entry:
1040  %0 = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %__B, <2 x i64> %__A, <2 x i64> <i64 31, i64 31>)
1041  ret <2 x i64> %0
1042}
1043
1044define <4 x i64> @test_mm256_mask_shrdi_epi32(<4 x i64> %__S, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1045; X86-LABEL: test_mm256_mask_shrdi_epi32:
1046; X86:       # %bb.0: # %entry
1047; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1048; X86-NEXT:    kmovd %eax, %k1
1049; X86-NEXT:    vpshrdd $7, %ymm2, %ymm1, %ymm0 {%k1}
1050; X86-NEXT:    retl
1051;
1052; X64-LABEL: test_mm256_mask_shrdi_epi32:
1053; X64:       # %bb.0: # %entry
1054; X64-NEXT:    kmovd %edi, %k1
1055; X64-NEXT:    vpshrdd $7, %ymm2, %ymm1, %ymm0 {%k1}
1056; X64-NEXT:    retq
1057entry:
1058  %0 = bitcast <4 x i64> %__A to <8 x i32>
1059  %1 = bitcast <4 x i64> %__B to <8 x i32>
1060  %2 = tail call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>)
1061  %3 = bitcast <4 x i64> %__S to <8 x i32>
1062  %4 = bitcast i8 %__U to <8 x i1>
1063  %5 = select <8 x i1> %4, <8 x i32> %2, <8 x i32> %3
1064  %6 = bitcast <8 x i32> %5 to <4 x i64>
1065  ret <4 x i64> %6
1066}
1067
1068declare <8 x i32> @llvm.fshr.v8i32(<8 x i32>, <8 x i32>, <8 x i32>)
1069
1070define <4 x i64> @test_mm256_maskz_shrdi_epi32(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1071; X86-LABEL: test_mm256_maskz_shrdi_epi32:
1072; X86:       # %bb.0: # %entry
1073; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1074; X86-NEXT:    kmovd %eax, %k1
1075; X86-NEXT:    vpshrdd $15, %ymm1, %ymm0, %ymm0 {%k1} {z}
1076; X86-NEXT:    retl
1077;
1078; X64-LABEL: test_mm256_maskz_shrdi_epi32:
1079; X64:       # %bb.0: # %entry
1080; X64-NEXT:    kmovd %edi, %k1
1081; X64-NEXT:    vpshrdd $15, %ymm1, %ymm0, %ymm0 {%k1} {z}
1082; X64-NEXT:    retq
1083entry:
1084  %0 = bitcast <4 x i64> %__A to <8 x i32>
1085  %1 = bitcast <4 x i64> %__B to <8 x i32>
1086  %2 = tail call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>)
1087  %3 = bitcast i8 %__U to <8 x i1>
1088  %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
1089  %5 = bitcast <8 x i32> %4 to <4 x i64>
1090  ret <4 x i64> %5
1091}
1092
1093define <4 x i64> @test_mm256_shrdi_epi32(<4 x i64> %__A, <4 x i64> %__B) {
1094; CHECK-LABEL: test_mm256_shrdi_epi32:
1095; CHECK:       # %bb.0: # %entry
1096; CHECK-NEXT:    vpshrdd $31, %ymm1, %ymm0, %ymm0
1097; CHECK-NEXT:    ret{{[l|q]}}
1098entry:
1099  %0 = bitcast <4 x i64> %__A to <8 x i32>
1100  %1 = bitcast <4 x i64> %__B to <8 x i32>
1101  %2 = tail call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>)
1102  %3 = bitcast <8 x i32> %2 to <4 x i64>
1103  ret <4 x i64> %3
1104}
1105
1106define <2 x i64> @test_mm_mask_shrdi_epi32(<2 x i64> %__S, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1107; X86-LABEL: test_mm_mask_shrdi_epi32:
1108; X86:       # %bb.0: # %entry
1109; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1110; X86-NEXT:    kmovd %eax, %k1
1111; X86-NEXT:    vpshrdd $7, %xmm2, %xmm1, %xmm0 {%k1}
1112; X86-NEXT:    retl
1113;
1114; X64-LABEL: test_mm_mask_shrdi_epi32:
1115; X64:       # %bb.0: # %entry
1116; X64-NEXT:    kmovd %edi, %k1
1117; X64-NEXT:    vpshrdd $7, %xmm2, %xmm1, %xmm0 {%k1}
1118; X64-NEXT:    retq
1119entry:
1120  %0 = bitcast <2 x i64> %__A to <4 x i32>
1121  %1 = bitcast <2 x i64> %__B to <4 x i32>
1122  %2 = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> <i32 7, i32 7, i32 7, i32 7>)
1123  %3 = bitcast <2 x i64> %__S to <4 x i32>
1124  %4 = bitcast i8 %__U to <8 x i1>
1125  %extract = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1126  %5 = select <4 x i1> %extract, <4 x i32> %2, <4 x i32> %3
1127  %6 = bitcast <4 x i32> %5 to <2 x i64>
1128  ret <2 x i64> %6
1129}
1130
1131declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
1132
1133define <2 x i64> @test_mm_maskz_shrdi_epi32(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1134; X86-LABEL: test_mm_maskz_shrdi_epi32:
1135; X86:       # %bb.0: # %entry
1136; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1137; X86-NEXT:    kmovd %eax, %k1
1138; X86-NEXT:    vpshrdd $15, %xmm1, %xmm0, %xmm0 {%k1} {z}
1139; X86-NEXT:    retl
1140;
1141; X64-LABEL: test_mm_maskz_shrdi_epi32:
1142; X64:       # %bb.0: # %entry
1143; X64-NEXT:    kmovd %edi, %k1
1144; X64-NEXT:    vpshrdd $15, %xmm1, %xmm0, %xmm0 {%k1} {z}
1145; X64-NEXT:    retq
1146entry:
1147  %0 = bitcast <2 x i64> %__A to <4 x i32>
1148  %1 = bitcast <2 x i64> %__B to <4 x i32>
1149  %2 = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> <i32 15, i32 15, i32 15, i32 15>)
1150  %3 = bitcast i8 %__U to <8 x i1>
1151  %extract = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1152  %4 = select <4 x i1> %extract, <4 x i32> %2, <4 x i32> zeroinitializer
1153  %5 = bitcast <4 x i32> %4 to <2 x i64>
1154  ret <2 x i64> %5
1155}
1156
1157define <2 x i64> @test_mm_shrdi_epi32(<2 x i64> %__A, <2 x i64> %__B) {
1158; CHECK-LABEL: test_mm_shrdi_epi32:
1159; CHECK:       # %bb.0: # %entry
1160; CHECK-NEXT:    vpshrdd $31, %xmm1, %xmm0, %xmm0
1161; CHECK-NEXT:    ret{{[l|q]}}
1162entry:
1163  %0 = bitcast <2 x i64> %__A to <4 x i32>
1164  %1 = bitcast <2 x i64> %__B to <4 x i32>
1165  %2 = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> <i32 31, i32 31, i32 31, i32 31>)
1166  %3 = bitcast <4 x i32> %2 to <2 x i64>
1167  ret <2 x i64> %3
1168}
1169
1170define <4 x i64> @test_mm256_mask_shrdi_epi16(<4 x i64> %__S, i16 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1171; X86-LABEL: test_mm256_mask_shrdi_epi16:
1172; X86:       # %bb.0: # %entry
1173; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
1174; X86-NEXT:    vpshrdw $3, %ymm2, %ymm1, %ymm0 {%k1}
1175; X86-NEXT:    retl
1176;
1177; X64-LABEL: test_mm256_mask_shrdi_epi16:
1178; X64:       # %bb.0: # %entry
1179; X64-NEXT:    kmovd %edi, %k1
1180; X64-NEXT:    vpshrdw $3, %ymm2, %ymm1, %ymm0 {%k1}
1181; X64-NEXT:    retq
1182entry:
1183  %0 = bitcast <4 x i64> %__A to <16 x i16>
1184  %1 = bitcast <4 x i64> %__B to <16 x i16>
1185  %2 = tail call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %1, <16 x i16> %0, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
1186  %3 = bitcast <4 x i64> %__S to <16 x i16>
1187  %4 = bitcast i16 %__U to <16 x i1>
1188  %5 = select <16 x i1> %4, <16 x i16> %2, <16 x i16> %3
1189  %6 = bitcast <16 x i16> %5 to <4 x i64>
1190  ret <4 x i64> %6
1191}
1192
1193declare <16 x i16> @llvm.fshr.v16i16(<16 x i16>, <16 x i16>, <16 x i16>)
1194
1195define <4 x i64> @test_mm256_maskz_shrdi_epi16(i16 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1196; X86-LABEL: test_mm256_maskz_shrdi_epi16:
1197; X86:       # %bb.0: # %entry
1198; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
1199; X86-NEXT:    vpshrdw $7, %ymm1, %ymm0, %ymm0 {%k1} {z}
1200; X86-NEXT:    retl
1201;
1202; X64-LABEL: test_mm256_maskz_shrdi_epi16:
1203; X64:       # %bb.0: # %entry
1204; X64-NEXT:    kmovd %edi, %k1
1205; X64-NEXT:    vpshrdw $7, %ymm1, %ymm0, %ymm0 {%k1} {z}
1206; X64-NEXT:    retq
1207entry:
1208  %0 = bitcast <4 x i64> %__A to <16 x i16>
1209  %1 = bitcast <4 x i64> %__B to <16 x i16>
1210  %2 = tail call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %1, <16 x i16> %0, <16 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
1211  %3 = bitcast i16 %__U to <16 x i1>
1212  %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> zeroinitializer
1213  %5 = bitcast <16 x i16> %4 to <4 x i64>
1214  ret <4 x i64> %5
1215}
1216
1217define <4 x i64> @test_mm256_shrdi_epi16(<4 x i64> %__A, <4 x i64> %__B) {
1218; CHECK-LABEL: test_mm256_shrdi_epi16:
1219; CHECK:       # %bb.0: # %entry
1220; CHECK-NEXT:    vpshrdw $15, %ymm1, %ymm0, %ymm0
1221; CHECK-NEXT:    ret{{[l|q]}}
1222entry:
1223  %0 = bitcast <4 x i64> %__A to <16 x i16>
1224  %1 = bitcast <4 x i64> %__B to <16 x i16>
1225  %2 = tail call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %1, <16 x i16> %0, <16 x i16> <i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31>)
1226  %3 = bitcast <16 x i16> %2 to <4 x i64>
1227  ret <4 x i64> %3
1228}
1229
1230define <2 x i64> @test_mm_mask_shrdi_epi16(<2 x i64> %__S, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1231; X86-LABEL: test_mm_mask_shrdi_epi16:
1232; X86:       # %bb.0: # %entry
1233; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1234; X86-NEXT:    kmovd %eax, %k1
1235; X86-NEXT:    vpshrdw $3, %xmm2, %xmm1, %xmm0 {%k1}
1236; X86-NEXT:    retl
1237;
1238; X64-LABEL: test_mm_mask_shrdi_epi16:
1239; X64:       # %bb.0: # %entry
1240; X64-NEXT:    kmovd %edi, %k1
1241; X64-NEXT:    vpshrdw $3, %xmm2, %xmm1, %xmm0 {%k1}
1242; X64-NEXT:    retq
1243entry:
1244  %0 = bitcast <2 x i64> %__A to <8 x i16>
1245  %1 = bitcast <2 x i64> %__B to <8 x i16>
1246  %2 = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %1, <8 x i16> %0, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
1247  %3 = bitcast <2 x i64> %__S to <8 x i16>
1248  %4 = bitcast i8 %__U to <8 x i1>
1249  %5 = select <8 x i1> %4, <8 x i16> %2, <8 x i16> %3
1250  %6 = bitcast <8 x i16> %5 to <2 x i64>
1251  ret <2 x i64> %6
1252}
1253
1254declare <8 x i16> @llvm.fshr.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
1255
1256define <2 x i64> @test_mm_maskz_shrdi_epi16(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1257; X86-LABEL: test_mm_maskz_shrdi_epi16:
1258; X86:       # %bb.0: # %entry
1259; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1260; X86-NEXT:    kmovd %eax, %k1
1261; X86-NEXT:    vpshrdw $7, %xmm1, %xmm0, %xmm0 {%k1} {z}
1262; X86-NEXT:    retl
1263;
1264; X64-LABEL: test_mm_maskz_shrdi_epi16:
1265; X64:       # %bb.0: # %entry
1266; X64-NEXT:    kmovd %edi, %k1
1267; X64-NEXT:    vpshrdw $7, %xmm1, %xmm0, %xmm0 {%k1} {z}
1268; X64-NEXT:    retq
1269entry:
1270  %0 = bitcast <2 x i64> %__A to <8 x i16>
1271  %1 = bitcast <2 x i64> %__B to <8 x i16>
1272  %2 = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %1, <8 x i16> %0, <8 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
1273  %3 = bitcast i8 %__U to <8 x i1>
1274  %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> zeroinitializer
1275  %5 = bitcast <8 x i16> %4 to <2 x i64>
1276  ret <2 x i64> %5
1277}
1278
1279define <2 x i64> @test_mm_shrdi_epi16(<2 x i64> %__A, <2 x i64> %__B) {
1280; CHECK-LABEL: test_mm_shrdi_epi16:
1281; CHECK:       # %bb.0: # %entry
1282; CHECK-NEXT:    vpshrdw $15, %xmm1, %xmm0, %xmm0
1283; CHECK-NEXT:    ret{{[l|q]}}
1284entry:
1285  %0 = bitcast <2 x i64> %__A to <8 x i16>
1286  %1 = bitcast <2 x i64> %__B to <8 x i16>
1287  %2 = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %1, <8 x i16> %0, <8 x i16> <i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31>)
1288  %3 = bitcast <8 x i16> %2 to <2 x i64>
1289  ret <2 x i64> %3
1290}
1291
1292define <4 x i64> @test_mm256_mask_shldv_epi64(<4 x i64> %__S, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1293; X86-LABEL: test_mm256_mask_shldv_epi64:
1294; X86:       # %bb.0: # %entry
1295; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1296; X86-NEXT:    kmovd %eax, %k1
1297; X86-NEXT:    vpshldvq %ymm2, %ymm1, %ymm0 {%k1}
1298; X86-NEXT:    retl
1299;
1300; X64-LABEL: test_mm256_mask_shldv_epi64:
1301; X64:       # %bb.0: # %entry
1302; X64-NEXT:    kmovd %edi, %k1
1303; X64-NEXT:    vpshldvq %ymm2, %ymm1, %ymm0 {%k1}
1304; X64-NEXT:    retq
1305entry:
1306  %0 = tail call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %__S, <4 x i64> %__A, <4 x i64> %__B)
1307  %1 = bitcast i8 %__U to <8 x i1>
1308  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1309  %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> %__S
1310  ret <4 x i64> %2
1311}
1312
1313define <4 x i64> @test_mm256_maskz_shldv_epi64(i8 zeroext %__U, <4 x i64> %__S, <4 x i64> %__A, <4 x i64> %__B) {
1314; X86-LABEL: test_mm256_maskz_shldv_epi64:
1315; X86:       # %bb.0: # %entry
1316; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1317; X86-NEXT:    kmovd %eax, %k1
1318; X86-NEXT:    vpshldvq %ymm2, %ymm1, %ymm0 {%k1} {z}
1319; X86-NEXT:    retl
1320;
1321; X64-LABEL: test_mm256_maskz_shldv_epi64:
1322; X64:       # %bb.0: # %entry
1323; X64-NEXT:    kmovd %edi, %k1
1324; X64-NEXT:    vpshldvq %ymm2, %ymm1, %ymm0 {%k1} {z}
1325; X64-NEXT:    retq
1326entry:
1327  %0 = tail call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %__S, <4 x i64> %__A, <4 x i64> %__B)
1328  %1 = bitcast i8 %__U to <8 x i1>
1329  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1330  %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> zeroinitializer
1331  ret <4 x i64> %2
1332}
1333
1334define <4 x i64> @test_mm256_shldv_epi64(<4 x i64> %__S, <4 x i64> %__A, <4 x i64> %__B) {
1335; CHECK-LABEL: test_mm256_shldv_epi64:
1336; CHECK:       # %bb.0: # %entry
1337; CHECK-NEXT:    vpshldvq %ymm2, %ymm1, %ymm0
1338; CHECK-NEXT:    ret{{[l|q]}}
1339entry:
1340  %0 = tail call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %__S, <4 x i64> %__A, <4 x i64> %__B)
1341  ret <4 x i64> %0
1342}
1343
1344define <2 x i64> @test_mm_mask_shldv_epi64(<2 x i64> %__S, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1345; X86-LABEL: test_mm_mask_shldv_epi64:
1346; X86:       # %bb.0: # %entry
1347; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1348; X86-NEXT:    kmovd %eax, %k1
1349; X86-NEXT:    vpshldvq %xmm2, %xmm1, %xmm0 {%k1}
1350; X86-NEXT:    retl
1351;
1352; X64-LABEL: test_mm_mask_shldv_epi64:
1353; X64:       # %bb.0: # %entry
1354; X64-NEXT:    kmovd %edi, %k1
1355; X64-NEXT:    vpshldvq %xmm2, %xmm1, %xmm0 {%k1}
1356; X64-NEXT:    retq
1357entry:
1358  %0 = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %__S, <2 x i64> %__A, <2 x i64> %__B)
1359  %1 = bitcast i8 %__U to <8 x i1>
1360  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1361  %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__S
1362  ret <2 x i64> %2
1363}
1364
1365define <2 x i64> @test_mm_maskz_shldv_epi64(i8 zeroext %__U, <2 x i64> %__S, <2 x i64> %__A, <2 x i64> %__B) {
1366; X86-LABEL: test_mm_maskz_shldv_epi64:
1367; X86:       # %bb.0: # %entry
1368; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1369; X86-NEXT:    kmovd %eax, %k1
1370; X86-NEXT:    vpshldvq %xmm2, %xmm1, %xmm0 {%k1} {z}
1371; X86-NEXT:    retl
1372;
1373; X64-LABEL: test_mm_maskz_shldv_epi64:
1374; X64:       # %bb.0: # %entry
1375; X64-NEXT:    kmovd %edi, %k1
1376; X64-NEXT:    vpshldvq %xmm2, %xmm1, %xmm0 {%k1} {z}
1377; X64-NEXT:    retq
1378entry:
1379  %0 = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %__S, <2 x i64> %__A, <2 x i64> %__B)
1380  %1 = bitcast i8 %__U to <8 x i1>
1381  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1382  %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> zeroinitializer
1383  ret <2 x i64> %2
1384}
1385
1386define <2 x i64> @test_mm_shldv_epi64(<2 x i64> %__S, <2 x i64> %__A, <2 x i64> %__B) {
1387; CHECK-LABEL: test_mm_shldv_epi64:
1388; CHECK:       # %bb.0: # %entry
1389; CHECK-NEXT:    vpshldvq %xmm2, %xmm1, %xmm0
1390; CHECK-NEXT:    ret{{[l|q]}}
1391entry:
1392  %0 = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %__S, <2 x i64> %__A, <2 x i64> %__B)
1393  ret <2 x i64> %0
1394}
1395
1396define <4 x i64> @test_mm256_mask_shldv_epi32(<4 x i64> %__S, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1397; X86-LABEL: test_mm256_mask_shldv_epi32:
1398; X86:       # %bb.0: # %entry
1399; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1400; X86-NEXT:    kmovd %eax, %k1
1401; X86-NEXT:    vpshldvd %ymm2, %ymm1, %ymm0 {%k1}
1402; X86-NEXT:    retl
1403;
1404; X64-LABEL: test_mm256_mask_shldv_epi32:
1405; X64:       # %bb.0: # %entry
1406; X64-NEXT:    kmovd %edi, %k1
1407; X64-NEXT:    vpshldvd %ymm2, %ymm1, %ymm0 {%k1}
1408; X64-NEXT:    retq
1409entry:
1410  %0 = bitcast <4 x i64> %__S to <8 x i32>
1411  %1 = bitcast <4 x i64> %__A to <8 x i32>
1412  %2 = bitcast <4 x i64> %__B to <8 x i32>
1413  %3 = tail call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2)
1414  %4 = bitcast i8 %__U to <8 x i1>
1415  %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> %0
1416  %6 = bitcast <8 x i32> %5 to <4 x i64>
1417  ret <4 x i64> %6
1418}
1419
1420define <4 x i64> @test_mm256_maskz_shldv_epi32(i8 zeroext %__U, <4 x i64> %__S, <4 x i64> %__A, <4 x i64> %__B) {
1421; X86-LABEL: test_mm256_maskz_shldv_epi32:
1422; X86:       # %bb.0: # %entry
1423; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1424; X86-NEXT:    kmovd %eax, %k1
1425; X86-NEXT:    vpshldvd %ymm2, %ymm1, %ymm0 {%k1} {z}
1426; X86-NEXT:    retl
1427;
1428; X64-LABEL: test_mm256_maskz_shldv_epi32:
1429; X64:       # %bb.0: # %entry
1430; X64-NEXT:    kmovd %edi, %k1
1431; X64-NEXT:    vpshldvd %ymm2, %ymm1, %ymm0 {%k1} {z}
1432; X64-NEXT:    retq
1433entry:
1434  %0 = bitcast <4 x i64> %__S to <8 x i32>
1435  %1 = bitcast <4 x i64> %__A to <8 x i32>
1436  %2 = bitcast <4 x i64> %__B to <8 x i32>
1437  %3 = tail call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2)
1438  %4 = bitcast i8 %__U to <8 x i1>
1439  %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> zeroinitializer
1440  %6 = bitcast <8 x i32> %5 to <4 x i64>
1441  ret <4 x i64> %6
1442}
1443
1444define <4 x i64> @test_mm256_shldv_epi32(<4 x i64> %__S, <4 x i64> %__A, <4 x i64> %__B) {
1445; CHECK-LABEL: test_mm256_shldv_epi32:
1446; CHECK:       # %bb.0: # %entry
1447; CHECK-NEXT:    vpshldvd %ymm2, %ymm1, %ymm0
1448; CHECK-NEXT:    ret{{[l|q]}}
1449entry:
1450  %0 = bitcast <4 x i64> %__S to <8 x i32>
1451  %1 = bitcast <4 x i64> %__A to <8 x i32>
1452  %2 = bitcast <4 x i64> %__B to <8 x i32>
1453  %3 = tail call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2)
1454  %4 = bitcast <8 x i32> %3 to <4 x i64>
1455  ret <4 x i64> %4
1456}
1457
1458define <2 x i64> @test_mm_mask_shldv_epi32(<2 x i64> %__S, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1459; X86-LABEL: test_mm_mask_shldv_epi32:
1460; X86:       # %bb.0: # %entry
1461; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1462; X86-NEXT:    kmovd %eax, %k1
1463; X86-NEXT:    vpshldvd %xmm2, %xmm1, %xmm0 {%k1}
1464; X86-NEXT:    retl
1465;
1466; X64-LABEL: test_mm_mask_shldv_epi32:
1467; X64:       # %bb.0: # %entry
1468; X64-NEXT:    kmovd %edi, %k1
1469; X64-NEXT:    vpshldvd %xmm2, %xmm1, %xmm0 {%k1}
1470; X64-NEXT:    retq
1471entry:
1472  %0 = bitcast <2 x i64> %__S to <4 x i32>
1473  %1 = bitcast <2 x i64> %__A to <4 x i32>
1474  %2 = bitcast <2 x i64> %__B to <4 x i32>
1475  %3 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
1476  %4 = bitcast i8 %__U to <8 x i1>
1477  %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1478  %5 = select <4 x i1> %extract.i, <4 x i32> %3, <4 x i32> %0
1479  %6 = bitcast <4 x i32> %5 to <2 x i64>
1480  ret <2 x i64> %6
1481}
1482
1483define <2 x i64> @test_mm_maskz_shldv_epi32(i8 zeroext %__U, <2 x i64> %__S, <2 x i64> %__A, <2 x i64> %__B) {
1484; X86-LABEL: test_mm_maskz_shldv_epi32:
1485; X86:       # %bb.0: # %entry
1486; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1487; X86-NEXT:    kmovd %eax, %k1
1488; X86-NEXT:    vpshldvd %xmm2, %xmm1, %xmm0 {%k1} {z}
1489; X86-NEXT:    retl
1490;
1491; X64-LABEL: test_mm_maskz_shldv_epi32:
1492; X64:       # %bb.0: # %entry
1493; X64-NEXT:    kmovd %edi, %k1
1494; X64-NEXT:    vpshldvd %xmm2, %xmm1, %xmm0 {%k1} {z}
1495; X64-NEXT:    retq
1496entry:
1497  %0 = bitcast <2 x i64> %__S to <4 x i32>
1498  %1 = bitcast <2 x i64> %__A to <4 x i32>
1499  %2 = bitcast <2 x i64> %__B to <4 x i32>
1500  %3 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
1501  %4 = bitcast i8 %__U to <8 x i1>
1502  %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1503  %5 = select <4 x i1> %extract.i, <4 x i32> %3, <4 x i32> zeroinitializer
1504  %6 = bitcast <4 x i32> %5 to <2 x i64>
1505  ret <2 x i64> %6
1506}
1507
1508define <2 x i64> @test_mm_shldv_epi32(<2 x i64> %__S, <2 x i64> %__A, <2 x i64> %__B) {
1509; CHECK-LABEL: test_mm_shldv_epi32:
1510; CHECK:       # %bb.0: # %entry
1511; CHECK-NEXT:    vpshldvd %xmm2, %xmm1, %xmm0
1512; CHECK-NEXT:    ret{{[l|q]}}
1513entry:
1514  %0 = bitcast <2 x i64> %__S to <4 x i32>
1515  %1 = bitcast <2 x i64> %__A to <4 x i32>
1516  %2 = bitcast <2 x i64> %__B to <4 x i32>
1517  %3 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
1518  %4 = bitcast <4 x i32> %3 to <2 x i64>
1519  ret <2 x i64> %4
1520}
1521
1522define <4 x i64> @test_mm256_mask_shldv_epi16(<4 x i64> %__S, i16 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1523; X86-LABEL: test_mm256_mask_shldv_epi16:
1524; X86:       # %bb.0: # %entry
1525; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
1526; X86-NEXT:    vpshldvw %ymm2, %ymm1, %ymm0 {%k1}
1527; X86-NEXT:    retl
1528;
1529; X64-LABEL: test_mm256_mask_shldv_epi16:
1530; X64:       # %bb.0: # %entry
1531; X64-NEXT:    kmovd %edi, %k1
1532; X64-NEXT:    vpshldvw %ymm2, %ymm1, %ymm0 {%k1}
1533; X64-NEXT:    retq
1534entry:
1535  %0 = bitcast <4 x i64> %__S to <16 x i16>
1536  %1 = bitcast <4 x i64> %__A to <16 x i16>
1537  %2 = bitcast <4 x i64> %__B to <16 x i16>
1538  %3 = tail call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %0, <16 x i16> %1, <16 x i16> %2)
1539  %4 = bitcast i16 %__U to <16 x i1>
1540  %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> %0
1541  %6 = bitcast <16 x i16> %5 to <4 x i64>
1542  ret <4 x i64> %6
1543}
1544
1545define <4 x i64> @test_mm256_maskz_shldv_epi16(i16 zeroext %__U, <4 x i64> %__S, <4 x i64> %__A, <4 x i64> %__B) {
1546; X86-LABEL: test_mm256_maskz_shldv_epi16:
1547; X86:       # %bb.0: # %entry
1548; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
1549; X86-NEXT:    vpshldvw %ymm2, %ymm1, %ymm0 {%k1} {z}
1550; X86-NEXT:    retl
1551;
1552; X64-LABEL: test_mm256_maskz_shldv_epi16:
1553; X64:       # %bb.0: # %entry
1554; X64-NEXT:    kmovd %edi, %k1
1555; X64-NEXT:    vpshldvw %ymm2, %ymm1, %ymm0 {%k1} {z}
1556; X64-NEXT:    retq
1557entry:
1558  %0 = bitcast <4 x i64> %__S to <16 x i16>
1559  %1 = bitcast <4 x i64> %__A to <16 x i16>
1560  %2 = bitcast <4 x i64> %__B to <16 x i16>
1561  %3 = tail call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %0, <16 x i16> %1, <16 x i16> %2)
1562  %4 = bitcast i16 %__U to <16 x i1>
1563  %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> zeroinitializer
1564  %6 = bitcast <16 x i16> %5 to <4 x i64>
1565  ret <4 x i64> %6
1566}
1567
1568define <4 x i64> @test_mm256_shldv_epi16(<4 x i64> %__S, <4 x i64> %__A, <4 x i64> %__B) {
1569; CHECK-LABEL: test_mm256_shldv_epi16:
1570; CHECK:       # %bb.0: # %entry
1571; CHECK-NEXT:    vpshldvw %ymm2, %ymm1, %ymm0
1572; CHECK-NEXT:    ret{{[l|q]}}
1573entry:
1574  %0 = bitcast <4 x i64> %__S to <16 x i16>
1575  %1 = bitcast <4 x i64> %__A to <16 x i16>
1576  %2 = bitcast <4 x i64> %__B to <16 x i16>
1577  %3 = tail call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %0, <16 x i16> %1, <16 x i16> %2)
1578  %4 = bitcast <16 x i16> %3 to <4 x i64>
1579  ret <4 x i64> %4
1580}
1581
1582define <2 x i64> @test_mm_mask_shldv_epi16(<2 x i64> %__S, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1583; X86-LABEL: test_mm_mask_shldv_epi16:
1584; X86:       # %bb.0: # %entry
1585; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1586; X86-NEXT:    kmovd %eax, %k1
1587; X86-NEXT:    vpshldvw %xmm2, %xmm1, %xmm0 {%k1}
1588; X86-NEXT:    retl
1589;
1590; X64-LABEL: test_mm_mask_shldv_epi16:
1591; X64:       # %bb.0: # %entry
1592; X64-NEXT:    kmovd %edi, %k1
1593; X64-NEXT:    vpshldvw %xmm2, %xmm1, %xmm0 {%k1}
1594; X64-NEXT:    retq
1595entry:
1596  %0 = bitcast <2 x i64> %__S to <8 x i16>
1597  %1 = bitcast <2 x i64> %__A to <8 x i16>
1598  %2 = bitcast <2 x i64> %__B to <8 x i16>
1599  %3 = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %0, <8 x i16> %1, <8 x i16> %2)
1600  %4 = bitcast i8 %__U to <8 x i1>
1601  %5 = select <8 x i1> %4, <8 x i16> %3, <8 x i16> %0
1602  %6 = bitcast <8 x i16> %5 to <2 x i64>
1603  ret <2 x i64> %6
1604}
1605
1606define <2 x i64> @test_mm_maskz_shldv_epi16(i8 zeroext %__U, <2 x i64> %__S, <2 x i64> %__A, <2 x i64> %__B) {
1607; X86-LABEL: test_mm_maskz_shldv_epi16:
1608; X86:       # %bb.0: # %entry
1609; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1610; X86-NEXT:    kmovd %eax, %k1
1611; X86-NEXT:    vpshldvw %xmm2, %xmm1, %xmm0 {%k1} {z}
1612; X86-NEXT:    retl
1613;
1614; X64-LABEL: test_mm_maskz_shldv_epi16:
1615; X64:       # %bb.0: # %entry
1616; X64-NEXT:    kmovd %edi, %k1
1617; X64-NEXT:    vpshldvw %xmm2, %xmm1, %xmm0 {%k1} {z}
1618; X64-NEXT:    retq
1619entry:
1620  %0 = bitcast <2 x i64> %__S to <8 x i16>
1621  %1 = bitcast <2 x i64> %__A to <8 x i16>
1622  %2 = bitcast <2 x i64> %__B to <8 x i16>
1623  %3 = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %0, <8 x i16> %1, <8 x i16> %2)
1624  %4 = bitcast i8 %__U to <8 x i1>
1625  %5 = select <8 x i1> %4, <8 x i16> %3, <8 x i16> zeroinitializer
1626  %6 = bitcast <8 x i16> %5 to <2 x i64>
1627  ret <2 x i64> %6
1628}
1629
1630define <2 x i64> @test_mm_shldv_epi16(<2 x i64> %__S, <2 x i64> %__A, <2 x i64> %__B) {
1631; CHECK-LABEL: test_mm_shldv_epi16:
1632; CHECK:       # %bb.0: # %entry
1633; CHECK-NEXT:    vpshldvw %xmm2, %xmm1, %xmm0
1634; CHECK-NEXT:    ret{{[l|q]}}
1635entry:
1636  %0 = bitcast <2 x i64> %__S to <8 x i16>
1637  %1 = bitcast <2 x i64> %__A to <8 x i16>
1638  %2 = bitcast <2 x i64> %__B to <8 x i16>
1639  %3 = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %0, <8 x i16> %1, <8 x i16> %2)
1640  %4 = bitcast <8 x i16> %3 to <2 x i64>
1641  ret <2 x i64> %4
1642}
1643
1644define <4 x i64> @test_mm256_mask_shrdv_epi64(<4 x i64> %__S, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1645; X86-LABEL: test_mm256_mask_shrdv_epi64:
1646; X86:       # %bb.0: # %entry
1647; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1648; X86-NEXT:    kmovd %eax, %k1
1649; X86-NEXT:    vpshrdvq %ymm2, %ymm1, %ymm0 {%k1}
1650; X86-NEXT:    retl
1651;
1652; X64-LABEL: test_mm256_mask_shrdv_epi64:
1653; X64:       # %bb.0: # %entry
1654; X64-NEXT:    kmovd %edi, %k1
1655; X64-NEXT:    vpshrdvq %ymm2, %ymm1, %ymm0 {%k1}
1656; X64-NEXT:    retq
1657entry:
1658  %0 = tail call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %__A, <4 x i64> %__S, <4 x i64> %__B)
1659  %1 = bitcast i8 %__U to <8 x i1>
1660  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1661  %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> %__S
1662  ret <4 x i64> %2
1663}
1664
1665define <4 x i64> @test_mm256_maskz_shrdv_epi64(i8 zeroext %__U, <4 x i64> %__S, <4 x i64> %__A, <4 x i64> %__B) {
1666; X86-LABEL: test_mm256_maskz_shrdv_epi64:
1667; X86:       # %bb.0: # %entry
1668; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1669; X86-NEXT:    kmovd %eax, %k1
1670; X86-NEXT:    vpshrdvq %ymm2, %ymm1, %ymm0 {%k1} {z}
1671; X86-NEXT:    retl
1672;
1673; X64-LABEL: test_mm256_maskz_shrdv_epi64:
1674; X64:       # %bb.0: # %entry
1675; X64-NEXT:    kmovd %edi, %k1
1676; X64-NEXT:    vpshrdvq %ymm2, %ymm1, %ymm0 {%k1} {z}
1677; X64-NEXT:    retq
1678entry:
1679  %0 = tail call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %__A, <4 x i64> %__S, <4 x i64> %__B)
1680  %1 = bitcast i8 %__U to <8 x i1>
1681  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1682  %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> zeroinitializer
1683  ret <4 x i64> %2
1684}
1685
1686define <4 x i64> @test_mm256_shrdv_epi64(<4 x i64> %__S, <4 x i64> %__A, <4 x i64> %__B) {
1687; CHECK-LABEL: test_mm256_shrdv_epi64:
1688; CHECK:       # %bb.0: # %entry
1689; CHECK-NEXT:    vpshrdvq %ymm2, %ymm1, %ymm0
1690; CHECK-NEXT:    ret{{[l|q]}}
1691entry:
1692  %0 = tail call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %__A, <4 x i64> %__S, <4 x i64> %__B)
1693  ret <4 x i64> %0
1694}
1695
1696define <2 x i64> @test_mm_mask_shrdv_epi64(<2 x i64> %__S, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1697; X86-LABEL: test_mm_mask_shrdv_epi64:
1698; X86:       # %bb.0: # %entry
1699; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1700; X86-NEXT:    kmovd %eax, %k1
1701; X86-NEXT:    vpshrdvq %xmm2, %xmm1, %xmm0 {%k1}
1702; X86-NEXT:    retl
1703;
1704; X64-LABEL: test_mm_mask_shrdv_epi64:
1705; X64:       # %bb.0: # %entry
1706; X64-NEXT:    kmovd %edi, %k1
1707; X64-NEXT:    vpshrdvq %xmm2, %xmm1, %xmm0 {%k1}
1708; X64-NEXT:    retq
1709entry:
1710  %0 = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %__A, <2 x i64> %__S, <2 x i64> %__B)
1711  %1 = bitcast i8 %__U to <8 x i1>
1712  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1713  %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__S
1714  ret <2 x i64> %2
1715}
1716
1717define <2 x i64> @test_mm_maskz_shrdv_epi64(i8 zeroext %__U, <2 x i64> %__S, <2 x i64> %__A, <2 x i64> %__B) {
1718; X86-LABEL: test_mm_maskz_shrdv_epi64:
1719; X86:       # %bb.0: # %entry
1720; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1721; X86-NEXT:    kmovd %eax, %k1
1722; X86-NEXT:    vpshrdvq %xmm2, %xmm1, %xmm0 {%k1} {z}
1723; X86-NEXT:    retl
1724;
1725; X64-LABEL: test_mm_maskz_shrdv_epi64:
1726; X64:       # %bb.0: # %entry
1727; X64-NEXT:    kmovd %edi, %k1
1728; X64-NEXT:    vpshrdvq %xmm2, %xmm1, %xmm0 {%k1} {z}
1729; X64-NEXT:    retq
1730entry:
1731  %0 = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %__A, <2 x i64> %__S, <2 x i64> %__B)
1732  %1 = bitcast i8 %__U to <8 x i1>
1733  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1734  %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> zeroinitializer
1735  ret <2 x i64> %2
1736}
1737
1738define <2 x i64> @test_mm_shrdv_epi64(<2 x i64> %__S, <2 x i64> %__A, <2 x i64> %__B) {
1739; CHECK-LABEL: test_mm_shrdv_epi64:
1740; CHECK:       # %bb.0: # %entry
1741; CHECK-NEXT:    vpshrdvq %xmm2, %xmm1, %xmm0
1742; CHECK-NEXT:    ret{{[l|q]}}
1743entry:
1744  %0 = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %__A, <2 x i64> %__S, <2 x i64> %__B)
1745  ret <2 x i64> %0
1746}
1747
1748define <4 x i64> @test_mm256_mask_shrdv_epi32(<4 x i64> %__S, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1749; X86-LABEL: test_mm256_mask_shrdv_epi32:
1750; X86:       # %bb.0: # %entry
1751; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1752; X86-NEXT:    kmovd %eax, %k1
1753; X86-NEXT:    vpshrdvd %ymm2, %ymm1, %ymm0 {%k1}
1754; X86-NEXT:    retl
1755;
1756; X64-LABEL: test_mm256_mask_shrdv_epi32:
1757; X64:       # %bb.0: # %entry
1758; X64-NEXT:    kmovd %edi, %k1
1759; X64-NEXT:    vpshrdvd %ymm2, %ymm1, %ymm0 {%k1}
1760; X64-NEXT:    retq
1761entry:
1762  %0 = bitcast <4 x i64> %__S to <8 x i32>
1763  %1 = bitcast <4 x i64> %__A to <8 x i32>
1764  %2 = bitcast <4 x i64> %__B to <8 x i32>
1765  %3 = tail call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> %2)
1766  %4 = bitcast i8 %__U to <8 x i1>
1767  %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> %0
1768  %6 = bitcast <8 x i32> %5 to <4 x i64>
1769  ret <4 x i64> %6
1770}
1771
1772define <4 x i64> @test_mm256_maskz_shrdv_epi32(i8 zeroext %__U, <4 x i64> %__S, <4 x i64> %__A, <4 x i64> %__B) {
1773; X86-LABEL: test_mm256_maskz_shrdv_epi32:
1774; X86:       # %bb.0: # %entry
1775; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1776; X86-NEXT:    kmovd %eax, %k1
1777; X86-NEXT:    vpshrdvd %ymm2, %ymm1, %ymm0 {%k1} {z}
1778; X86-NEXT:    retl
1779;
1780; X64-LABEL: test_mm256_maskz_shrdv_epi32:
1781; X64:       # %bb.0: # %entry
1782; X64-NEXT:    kmovd %edi, %k1
1783; X64-NEXT:    vpshrdvd %ymm2, %ymm1, %ymm0 {%k1} {z}
1784; X64-NEXT:    retq
1785entry:
1786  %0 = bitcast <4 x i64> %__S to <8 x i32>
1787  %1 = bitcast <4 x i64> %__A to <8 x i32>
1788  %2 = bitcast <4 x i64> %__B to <8 x i32>
1789  %3 = tail call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> %2)
1790  %4 = bitcast i8 %__U to <8 x i1>
1791  %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> zeroinitializer
1792  %6 = bitcast <8 x i32> %5 to <4 x i64>
1793  ret <4 x i64> %6
1794}
1795
1796define <4 x i64> @test_mm256_shrdv_epi32(<4 x i64> %__S, <4 x i64> %__A, <4 x i64> %__B) {
1797; CHECK-LABEL: test_mm256_shrdv_epi32:
1798; CHECK:       # %bb.0: # %entry
1799; CHECK-NEXT:    vpshrdvd %ymm2, %ymm1, %ymm0
1800; CHECK-NEXT:    ret{{[l|q]}}
1801entry:
1802  %0 = bitcast <4 x i64> %__S to <8 x i32>
1803  %1 = bitcast <4 x i64> %__A to <8 x i32>
1804  %2 = bitcast <4 x i64> %__B to <8 x i32>
1805  %3 = tail call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %1, <8 x i32> %0, <8 x i32> %2)
1806  %4 = bitcast <8 x i32> %3 to <4 x i64>
1807  ret <4 x i64> %4
1808}
1809
1810define <2 x i64> @test_mm_mask_shrdv_epi32(<2 x i64> %__S, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1811; X86-LABEL: test_mm_mask_shrdv_epi32:
1812; X86:       # %bb.0: # %entry
1813; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1814; X86-NEXT:    kmovd %eax, %k1
1815; X86-NEXT:    vpshrdvd %xmm2, %xmm1, %xmm0 {%k1}
1816; X86-NEXT:    retl
1817;
1818; X64-LABEL: test_mm_mask_shrdv_epi32:
1819; X64:       # %bb.0: # %entry
1820; X64-NEXT:    kmovd %edi, %k1
1821; X64-NEXT:    vpshrdvd %xmm2, %xmm1, %xmm0 {%k1}
1822; X64-NEXT:    retq
1823entry:
1824  %0 = bitcast <2 x i64> %__S to <4 x i32>
1825  %1 = bitcast <2 x i64> %__A to <4 x i32>
1826  %2 = bitcast <2 x i64> %__B to <4 x i32>
1827  %3 = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %2)
1828  %4 = bitcast i8 %__U to <8 x i1>
1829  %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1830  %5 = select <4 x i1> %extract.i, <4 x i32> %3, <4 x i32> %0
1831  %6 = bitcast <4 x i32> %5 to <2 x i64>
1832  ret <2 x i64> %6
1833}
1834
1835define <2 x i64> @test_mm_maskz_shrdv_epi32(i8 zeroext %__U, <2 x i64> %__S, <2 x i64> %__A, <2 x i64> %__B) {
1836; X86-LABEL: test_mm_maskz_shrdv_epi32:
1837; X86:       # %bb.0: # %entry
1838; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1839; X86-NEXT:    kmovd %eax, %k1
1840; X86-NEXT:    vpshrdvd %xmm2, %xmm1, %xmm0 {%k1} {z}
1841; X86-NEXT:    retl
1842;
1843; X64-LABEL: test_mm_maskz_shrdv_epi32:
1844; X64:       # %bb.0: # %entry
1845; X64-NEXT:    kmovd %edi, %k1
1846; X64-NEXT:    vpshrdvd %xmm2, %xmm1, %xmm0 {%k1} {z}
1847; X64-NEXT:    retq
1848entry:
1849  %0 = bitcast <2 x i64> %__S to <4 x i32>
1850  %1 = bitcast <2 x i64> %__A to <4 x i32>
1851  %2 = bitcast <2 x i64> %__B to <4 x i32>
1852  %3 = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %2)
1853  %4 = bitcast i8 %__U to <8 x i1>
1854  %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1855  %5 = select <4 x i1> %extract.i, <4 x i32> %3, <4 x i32> zeroinitializer
1856  %6 = bitcast <4 x i32> %5 to <2 x i64>
1857  ret <2 x i64> %6
1858}
1859
1860define <2 x i64> @test_mm_shrdv_epi32(<2 x i64> %__S, <2 x i64> %__A, <2 x i64> %__B) {
1861; CHECK-LABEL: test_mm_shrdv_epi32:
1862; CHECK:       # %bb.0: # %entry
1863; CHECK-NEXT:    vpshrdvd %xmm2, %xmm1, %xmm0
1864; CHECK-NEXT:    ret{{[l|q]}}
1865entry:
1866  %0 = bitcast <2 x i64> %__S to <4 x i32>
1867  %1 = bitcast <2 x i64> %__A to <4 x i32>
1868  %2 = bitcast <2 x i64> %__B to <4 x i32>
1869  %3 = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %1, <4 x i32> %0, <4 x i32> %2)
1870  %4 = bitcast <4 x i32> %3 to <2 x i64>
1871  ret <2 x i64> %4
1872}
1873
1874define <4 x i64> @test_mm256_mask_shrdv_epi16(<4 x i64> %__S, i16 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
1875; X86-LABEL: test_mm256_mask_shrdv_epi16:
1876; X86:       # %bb.0: # %entry
1877; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
1878; X86-NEXT:    vpshrdvw %ymm2, %ymm1, %ymm0 {%k1}
1879; X86-NEXT:    retl
1880;
1881; X64-LABEL: test_mm256_mask_shrdv_epi16:
1882; X64:       # %bb.0: # %entry
1883; X64-NEXT:    kmovd %edi, %k1
1884; X64-NEXT:    vpshrdvw %ymm2, %ymm1, %ymm0 {%k1}
1885; X64-NEXT:    retq
1886entry:
1887  %0 = bitcast <4 x i64> %__S to <16 x i16>
1888  %1 = bitcast <4 x i64> %__A to <16 x i16>
1889  %2 = bitcast <4 x i64> %__B to <16 x i16>
1890  %3 = tail call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %1, <16 x i16> %0, <16 x i16> %2)
1891  %4 = bitcast i16 %__U to <16 x i1>
1892  %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> %0
1893  %6 = bitcast <16 x i16> %5 to <4 x i64>
1894  ret <4 x i64> %6
1895}
1896
1897define <4 x i64> @test_mm256_maskz_shrdv_epi16(i16 zeroext %__U, <4 x i64> %__S, <4 x i64> %__A, <4 x i64> %__B) {
1898; X86-LABEL: test_mm256_maskz_shrdv_epi16:
1899; X86:       # %bb.0: # %entry
1900; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
1901; X86-NEXT:    vpshrdvw %ymm2, %ymm1, %ymm0 {%k1} {z}
1902; X86-NEXT:    retl
1903;
1904; X64-LABEL: test_mm256_maskz_shrdv_epi16:
1905; X64:       # %bb.0: # %entry
1906; X64-NEXT:    kmovd %edi, %k1
1907; X64-NEXT:    vpshrdvw %ymm2, %ymm1, %ymm0 {%k1} {z}
1908; X64-NEXT:    retq
1909entry:
1910  %0 = bitcast <4 x i64> %__S to <16 x i16>
1911  %1 = bitcast <4 x i64> %__A to <16 x i16>
1912  %2 = bitcast <4 x i64> %__B to <16 x i16>
1913  %3 = tail call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %1, <16 x i16> %0, <16 x i16> %2)
1914  %4 = bitcast i16 %__U to <16 x i1>
1915  %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> zeroinitializer
1916  %6 = bitcast <16 x i16> %5 to <4 x i64>
1917  ret <4 x i64> %6
1918}
1919
1920define <4 x i64> @test_mm256_shrdv_epi16(<4 x i64> %__S, <4 x i64> %__A, <4 x i64> %__B) {
1921; CHECK-LABEL: test_mm256_shrdv_epi16:
1922; CHECK:       # %bb.0: # %entry
1923; CHECK-NEXT:    vpshrdvw %ymm2, %ymm1, %ymm0
1924; CHECK-NEXT:    ret{{[l|q]}}
1925entry:
1926  %0 = bitcast <4 x i64> %__S to <16 x i16>
1927  %1 = bitcast <4 x i64> %__A to <16 x i16>
1928  %2 = bitcast <4 x i64> %__B to <16 x i16>
1929  %3 = tail call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %1, <16 x i16> %0, <16 x i16> %2)
1930  %4 = bitcast <16 x i16> %3 to <4 x i64>
1931  ret <4 x i64> %4
1932}
1933
1934define <2 x i64> @test_mm_mask_shrdv_epi16(<2 x i64> %__S, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
1935; X86-LABEL: test_mm_mask_shrdv_epi16:
1936; X86:       # %bb.0: # %entry
1937; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1938; X86-NEXT:    kmovd %eax, %k1
1939; X86-NEXT:    vpshrdvw %xmm2, %xmm1, %xmm0 {%k1}
1940; X86-NEXT:    retl
1941;
1942; X64-LABEL: test_mm_mask_shrdv_epi16:
1943; X64:       # %bb.0: # %entry
1944; X64-NEXT:    kmovd %edi, %k1
1945; X64-NEXT:    vpshrdvw %xmm2, %xmm1, %xmm0 {%k1}
1946; X64-NEXT:    retq
1947entry:
1948  %0 = bitcast <2 x i64> %__S to <8 x i16>
1949  %1 = bitcast <2 x i64> %__A to <8 x i16>
1950  %2 = bitcast <2 x i64> %__B to <8 x i16>
1951  %3 = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %1, <8 x i16> %0, <8 x i16> %2)
1952  %4 = bitcast i8 %__U to <8 x i1>
1953  %5 = select <8 x i1> %4, <8 x i16> %3, <8 x i16> %0
1954  %6 = bitcast <8 x i16> %5 to <2 x i64>
1955  ret <2 x i64> %6
1956}
1957
1958define <2 x i64> @test_mm_maskz_shrdv_epi16(i8 zeroext %__U, <2 x i64> %__S, <2 x i64> %__A, <2 x i64> %__B) {
1959; X86-LABEL: test_mm_maskz_shrdv_epi16:
1960; X86:       # %bb.0: # %entry
1961; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
1962; X86-NEXT:    kmovd %eax, %k1
1963; X86-NEXT:    vpshrdvw %xmm2, %xmm1, %xmm0 {%k1} {z}
1964; X86-NEXT:    retl
1965;
1966; X64-LABEL: test_mm_maskz_shrdv_epi16:
1967; X64:       # %bb.0: # %entry
1968; X64-NEXT:    kmovd %edi, %k1
1969; X64-NEXT:    vpshrdvw %xmm2, %xmm1, %xmm0 {%k1} {z}
1970; X64-NEXT:    retq
1971entry:
1972  %0 = bitcast <2 x i64> %__S to <8 x i16>
1973  %1 = bitcast <2 x i64> %__A to <8 x i16>
1974  %2 = bitcast <2 x i64> %__B to <8 x i16>
1975  %3 = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %1, <8 x i16> %0, <8 x i16> %2)
1976  %4 = bitcast i8 %__U to <8 x i1>
1977  %5 = select <8 x i1> %4, <8 x i16> %3, <8 x i16> zeroinitializer
1978  %6 = bitcast <8 x i16> %5 to <2 x i64>
1979  ret <2 x i64> %6
1980}
1981
1982define <2 x i64> @test_mm_shrdv_epi16(<2 x i64> %__S, <2 x i64> %__A, <2 x i64> %__B) {
1983; CHECK-LABEL: test_mm_shrdv_epi16:
1984; CHECK:       # %bb.0: # %entry
1985; CHECK-NEXT:    vpshrdvw %xmm2, %xmm1, %xmm0
1986; CHECK-NEXT:    ret{{[l|q]}}
1987entry:
1988  %0 = bitcast <2 x i64> %__S to <8 x i16>
1989  %1 = bitcast <2 x i64> %__A to <8 x i16>
1990  %2 = bitcast <2 x i64> %__B to <8 x i16>
1991  %3 = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %1, <8 x i16> %0, <8 x i16> %2)
1992  %4 = bitcast <8 x i16> %3 to <2 x i64>
1993  ret <2 x i64> %4
1994}
1995
1996declare <8 x i16> @llvm.x86.avx512.mask.compress.w.128(<8 x i16>, <8 x i16>, i8)
1997declare <16 x i8> @llvm.x86.avx512.mask.compress.b.128(<16 x i8>, <16 x i8>, i16)
1998declare void @llvm.masked.compressstore.v8i16(<8 x i16>, i16*, <8 x i1>)
1999declare void @llvm.masked.compressstore.v16i8(<16 x i8>, i8*, <16 x i1>)
2000declare <8 x i16> @llvm.x86.avx512.mask.expand.w.128(<8 x i16>, <8 x i16>, i8)
2001declare <16 x i8> @llvm.x86.avx512.mask.expand.b.128(<16 x i8>, <16 x i8>, i16)
2002declare <8 x i16> @llvm.masked.expandload.v8i16(i16*, <8 x i1>, <8 x i16>)
2003declare <16 x i8> @llvm.masked.expandload.v16i8(i8*, <16 x i1>, <16 x i8>)
2004declare <16 x i16> @llvm.x86.avx512.mask.compress.w.256(<16 x i16>, <16 x i16>, i16)
2005declare <32 x i8> @llvm.x86.avx512.mask.compress.b.256(<32 x i8>, <32 x i8>, i32)
2006declare void @llvm.masked.compressstore.v16i16(<16 x i16>, i16*, <16 x i1>)
2007declare void @llvm.masked.compressstore.v32i8(<32 x i8>, i8*, <32 x i1>)
2008declare <16 x i16> @llvm.x86.avx512.mask.expand.w.256(<16 x i16>, <16 x i16>, i16)
2009declare <32 x i8> @llvm.x86.avx512.mask.expand.b.256(<32 x i8>, <32 x i8>, i32)
2010declare <16 x i16> @llvm.masked.expandload.v16i16(i16*, <16 x i1>, <16 x i16>)
2011declare <32 x i8> @llvm.masked.expandload.v32i8(i8*, <32 x i1>, <32 x i8>)
2012