• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X32,X32-SLOW
3; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=X32,X32-FAST
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64,X64-SLOW
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=X64,X64-FAST
6
7; AVX2 Logical Shift Left
8
9define <16 x i16> @test_sllw_1(<16 x i16> %InVec) {
10; X32-LABEL: test_sllw_1:
11; X32:       # %bb.0: # %entry
12; X32-NEXT:    retl
13;
14; X64-LABEL: test_sllw_1:
15; X64:       # %bb.0: # %entry
16; X64-NEXT:    retq
17entry:
18  %shl = shl <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
19  ret <16 x i16> %shl
20}
21
22define <16 x i16> @test_sllw_2(<16 x i16> %InVec) {
23; X32-LABEL: test_sllw_2:
24; X32:       # %bb.0: # %entry
25; X32-NEXT:    vpaddw %ymm0, %ymm0, %ymm0
26; X32-NEXT:    retl
27;
28; X64-LABEL: test_sllw_2:
29; X64:       # %bb.0: # %entry
30; X64-NEXT:    vpaddw %ymm0, %ymm0, %ymm0
31; X64-NEXT:    retq
32entry:
33  %shl = shl <16 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
34  ret <16 x i16> %shl
35}
36
37define <16 x i16> @test_sllw_3(<16 x i16> %InVec) {
38; X32-LABEL: test_sllw_3:
39; X32:       # %bb.0: # %entry
40; X32-NEXT:    vpsllw $15, %ymm0, %ymm0
41; X32-NEXT:    retl
42;
43; X64-LABEL: test_sllw_3:
44; X64:       # %bb.0: # %entry
45; X64-NEXT:    vpsllw $15, %ymm0, %ymm0
46; X64-NEXT:    retq
47entry:
48  %shl = shl <16 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
49  ret <16 x i16> %shl
50}
51
52define <8 x i32> @test_slld_1(<8 x i32> %InVec) {
53; X32-LABEL: test_slld_1:
54; X32:       # %bb.0: # %entry
55; X32-NEXT:    retl
56;
57; X64-LABEL: test_slld_1:
58; X64:       # %bb.0: # %entry
59; X64-NEXT:    retq
60entry:
61  %shl = shl <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
62  ret <8 x i32> %shl
63}
64
65define <8 x i32> @test_slld_2(<8 x i32> %InVec) {
66; X32-LABEL: test_slld_2:
67; X32:       # %bb.0: # %entry
68; X32-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
69; X32-NEXT:    retl
70;
71; X64-LABEL: test_slld_2:
72; X64:       # %bb.0: # %entry
73; X64-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
74; X64-NEXT:    retq
75entry:
76  %shl = shl <8 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
77  ret <8 x i32> %shl
78}
79
80define <8 x i32> @test_vpslld_var(i32 %shift) {
81; X32-LABEL: test_vpslld_var:
82; X32:       # %bb.0:
83; X32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
84; X32-NEXT:    vmovdqa {{.*#+}} ymm1 = [192,193,194,195,196,197,198,199]
85; X32-NEXT:    vpslld %xmm0, %ymm1, %ymm0
86; X32-NEXT:    retl
87;
88; X64-LABEL: test_vpslld_var:
89; X64:       # %bb.0:
90; X64-NEXT:    vmovd %edi, %xmm0
91; X64-NEXT:    vmovdqa {{.*#+}} ymm1 = [192,193,194,195,196,197,198,199]
92; X64-NEXT:    vpslld %xmm0, %ymm1, %ymm0
93; X64-NEXT:    retq
94  %amt = insertelement <8 x i32> undef, i32 %shift, i32 0
95  %tmp = shl <8 x i32> <i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199>, %amt
96  ret <8 x i32> %tmp
97}
98
99define <8 x i32> @test_slld_3(<8 x i32> %InVec) {
100; X32-LABEL: test_slld_3:
101; X32:       # %bb.0: # %entry
102; X32-NEXT:    vpslld $31, %ymm0, %ymm0
103; X32-NEXT:    retl
104;
105; X64-LABEL: test_slld_3:
106; X64:       # %bb.0: # %entry
107; X64-NEXT:    vpslld $31, %ymm0, %ymm0
108; X64-NEXT:    retq
109entry:
110  %shl = shl <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
111  ret <8 x i32> %shl
112}
113
114define <4 x i64> @test_sllq_1(<4 x i64> %InVec) {
115; X32-LABEL: test_sllq_1:
116; X32:       # %bb.0: # %entry
117; X32-NEXT:    retl
118;
119; X64-LABEL: test_sllq_1:
120; X64:       # %bb.0: # %entry
121; X64-NEXT:    retq
122entry:
123  %shl = shl <4 x i64> %InVec, <i64 0, i64 0, i64 0, i64 0>
124  ret <4 x i64> %shl
125}
126
127define <4 x i64> @test_sllq_2(<4 x i64> %InVec) {
128; X32-LABEL: test_sllq_2:
129; X32:       # %bb.0: # %entry
130; X32-NEXT:    vpaddq %ymm0, %ymm0, %ymm0
131; X32-NEXT:    retl
132;
133; X64-LABEL: test_sllq_2:
134; X64:       # %bb.0: # %entry
135; X64-NEXT:    vpaddq %ymm0, %ymm0, %ymm0
136; X64-NEXT:    retq
137entry:
138  %shl = shl <4 x i64> %InVec, <i64 1, i64 1, i64 1, i64 1>
139  ret <4 x i64> %shl
140}
141
142define <4 x i64> @test_sllq_3(<4 x i64> %InVec) {
143; X32-LABEL: test_sllq_3:
144; X32:       # %bb.0: # %entry
145; X32-NEXT:    vpsllq $63, %ymm0, %ymm0
146; X32-NEXT:    retl
147;
148; X64-LABEL: test_sllq_3:
149; X64:       # %bb.0: # %entry
150; X64-NEXT:    vpsllq $63, %ymm0, %ymm0
151; X64-NEXT:    retq
152entry:
153  %shl = shl <4 x i64> %InVec, <i64 63, i64 63, i64 63, i64 63>
154  ret <4 x i64> %shl
155}
156
157; AVX2 Arithmetic Shift
158
159define <16 x i16> @test_sraw_1(<16 x i16> %InVec) {
160; X32-LABEL: test_sraw_1:
161; X32:       # %bb.0: # %entry
162; X32-NEXT:    retl
163;
164; X64-LABEL: test_sraw_1:
165; X64:       # %bb.0: # %entry
166; X64-NEXT:    retq
167entry:
168  %shl = ashr <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
169  ret <16 x i16> %shl
170}
171
172define <16 x i16> @test_sraw_2(<16 x i16> %InVec) {
173; X32-LABEL: test_sraw_2:
174; X32:       # %bb.0: # %entry
175; X32-NEXT:    vpsraw $1, %ymm0, %ymm0
176; X32-NEXT:    retl
177;
178; X64-LABEL: test_sraw_2:
179; X64:       # %bb.0: # %entry
180; X64-NEXT:    vpsraw $1, %ymm0, %ymm0
181; X64-NEXT:    retq
182entry:
183  %shl = ashr <16 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
184  ret <16 x i16> %shl
185}
186
187define <16 x i16> @test_sraw_3(<16 x i16> %InVec) {
188; X32-LABEL: test_sraw_3:
189; X32:       # %bb.0: # %entry
190; X32-NEXT:    vpsraw $15, %ymm0, %ymm0
191; X32-NEXT:    retl
192;
193; X64-LABEL: test_sraw_3:
194; X64:       # %bb.0: # %entry
195; X64-NEXT:    vpsraw $15, %ymm0, %ymm0
196; X64-NEXT:    retq
197entry:
198  %shl = ashr <16 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
199  ret <16 x i16> %shl
200}
201
202define <8 x i32> @test_srad_1(<8 x i32> %InVec) {
203; X32-LABEL: test_srad_1:
204; X32:       # %bb.0: # %entry
205; X32-NEXT:    retl
206;
207; X64-LABEL: test_srad_1:
208; X64:       # %bb.0: # %entry
209; X64-NEXT:    retq
210entry:
211  %shl = ashr <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
212  ret <8 x i32> %shl
213}
214
215define <8 x i32> @test_srad_2(<8 x i32> %InVec) {
216; X32-LABEL: test_srad_2:
217; X32:       # %bb.0: # %entry
218; X32-NEXT:    vpsrad $1, %ymm0, %ymm0
219; X32-NEXT:    retl
220;
221; X64-LABEL: test_srad_2:
222; X64:       # %bb.0: # %entry
223; X64-NEXT:    vpsrad $1, %ymm0, %ymm0
224; X64-NEXT:    retq
225entry:
226  %shl = ashr <8 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
227  ret <8 x i32> %shl
228}
229
230define <8 x i32> @test_srad_3(<8 x i32> %InVec) {
231; X32-LABEL: test_srad_3:
232; X32:       # %bb.0: # %entry
233; X32-NEXT:    vpsrad $31, %ymm0, %ymm0
234; X32-NEXT:    retl
235;
236; X64-LABEL: test_srad_3:
237; X64:       # %bb.0: # %entry
238; X64-NEXT:    vpsrad $31, %ymm0, %ymm0
239; X64-NEXT:    retq
240entry:
241  %shl = ashr <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
242  ret <8 x i32> %shl
243}
244
245; SSE Logical Shift Right
246
247define <16 x i16> @test_srlw_1(<16 x i16> %InVec) {
248; X32-LABEL: test_srlw_1:
249; X32:       # %bb.0: # %entry
250; X32-NEXT:    retl
251;
252; X64-LABEL: test_srlw_1:
253; X64:       # %bb.0: # %entry
254; X64-NEXT:    retq
255entry:
256  %shl = lshr <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
257  ret <16 x i16> %shl
258}
259
260define <16 x i16> @test_srlw_2(<16 x i16> %InVec) {
261; X32-LABEL: test_srlw_2:
262; X32:       # %bb.0: # %entry
263; X32-NEXT:    vpsrlw $1, %ymm0, %ymm0
264; X32-NEXT:    retl
265;
266; X64-LABEL: test_srlw_2:
267; X64:       # %bb.0: # %entry
268; X64-NEXT:    vpsrlw $1, %ymm0, %ymm0
269; X64-NEXT:    retq
270entry:
271  %shl = lshr <16 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
272  ret <16 x i16> %shl
273}
274
275define <16 x i16> @test_srlw_3(<16 x i16> %InVec) {
276; X32-LABEL: test_srlw_3:
277; X32:       # %bb.0: # %entry
278; X32-NEXT:    vpsrlw $15, %ymm0, %ymm0
279; X32-NEXT:    retl
280;
281; X64-LABEL: test_srlw_3:
282; X64:       # %bb.0: # %entry
283; X64-NEXT:    vpsrlw $15, %ymm0, %ymm0
284; X64-NEXT:    retq
285entry:
286  %shl = lshr <16 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
287  ret <16 x i16> %shl
288}
289
290define <8 x i32> @test_srld_1(<8 x i32> %InVec) {
291; X32-LABEL: test_srld_1:
292; X32:       # %bb.0: # %entry
293; X32-NEXT:    retl
294;
295; X64-LABEL: test_srld_1:
296; X64:       # %bb.0: # %entry
297; X64-NEXT:    retq
298entry:
299  %shl = lshr <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
300  ret <8 x i32> %shl
301}
302
303define <8 x i32> @test_srld_2(<8 x i32> %InVec) {
304; X32-LABEL: test_srld_2:
305; X32:       # %bb.0: # %entry
306; X32-NEXT:    vpsrld $1, %ymm0, %ymm0
307; X32-NEXT:    retl
308;
309; X64-LABEL: test_srld_2:
310; X64:       # %bb.0: # %entry
311; X64-NEXT:    vpsrld $1, %ymm0, %ymm0
312; X64-NEXT:    retq
313entry:
314  %shl = lshr <8 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
315  ret <8 x i32> %shl
316}
317
318define <8 x i32> @test_srld_3(<8 x i32> %InVec) {
319; X32-LABEL: test_srld_3:
320; X32:       # %bb.0: # %entry
321; X32-NEXT:    vpsrld $31, %ymm0, %ymm0
322; X32-NEXT:    retl
323;
324; X64-LABEL: test_srld_3:
325; X64:       # %bb.0: # %entry
326; X64-NEXT:    vpsrld $31, %ymm0, %ymm0
327; X64-NEXT:    retq
328entry:
329  %shl = lshr <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
330  ret <8 x i32> %shl
331}
332
333define <4 x i64> @test_srlq_1(<4 x i64> %InVec) {
334; X32-LABEL: test_srlq_1:
335; X32:       # %bb.0: # %entry
336; X32-NEXT:    retl
337;
338; X64-LABEL: test_srlq_1:
339; X64:       # %bb.0: # %entry
340; X64-NEXT:    retq
341entry:
342  %shl = lshr <4 x i64> %InVec, <i64 0, i64 0, i64 0, i64 0>
343  ret <4 x i64> %shl
344}
345
346define <4 x i64> @test_srlq_2(<4 x i64> %InVec) {
347; X32-LABEL: test_srlq_2:
348; X32:       # %bb.0: # %entry
349; X32-NEXT:    vpsrlq $1, %ymm0, %ymm0
350; X32-NEXT:    retl
351;
352; X64-LABEL: test_srlq_2:
353; X64:       # %bb.0: # %entry
354; X64-NEXT:    vpsrlq $1, %ymm0, %ymm0
355; X64-NEXT:    retq
356entry:
357  %shl = lshr <4 x i64> %InVec, <i64 1, i64 1, i64 1, i64 1>
358  ret <4 x i64> %shl
359}
360
361define <4 x i64> @test_srlq_3(<4 x i64> %InVec) {
362; X32-LABEL: test_srlq_3:
363; X32:       # %bb.0: # %entry
364; X32-NEXT:    vpsrlq $63, %ymm0, %ymm0
365; X32-NEXT:    retl
366;
367; X64-LABEL: test_srlq_3:
368; X64:       # %bb.0: # %entry
369; X64-NEXT:    vpsrlq $63, %ymm0, %ymm0
370; X64-NEXT:    retq
371entry:
372  %shl = lshr <4 x i64> %InVec, <i64 63, i64 63, i64 63, i64 63>
373  ret <4 x i64> %shl
374}
375
376define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind {
377; X32-SLOW-LABEL: srl_trunc_and_v4i64:
378; X32-SLOW:       # %bb.0:
379; X32-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
380; X32-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
381; X32-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8]
382; X32-SLOW-NEXT:    vpand %xmm2, %xmm1, %xmm1
383; X32-SLOW-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
384; X32-SLOW-NEXT:    vzeroupper
385; X32-SLOW-NEXT:    retl
386;
387; X32-FAST-LABEL: srl_trunc_and_v4i64:
388; X32-FAST:       # %bb.0:
389; X32-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
390; X32-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
391; X32-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8]
392; X32-FAST-NEXT:    vpand %xmm2, %xmm1, %xmm1
393; X32-FAST-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
394; X32-FAST-NEXT:    vzeroupper
395; X32-FAST-NEXT:    retl
396;
397; X64-SLOW-LABEL: srl_trunc_and_v4i64:
398; X64-SLOW:       # %bb.0:
399; X64-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
400; X64-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
401; X64-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8]
402; X64-SLOW-NEXT:    vpand %xmm2, %xmm1, %xmm1
403; X64-SLOW-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
404; X64-SLOW-NEXT:    vzeroupper
405; X64-SLOW-NEXT:    retq
406;
407; X64-FAST-LABEL: srl_trunc_and_v4i64:
408; X64-FAST:       # %bb.0:
409; X64-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
410; X64-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
411; X64-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8]
412; X64-FAST-NEXT:    vpand %xmm2, %xmm1, %xmm1
413; X64-FAST-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
414; X64-FAST-NEXT:    vzeroupper
415; X64-FAST-NEXT:    retq
416  %and = and <4 x i64> %y, <i64 8, i64 8, i64 8, i64 8>
417  %trunc = trunc <4 x i64> %and to <4 x i32>
418  %sra = lshr <4 x i32> %x, %trunc
419  ret <4 x i32> %sra
420}
421
422;
423; Vectorized byte shifts
424;
425
426define <8 x i16> @shl_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
427; X32-LABEL: shl_8i16:
428; X32:       # %bb.0:
429; X32-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
430; X32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
431; X32-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
432; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
433; X32-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
434; X32-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
435; X32-NEXT:    vzeroupper
436; X32-NEXT:    retl
437;
438; X64-LABEL: shl_8i16:
439; X64:       # %bb.0:
440; X64-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
441; X64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
442; X64-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
443; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
444; X64-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
445; X64-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
446; X64-NEXT:    vzeroupper
447; X64-NEXT:    retq
448  %shl = shl <8 x i16> %r, %a
449  ret <8 x i16> %shl
450}
451
452define <16 x i16> @shl_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
453; X32-LABEL: shl_16i16:
454; X32:       # %bb.0:
455; X32-NEXT:    vpxor %xmm2, %xmm2, %xmm2
456; X32-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
457; X32-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
458; X32-NEXT:    vpsllvd %ymm3, %ymm4, %ymm3
459; X32-NEXT:    vpsrld $16, %ymm3, %ymm3
460; X32-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
461; X32-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
462; X32-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
463; X32-NEXT:    vpsrld $16, %ymm0, %ymm0
464; X32-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
465; X32-NEXT:    retl
466;
467; X64-LABEL: shl_16i16:
468; X64:       # %bb.0:
469; X64-NEXT:    vpxor %xmm2, %xmm2, %xmm2
470; X64-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
471; X64-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
472; X64-NEXT:    vpsllvd %ymm3, %ymm4, %ymm3
473; X64-NEXT:    vpsrld $16, %ymm3, %ymm3
474; X64-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
475; X64-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
476; X64-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
477; X64-NEXT:    vpsrld $16, %ymm0, %ymm0
478; X64-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
479; X64-NEXT:    retq
480  %shl = shl <16 x i16> %r, %a
481  ret <16 x i16> %shl
482}
483
484define <32 x i8> @shl_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
485; X32-LABEL: shl_32i8:
486; X32:       # %bb.0:
487; X32-NEXT:    vpsllw $5, %ymm1, %ymm1
488; X32-NEXT:    vpsllw $4, %ymm0, %ymm2
489; X32-NEXT:    vpand {{\.LCPI.*}}, %ymm2, %ymm2
490; X32-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
491; X32-NEXT:    vpsllw $2, %ymm0, %ymm2
492; X32-NEXT:    vpand {{\.LCPI.*}}, %ymm2, %ymm2
493; X32-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
494; X32-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
495; X32-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
496; X32-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
497; X32-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
498; X32-NEXT:    retl
499;
500; X64-LABEL: shl_32i8:
501; X64:       # %bb.0:
502; X64-NEXT:    vpsllw $5, %ymm1, %ymm1
503; X64-NEXT:    vpsllw $4, %ymm0, %ymm2
504; X64-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
505; X64-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
506; X64-NEXT:    vpsllw $2, %ymm0, %ymm2
507; X64-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
508; X64-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
509; X64-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
510; X64-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
511; X64-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
512; X64-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
513; X64-NEXT:    retq
514  %shl = shl <32 x i8> %r, %a
515  ret <32 x i8> %shl
516}
517
518define <8 x i16> @ashr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
519; X32-LABEL: ashr_8i16:
520; X32:       # %bb.0:
521; X32-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
522; X32-NEXT:    vpmovsxwd %xmm0, %ymm0
523; X32-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
524; X32-NEXT:    vextracti128 $1, %ymm0, %xmm1
525; X32-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
526; X32-NEXT:    vzeroupper
527; X32-NEXT:    retl
528;
529; X64-LABEL: ashr_8i16:
530; X64:       # %bb.0:
531; X64-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
532; X64-NEXT:    vpmovsxwd %xmm0, %ymm0
533; X64-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
534; X64-NEXT:    vextracti128 $1, %ymm0, %xmm1
535; X64-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
536; X64-NEXT:    vzeroupper
537; X64-NEXT:    retq
538  %ashr = ashr <8 x i16> %r, %a
539  ret <8 x i16> %ashr
540}
541
542define <16 x i16> @ashr_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
543; X32-LABEL: ashr_16i16:
544; X32:       # %bb.0:
545; X32-NEXT:    vpxor %xmm2, %xmm2, %xmm2
546; X32-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
547; X32-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
548; X32-NEXT:    vpsravd %ymm3, %ymm4, %ymm3
549; X32-NEXT:    vpsrld $16, %ymm3, %ymm3
550; X32-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
551; X32-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
552; X32-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
553; X32-NEXT:    vpsrld $16, %ymm0, %ymm0
554; X32-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
555; X32-NEXT:    retl
556;
557; X64-LABEL: ashr_16i16:
558; X64:       # %bb.0:
559; X64-NEXT:    vpxor %xmm2, %xmm2, %xmm2
560; X64-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
561; X64-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
562; X64-NEXT:    vpsravd %ymm3, %ymm4, %ymm3
563; X64-NEXT:    vpsrld $16, %ymm3, %ymm3
564; X64-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
565; X64-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
566; X64-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
567; X64-NEXT:    vpsrld $16, %ymm0, %ymm0
568; X64-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
569; X64-NEXT:    retq
570  %ashr = ashr <16 x i16> %r, %a
571  ret <16 x i16> %ashr
572}
573
574define <32 x i8> @ashr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
575; X32-LABEL: ashr_32i8:
576; X32:       # %bb.0:
577; X32-NEXT:    vpsllw $5, %ymm1, %ymm1
578; X32-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
579; X32-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
580; X32-NEXT:    vpsraw $4, %ymm3, %ymm4
581; X32-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
582; X32-NEXT:    vpsraw $2, %ymm3, %ymm4
583; X32-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
584; X32-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
585; X32-NEXT:    vpsraw $1, %ymm3, %ymm4
586; X32-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
587; X32-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
588; X32-NEXT:    vpsrlw $8, %ymm2, %ymm2
589; X32-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
590; X32-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
591; X32-NEXT:    vpsraw $4, %ymm0, %ymm3
592; X32-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
593; X32-NEXT:    vpsraw $2, %ymm0, %ymm3
594; X32-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
595; X32-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
596; X32-NEXT:    vpsraw $1, %ymm0, %ymm3
597; X32-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
598; X32-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
599; X32-NEXT:    vpsrlw $8, %ymm0, %ymm0
600; X32-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
601; X32-NEXT:    retl
602;
603; X64-LABEL: ashr_32i8:
604; X64:       # %bb.0:
605; X64-NEXT:    vpsllw $5, %ymm1, %ymm1
606; X64-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
607; X64-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
608; X64-NEXT:    vpsraw $4, %ymm3, %ymm4
609; X64-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
610; X64-NEXT:    vpsraw $2, %ymm3, %ymm4
611; X64-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
612; X64-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
613; X64-NEXT:    vpsraw $1, %ymm3, %ymm4
614; X64-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
615; X64-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
616; X64-NEXT:    vpsrlw $8, %ymm2, %ymm2
617; X64-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
618; X64-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
619; X64-NEXT:    vpsraw $4, %ymm0, %ymm3
620; X64-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
621; X64-NEXT:    vpsraw $2, %ymm0, %ymm3
622; X64-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
623; X64-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
624; X64-NEXT:    vpsraw $1, %ymm0, %ymm3
625; X64-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
626; X64-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
627; X64-NEXT:    vpsrlw $8, %ymm0, %ymm0
628; X64-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
629; X64-NEXT:    retq
630  %ashr = ashr <32 x i8> %r, %a
631  ret <32 x i8> %ashr
632}
633
634define <8 x i16> @lshr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
635; X32-LABEL: lshr_8i16:
636; X32:       # %bb.0:
637; X32-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
638; X32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
639; X32-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
640; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
641; X32-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
642; X32-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
643; X32-NEXT:    vzeroupper
644; X32-NEXT:    retl
645;
646; X64-LABEL: lshr_8i16:
647; X64:       # %bb.0:
648; X64-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
649; X64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
650; X64-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
651; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
652; X64-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
653; X64-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
654; X64-NEXT:    vzeroupper
655; X64-NEXT:    retq
656  %lshr = lshr <8 x i16> %r, %a
657  ret <8 x i16> %lshr
658}
659
660define <16 x i16> @lshr_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
661; X32-LABEL: lshr_16i16:
662; X32:       # %bb.0:
663; X32-NEXT:    vpxor %xmm2, %xmm2, %xmm2
664; X32-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
665; X32-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
666; X32-NEXT:    vpsrlvd %ymm3, %ymm4, %ymm3
667; X32-NEXT:    vpsrld $16, %ymm3, %ymm3
668; X32-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
669; X32-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
670; X32-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
671; X32-NEXT:    vpsrld $16, %ymm0, %ymm0
672; X32-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
673; X32-NEXT:    retl
674;
675; X64-LABEL: lshr_16i16:
676; X64:       # %bb.0:
677; X64-NEXT:    vpxor %xmm2, %xmm2, %xmm2
678; X64-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
679; X64-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
680; X64-NEXT:    vpsrlvd %ymm3, %ymm4, %ymm3
681; X64-NEXT:    vpsrld $16, %ymm3, %ymm3
682; X64-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
683; X64-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
684; X64-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
685; X64-NEXT:    vpsrld $16, %ymm0, %ymm0
686; X64-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
687; X64-NEXT:    retq
688  %lshr = lshr <16 x i16> %r, %a
689  ret <16 x i16> %lshr
690}
691
692define <32 x i8> @lshr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
693; X32-LABEL: lshr_32i8:
694; X32:       # %bb.0:
695; X32-NEXT:    vpsllw $5, %ymm1, %ymm1
696; X32-NEXT:    vpsrlw $4, %ymm0, %ymm2
697; X32-NEXT:    vpand {{\.LCPI.*}}, %ymm2, %ymm2
698; X32-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
699; X32-NEXT:    vpsrlw $2, %ymm0, %ymm2
700; X32-NEXT:    vpand {{\.LCPI.*}}, %ymm2, %ymm2
701; X32-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
702; X32-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
703; X32-NEXT:    vpsrlw $1, %ymm0, %ymm2
704; X32-NEXT:    vpand {{\.LCPI.*}}, %ymm2, %ymm2
705; X32-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
706; X32-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
707; X32-NEXT:    retl
708;
709; X64-LABEL: lshr_32i8:
710; X64:       # %bb.0:
711; X64-NEXT:    vpsllw $5, %ymm1, %ymm1
712; X64-NEXT:    vpsrlw $4, %ymm0, %ymm2
713; X64-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
714; X64-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
715; X64-NEXT:    vpsrlw $2, %ymm0, %ymm2
716; X64-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
717; X64-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
718; X64-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
719; X64-NEXT:    vpsrlw $1, %ymm0, %ymm2
720; X64-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
721; X64-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
722; X64-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
723; X64-NEXT:    retq
724  %lshr = lshr <32 x i8> %r, %a
725  ret <32 x i8> %lshr
726}
727