• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
8;
9; Just one 32-bit run to make sure we do reasonable things for i64 shifts.
10; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2
11
12;
13; Variable Shifts
14;
15
16define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
17; SSE2-LABEL: var_shift_v2i64:
18; SSE2:       # BB#0:
19; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
20; SSE2-NEXT:    movdqa %xmm0, %xmm2
21; SSE2-NEXT:    psrlq %xmm3, %xmm2
22; SSE2-NEXT:    psrlq %xmm1, %xmm0
23; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
24; SSE2-NEXT:    movapd %xmm2, %xmm0
25; SSE2-NEXT:    retq
26;
27; SSE41-LABEL: var_shift_v2i64:
28; SSE41:       # BB#0:
29; SSE41-NEXT:    movdqa %xmm0, %xmm2
30; SSE41-NEXT:    psrlq %xmm1, %xmm2
31; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
32; SSE41-NEXT:    psrlq %xmm1, %xmm0
33; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
34; SSE41-NEXT:    retq
35;
36; AVX1-LABEL: var_shift_v2i64:
37; AVX1:       # BB#0:
38; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm2
39; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
40; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
41; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
42; AVX1-NEXT:    retq
43;
44; AVX2-LABEL: var_shift_v2i64:
45; AVX2:       # BB#0:
46; AVX2-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
47; AVX2-NEXT:    retq
48;
49; XOPAVX1-LABEL: var_shift_v2i64:
50; XOPAVX1:       # BB#0:
51; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
52; XOPAVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
53; XOPAVX1-NEXT:    vpshlq %xmm1, %xmm0, %xmm0
54; XOPAVX1-NEXT:    retq
55;
56; XOPAVX2-LABEL: var_shift_v2i64:
57; XOPAVX2:       # BB#0:
58; XOPAVX2-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
59; XOPAVX2-NEXT:    retq
60;
61; X32-SSE-LABEL: var_shift_v2i64:
62; X32-SSE:       # BB#0:
63; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
64; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
65; X32-SSE-NEXT:    psrlq %xmm3, %xmm2
66; X32-SSE-NEXT:    movq {{.*#+}} xmm1 = xmm1[0],zero
67; X32-SSE-NEXT:    psrlq %xmm1, %xmm0
68; X32-SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
69; X32-SSE-NEXT:    movapd %xmm2, %xmm0
70; X32-SSE-NEXT:    retl
71  %shift = lshr <2 x i64> %a, %b
72  ret <2 x i64> %shift
73}
74
75define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
76; SSE2-LABEL: var_shift_v4i32:
77; SSE2:       # BB#0:
78; SSE2-NEXT:    movdqa %xmm1, %xmm2
79; SSE2-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
80; SSE2-NEXT:    movdqa %xmm0, %xmm3
81; SSE2-NEXT:    psrld %xmm2, %xmm3
82; SSE2-NEXT:    movdqa %xmm1, %xmm2
83; SSE2-NEXT:    psrlq $32, %xmm2
84; SSE2-NEXT:    movdqa %xmm0, %xmm4
85; SSE2-NEXT:    psrld %xmm2, %xmm4
86; SSE2-NEXT:    movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1]
87; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
88; SSE2-NEXT:    pxor %xmm3, %xmm3
89; SSE2-NEXT:    movdqa %xmm1, %xmm4
90; SSE2-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
91; SSE2-NEXT:    movdqa %xmm0, %xmm5
92; SSE2-NEXT:    psrld %xmm4, %xmm5
93; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
94; SSE2-NEXT:    psrld %xmm1, %xmm0
95; SSE2-NEXT:    movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1]
96; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3]
97; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
98; SSE2-NEXT:    retq
99;
100; SSE41-LABEL: var_shift_v4i32:
101; SSE41:       # BB#0:
102; SSE41-NEXT:    movdqa %xmm1, %xmm2
103; SSE41-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
104; SSE41-NEXT:    movdqa %xmm0, %xmm3
105; SSE41-NEXT:    psrld %xmm2, %xmm3
106; SSE41-NEXT:    movdqa %xmm1, %xmm2
107; SSE41-NEXT:    psrlq $32, %xmm2
108; SSE41-NEXT:    movdqa %xmm0, %xmm4
109; SSE41-NEXT:    psrld %xmm2, %xmm4
110; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
111; SSE41-NEXT:    pxor %xmm2, %xmm2
112; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero
113; SSE41-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
114; SSE41-NEXT:    movdqa %xmm0, %xmm2
115; SSE41-NEXT:    psrld %xmm1, %xmm2
116; SSE41-NEXT:    psrld %xmm3, %xmm0
117; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
118; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
119; SSE41-NEXT:    retq
120;
121; AVX1-LABEL: var_shift_v4i32:
122; AVX1:       # BB#0:
123; AVX1-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
124; AVX1-NEXT:    vpsrld %xmm2, %xmm0, %xmm2
125; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm3
126; AVX1-NEXT:    vpsrld %xmm3, %xmm0, %xmm3
127; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
128; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
129; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
130; AVX1-NEXT:    vpsrld %xmm3, %xmm0, %xmm3
131; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
132; AVX1-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
133; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
134; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
135; AVX1-NEXT:    retq
136;
137; AVX2-LABEL: var_shift_v4i32:
138; AVX2:       # BB#0:
139; AVX2-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
140; AVX2-NEXT:    retq
141;
142; XOPAVX1-LABEL: var_shift_v4i32:
143; XOPAVX1:       # BB#0:
144; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
145; XOPAVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm1
146; XOPAVX1-NEXT:    vpshld %xmm1, %xmm0, %xmm0
147; XOPAVX1-NEXT:    retq
148;
149; XOPAVX2-LABEL: var_shift_v4i32:
150; XOPAVX2:       # BB#0:
151; XOPAVX2-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
152; XOPAVX2-NEXT:    retq
153;
154; X32-SSE-LABEL: var_shift_v4i32:
155; X32-SSE:       # BB#0:
156; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
157; X32-SSE-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
158; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
159; X32-SSE-NEXT:    psrld %xmm2, %xmm3
160; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
161; X32-SSE-NEXT:    psrlq $32, %xmm2
162; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
163; X32-SSE-NEXT:    psrld %xmm2, %xmm4
164; X32-SSE-NEXT:    movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1]
165; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
166; X32-SSE-NEXT:    pxor %xmm3, %xmm3
167; X32-SSE-NEXT:    movdqa %xmm1, %xmm4
168; X32-SSE-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
169; X32-SSE-NEXT:    movdqa %xmm0, %xmm5
170; X32-SSE-NEXT:    psrld %xmm4, %xmm5
171; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
172; X32-SSE-NEXT:    psrld %xmm1, %xmm0
173; X32-SSE-NEXT:    movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1]
174; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3]
175; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
176; X32-SSE-NEXT:    retl
177  %shift = lshr <4 x i32> %a, %b
178  ret <4 x i32> %shift
179}
180
181define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
182; SSE2-LABEL: var_shift_v8i16:
183; SSE2:       # BB#0:
184; SSE2-NEXT:    psllw $12, %xmm1
185; SSE2-NEXT:    movdqa %xmm1, %xmm2
186; SSE2-NEXT:    psraw $15, %xmm2
187; SSE2-NEXT:    movdqa %xmm2, %xmm3
188; SSE2-NEXT:    pandn %xmm0, %xmm3
189; SSE2-NEXT:    psrlw $8, %xmm0
190; SSE2-NEXT:    pand %xmm2, %xmm0
191; SSE2-NEXT:    por %xmm3, %xmm0
192; SSE2-NEXT:    paddw %xmm1, %xmm1
193; SSE2-NEXT:    movdqa %xmm1, %xmm2
194; SSE2-NEXT:    psraw $15, %xmm2
195; SSE2-NEXT:    movdqa %xmm2, %xmm3
196; SSE2-NEXT:    pandn %xmm0, %xmm3
197; SSE2-NEXT:    psrlw $4, %xmm0
198; SSE2-NEXT:    pand %xmm2, %xmm0
199; SSE2-NEXT:    por %xmm3, %xmm0
200; SSE2-NEXT:    paddw %xmm1, %xmm1
201; SSE2-NEXT:    movdqa %xmm1, %xmm2
202; SSE2-NEXT:    psraw $15, %xmm2
203; SSE2-NEXT:    movdqa %xmm2, %xmm3
204; SSE2-NEXT:    pandn %xmm0, %xmm3
205; SSE2-NEXT:    psrlw $2, %xmm0
206; SSE2-NEXT:    pand %xmm2, %xmm0
207; SSE2-NEXT:    por %xmm3, %xmm0
208; SSE2-NEXT:    paddw %xmm1, %xmm1
209; SSE2-NEXT:    psraw $15, %xmm1
210; SSE2-NEXT:    movdqa %xmm1, %xmm2
211; SSE2-NEXT:    pandn %xmm0, %xmm2
212; SSE2-NEXT:    psrlw $1, %xmm0
213; SSE2-NEXT:    pand %xmm1, %xmm0
214; SSE2-NEXT:    por %xmm2, %xmm0
215; SSE2-NEXT:    retq
216;
217; SSE41-LABEL: var_shift_v8i16:
218; SSE41:       # BB#0:
219; SSE41-NEXT:    movdqa %xmm0, %xmm2
220; SSE41-NEXT:    movdqa %xmm1, %xmm0
221; SSE41-NEXT:    psllw $12, %xmm0
222; SSE41-NEXT:    psllw $4, %xmm1
223; SSE41-NEXT:    por %xmm0, %xmm1
224; SSE41-NEXT:    movdqa %xmm1, %xmm3
225; SSE41-NEXT:    paddw %xmm3, %xmm3
226; SSE41-NEXT:    movdqa %xmm2, %xmm4
227; SSE41-NEXT:    psrlw $8, %xmm4
228; SSE41-NEXT:    movdqa %xmm1, %xmm0
229; SSE41-NEXT:    pblendvb %xmm4, %xmm2
230; SSE41-NEXT:    movdqa %xmm2, %xmm1
231; SSE41-NEXT:    psrlw $4, %xmm1
232; SSE41-NEXT:    movdqa %xmm3, %xmm0
233; SSE41-NEXT:    pblendvb %xmm1, %xmm2
234; SSE41-NEXT:    movdqa %xmm2, %xmm1
235; SSE41-NEXT:    psrlw $2, %xmm1
236; SSE41-NEXT:    paddw %xmm3, %xmm3
237; SSE41-NEXT:    movdqa %xmm3, %xmm0
238; SSE41-NEXT:    pblendvb %xmm1, %xmm2
239; SSE41-NEXT:    movdqa %xmm2, %xmm1
240; SSE41-NEXT:    psrlw $1, %xmm1
241; SSE41-NEXT:    paddw %xmm3, %xmm3
242; SSE41-NEXT:    movdqa %xmm3, %xmm0
243; SSE41-NEXT:    pblendvb %xmm1, %xmm2
244; SSE41-NEXT:    movdqa %xmm2, %xmm0
245; SSE41-NEXT:    retq
246;
247; AVX1-LABEL: var_shift_v8i16:
248; AVX1:       # BB#0:
249; AVX1-NEXT:    vpsllw $12, %xmm1, %xmm2
250; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
251; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
252; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm2
253; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm3
254; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
255; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm1
256; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
257; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm1
258; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
259; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
260; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm1
261; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
262; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
263; AVX1-NEXT:    retq
264;
265; AVX2-LABEL: var_shift_v8i16:
266; AVX2:       # BB#0:
267; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
268; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
269; AVX2-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
270; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
271; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
272; AVX2-NEXT:    vzeroupper
273; AVX2-NEXT:    retq
274;
275; XOP-LABEL: var_shift_v8i16:
276; XOP:       # BB#0:
277; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
278; XOP-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
279; XOP-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
280; XOP-NEXT:    retq
281;
282; X32-SSE-LABEL: var_shift_v8i16:
283; X32-SSE:       # BB#0:
284; X32-SSE-NEXT:    psllw $12, %xmm1
285; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
286; X32-SSE-NEXT:    psraw $15, %xmm2
287; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
288; X32-SSE-NEXT:    pandn %xmm0, %xmm3
289; X32-SSE-NEXT:    psrlw $8, %xmm0
290; X32-SSE-NEXT:    pand %xmm2, %xmm0
291; X32-SSE-NEXT:    por %xmm3, %xmm0
292; X32-SSE-NEXT:    paddw %xmm1, %xmm1
293; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
294; X32-SSE-NEXT:    psraw $15, %xmm2
295; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
296; X32-SSE-NEXT:    pandn %xmm0, %xmm3
297; X32-SSE-NEXT:    psrlw $4, %xmm0
298; X32-SSE-NEXT:    pand %xmm2, %xmm0
299; X32-SSE-NEXT:    por %xmm3, %xmm0
300; X32-SSE-NEXT:    paddw %xmm1, %xmm1
301; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
302; X32-SSE-NEXT:    psraw $15, %xmm2
303; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
304; X32-SSE-NEXT:    pandn %xmm0, %xmm3
305; X32-SSE-NEXT:    psrlw $2, %xmm0
306; X32-SSE-NEXT:    pand %xmm2, %xmm0
307; X32-SSE-NEXT:    por %xmm3, %xmm0
308; X32-SSE-NEXT:    paddw %xmm1, %xmm1
309; X32-SSE-NEXT:    psraw $15, %xmm1
310; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
311; X32-SSE-NEXT:    pandn %xmm0, %xmm2
312; X32-SSE-NEXT:    psrlw $1, %xmm0
313; X32-SSE-NEXT:    pand %xmm1, %xmm0
314; X32-SSE-NEXT:    por %xmm2, %xmm0
315; X32-SSE-NEXT:    retl
316  %shift = lshr <8 x i16> %a, %b
317  ret <8 x i16> %shift
318}
319
320define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
321; SSE2-LABEL: var_shift_v16i8:
322; SSE2:       # BB#0:
323; SSE2-NEXT:    psllw $5, %xmm1
324; SSE2-NEXT:    pxor %xmm2, %xmm2
325; SSE2-NEXT:    pxor %xmm3, %xmm3
326; SSE2-NEXT:    pcmpgtb %xmm1, %xmm3
327; SSE2-NEXT:    movdqa %xmm3, %xmm4
328; SSE2-NEXT:    pandn %xmm0, %xmm4
329; SSE2-NEXT:    psrlw $4, %xmm0
330; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
331; SSE2-NEXT:    pand %xmm3, %xmm0
332; SSE2-NEXT:    por %xmm4, %xmm0
333; SSE2-NEXT:    paddb %xmm1, %xmm1
334; SSE2-NEXT:    pxor %xmm3, %xmm3
335; SSE2-NEXT:    pcmpgtb %xmm1, %xmm3
336; SSE2-NEXT:    movdqa %xmm3, %xmm4
337; SSE2-NEXT:    pandn %xmm0, %xmm4
338; SSE2-NEXT:    psrlw $2, %xmm0
339; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
340; SSE2-NEXT:    pand %xmm3, %xmm0
341; SSE2-NEXT:    por %xmm4, %xmm0
342; SSE2-NEXT:    paddb %xmm1, %xmm1
343; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
344; SSE2-NEXT:    movdqa %xmm2, %xmm1
345; SSE2-NEXT:    pandn %xmm0, %xmm1
346; SSE2-NEXT:    psrlw $1, %xmm0
347; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
348; SSE2-NEXT:    pand %xmm2, %xmm0
349; SSE2-NEXT:    por %xmm1, %xmm0
350; SSE2-NEXT:    retq
351;
352; SSE41-LABEL: var_shift_v16i8:
353; SSE41:       # BB#0:
354; SSE41-NEXT:    movdqa %xmm0, %xmm2
355; SSE41-NEXT:    psllw $5, %xmm1
356; SSE41-NEXT:    movdqa %xmm2, %xmm3
357; SSE41-NEXT:    psrlw $4, %xmm3
358; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
359; SSE41-NEXT:    movdqa %xmm1, %xmm0
360; SSE41-NEXT:    pblendvb %xmm3, %xmm2
361; SSE41-NEXT:    movdqa %xmm2, %xmm3
362; SSE41-NEXT:    psrlw $2, %xmm3
363; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
364; SSE41-NEXT:    paddb %xmm1, %xmm1
365; SSE41-NEXT:    movdqa %xmm1, %xmm0
366; SSE41-NEXT:    pblendvb %xmm3, %xmm2
367; SSE41-NEXT:    movdqa %xmm2, %xmm3
368; SSE41-NEXT:    psrlw $1, %xmm3
369; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
370; SSE41-NEXT:    paddb %xmm1, %xmm1
371; SSE41-NEXT:    movdqa %xmm1, %xmm0
372; SSE41-NEXT:    pblendvb %xmm3, %xmm2
373; SSE41-NEXT:    movdqa %xmm2, %xmm0
374; SSE41-NEXT:    retq
375;
376; AVX-LABEL: var_shift_v16i8:
377; AVX:       # BB#0:
378; AVX-NEXT:    vpsllw $5, %xmm1, %xmm1
379; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm2
380; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
381; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
382; AVX-NEXT:    vpsrlw $2, %xmm0, %xmm2
383; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
384; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
385; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
386; AVX-NEXT:    vpsrlw $1, %xmm0, %xmm2
387; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
388; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
389; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
390; AVX-NEXT:    retq
391;
392; XOP-LABEL: var_shift_v16i8:
393; XOP:       # BB#0:
394; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
395; XOP-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
396; XOP-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
397; XOP-NEXT:    retq
398;
399; X32-SSE-LABEL: var_shift_v16i8:
400; X32-SSE:       # BB#0:
401; X32-SSE-NEXT:    psllw $5, %xmm1
402; X32-SSE-NEXT:    pxor %xmm2, %xmm2
403; X32-SSE-NEXT:    pxor %xmm3, %xmm3
404; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm3
405; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
406; X32-SSE-NEXT:    pandn %xmm0, %xmm4
407; X32-SSE-NEXT:    psrlw $4, %xmm0
408; X32-SSE-NEXT:    pand .LCPI3_0, %xmm0
409; X32-SSE-NEXT:    pand %xmm3, %xmm0
410; X32-SSE-NEXT:    por %xmm4, %xmm0
411; X32-SSE-NEXT:    paddb %xmm1, %xmm1
412; X32-SSE-NEXT:    pxor %xmm3, %xmm3
413; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm3
414; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
415; X32-SSE-NEXT:    pandn %xmm0, %xmm4
416; X32-SSE-NEXT:    psrlw $2, %xmm0
417; X32-SSE-NEXT:    pand .LCPI3_1, %xmm0
418; X32-SSE-NEXT:    pand %xmm3, %xmm0
419; X32-SSE-NEXT:    por %xmm4, %xmm0
420; X32-SSE-NEXT:    paddb %xmm1, %xmm1
421; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm2
422; X32-SSE-NEXT:    movdqa %xmm2, %xmm1
423; X32-SSE-NEXT:    pandn %xmm0, %xmm1
424; X32-SSE-NEXT:    psrlw $1, %xmm0
425; X32-SSE-NEXT:    pand .LCPI3_2, %xmm0
426; X32-SSE-NEXT:    pand %xmm2, %xmm0
427; X32-SSE-NEXT:    por %xmm1, %xmm0
428; X32-SSE-NEXT:    retl
429  %shift = lshr <16 x i8> %a, %b
430  ret <16 x i8> %shift
431}
432
433;
434; Uniform Variable Shifts
435;
436
437define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
438; SSE-LABEL: splatvar_shift_v2i64:
439; SSE:       # BB#0:
440; SSE-NEXT:    psrlq %xmm1, %xmm0
441; SSE-NEXT:    retq
442;
443; AVX-LABEL: splatvar_shift_v2i64:
444; AVX:       # BB#0:
445; AVX-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
446; AVX-NEXT:    retq
447;
448; XOP-LABEL: splatvar_shift_v2i64:
449; XOP:       # BB#0:
450; XOP-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
451; XOP-NEXT:    retq
452;
453; X32-SSE-LABEL: splatvar_shift_v2i64:
454; X32-SSE:       # BB#0:
455; X32-SSE-NEXT:    movq {{.*#+}} xmm1 = xmm1[0],zero
456; X32-SSE-NEXT:    psrlq %xmm1, %xmm0
457; X32-SSE-NEXT:    retl
458  %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer
459  %shift = lshr <2 x i64> %a, %splat
460  ret <2 x i64> %shift
461}
462
463define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
464; SSE2-LABEL: splatvar_shift_v4i32:
465; SSE2:       # BB#0:
466; SSE2-NEXT:    xorps %xmm2, %xmm2
467; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
468; SSE2-NEXT:    psrld %xmm2, %xmm0
469; SSE2-NEXT:    retq
470;
471; SSE41-LABEL: splatvar_shift_v4i32:
472; SSE41:       # BB#0:
473; SSE41-NEXT:    pxor %xmm2, %xmm2
474; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7]
475; SSE41-NEXT:    psrld %xmm2, %xmm0
476; SSE41-NEXT:    retq
477;
478; AVX-LABEL: splatvar_shift_v4i32:
479; AVX:       # BB#0:
480; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
481; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
482; AVX-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
483; AVX-NEXT:    retq
484;
485; XOP-LABEL: splatvar_shift_v4i32:
486; XOP:       # BB#0:
487; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
488; XOP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
489; XOP-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
490; XOP-NEXT:    retq
491;
492; X32-SSE-LABEL: splatvar_shift_v4i32:
493; X32-SSE:       # BB#0:
494; X32-SSE-NEXT:    xorps %xmm2, %xmm2
495; X32-SSE-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
496; X32-SSE-NEXT:    psrld %xmm2, %xmm0
497; X32-SSE-NEXT:    retl
498  %splat = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer
499  %shift = lshr <4 x i32> %a, %splat
500  ret <4 x i32> %shift
501}
502
503define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
504; SSE2-LABEL: splatvar_shift_v8i16:
505; SSE2:       # BB#0:
506; SSE2-NEXT:    movd %xmm1, %eax
507; SSE2-NEXT:    movzwl %ax, %eax
508; SSE2-NEXT:    movd %eax, %xmm1
509; SSE2-NEXT:    psrlw %xmm1, %xmm0
510; SSE2-NEXT:    retq
511;
512; SSE41-LABEL: splatvar_shift_v8i16:
513; SSE41:       # BB#0:
514; SSE41-NEXT:    pxor %xmm2, %xmm2
515; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3,4,5,6,7]
516; SSE41-NEXT:    psrlw %xmm2, %xmm0
517; SSE41-NEXT:    retq
518;
519; AVX-LABEL: splatvar_shift_v8i16:
520; AVX:       # BB#0:
521; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
522; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
523; AVX-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
524; AVX-NEXT:    retq
525;
526; XOP-LABEL: splatvar_shift_v8i16:
527; XOP:       # BB#0:
528; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
529; XOP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
530; XOP-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
531; XOP-NEXT:    retq
532;
533; X32-SSE-LABEL: splatvar_shift_v8i16:
534; X32-SSE:       # BB#0:
535; X32-SSE-NEXT:    movd %xmm1, %eax
536; X32-SSE-NEXT:    movzwl %ax, %eax
537; X32-SSE-NEXT:    movd %eax, %xmm1
538; X32-SSE-NEXT:    psrlw %xmm1, %xmm0
539; X32-SSE-NEXT:    retl
540  %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer
541  %shift = lshr <8 x i16> %a, %splat
542  ret <8 x i16> %shift
543}
544
545define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
546; SSE2-LABEL: splatvar_shift_v16i8:
547; SSE2:       # BB#0:
548; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
549; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
550; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
551; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,4,4]
552; SSE2-NEXT:    psllw $5, %xmm2
553; SSE2-NEXT:    pxor %xmm1, %xmm1
554; SSE2-NEXT:    pxor %xmm3, %xmm3
555; SSE2-NEXT:    pcmpgtb %xmm2, %xmm3
556; SSE2-NEXT:    movdqa %xmm3, %xmm4
557; SSE2-NEXT:    pandn %xmm0, %xmm4
558; SSE2-NEXT:    psrlw $4, %xmm0
559; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
560; SSE2-NEXT:    pand %xmm3, %xmm0
561; SSE2-NEXT:    por %xmm4, %xmm0
562; SSE2-NEXT:    paddb %xmm2, %xmm2
563; SSE2-NEXT:    pxor %xmm3, %xmm3
564; SSE2-NEXT:    pcmpgtb %xmm2, %xmm3
565; SSE2-NEXT:    movdqa %xmm3, %xmm4
566; SSE2-NEXT:    pandn %xmm0, %xmm4
567; SSE2-NEXT:    psrlw $2, %xmm0
568; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
569; SSE2-NEXT:    pand %xmm3, %xmm0
570; SSE2-NEXT:    por %xmm4, %xmm0
571; SSE2-NEXT:    paddb %xmm2, %xmm2
572; SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
573; SSE2-NEXT:    movdqa %xmm1, %xmm2
574; SSE2-NEXT:    pandn %xmm0, %xmm2
575; SSE2-NEXT:    psrlw $1, %xmm0
576; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
577; SSE2-NEXT:    pand %xmm1, %xmm0
578; SSE2-NEXT:    por %xmm2, %xmm0
579; SSE2-NEXT:    retq
580;
581; SSE41-LABEL: splatvar_shift_v16i8:
582; SSE41:       # BB#0:
583; SSE41-NEXT:    movdqa %xmm0, %xmm2
584; SSE41-NEXT:    pxor %xmm0, %xmm0
585; SSE41-NEXT:    pshufb %xmm0, %xmm1
586; SSE41-NEXT:    psllw $5, %xmm1
587; SSE41-NEXT:    movdqa %xmm1, %xmm3
588; SSE41-NEXT:    paddb %xmm3, %xmm3
589; SSE41-NEXT:    movdqa %xmm2, %xmm4
590; SSE41-NEXT:    psrlw $4, %xmm4
591; SSE41-NEXT:    pand {{.*}}(%rip), %xmm4
592; SSE41-NEXT:    movdqa %xmm1, %xmm0
593; SSE41-NEXT:    pblendvb %xmm4, %xmm2
594; SSE41-NEXT:    movdqa %xmm2, %xmm1
595; SSE41-NEXT:    psrlw $2, %xmm1
596; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
597; SSE41-NEXT:    movdqa %xmm3, %xmm0
598; SSE41-NEXT:    pblendvb %xmm1, %xmm2
599; SSE41-NEXT:    movdqa %xmm2, %xmm1
600; SSE41-NEXT:    psrlw $1, %xmm1
601; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
602; SSE41-NEXT:    paddb %xmm3, %xmm3
603; SSE41-NEXT:    movdqa %xmm3, %xmm0
604; SSE41-NEXT:    pblendvb %xmm1, %xmm2
605; SSE41-NEXT:    movdqa %xmm2, %xmm0
606; SSE41-NEXT:    retq
607;
608; AVX1-LABEL: splatvar_shift_v16i8:
609; AVX1:       # BB#0:
610; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
611; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
612; AVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
613; AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm2
614; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm3
615; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
616; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
617; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm1
618; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
619; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
620; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm1
621; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
622; AVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
623; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
624; AVX1-NEXT:    retq
625;
626; AVX2-LABEL: splatvar_shift_v16i8:
627; AVX2:       # BB#0:
628; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
629; AVX2-NEXT:    vpsllw $5, %xmm1, %xmm1
630; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm2
631; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
632; AVX2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
633; AVX2-NEXT:    vpsrlw $2, %xmm0, %xmm2
634; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
635; AVX2-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
636; AVX2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
637; AVX2-NEXT:    vpsrlw $1, %xmm0, %xmm2
638; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
639; AVX2-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
640; AVX2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
641; AVX2-NEXT:    retq
642;
643; XOPAVX1-LABEL: splatvar_shift_v16i8:
644; XOPAVX1:       # BB#0:
645; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
646; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
647; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
648; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
649; XOPAVX1-NEXT:    retq
650;
651; XOPAVX2-LABEL: splatvar_shift_v16i8:
652; XOPAVX2:       # BB#0:
653; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
654; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
655; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
656; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
657; XOPAVX2-NEXT:    retq
658;
659; X32-SSE-LABEL: splatvar_shift_v16i8:
660; X32-SSE:       # BB#0:
661; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
662; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
663; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
664; X32-SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,4,4]
665; X32-SSE-NEXT:    psllw $5, %xmm2
666; X32-SSE-NEXT:    pxor %xmm1, %xmm1
667; X32-SSE-NEXT:    pxor %xmm3, %xmm3
668; X32-SSE-NEXT:    pcmpgtb %xmm2, %xmm3
669; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
670; X32-SSE-NEXT:    pandn %xmm0, %xmm4
671; X32-SSE-NEXT:    psrlw $4, %xmm0
672; X32-SSE-NEXT:    pand .LCPI7_0, %xmm0
673; X32-SSE-NEXT:    pand %xmm3, %xmm0
674; X32-SSE-NEXT:    por %xmm4, %xmm0
675; X32-SSE-NEXT:    paddb %xmm2, %xmm2
676; X32-SSE-NEXT:    pxor %xmm3, %xmm3
677; X32-SSE-NEXT:    pcmpgtb %xmm2, %xmm3
678; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
679; X32-SSE-NEXT:    pandn %xmm0, %xmm4
680; X32-SSE-NEXT:    psrlw $2, %xmm0
681; X32-SSE-NEXT:    pand .LCPI7_1, %xmm0
682; X32-SSE-NEXT:    pand %xmm3, %xmm0
683; X32-SSE-NEXT:    por %xmm4, %xmm0
684; X32-SSE-NEXT:    paddb %xmm2, %xmm2
685; X32-SSE-NEXT:    pcmpgtb %xmm2, %xmm1
686; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
687; X32-SSE-NEXT:    pandn %xmm0, %xmm2
688; X32-SSE-NEXT:    psrlw $1, %xmm0
689; X32-SSE-NEXT:    pand .LCPI7_2, %xmm0
690; X32-SSE-NEXT:    pand %xmm1, %xmm0
691; X32-SSE-NEXT:    por %xmm2, %xmm0
692; X32-SSE-NEXT:    retl
693  %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer
694  %shift = lshr <16 x i8> %a, %splat
695  ret <16 x i8> %shift
696}
697
698;
699; Constant Shifts
700;
701
702define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
703; SSE2-LABEL: constant_shift_v2i64:
704; SSE2:       # BB#0:
705; SSE2-NEXT:    movdqa %xmm0, %xmm1
706; SSE2-NEXT:    psrlq $7, %xmm1
707; SSE2-NEXT:    psrlq $1, %xmm0
708; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
709; SSE2-NEXT:    movapd %xmm1, %xmm0
710; SSE2-NEXT:    retq
711;
712; SSE41-LABEL: constant_shift_v2i64:
713; SSE41:       # BB#0:
714; SSE41-NEXT:    movdqa %xmm0, %xmm1
715; SSE41-NEXT:    psrlq $7, %xmm1
716; SSE41-NEXT:    psrlq $1, %xmm0
717; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
718; SSE41-NEXT:    retq
719;
720; AVX1-LABEL: constant_shift_v2i64:
721; AVX1:       # BB#0:
722; AVX1-NEXT:    vpsrlq $7, %xmm0, %xmm1
723; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
724; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
725; AVX1-NEXT:    retq
726;
727; AVX2-LABEL: constant_shift_v2i64:
728; AVX2:       # BB#0:
729; AVX2-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
730; AVX2-NEXT:    retq
731;
732; XOPAVX1-LABEL: constant_shift_v2i64:
733; XOPAVX1:       # BB#0:
734; XOPAVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
735; XOPAVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm1, %xmm1
736; XOPAVX1-NEXT:    vpshlq %xmm1, %xmm0, %xmm0
737; XOPAVX1-NEXT:    retq
738;
739; XOPAVX2-LABEL: constant_shift_v2i64:
740; XOPAVX2:       # BB#0:
741; XOPAVX2-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
742; XOPAVX2-NEXT:    retq
743;
744; X32-SSE-LABEL: constant_shift_v2i64:
745; X32-SSE:       # BB#0:
746; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
747; X32-SSE-NEXT:    psrlq $7, %xmm1
748; X32-SSE-NEXT:    psrlq $1, %xmm0
749; X32-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
750; X32-SSE-NEXT:    movapd %xmm1, %xmm0
751; X32-SSE-NEXT:    retl
752  %shift = lshr <2 x i64> %a, <i64 1, i64 7>
753  ret <2 x i64> %shift
754}
755
756define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind {
757; SSE2-LABEL: constant_shift_v4i32:
758; SSE2:       # BB#0:
759; SSE2-NEXT:    movdqa %xmm0, %xmm1
760; SSE2-NEXT:    psrld $7, %xmm1
761; SSE2-NEXT:    movdqa %xmm0, %xmm2
762; SSE2-NEXT:    psrld $5, %xmm2
763; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
764; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
765; SSE2-NEXT:    movdqa %xmm0, %xmm2
766; SSE2-NEXT:    psrld $6, %xmm2
767; SSE2-NEXT:    psrld $4, %xmm0
768; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
769; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
770; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
771; SSE2-NEXT:    retq
772;
773; SSE41-LABEL: constant_shift_v4i32:
774; SSE41:       # BB#0:
775; SSE41-NEXT:    movdqa %xmm0, %xmm1
776; SSE41-NEXT:    psrld $7, %xmm1
777; SSE41-NEXT:    movdqa %xmm0, %xmm2
778; SSE41-NEXT:    psrld $5, %xmm2
779; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
780; SSE41-NEXT:    movdqa %xmm0, %xmm1
781; SSE41-NEXT:    psrld $6, %xmm1
782; SSE41-NEXT:    psrld $4, %xmm0
783; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
784; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
785; SSE41-NEXT:    retq
786;
787; AVX1-LABEL: constant_shift_v4i32:
788; AVX1:       # BB#0:
789; AVX1-NEXT:    vpsrld $7, %xmm0, %xmm1
790; AVX1-NEXT:    vpsrld $5, %xmm0, %xmm2
791; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
792; AVX1-NEXT:    vpsrld $6, %xmm0, %xmm2
793; AVX1-NEXT:    vpsrld $4, %xmm0, %xmm0
794; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
795; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
796; AVX1-NEXT:    retq
797;
798; AVX2-LABEL: constant_shift_v4i32:
799; AVX2:       # BB#0:
800; AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
801; AVX2-NEXT:    retq
802;
803; XOPAVX1-LABEL: constant_shift_v4i32:
804; XOPAVX1:       # BB#0:
805; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm0
806; XOPAVX1-NEXT:    retq
807;
808; XOPAVX2-LABEL: constant_shift_v4i32:
809; XOPAVX2:       # BB#0:
810; XOPAVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
811; XOPAVX2-NEXT:    retq
812;
813; X32-SSE-LABEL: constant_shift_v4i32:
814; X32-SSE:       # BB#0:
815; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
816; X32-SSE-NEXT:    psrld $7, %xmm1
817; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
818; X32-SSE-NEXT:    psrld $5, %xmm2
819; X32-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
820; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
821; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
822; X32-SSE-NEXT:    psrld $6, %xmm2
823; X32-SSE-NEXT:    psrld $4, %xmm0
824; X32-SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
825; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
826; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
827; X32-SSE-NEXT:    retl
828  %shift = lshr <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7>
829  ret <4 x i32> %shift
830}
831
832define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
833; SSE2-LABEL: constant_shift_v8i16:
834; SSE2:       # BB#0:
835; SSE2-NEXT:    movdqa %xmm0, %xmm1
836; SSE2-NEXT:    psrlw $4, %xmm1
837; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
838; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
839; SSE2-NEXT:    psrlw $2, %xmm1
840; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
841; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
842; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0]
843; SSE2-NEXT:    movdqa %xmm2, %xmm1
844; SSE2-NEXT:    pand %xmm0, %xmm1
845; SSE2-NEXT:    psrlw $1, %xmm2
846; SSE2-NEXT:    pandn %xmm2, %xmm0
847; SSE2-NEXT:    por %xmm1, %xmm0
848; SSE2-NEXT:    retq
849;
850; SSE41-LABEL: constant_shift_v8i16:
851; SSE41:       # BB#0:
852; SSE41-NEXT:    movdqa %xmm0, %xmm1
853; SSE41-NEXT:    movdqa %xmm1, %xmm2
854; SSE41-NEXT:    psrlw $8, %xmm2
855; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [0,4112,8224,12336,16448,20560,24672,28784]
856; SSE41-NEXT:    pblendvb %xmm2, %xmm1
857; SSE41-NEXT:    movdqa %xmm1, %xmm2
858; SSE41-NEXT:    psrlw $4, %xmm2
859; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [0,8224,16448,24672,32896,41120,49344,57568]
860; SSE41-NEXT:    pblendvb %xmm2, %xmm1
861; SSE41-NEXT:    movdqa %xmm1, %xmm2
862; SSE41-NEXT:    psrlw $2, %xmm2
863; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [0,16448,32896,49344,256,16704,33152,49600]
864; SSE41-NEXT:    pblendvb %xmm2, %xmm1
865; SSE41-NEXT:    movdqa %xmm1, %xmm2
866; SSE41-NEXT:    psrlw $1, %xmm2
867; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [0,32896,256,33152,512,33408,768,33664]
868; SSE41-NEXT:    pblendvb %xmm2, %xmm1
869; SSE41-NEXT:    movdqa %xmm1, %xmm0
870; SSE41-NEXT:    retq
871;
872; AVX1-LABEL: constant_shift_v8i16:
873; AVX1:       # BB#0:
874; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
875; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,4112,8224,12336,16448,20560,24672,28784]
876; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
877; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm1
878; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,8224,16448,24672,32896,41120,49344,57568]
879; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
880; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm1
881; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,16448,32896,49344,256,16704,33152,49600]
882; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
883; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm1
884; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,32896,256,33152,512,33408,768,33664]
885; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
886; AVX1-NEXT:    retq
887;
888; AVX2-LABEL: constant_shift_v8i16:
889; AVX2:       # BB#0:
890; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
891; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
892; AVX2-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
893; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
894; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
895; AVX2-NEXT:    vzeroupper
896; AVX2-NEXT:    retq
897;
898; XOP-LABEL: constant_shift_v8i16:
899; XOP:       # BB#0:
900; XOP-NEXT:    vpxor %xmm1, %xmm1, %xmm1
901; XOP-NEXT:    vpsubw {{.*}}(%rip), %xmm1, %xmm1
902; XOP-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
903; XOP-NEXT:    retq
904;
905; X32-SSE-LABEL: constant_shift_v8i16:
906; X32-SSE:       # BB#0:
907; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
908; X32-SSE-NEXT:    psrlw $4, %xmm1
909; X32-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
910; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
911; X32-SSE-NEXT:    psrlw $2, %xmm1
912; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
913; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
914; X32-SSE-NEXT:    movdqa {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0]
915; X32-SSE-NEXT:    movdqa %xmm2, %xmm1
916; X32-SSE-NEXT:    pand %xmm0, %xmm1
917; X32-SSE-NEXT:    psrlw $1, %xmm2
918; X32-SSE-NEXT:    pandn %xmm2, %xmm0
919; X32-SSE-NEXT:    por %xmm1, %xmm0
920; X32-SSE-NEXT:    retl
921  %shift = lshr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
922  ret <8 x i16> %shift
923}
924
925define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
926; SSE2-LABEL: constant_shift_v16i8:
927; SSE2:       # BB#0:
928; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
929; SSE2-NEXT:    psllw $5, %xmm2
930; SSE2-NEXT:    pxor %xmm1, %xmm1
931; SSE2-NEXT:    pxor %xmm3, %xmm3
932; SSE2-NEXT:    pcmpgtb %xmm2, %xmm3
933; SSE2-NEXT:    movdqa %xmm3, %xmm4
934; SSE2-NEXT:    pandn %xmm0, %xmm4
935; SSE2-NEXT:    psrlw $4, %xmm0
936; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
937; SSE2-NEXT:    pand %xmm3, %xmm0
938; SSE2-NEXT:    por %xmm4, %xmm0
939; SSE2-NEXT:    paddb %xmm2, %xmm2
940; SSE2-NEXT:    pxor %xmm3, %xmm3
941; SSE2-NEXT:    pcmpgtb %xmm2, %xmm3
942; SSE2-NEXT:    movdqa %xmm3, %xmm4
943; SSE2-NEXT:    pandn %xmm0, %xmm4
944; SSE2-NEXT:    psrlw $2, %xmm0
945; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
946; SSE2-NEXT:    pand %xmm3, %xmm0
947; SSE2-NEXT:    por %xmm4, %xmm0
948; SSE2-NEXT:    paddb %xmm2, %xmm2
949; SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
950; SSE2-NEXT:    movdqa %xmm1, %xmm2
951; SSE2-NEXT:    pandn %xmm0, %xmm2
952; SSE2-NEXT:    psrlw $1, %xmm0
953; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
954; SSE2-NEXT:    pand %xmm1, %xmm0
955; SSE2-NEXT:    por %xmm2, %xmm0
956; SSE2-NEXT:    retq
957;
958; SSE41-LABEL: constant_shift_v16i8:
959; SSE41:       # BB#0:
960; SSE41-NEXT:    movdqa %xmm0, %xmm1
961; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
962; SSE41-NEXT:    psllw $5, %xmm0
963; SSE41-NEXT:    movdqa %xmm1, %xmm2
964; SSE41-NEXT:    psrlw $4, %xmm2
965; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
966; SSE41-NEXT:    pblendvb %xmm2, %xmm1
967; SSE41-NEXT:    movdqa %xmm1, %xmm2
968; SSE41-NEXT:    psrlw $2, %xmm2
969; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
970; SSE41-NEXT:    paddb %xmm0, %xmm0
971; SSE41-NEXT:    pblendvb %xmm2, %xmm1
972; SSE41-NEXT:    movdqa %xmm1, %xmm2
973; SSE41-NEXT:    psrlw $1, %xmm2
974; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
975; SSE41-NEXT:    paddb %xmm0, %xmm0
976; SSE41-NEXT:    pblendvb %xmm2, %xmm1
977; SSE41-NEXT:    movdqa %xmm1, %xmm0
978; SSE41-NEXT:    retq
979;
980; AVX-LABEL: constant_shift_v16i8:
981; AVX:       # BB#0:
982; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
983; AVX-NEXT:    vpsllw $5, %xmm1, %xmm1
984; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm2
985; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
986; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
987; AVX-NEXT:    vpsrlw $2, %xmm0, %xmm2
988; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
989; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
990; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
991; AVX-NEXT:    vpsrlw $1, %xmm0, %xmm2
992; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
993; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
994; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
995; AVX-NEXT:    retq
996;
997; XOP-LABEL: constant_shift_v16i8:
998; XOP:       # BB#0:
999; XOP-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1000; XOP-NEXT:    vpsubb {{.*}}(%rip), %xmm1, %xmm1
1001; XOP-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
1002; XOP-NEXT:    retq
1003;
1004; X32-SSE-LABEL: constant_shift_v16i8:
1005; X32-SSE:       # BB#0:
1006; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
1007; X32-SSE-NEXT:    psllw $5, %xmm2
1008; X32-SSE-NEXT:    pxor %xmm1, %xmm1
1009; X32-SSE-NEXT:    pxor %xmm3, %xmm3
1010; X32-SSE-NEXT:    pcmpgtb %xmm2, %xmm3
1011; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
1012; X32-SSE-NEXT:    pandn %xmm0, %xmm4
1013; X32-SSE-NEXT:    psrlw $4, %xmm0
1014; X32-SSE-NEXT:    pand .LCPI11_1, %xmm0
1015; X32-SSE-NEXT:    pand %xmm3, %xmm0
1016; X32-SSE-NEXT:    por %xmm4, %xmm0
1017; X32-SSE-NEXT:    paddb %xmm2, %xmm2
1018; X32-SSE-NEXT:    pxor %xmm3, %xmm3
1019; X32-SSE-NEXT:    pcmpgtb %xmm2, %xmm3
1020; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
1021; X32-SSE-NEXT:    pandn %xmm0, %xmm4
1022; X32-SSE-NEXT:    psrlw $2, %xmm0
1023; X32-SSE-NEXT:    pand .LCPI11_2, %xmm0
1024; X32-SSE-NEXT:    pand %xmm3, %xmm0
1025; X32-SSE-NEXT:    por %xmm4, %xmm0
1026; X32-SSE-NEXT:    paddb %xmm2, %xmm2
1027; X32-SSE-NEXT:    pcmpgtb %xmm2, %xmm1
1028; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
1029; X32-SSE-NEXT:    pandn %xmm0, %xmm2
1030; X32-SSE-NEXT:    psrlw $1, %xmm0
1031; X32-SSE-NEXT:    pand .LCPI11_3, %xmm0
1032; X32-SSE-NEXT:    pand %xmm1, %xmm0
1033; X32-SSE-NEXT:    por %xmm2, %xmm0
1034; X32-SSE-NEXT:    retl
1035  %shift = lshr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
1036  ret <16 x i8> %shift
1037}
1038
1039;
1040; Uniform Constant Shifts
1041;
1042
1043define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind {
1044; SSE-LABEL: splatconstant_shift_v2i64:
1045; SSE:       # BB#0:
1046; SSE-NEXT:    psrlq $7, %xmm0
1047; SSE-NEXT:    retq
1048;
1049; AVX-LABEL: splatconstant_shift_v2i64:
1050; AVX:       # BB#0:
1051; AVX-NEXT:    vpsrlq $7, %xmm0, %xmm0
1052; AVX-NEXT:    retq
1053;
1054; XOP-LABEL: splatconstant_shift_v2i64:
1055; XOP:       # BB#0:
1056; XOP-NEXT:    vpsrlq $7, %xmm0, %xmm0
1057; XOP-NEXT:    retq
1058;
1059; X32-SSE-LABEL: splatconstant_shift_v2i64:
1060; X32-SSE:       # BB#0:
1061; X32-SSE-NEXT:    psrlq $7, %xmm0
1062; X32-SSE-NEXT:    retl
1063  %shift = lshr <2 x i64> %a, <i64 7, i64 7>
1064  ret <2 x i64> %shift
1065}
1066
1067define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) nounwind {
1068; SSE-LABEL: splatconstant_shift_v4i32:
1069; SSE:       # BB#0:
1070; SSE-NEXT:    psrld $5, %xmm0
1071; SSE-NEXT:    retq
1072;
1073; AVX-LABEL: splatconstant_shift_v4i32:
1074; AVX:       # BB#0:
1075; AVX-NEXT:    vpsrld $5, %xmm0, %xmm0
1076; AVX-NEXT:    retq
1077;
1078; XOP-LABEL: splatconstant_shift_v4i32:
1079; XOP:       # BB#0:
1080; XOP-NEXT:    vpsrld $5, %xmm0, %xmm0
1081; XOP-NEXT:    retq
1082;
1083; X32-SSE-LABEL: splatconstant_shift_v4i32:
1084; X32-SSE:       # BB#0:
1085; X32-SSE-NEXT:    psrld $5, %xmm0
1086; X32-SSE-NEXT:    retl
1087  %shift = lshr <4 x i32> %a, <i32 5, i32 5, i32 5, i32 5>
1088  ret <4 x i32> %shift
1089}
1090
1091define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) nounwind {
1092; SSE-LABEL: splatconstant_shift_v8i16:
1093; SSE:       # BB#0:
1094; SSE-NEXT:    psrlw $3, %xmm0
1095; SSE-NEXT:    retq
1096;
1097; AVX-LABEL: splatconstant_shift_v8i16:
1098; AVX:       # BB#0:
1099; AVX-NEXT:    vpsrlw $3, %xmm0, %xmm0
1100; AVX-NEXT:    retq
1101;
1102; XOP-LABEL: splatconstant_shift_v8i16:
1103; XOP:       # BB#0:
1104; XOP-NEXT:    vpsrlw $3, %xmm0, %xmm0
1105; XOP-NEXT:    retq
1106;
1107; X32-SSE-LABEL: splatconstant_shift_v8i16:
1108; X32-SSE:       # BB#0:
1109; X32-SSE-NEXT:    psrlw $3, %xmm0
1110; X32-SSE-NEXT:    retl
1111  %shift = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
1112  ret <8 x i16> %shift
1113}
1114
1115define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind {
1116; SSE-LABEL: splatconstant_shift_v16i8:
1117; SSE:       # BB#0:
1118; SSE-NEXT:    psrlw $3, %xmm0
1119; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
1120; SSE-NEXT:    retq
1121;
1122; AVX-LABEL: splatconstant_shift_v16i8:
1123; AVX:       # BB#0:
1124; AVX-NEXT:    vpsrlw $3, %xmm0, %xmm0
1125; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
1126; AVX-NEXT:    retq
1127;
1128; XOP-LABEL: splatconstant_shift_v16i8:
1129; XOP:       # BB#0:
1130; XOP-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1131; XOP-NEXT:    vpsubb {{.*}}(%rip), %xmm1, %xmm1
1132; XOP-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
1133; XOP-NEXT:    retq
1134;
1135; X32-SSE-LABEL: splatconstant_shift_v16i8:
1136; X32-SSE:       # BB#0:
1137; X32-SSE-NEXT:    psrlw $3, %xmm0
1138; X32-SSE-NEXT:    pand .LCPI15_0, %xmm0
1139; X32-SSE-NEXT:    retl
1140  %shift = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
1141  ret <16 x i8> %shift
1142}
1143