• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
8;
9; Just one 32-bit run to make sure we do reasonable things for i64 shifts.
10; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2
11
12;
13; Variable Shifts
14;
15
16define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
17; SSE2-LABEL: var_shift_v2i64:
18; SSE2:       # BB#0:
19; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
20; SSE2-NEXT:    movdqa %xmm0, %xmm2
21; SSE2-NEXT:    psllq %xmm3, %xmm2
22; SSE2-NEXT:    psllq %xmm1, %xmm0
23; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
24; SSE2-NEXT:    movapd %xmm2, %xmm0
25; SSE2-NEXT:    retq
26;
27; SSE41-LABEL: var_shift_v2i64:
28; SSE41:       # BB#0:
29; SSE41-NEXT:    movdqa %xmm0, %xmm2
30; SSE41-NEXT:    psllq %xmm1, %xmm2
31; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
32; SSE41-NEXT:    psllq %xmm1, %xmm0
33; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
34; SSE41-NEXT:    retq
35;
36; AVX1-LABEL: var_shift_v2i64:
37; AVX1:       # BB#0:
38; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm2
39; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
40; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
41; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
42; AVX1-NEXT:    retq
43;
44; AVX2-LABEL: var_shift_v2i64:
45; AVX2:       # BB#0:
46; AVX2-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
47; AVX2-NEXT:    retq
48;
49; XOPAVX1-LABEL: var_shift_v2i64:
50; XOPAVX1:       # BB#0:
51; XOPAVX1-NEXT:    vpshlq %xmm1, %xmm0, %xmm0
52; XOPAVX1-NEXT:    retq
53;
54; XOPAVX2-LABEL: var_shift_v2i64:
55; XOPAVX2:       # BB#0:
56; XOPAVX2-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
57; XOPAVX2-NEXT:    retq
58;
59; X32-SSE-LABEL: var_shift_v2i64:
60; X32-SSE:       # BB#0:
61; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
62; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
63; X32-SSE-NEXT:    psllq %xmm3, %xmm2
64; X32-SSE-NEXT:    movq {{.*#+}} xmm1 = xmm1[0],zero
65; X32-SSE-NEXT:    psllq %xmm1, %xmm0
66; X32-SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
67; X32-SSE-NEXT:    movapd %xmm2, %xmm0
68; X32-SSE-NEXT:    retl
69  %shift = shl <2 x i64> %a, %b
70  ret <2 x i64> %shift
71}
72
73define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
74; SSE2-LABEL: var_shift_v4i32:
75; SSE2:       # BB#0:
76; SSE2-NEXT:    pslld $23, %xmm1
77; SSE2-NEXT:    paddd {{.*}}(%rip), %xmm1
78; SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
79; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
80; SSE2-NEXT:    pmuludq %xmm0, %xmm1
81; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
82; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
83; SSE2-NEXT:    pmuludq %xmm2, %xmm0
84; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
85; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
86; SSE2-NEXT:    movdqa %xmm1, %xmm0
87; SSE2-NEXT:    retq
88;
89; SSE41-LABEL: var_shift_v4i32:
90; SSE41:       # BB#0:
91; SSE41-NEXT:    pslld $23, %xmm1
92; SSE41-NEXT:    paddd {{.*}}(%rip), %xmm1
93; SSE41-NEXT:    cvttps2dq %xmm1, %xmm1
94; SSE41-NEXT:    pmulld %xmm1, %xmm0
95; SSE41-NEXT:    retq
96;
97; AVX1-LABEL: var_shift_v4i32:
98; AVX1:       # BB#0:
99; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
100; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm1, %xmm1
101; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
102; AVX1-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
103; AVX1-NEXT:    retq
104;
105; AVX2-LABEL: var_shift_v4i32:
106; AVX2:       # BB#0:
107; AVX2-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
108; AVX2-NEXT:    retq
109;
110; XOPAVX1-LABEL: var_shift_v4i32:
111; XOPAVX1:       # BB#0:
112; XOPAVX1-NEXT:    vpshld %xmm1, %xmm0, %xmm0
113; XOPAVX1-NEXT:    retq
114;
115; XOPAVX2-LABEL: var_shift_v4i32:
116; XOPAVX2:       # BB#0:
117; XOPAVX2-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
118; XOPAVX2-NEXT:    retq
119;
120; X32-SSE-LABEL: var_shift_v4i32:
121; X32-SSE:       # BB#0:
122; X32-SSE-NEXT:    pslld $23, %xmm1
123; X32-SSE-NEXT:    paddd .LCPI1_0, %xmm1
124; X32-SSE-NEXT:    cvttps2dq %xmm1, %xmm1
125; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
126; X32-SSE-NEXT:    pmuludq %xmm0, %xmm1
127; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
128; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
129; X32-SSE-NEXT:    pmuludq %xmm2, %xmm0
130; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
131; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
132; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
133; X32-SSE-NEXT:    retl
134  %shift = shl <4 x i32> %a, %b
135  ret <4 x i32> %shift
136}
137
138define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
139; SSE2-LABEL: var_shift_v8i16:
140; SSE2:       # BB#0:
141; SSE2-NEXT:    psllw $12, %xmm1
142; SSE2-NEXT:    movdqa %xmm1, %xmm2
143; SSE2-NEXT:    psraw $15, %xmm2
144; SSE2-NEXT:    movdqa %xmm2, %xmm3
145; SSE2-NEXT:    pandn %xmm0, %xmm3
146; SSE2-NEXT:    psllw $8, %xmm0
147; SSE2-NEXT:    pand %xmm2, %xmm0
148; SSE2-NEXT:    por %xmm3, %xmm0
149; SSE2-NEXT:    paddw %xmm1, %xmm1
150; SSE2-NEXT:    movdqa %xmm1, %xmm2
151; SSE2-NEXT:    psraw $15, %xmm2
152; SSE2-NEXT:    movdqa %xmm2, %xmm3
153; SSE2-NEXT:    pandn %xmm0, %xmm3
154; SSE2-NEXT:    psllw $4, %xmm0
155; SSE2-NEXT:    pand %xmm2, %xmm0
156; SSE2-NEXT:    por %xmm3, %xmm0
157; SSE2-NEXT:    paddw %xmm1, %xmm1
158; SSE2-NEXT:    movdqa %xmm1, %xmm2
159; SSE2-NEXT:    psraw $15, %xmm2
160; SSE2-NEXT:    movdqa %xmm2, %xmm3
161; SSE2-NEXT:    pandn %xmm0, %xmm3
162; SSE2-NEXT:    psllw $2, %xmm0
163; SSE2-NEXT:    pand %xmm2, %xmm0
164; SSE2-NEXT:    por %xmm3, %xmm0
165; SSE2-NEXT:    paddw %xmm1, %xmm1
166; SSE2-NEXT:    psraw $15, %xmm1
167; SSE2-NEXT:    movdqa %xmm1, %xmm2
168; SSE2-NEXT:    pandn %xmm0, %xmm2
169; SSE2-NEXT:    psllw $1, %xmm0
170; SSE2-NEXT:    pand %xmm1, %xmm0
171; SSE2-NEXT:    por %xmm2, %xmm0
172; SSE2-NEXT:    retq
173;
174; SSE41-LABEL: var_shift_v8i16:
175; SSE41:       # BB#0:
176; SSE41-NEXT:    movdqa %xmm0, %xmm2
177; SSE41-NEXT:    movdqa %xmm1, %xmm0
178; SSE41-NEXT:    psllw $12, %xmm0
179; SSE41-NEXT:    psllw $4, %xmm1
180; SSE41-NEXT:    por %xmm0, %xmm1
181; SSE41-NEXT:    movdqa %xmm1, %xmm3
182; SSE41-NEXT:    paddw %xmm3, %xmm3
183; SSE41-NEXT:    movdqa %xmm2, %xmm4
184; SSE41-NEXT:    psllw $8, %xmm4
185; SSE41-NEXT:    movdqa %xmm1, %xmm0
186; SSE41-NEXT:    pblendvb %xmm4, %xmm2
187; SSE41-NEXT:    movdqa %xmm2, %xmm1
188; SSE41-NEXT:    psllw $4, %xmm1
189; SSE41-NEXT:    movdqa %xmm3, %xmm0
190; SSE41-NEXT:    pblendvb %xmm1, %xmm2
191; SSE41-NEXT:    movdqa %xmm2, %xmm1
192; SSE41-NEXT:    psllw $2, %xmm1
193; SSE41-NEXT:    paddw %xmm3, %xmm3
194; SSE41-NEXT:    movdqa %xmm3, %xmm0
195; SSE41-NEXT:    pblendvb %xmm1, %xmm2
196; SSE41-NEXT:    movdqa %xmm2, %xmm1
197; SSE41-NEXT:    psllw $1, %xmm1
198; SSE41-NEXT:    paddw %xmm3, %xmm3
199; SSE41-NEXT:    movdqa %xmm3, %xmm0
200; SSE41-NEXT:    pblendvb %xmm1, %xmm2
201; SSE41-NEXT:    movdqa %xmm2, %xmm0
202; SSE41-NEXT:    retq
203;
204; AVX1-LABEL: var_shift_v8i16:
205; AVX1:       # BB#0:
206; AVX1-NEXT:    vpsllw $12, %xmm1, %xmm2
207; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
208; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
209; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm2
210; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm3
211; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
212; AVX1-NEXT:    vpsllw $4, %xmm0, %xmm1
213; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
214; AVX1-NEXT:    vpsllw $2, %xmm0, %xmm1
215; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
216; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
217; AVX1-NEXT:    vpsllw $1, %xmm0, %xmm1
218; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
219; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
220; AVX1-NEXT:    retq
221;
222; AVX2-LABEL: var_shift_v8i16:
223; AVX2:       # BB#0:
224; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
225; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
226; AVX2-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
227; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
228; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
229; AVX2-NEXT:    vzeroupper
230; AVX2-NEXT:    retq
231;
232; XOP-LABEL: var_shift_v8i16:
233; XOP:       # BB#0:
234; XOP-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
235; XOP-NEXT:    retq
236;
237; X32-SSE-LABEL: var_shift_v8i16:
238; X32-SSE:       # BB#0:
239; X32-SSE-NEXT:    psllw $12, %xmm1
240; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
241; X32-SSE-NEXT:    psraw $15, %xmm2
242; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
243; X32-SSE-NEXT:    pandn %xmm0, %xmm3
244; X32-SSE-NEXT:    psllw $8, %xmm0
245; X32-SSE-NEXT:    pand %xmm2, %xmm0
246; X32-SSE-NEXT:    por %xmm3, %xmm0
247; X32-SSE-NEXT:    paddw %xmm1, %xmm1
248; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
249; X32-SSE-NEXT:    psraw $15, %xmm2
250; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
251; X32-SSE-NEXT:    pandn %xmm0, %xmm3
252; X32-SSE-NEXT:    psllw $4, %xmm0
253; X32-SSE-NEXT:    pand %xmm2, %xmm0
254; X32-SSE-NEXT:    por %xmm3, %xmm0
255; X32-SSE-NEXT:    paddw %xmm1, %xmm1
256; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
257; X32-SSE-NEXT:    psraw $15, %xmm2
258; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
259; X32-SSE-NEXT:    pandn %xmm0, %xmm3
260; X32-SSE-NEXT:    psllw $2, %xmm0
261; X32-SSE-NEXT:    pand %xmm2, %xmm0
262; X32-SSE-NEXT:    por %xmm3, %xmm0
263; X32-SSE-NEXT:    paddw %xmm1, %xmm1
264; X32-SSE-NEXT:    psraw $15, %xmm1
265; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
266; X32-SSE-NEXT:    pandn %xmm0, %xmm2
267; X32-SSE-NEXT:    psllw $1, %xmm0
268; X32-SSE-NEXT:    pand %xmm1, %xmm0
269; X32-SSE-NEXT:    por %xmm2, %xmm0
270; X32-SSE-NEXT:    retl
271  %shift = shl <8 x i16> %a, %b
272  ret <8 x i16> %shift
273}
274
275define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
276; SSE2-LABEL: var_shift_v16i8:
277; SSE2:       # BB#0:
278; SSE2-NEXT:    psllw $5, %xmm1
279; SSE2-NEXT:    pxor %xmm2, %xmm2
280; SSE2-NEXT:    pxor %xmm3, %xmm3
281; SSE2-NEXT:    pcmpgtb %xmm1, %xmm3
282; SSE2-NEXT:    movdqa %xmm3, %xmm4
283; SSE2-NEXT:    pandn %xmm0, %xmm4
284; SSE2-NEXT:    psllw $4, %xmm0
285; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
286; SSE2-NEXT:    pand %xmm3, %xmm0
287; SSE2-NEXT:    por %xmm4, %xmm0
288; SSE2-NEXT:    paddb %xmm1, %xmm1
289; SSE2-NEXT:    pxor %xmm3, %xmm3
290; SSE2-NEXT:    pcmpgtb %xmm1, %xmm3
291; SSE2-NEXT:    movdqa %xmm3, %xmm4
292; SSE2-NEXT:    pandn %xmm0, %xmm4
293; SSE2-NEXT:    psllw $2, %xmm0
294; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
295; SSE2-NEXT:    pand %xmm3, %xmm0
296; SSE2-NEXT:    por %xmm4, %xmm0
297; SSE2-NEXT:    paddb %xmm1, %xmm1
298; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
299; SSE2-NEXT:    movdqa %xmm2, %xmm1
300; SSE2-NEXT:    pandn %xmm0, %xmm1
301; SSE2-NEXT:    paddb %xmm0, %xmm0
302; SSE2-NEXT:    pand %xmm2, %xmm0
303; SSE2-NEXT:    por %xmm1, %xmm0
304; SSE2-NEXT:    retq
305;
306; SSE41-LABEL: var_shift_v16i8:
307; SSE41:       # BB#0:
308; SSE41-NEXT:    movdqa %xmm0, %xmm2
309; SSE41-NEXT:    psllw $5, %xmm1
310; SSE41-NEXT:    movdqa %xmm2, %xmm3
311; SSE41-NEXT:    psllw $4, %xmm3
312; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
313; SSE41-NEXT:    movdqa %xmm1, %xmm0
314; SSE41-NEXT:    pblendvb %xmm3, %xmm2
315; SSE41-NEXT:    movdqa %xmm2, %xmm3
316; SSE41-NEXT:    psllw $2, %xmm3
317; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
318; SSE41-NEXT:    paddb %xmm1, %xmm1
319; SSE41-NEXT:    movdqa %xmm1, %xmm0
320; SSE41-NEXT:    pblendvb %xmm3, %xmm2
321; SSE41-NEXT:    movdqa %xmm2, %xmm3
322; SSE41-NEXT:    paddb %xmm3, %xmm3
323; SSE41-NEXT:    paddb %xmm1, %xmm1
324; SSE41-NEXT:    movdqa %xmm1, %xmm0
325; SSE41-NEXT:    pblendvb %xmm3, %xmm2
326; SSE41-NEXT:    movdqa %xmm2, %xmm0
327; SSE41-NEXT:    retq
328;
329; AVX-LABEL: var_shift_v16i8:
330; AVX:       # BB#0:
331; AVX-NEXT:    vpsllw $5, %xmm1, %xmm1
332; AVX-NEXT:    vpsllw $4, %xmm0, %xmm2
333; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
334; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
335; AVX-NEXT:    vpsllw $2, %xmm0, %xmm2
336; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
337; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
338; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
339; AVX-NEXT:    vpaddb %xmm0, %xmm0, %xmm2
340; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
341; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
342; AVX-NEXT:    retq
343;
344; XOP-LABEL: var_shift_v16i8:
345; XOP:       # BB#0:
346; XOP-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
347; XOP-NEXT:    retq
348;
349; X32-SSE-LABEL: var_shift_v16i8:
350; X32-SSE:       # BB#0:
351; X32-SSE-NEXT:    psllw $5, %xmm1
352; X32-SSE-NEXT:    pxor %xmm2, %xmm2
353; X32-SSE-NEXT:    pxor %xmm3, %xmm3
354; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm3
355; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
356; X32-SSE-NEXT:    pandn %xmm0, %xmm4
357; X32-SSE-NEXT:    psllw $4, %xmm0
358; X32-SSE-NEXT:    pand .LCPI3_0, %xmm0
359; X32-SSE-NEXT:    pand %xmm3, %xmm0
360; X32-SSE-NEXT:    por %xmm4, %xmm0
361; X32-SSE-NEXT:    paddb %xmm1, %xmm1
362; X32-SSE-NEXT:    pxor %xmm3, %xmm3
363; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm3
364; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
365; X32-SSE-NEXT:    pandn %xmm0, %xmm4
366; X32-SSE-NEXT:    psllw $2, %xmm0
367; X32-SSE-NEXT:    pand .LCPI3_1, %xmm0
368; X32-SSE-NEXT:    pand %xmm3, %xmm0
369; X32-SSE-NEXT:    por %xmm4, %xmm0
370; X32-SSE-NEXT:    paddb %xmm1, %xmm1
371; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm2
372; X32-SSE-NEXT:    movdqa %xmm2, %xmm1
373; X32-SSE-NEXT:    pandn %xmm0, %xmm1
374; X32-SSE-NEXT:    paddb %xmm0, %xmm0
375; X32-SSE-NEXT:    pand %xmm2, %xmm0
376; X32-SSE-NEXT:    por %xmm1, %xmm0
377; X32-SSE-NEXT:    retl
378  %shift = shl <16 x i8> %a, %b
379  ret <16 x i8> %shift
380}
381
382;
383; Uniform Variable Shifts
384;
385
386define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
387; SSE-LABEL: splatvar_shift_v2i64:
388; SSE:       # BB#0:
389; SSE-NEXT:    psllq %xmm1, %xmm0
390; SSE-NEXT:    retq
391;
392; AVX-LABEL: splatvar_shift_v2i64:
393; AVX:       # BB#0:
394; AVX-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
395; AVX-NEXT:    retq
396;
397; XOP-LABEL: splatvar_shift_v2i64:
398; XOP:       # BB#0:
399; XOP-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
400; XOP-NEXT:    retq
401;
402; X32-SSE-LABEL: splatvar_shift_v2i64:
403; X32-SSE:       # BB#0:
404; X32-SSE-NEXT:    movq {{.*#+}} xmm1 = xmm1[0],zero
405; X32-SSE-NEXT:    psllq %xmm1, %xmm0
406; X32-SSE-NEXT:    retl
407  %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer
408  %shift = shl <2 x i64> %a, %splat
409  ret <2 x i64> %shift
410}
411
412define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
413; SSE2-LABEL: splatvar_shift_v4i32:
414; SSE2:       # BB#0:
415; SSE2-NEXT:    xorps %xmm2, %xmm2
416; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
417; SSE2-NEXT:    pslld %xmm2, %xmm0
418; SSE2-NEXT:    retq
419;
420; SSE41-LABEL: splatvar_shift_v4i32:
421; SSE41:       # BB#0:
422; SSE41-NEXT:    pxor %xmm2, %xmm2
423; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7]
424; SSE41-NEXT:    pslld %xmm2, %xmm0
425; SSE41-NEXT:    retq
426;
427; AVX-LABEL: splatvar_shift_v4i32:
428; AVX:       # BB#0:
429; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
430; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
431; AVX-NEXT:    vpslld %xmm1, %xmm0, %xmm0
432; AVX-NEXT:    retq
433;
434; XOP-LABEL: splatvar_shift_v4i32:
435; XOP:       # BB#0:
436; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
437; XOP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
438; XOP-NEXT:    vpslld %xmm1, %xmm0, %xmm0
439; XOP-NEXT:    retq
440;
441; X32-SSE-LABEL: splatvar_shift_v4i32:
442; X32-SSE:       # BB#0:
443; X32-SSE-NEXT:    xorps %xmm2, %xmm2
444; X32-SSE-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
445; X32-SSE-NEXT:    pslld %xmm2, %xmm0
446; X32-SSE-NEXT:    retl
447  %splat = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer
448  %shift = shl <4 x i32> %a, %splat
449  ret <4 x i32> %shift
450}
451
452define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
453; SSE2-LABEL: splatvar_shift_v8i16:
454; SSE2:       # BB#0:
455; SSE2-NEXT:    movd %xmm1, %eax
456; SSE2-NEXT:    movzwl %ax, %eax
457; SSE2-NEXT:    movd %eax, %xmm1
458; SSE2-NEXT:    psllw %xmm1, %xmm0
459; SSE2-NEXT:    retq
460;
461; SSE41-LABEL: splatvar_shift_v8i16:
462; SSE41:       # BB#0:
463; SSE41-NEXT:    pxor %xmm2, %xmm2
464; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3,4,5,6,7]
465; SSE41-NEXT:    psllw %xmm2, %xmm0
466; SSE41-NEXT:    retq
467;
468; AVX-LABEL: splatvar_shift_v8i16:
469; AVX:       # BB#0:
470; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
471; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
472; AVX-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
473; AVX-NEXT:    retq
474;
475; XOP-LABEL: splatvar_shift_v8i16:
476; XOP:       # BB#0:
477; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
478; XOP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
479; XOP-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
480; XOP-NEXT:    retq
481;
482; X32-SSE-LABEL: splatvar_shift_v8i16:
483; X32-SSE:       # BB#0:
484; X32-SSE-NEXT:    movd %xmm1, %eax
485; X32-SSE-NEXT:    movzwl %ax, %eax
486; X32-SSE-NEXT:    movd %eax, %xmm1
487; X32-SSE-NEXT:    psllw %xmm1, %xmm0
488; X32-SSE-NEXT:    retl
489  %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer
490  %shift = shl <8 x i16> %a, %splat
491  ret <8 x i16> %shift
492}
493
494define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
495; SSE2-LABEL: splatvar_shift_v16i8:
496; SSE2:       # BB#0:
497; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
498; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
499; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
500; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,4,4]
501; SSE2-NEXT:    psllw $5, %xmm2
502; SSE2-NEXT:    pxor %xmm1, %xmm1
503; SSE2-NEXT:    pxor %xmm3, %xmm3
504; SSE2-NEXT:    pcmpgtb %xmm2, %xmm3
505; SSE2-NEXT:    movdqa %xmm3, %xmm4
506; SSE2-NEXT:    pandn %xmm0, %xmm4
507; SSE2-NEXT:    psllw $4, %xmm0
508; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
509; SSE2-NEXT:    pand %xmm3, %xmm0
510; SSE2-NEXT:    por %xmm4, %xmm0
511; SSE2-NEXT:    paddb %xmm2, %xmm2
512; SSE2-NEXT:    pxor %xmm3, %xmm3
513; SSE2-NEXT:    pcmpgtb %xmm2, %xmm3
514; SSE2-NEXT:    movdqa %xmm3, %xmm4
515; SSE2-NEXT:    pandn %xmm0, %xmm4
516; SSE2-NEXT:    psllw $2, %xmm0
517; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
518; SSE2-NEXT:    pand %xmm3, %xmm0
519; SSE2-NEXT:    por %xmm4, %xmm0
520; SSE2-NEXT:    paddb %xmm2, %xmm2
521; SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
522; SSE2-NEXT:    movdqa %xmm1, %xmm2
523; SSE2-NEXT:    pandn %xmm0, %xmm2
524; SSE2-NEXT:    paddb %xmm0, %xmm0
525; SSE2-NEXT:    pand %xmm1, %xmm0
526; SSE2-NEXT:    por %xmm2, %xmm0
527; SSE2-NEXT:    retq
528;
529; SSE41-LABEL: splatvar_shift_v16i8:
530; SSE41:       # BB#0:
531; SSE41-NEXT:    movdqa %xmm0, %xmm2
532; SSE41-NEXT:    pxor %xmm0, %xmm0
533; SSE41-NEXT:    pshufb %xmm0, %xmm1
534; SSE41-NEXT:    psllw $5, %xmm1
535; SSE41-NEXT:    movdqa %xmm1, %xmm3
536; SSE41-NEXT:    paddb %xmm3, %xmm3
537; SSE41-NEXT:    movdqa %xmm2, %xmm4
538; SSE41-NEXT:    psllw $4, %xmm4
539; SSE41-NEXT:    pand {{.*}}(%rip), %xmm4
540; SSE41-NEXT:    movdqa %xmm1, %xmm0
541; SSE41-NEXT:    pblendvb %xmm4, %xmm2
542; SSE41-NEXT:    movdqa %xmm2, %xmm1
543; SSE41-NEXT:    psllw $2, %xmm1
544; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
545; SSE41-NEXT:    movdqa %xmm3, %xmm0
546; SSE41-NEXT:    pblendvb %xmm1, %xmm2
547; SSE41-NEXT:    movdqa %xmm2, %xmm1
548; SSE41-NEXT:    paddb %xmm1, %xmm1
549; SSE41-NEXT:    paddb %xmm3, %xmm3
550; SSE41-NEXT:    movdqa %xmm3, %xmm0
551; SSE41-NEXT:    pblendvb %xmm1, %xmm2
552; SSE41-NEXT:    movdqa %xmm2, %xmm0
553; SSE41-NEXT:    retq
554;
555; AVX1-LABEL: splatvar_shift_v16i8:
556; AVX1:       # BB#0:
557; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
558; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
559; AVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
560; AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm2
561; AVX1-NEXT:    vpsllw $4, %xmm0, %xmm3
562; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
563; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
564; AVX1-NEXT:    vpsllw $2, %xmm0, %xmm1
565; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
566; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
567; AVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm1
568; AVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
569; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
570; AVX1-NEXT:    retq
571;
572; AVX2-LABEL: splatvar_shift_v16i8:
573; AVX2:       # BB#0:
574; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
575; AVX2-NEXT:    vpsllw $5, %xmm1, %xmm1
576; AVX2-NEXT:    vpsllw $4, %xmm0, %xmm2
577; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
578; AVX2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
579; AVX2-NEXT:    vpsllw $2, %xmm0, %xmm2
580; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
581; AVX2-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
582; AVX2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
583; AVX2-NEXT:    vpaddb %xmm0, %xmm0, %xmm2
584; AVX2-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
585; AVX2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
586; AVX2-NEXT:    retq
587;
588; XOPAVX1-LABEL: splatvar_shift_v16i8:
589; XOPAVX1:       # BB#0:
590; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
591; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
592; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
593; XOPAVX1-NEXT:    retq
594;
595; XOPAVX2-LABEL: splatvar_shift_v16i8:
596; XOPAVX2:       # BB#0:
597; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
598; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
599; XOPAVX2-NEXT:    retq
600;
601; X32-SSE-LABEL: splatvar_shift_v16i8:
602; X32-SSE:       # BB#0:
603; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
604; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
605; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
606; X32-SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,4,4]
607; X32-SSE-NEXT:    psllw $5, %xmm2
608; X32-SSE-NEXT:    pxor %xmm1, %xmm1
609; X32-SSE-NEXT:    pxor %xmm3, %xmm3
610; X32-SSE-NEXT:    pcmpgtb %xmm2, %xmm3
611; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
612; X32-SSE-NEXT:    pandn %xmm0, %xmm4
613; X32-SSE-NEXT:    psllw $4, %xmm0
614; X32-SSE-NEXT:    pand .LCPI7_0, %xmm0
615; X32-SSE-NEXT:    pand %xmm3, %xmm0
616; X32-SSE-NEXT:    por %xmm4, %xmm0
617; X32-SSE-NEXT:    paddb %xmm2, %xmm2
618; X32-SSE-NEXT:    pxor %xmm3, %xmm3
619; X32-SSE-NEXT:    pcmpgtb %xmm2, %xmm3
620; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
621; X32-SSE-NEXT:    pandn %xmm0, %xmm4
622; X32-SSE-NEXT:    psllw $2, %xmm0
623; X32-SSE-NEXT:    pand .LCPI7_1, %xmm0
624; X32-SSE-NEXT:    pand %xmm3, %xmm0
625; X32-SSE-NEXT:    por %xmm4, %xmm0
626; X32-SSE-NEXT:    paddb %xmm2, %xmm2
627; X32-SSE-NEXT:    pcmpgtb %xmm2, %xmm1
628; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
629; X32-SSE-NEXT:    pandn %xmm0, %xmm2
630; X32-SSE-NEXT:    paddb %xmm0, %xmm0
631; X32-SSE-NEXT:    pand %xmm1, %xmm0
632; X32-SSE-NEXT:    por %xmm2, %xmm0
633; X32-SSE-NEXT:    retl
634  %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer
635  %shift = shl <16 x i8> %a, %splat
636  ret <16 x i8> %shift
637}
638
639;
640; Constant Shifts
641;
642
643define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
644; SSE2-LABEL: constant_shift_v2i64:
645; SSE2:       # BB#0:
646; SSE2-NEXT:    movdqa %xmm0, %xmm1
647; SSE2-NEXT:    psllq $7, %xmm1
648; SSE2-NEXT:    psllq $1, %xmm0
649; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
650; SSE2-NEXT:    movapd %xmm1, %xmm0
651; SSE2-NEXT:    retq
652;
653; SSE41-LABEL: constant_shift_v2i64:
654; SSE41:       # BB#0:
655; SSE41-NEXT:    movdqa %xmm0, %xmm1
656; SSE41-NEXT:    psllq $7, %xmm1
657; SSE41-NEXT:    psllq $1, %xmm0
658; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
659; SSE41-NEXT:    retq
660;
661; AVX1-LABEL: constant_shift_v2i64:
662; AVX1:       # BB#0:
663; AVX1-NEXT:    vpsllq $7, %xmm0, %xmm1
664; AVX1-NEXT:    vpsllq $1, %xmm0, %xmm0
665; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
666; AVX1-NEXT:    retq
667;
668; AVX2-LABEL: constant_shift_v2i64:
669; AVX2:       # BB#0:
670; AVX2-NEXT:    vpsllvq {{.*}}(%rip), %xmm0, %xmm0
671; AVX2-NEXT:    retq
672;
673; XOPAVX1-LABEL: constant_shift_v2i64:
674; XOPAVX1:       # BB#0:
675; XOPAVX1-NEXT:    vpshlq {{.*}}(%rip), %xmm0, %xmm0
676; XOPAVX1-NEXT:    retq
677;
678; XOPAVX2-LABEL: constant_shift_v2i64:
679; XOPAVX2:       # BB#0:
680; XOPAVX2-NEXT:    vpsllvq {{.*}}(%rip), %xmm0, %xmm0
681; XOPAVX2-NEXT:    retq
682;
683; X32-SSE-LABEL: constant_shift_v2i64:
684; X32-SSE:       # BB#0:
685; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
686; X32-SSE-NEXT:    psllq $7, %xmm1
687; X32-SSE-NEXT:    psllq $1, %xmm0
688; X32-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
689; X32-SSE-NEXT:    movapd %xmm1, %xmm0
690; X32-SSE-NEXT:    retl
691  %shift = shl <2 x i64> %a, <i64 1, i64 7>
692  ret <2 x i64> %shift
693}
694
695define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind {
696; SSE2-LABEL: constant_shift_v4i32:
697; SSE2:       # BB#0:
698; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [16,32,64,128]
699; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
700; SSE2-NEXT:    pmuludq %xmm1, %xmm0
701; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
702; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
703; SSE2-NEXT:    pmuludq %xmm2, %xmm1
704; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
705; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
706; SSE2-NEXT:    retq
707;
708; SSE41-LABEL: constant_shift_v4i32:
709; SSE41:       # BB#0:
710; SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
711; SSE41-NEXT:    retq
712;
713; AVX1-LABEL: constant_shift_v4i32:
714; AVX1:       # BB#0:
715; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
716; AVX1-NEXT:    retq
717;
718; AVX2-LABEL: constant_shift_v4i32:
719; AVX2:       # BB#0:
720; AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
721; AVX2-NEXT:    retq
722;
723; XOPAVX1-LABEL: constant_shift_v4i32:
724; XOPAVX1:       # BB#0:
725; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm0
726; XOPAVX1-NEXT:    retq
727;
728; XOPAVX2-LABEL: constant_shift_v4i32:
729; XOPAVX2:       # BB#0:
730; XOPAVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
731; XOPAVX2-NEXT:    retq
732;
733; X32-SSE-LABEL: constant_shift_v4i32:
734; X32-SSE:       # BB#0:
735; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [16,32,64,128]
736; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
737; X32-SSE-NEXT:    pmuludq %xmm1, %xmm0
738; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
739; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
740; X32-SSE-NEXT:    pmuludq %xmm2, %xmm1
741; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
742; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
743; X32-SSE-NEXT:    retl
744  %shift = shl <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7>
745  ret <4 x i32> %shift
746}
747
748define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
749; SSE-LABEL: constant_shift_v8i16:
750; SSE:       # BB#0:
751; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm0
752; SSE-NEXT:    retq
753;
754; AVX-LABEL: constant_shift_v8i16:
755; AVX:       # BB#0:
756; AVX-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
757; AVX-NEXT:    retq
758;
759; XOP-LABEL: constant_shift_v8i16:
760; XOP:       # BB#0:
761; XOP-NEXT:    vpshlw {{.*}}(%rip), %xmm0, %xmm0
762; XOP-NEXT:    retq
763;
764; X32-SSE-LABEL: constant_shift_v8i16:
765; X32-SSE:       # BB#0:
766; X32-SSE-NEXT:    pmullw .LCPI10_0, %xmm0
767; X32-SSE-NEXT:    retl
768  %shift = shl <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
769  ret <8 x i16> %shift
770}
771
772define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
773; SSE2-LABEL: constant_shift_v16i8:
774; SSE2:       # BB#0:
775; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
776; SSE2-NEXT:    psllw $5, %xmm2
777; SSE2-NEXT:    pxor %xmm1, %xmm1
778; SSE2-NEXT:    pxor %xmm3, %xmm3
779; SSE2-NEXT:    pcmpgtb %xmm2, %xmm3
780; SSE2-NEXT:    movdqa %xmm3, %xmm4
781; SSE2-NEXT:    pandn %xmm0, %xmm4
782; SSE2-NEXT:    psllw $4, %xmm0
783; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
784; SSE2-NEXT:    pand %xmm3, %xmm0
785; SSE2-NEXT:    por %xmm4, %xmm0
786; SSE2-NEXT:    paddb %xmm2, %xmm2
787; SSE2-NEXT:    pxor %xmm3, %xmm3
788; SSE2-NEXT:    pcmpgtb %xmm2, %xmm3
789; SSE2-NEXT:    movdqa %xmm3, %xmm4
790; SSE2-NEXT:    pandn %xmm0, %xmm4
791; SSE2-NEXT:    psllw $2, %xmm0
792; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
793; SSE2-NEXT:    pand %xmm3, %xmm0
794; SSE2-NEXT:    por %xmm4, %xmm0
795; SSE2-NEXT:    paddb %xmm2, %xmm2
796; SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
797; SSE2-NEXT:    movdqa %xmm1, %xmm2
798; SSE2-NEXT:    pandn %xmm0, %xmm2
799; SSE2-NEXT:    paddb %xmm0, %xmm0
800; SSE2-NEXT:    pand %xmm1, %xmm0
801; SSE2-NEXT:    por %xmm2, %xmm0
802; SSE2-NEXT:    retq
803;
804; SSE41-LABEL: constant_shift_v16i8:
805; SSE41:       # BB#0:
806; SSE41-NEXT:    movdqa %xmm0, %xmm1
807; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
808; SSE41-NEXT:    psllw $5, %xmm0
809; SSE41-NEXT:    movdqa %xmm1, %xmm2
810; SSE41-NEXT:    psllw $4, %xmm2
811; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
812; SSE41-NEXT:    pblendvb %xmm2, %xmm1
813; SSE41-NEXT:    movdqa %xmm1, %xmm2
814; SSE41-NEXT:    psllw $2, %xmm2
815; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
816; SSE41-NEXT:    paddb %xmm0, %xmm0
817; SSE41-NEXT:    pblendvb %xmm2, %xmm1
818; SSE41-NEXT:    movdqa %xmm1, %xmm2
819; SSE41-NEXT:    paddb %xmm2, %xmm2
820; SSE41-NEXT:    paddb %xmm0, %xmm0
821; SSE41-NEXT:    pblendvb %xmm2, %xmm1
822; SSE41-NEXT:    movdqa %xmm1, %xmm0
823; SSE41-NEXT:    retq
824;
825; AVX-LABEL: constant_shift_v16i8:
826; AVX:       # BB#0:
827; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
828; AVX-NEXT:    vpsllw $5, %xmm1, %xmm1
829; AVX-NEXT:    vpsllw $4, %xmm0, %xmm2
830; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
831; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
832; AVX-NEXT:    vpsllw $2, %xmm0, %xmm2
833; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
834; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
835; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
836; AVX-NEXT:    vpaddb %xmm0, %xmm0, %xmm2
837; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
838; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
839; AVX-NEXT:    retq
840;
841; XOP-LABEL: constant_shift_v16i8:
842; XOP:       # BB#0:
843; XOP-NEXT:    vpshlb {{.*}}(%rip), %xmm0, %xmm0
844; XOP-NEXT:    retq
845;
846; X32-SSE-LABEL: constant_shift_v16i8:
847; X32-SSE:       # BB#0:
848; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
849; X32-SSE-NEXT:    psllw $5, %xmm2
850; X32-SSE-NEXT:    pxor %xmm1, %xmm1
851; X32-SSE-NEXT:    pxor %xmm3, %xmm3
852; X32-SSE-NEXT:    pcmpgtb %xmm2, %xmm3
853; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
854; X32-SSE-NEXT:    pandn %xmm0, %xmm4
855; X32-SSE-NEXT:    psllw $4, %xmm0
856; X32-SSE-NEXT:    pand .LCPI11_1, %xmm0
857; X32-SSE-NEXT:    pand %xmm3, %xmm0
858; X32-SSE-NEXT:    por %xmm4, %xmm0
859; X32-SSE-NEXT:    paddb %xmm2, %xmm2
860; X32-SSE-NEXT:    pxor %xmm3, %xmm3
861; X32-SSE-NEXT:    pcmpgtb %xmm2, %xmm3
862; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
863; X32-SSE-NEXT:    pandn %xmm0, %xmm4
864; X32-SSE-NEXT:    psllw $2, %xmm0
865; X32-SSE-NEXT:    pand .LCPI11_2, %xmm0
866; X32-SSE-NEXT:    pand %xmm3, %xmm0
867; X32-SSE-NEXT:    por %xmm4, %xmm0
868; X32-SSE-NEXT:    paddb %xmm2, %xmm2
869; X32-SSE-NEXT:    pcmpgtb %xmm2, %xmm1
870; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
871; X32-SSE-NEXT:    pandn %xmm0, %xmm2
872; X32-SSE-NEXT:    paddb %xmm0, %xmm0
873; X32-SSE-NEXT:    pand %xmm1, %xmm0
874; X32-SSE-NEXT:    por %xmm2, %xmm0
875; X32-SSE-NEXT:    retl
876  %shift = shl <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
877  ret <16 x i8> %shift
878}
879
880;
881; Uniform Constant Shifts
882;
883
884define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind {
885; SSE-LABEL: splatconstant_shift_v2i64:
886; SSE:       # BB#0:
887; SSE-NEXT:    psllq $7, %xmm0
888; SSE-NEXT:    retq
889;
890; AVX-LABEL: splatconstant_shift_v2i64:
891; AVX:       # BB#0:
892; AVX-NEXT:    vpsllq $7, %xmm0, %xmm0
893; AVX-NEXT:    retq
894;
895; XOP-LABEL: splatconstant_shift_v2i64:
896; XOP:       # BB#0:
897; XOP-NEXT:    vpsllq $7, %xmm0, %xmm0
898; XOP-NEXT:    retq
899;
900; X32-SSE-LABEL: splatconstant_shift_v2i64:
901; X32-SSE:       # BB#0:
902; X32-SSE-NEXT:    psllq $7, %xmm0
903; X32-SSE-NEXT:    retl
904  %shift = shl <2 x i64> %a, <i64 7, i64 7>
905  ret <2 x i64> %shift
906}
907
908define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) nounwind {
909; SSE-LABEL: splatconstant_shift_v4i32:
910; SSE:       # BB#0:
911; SSE-NEXT:    pslld $5, %xmm0
912; SSE-NEXT:    retq
913;
914; AVX-LABEL: splatconstant_shift_v4i32:
915; AVX:       # BB#0:
916; AVX-NEXT:    vpslld $5, %xmm0, %xmm0
917; AVX-NEXT:    retq
918;
919; XOP-LABEL: splatconstant_shift_v4i32:
920; XOP:       # BB#0:
921; XOP-NEXT:    vpslld $5, %xmm0, %xmm0
922; XOP-NEXT:    retq
923;
924; X32-SSE-LABEL: splatconstant_shift_v4i32:
925; X32-SSE:       # BB#0:
926; X32-SSE-NEXT:    pslld $5, %xmm0
927; X32-SSE-NEXT:    retl
928  %shift = shl <4 x i32> %a, <i32 5, i32 5, i32 5, i32 5>
929  ret <4 x i32> %shift
930}
931
932define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) nounwind {
933; SSE-LABEL: splatconstant_shift_v8i16:
934; SSE:       # BB#0:
935; SSE-NEXT:    psllw $3, %xmm0
936; SSE-NEXT:    retq
937;
938; AVX-LABEL: splatconstant_shift_v8i16:
939; AVX:       # BB#0:
940; AVX-NEXT:    vpsllw $3, %xmm0, %xmm0
941; AVX-NEXT:    retq
942;
943; XOP-LABEL: splatconstant_shift_v8i16:
944; XOP:       # BB#0:
945; XOP-NEXT:    vpsllw $3, %xmm0, %xmm0
946; XOP-NEXT:    retq
947;
948; X32-SSE-LABEL: splatconstant_shift_v8i16:
949; X32-SSE:       # BB#0:
950; X32-SSE-NEXT:    psllw $3, %xmm0
951; X32-SSE-NEXT:    retl
952  %shift = shl <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
953  ret <8 x i16> %shift
954}
955
956define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind {
957; SSE-LABEL: splatconstant_shift_v16i8:
958; SSE:       # BB#0:
959; SSE-NEXT:    psllw $3, %xmm0
960; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
961; SSE-NEXT:    retq
962;
963; AVX-LABEL: splatconstant_shift_v16i8:
964; AVX:       # BB#0:
965; AVX-NEXT:    vpsllw $3, %xmm0, %xmm0
966; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
967; AVX-NEXT:    retq
968;
969; XOP-LABEL: splatconstant_shift_v16i8:
970; XOP:       # BB#0:
971; XOP-NEXT:    vpshlb {{.*}}(%rip), %xmm0, %xmm0
972; XOP-NEXT:    retq
973;
974; X32-SSE-LABEL: splatconstant_shift_v16i8:
975; X32-SSE:       # BB#0:
976; X32-SSE-NEXT:    psllw $3, %xmm0
977; X32-SSE-NEXT:    pand .LCPI15_0, %xmm0
978; X32-SSE-NEXT:    retl
979  %shift = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
980  ret <16 x i8> %shift
981}
982