• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
8; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
9;
10; Just one 32-bit run to make sure we do reasonable things for i64 shifts.
11; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2
12
13;
14; Variable Shifts
15;
16
17define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
18; SSE2-LABEL: var_shift_v2i64:
19; SSE2:       # BB#0:
20; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
21; SSE2-NEXT:    movdqa %xmm0, %xmm2
22; SSE2-NEXT:    psllq %xmm3, %xmm2
23; SSE2-NEXT:    psllq %xmm1, %xmm0
24; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
25; SSE2-NEXT:    movapd %xmm2, %xmm0
26; SSE2-NEXT:    retq
27;
28; SSE41-LABEL: var_shift_v2i64:
29; SSE41:       # BB#0:
30; SSE41-NEXT:    movdqa %xmm0, %xmm2
31; SSE41-NEXT:    psllq %xmm1, %xmm2
32; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
33; SSE41-NEXT:    psllq %xmm1, %xmm0
34; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
35; SSE41-NEXT:    retq
36;
37; AVX1-LABEL: var_shift_v2i64:
38; AVX1:       # BB#0:
39; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm2
40; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
41; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
42; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
43; AVX1-NEXT:    retq
44;
45; AVX2-LABEL: var_shift_v2i64:
46; AVX2:       # BB#0:
47; AVX2-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
48; AVX2-NEXT:    retq
49;
50; XOPAVX1-LABEL: var_shift_v2i64:
51; XOPAVX1:       # BB#0:
52; XOPAVX1-NEXT:    vpshlq %xmm1, %xmm0, %xmm0
53; XOPAVX1-NEXT:    retq
54;
55; XOPAVX2-LABEL: var_shift_v2i64:
56; XOPAVX2:       # BB#0:
57; XOPAVX2-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
58; XOPAVX2-NEXT:    retq
59;
60; AVX512-LABEL: var_shift_v2i64:
61; AVX512:       ## BB#0:
62; AVX512-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
63; AVX512-NEXT:    retq
64;
65; X32-SSE-LABEL: var_shift_v2i64:
66; X32-SSE:       # BB#0:
67; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
68; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
69; X32-SSE-NEXT:    psllq %xmm3, %xmm2
70; X32-SSE-NEXT:    movq {{.*#+}} xmm1 = xmm1[0],zero
71; X32-SSE-NEXT:    psllq %xmm1, %xmm0
72; X32-SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
73; X32-SSE-NEXT:    movapd %xmm2, %xmm0
74; X32-SSE-NEXT:    retl
75  %shift = shl <2 x i64> %a, %b
76  ret <2 x i64> %shift
77}
78
79define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
80; SSE2-LABEL: var_shift_v4i32:
81; SSE2:       # BB#0:
82; SSE2-NEXT:    pslld $23, %xmm1
83; SSE2-NEXT:    paddd {{.*}}(%rip), %xmm1
84; SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
85; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
86; SSE2-NEXT:    pmuludq %xmm0, %xmm1
87; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
88; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
89; SSE2-NEXT:    pmuludq %xmm2, %xmm0
90; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
91; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
92; SSE2-NEXT:    movdqa %xmm1, %xmm0
93; SSE2-NEXT:    retq
94;
95; SSE41-LABEL: var_shift_v4i32:
96; SSE41:       # BB#0:
97; SSE41-NEXT:    pslld $23, %xmm1
98; SSE41-NEXT:    paddd {{.*}}(%rip), %xmm1
99; SSE41-NEXT:    cvttps2dq %xmm1, %xmm1
100; SSE41-NEXT:    pmulld %xmm1, %xmm0
101; SSE41-NEXT:    retq
102;
103; AVX1-LABEL: var_shift_v4i32:
104; AVX1:       # BB#0:
105; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
106; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm1, %xmm1
107; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
108; AVX1-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
109; AVX1-NEXT:    retq
110;
111; AVX2-LABEL: var_shift_v4i32:
112; AVX2:       # BB#0:
113; AVX2-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
114; AVX2-NEXT:    retq
115;
116; XOPAVX1-LABEL: var_shift_v4i32:
117; XOPAVX1:       # BB#0:
118; XOPAVX1-NEXT:    vpshld %xmm1, %xmm0, %xmm0
119; XOPAVX1-NEXT:    retq
120;
121; XOPAVX2-LABEL: var_shift_v4i32:
122; XOPAVX2:       # BB#0:
123; XOPAVX2-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
124; XOPAVX2-NEXT:    retq
125;
126; AVX512-LABEL: var_shift_v4i32:
127; AVX512:       ## BB#0:
128; AVX512-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
129; AVX512-NEXT:    retq
130;
131; X32-SSE-LABEL: var_shift_v4i32:
132; X32-SSE:       # BB#0:
133; X32-SSE-NEXT:    pslld $23, %xmm1
134; X32-SSE-NEXT:    paddd {{\.LCPI.*}}, %xmm1
135; X32-SSE-NEXT:    cvttps2dq %xmm1, %xmm1
136; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
137; X32-SSE-NEXT:    pmuludq %xmm0, %xmm1
138; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
139; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
140; X32-SSE-NEXT:    pmuludq %xmm2, %xmm0
141; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
142; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
143; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
144; X32-SSE-NEXT:    retl
145  %shift = shl <4 x i32> %a, %b
146  ret <4 x i32> %shift
147}
148
149define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
150; SSE2-LABEL: var_shift_v8i16:
151; SSE2:       # BB#0:
152; SSE2-NEXT:    psllw $12, %xmm1
153; SSE2-NEXT:    movdqa %xmm1, %xmm2
154; SSE2-NEXT:    psraw $15, %xmm2
155; SSE2-NEXT:    movdqa %xmm2, %xmm3
156; SSE2-NEXT:    pandn %xmm0, %xmm3
157; SSE2-NEXT:    psllw $8, %xmm0
158; SSE2-NEXT:    pand %xmm2, %xmm0
159; SSE2-NEXT:    por %xmm3, %xmm0
160; SSE2-NEXT:    paddw %xmm1, %xmm1
161; SSE2-NEXT:    movdqa %xmm1, %xmm2
162; SSE2-NEXT:    psraw $15, %xmm2
163; SSE2-NEXT:    movdqa %xmm2, %xmm3
164; SSE2-NEXT:    pandn %xmm0, %xmm3
165; SSE2-NEXT:    psllw $4, %xmm0
166; SSE2-NEXT:    pand %xmm2, %xmm0
167; SSE2-NEXT:    por %xmm3, %xmm0
168; SSE2-NEXT:    paddw %xmm1, %xmm1
169; SSE2-NEXT:    movdqa %xmm1, %xmm2
170; SSE2-NEXT:    psraw $15, %xmm2
171; SSE2-NEXT:    movdqa %xmm2, %xmm3
172; SSE2-NEXT:    pandn %xmm0, %xmm3
173; SSE2-NEXT:    psllw $2, %xmm0
174; SSE2-NEXT:    pand %xmm2, %xmm0
175; SSE2-NEXT:    por %xmm3, %xmm0
176; SSE2-NEXT:    paddw %xmm1, %xmm1
177; SSE2-NEXT:    psraw $15, %xmm1
178; SSE2-NEXT:    movdqa %xmm1, %xmm2
179; SSE2-NEXT:    pandn %xmm0, %xmm2
180; SSE2-NEXT:    psllw $1, %xmm0
181; SSE2-NEXT:    pand %xmm1, %xmm0
182; SSE2-NEXT:    por %xmm2, %xmm0
183; SSE2-NEXT:    retq
184;
185; SSE41-LABEL: var_shift_v8i16:
186; SSE41:       # BB#0:
187; SSE41-NEXT:    movdqa %xmm0, %xmm2
188; SSE41-NEXT:    movdqa %xmm1, %xmm0
189; SSE41-NEXT:    psllw $12, %xmm0
190; SSE41-NEXT:    psllw $4, %xmm1
191; SSE41-NEXT:    por %xmm0, %xmm1
192; SSE41-NEXT:    movdqa %xmm1, %xmm3
193; SSE41-NEXT:    paddw %xmm3, %xmm3
194; SSE41-NEXT:    movdqa %xmm2, %xmm4
195; SSE41-NEXT:    psllw $8, %xmm4
196; SSE41-NEXT:    movdqa %xmm1, %xmm0
197; SSE41-NEXT:    pblendvb %xmm4, %xmm2
198; SSE41-NEXT:    movdqa %xmm2, %xmm1
199; SSE41-NEXT:    psllw $4, %xmm1
200; SSE41-NEXT:    movdqa %xmm3, %xmm0
201; SSE41-NEXT:    pblendvb %xmm1, %xmm2
202; SSE41-NEXT:    movdqa %xmm2, %xmm1
203; SSE41-NEXT:    psllw $2, %xmm1
204; SSE41-NEXT:    paddw %xmm3, %xmm3
205; SSE41-NEXT:    movdqa %xmm3, %xmm0
206; SSE41-NEXT:    pblendvb %xmm1, %xmm2
207; SSE41-NEXT:    movdqa %xmm2, %xmm1
208; SSE41-NEXT:    psllw $1, %xmm1
209; SSE41-NEXT:    paddw %xmm3, %xmm3
210; SSE41-NEXT:    movdqa %xmm3, %xmm0
211; SSE41-NEXT:    pblendvb %xmm1, %xmm2
212; SSE41-NEXT:    movdqa %xmm2, %xmm0
213; SSE41-NEXT:    retq
214;
215; AVX1-LABEL: var_shift_v8i16:
216; AVX1:       # BB#0:
217; AVX1-NEXT:    vpsllw $12, %xmm1, %xmm2
218; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
219; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
220; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm2
221; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm3
222; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
223; AVX1-NEXT:    vpsllw $4, %xmm0, %xmm1
224; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
225; AVX1-NEXT:    vpsllw $2, %xmm0, %xmm1
226; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
227; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
228; AVX1-NEXT:    vpsllw $1, %xmm0, %xmm1
229; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
230; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
231; AVX1-NEXT:    retq
232;
233; AVX2-LABEL: var_shift_v8i16:
234; AVX2:       # BB#0:
235; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
236; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
237; AVX2-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
238; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
239; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
240; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
241; AVX2-NEXT:    vzeroupper
242; AVX2-NEXT:    retq
243;
244; XOP-LABEL: var_shift_v8i16:
245; XOP:       # BB#0:
246; XOP-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
247; XOP-NEXT:    retq
248;
249; AVX512-LABEL: var_shift_v8i16:
250; AVX512:       ## BB#0:
251; AVX512-NEXT:    ## kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
252; AVX512-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
253; AVX512-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
254; AVX512-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
255; AVX512-NEXT:    retq
256;
257; X32-SSE-LABEL: var_shift_v8i16:
258; X32-SSE:       # BB#0:
259; X32-SSE-NEXT:    psllw $12, %xmm1
260; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
261; X32-SSE-NEXT:    psraw $15, %xmm2
262; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
263; X32-SSE-NEXT:    pandn %xmm0, %xmm3
264; X32-SSE-NEXT:    psllw $8, %xmm0
265; X32-SSE-NEXT:    pand %xmm2, %xmm0
266; X32-SSE-NEXT:    por %xmm3, %xmm0
267; X32-SSE-NEXT:    paddw %xmm1, %xmm1
268; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
269; X32-SSE-NEXT:    psraw $15, %xmm2
270; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
271; X32-SSE-NEXT:    pandn %xmm0, %xmm3
272; X32-SSE-NEXT:    psllw $4, %xmm0
273; X32-SSE-NEXT:    pand %xmm2, %xmm0
274; X32-SSE-NEXT:    por %xmm3, %xmm0
275; X32-SSE-NEXT:    paddw %xmm1, %xmm1
276; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
277; X32-SSE-NEXT:    psraw $15, %xmm2
278; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
279; X32-SSE-NEXT:    pandn %xmm0, %xmm3
280; X32-SSE-NEXT:    psllw $2, %xmm0
281; X32-SSE-NEXT:    pand %xmm2, %xmm0
282; X32-SSE-NEXT:    por %xmm3, %xmm0
283; X32-SSE-NEXT:    paddw %xmm1, %xmm1
284; X32-SSE-NEXT:    psraw $15, %xmm1
285; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
286; X32-SSE-NEXT:    pandn %xmm0, %xmm2
287; X32-SSE-NEXT:    psllw $1, %xmm0
288; X32-SSE-NEXT:    pand %xmm1, %xmm0
289; X32-SSE-NEXT:    por %xmm2, %xmm0
290; X32-SSE-NEXT:    retl
291  %shift = shl <8 x i16> %a, %b
292  ret <8 x i16> %shift
293}
294
295define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
296; SSE2-LABEL: var_shift_v16i8:
297; SSE2:       # BB#0:
298; SSE2-NEXT:    psllw $5, %xmm1
299; SSE2-NEXT:    pxor %xmm2, %xmm2
300; SSE2-NEXT:    pxor %xmm3, %xmm3
301; SSE2-NEXT:    pcmpgtb %xmm1, %xmm3
302; SSE2-NEXT:    movdqa %xmm3, %xmm4
303; SSE2-NEXT:    pandn %xmm0, %xmm4
304; SSE2-NEXT:    psllw $4, %xmm0
305; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
306; SSE2-NEXT:    pand %xmm3, %xmm0
307; SSE2-NEXT:    por %xmm4, %xmm0
308; SSE2-NEXT:    paddb %xmm1, %xmm1
309; SSE2-NEXT:    pxor %xmm3, %xmm3
310; SSE2-NEXT:    pcmpgtb %xmm1, %xmm3
311; SSE2-NEXT:    movdqa %xmm3, %xmm4
312; SSE2-NEXT:    pandn %xmm0, %xmm4
313; SSE2-NEXT:    psllw $2, %xmm0
314; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
315; SSE2-NEXT:    pand %xmm3, %xmm0
316; SSE2-NEXT:    por %xmm4, %xmm0
317; SSE2-NEXT:    paddb %xmm1, %xmm1
318; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
319; SSE2-NEXT:    movdqa %xmm2, %xmm1
320; SSE2-NEXT:    pandn %xmm0, %xmm1
321; SSE2-NEXT:    paddb %xmm0, %xmm0
322; SSE2-NEXT:    pand %xmm2, %xmm0
323; SSE2-NEXT:    por %xmm1, %xmm0
324; SSE2-NEXT:    retq
325;
326; SSE41-LABEL: var_shift_v16i8:
327; SSE41:       # BB#0:
328; SSE41-NEXT:    movdqa %xmm0, %xmm2
329; SSE41-NEXT:    psllw $5, %xmm1
330; SSE41-NEXT:    movdqa %xmm2, %xmm3
331; SSE41-NEXT:    psllw $4, %xmm3
332; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
333; SSE41-NEXT:    movdqa %xmm1, %xmm0
334; SSE41-NEXT:    pblendvb %xmm3, %xmm2
335; SSE41-NEXT:    movdqa %xmm2, %xmm3
336; SSE41-NEXT:    psllw $2, %xmm3
337; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
338; SSE41-NEXT:    paddb %xmm1, %xmm1
339; SSE41-NEXT:    movdqa %xmm1, %xmm0
340; SSE41-NEXT:    pblendvb %xmm3, %xmm2
341; SSE41-NEXT:    movdqa %xmm2, %xmm3
342; SSE41-NEXT:    paddb %xmm3, %xmm3
343; SSE41-NEXT:    paddb %xmm1, %xmm1
344; SSE41-NEXT:    movdqa %xmm1, %xmm0
345; SSE41-NEXT:    pblendvb %xmm3, %xmm2
346; SSE41-NEXT:    movdqa %xmm2, %xmm0
347; SSE41-NEXT:    retq
348;
349; AVX-LABEL: var_shift_v16i8:
350; AVX:       # BB#0:
351; AVX-NEXT:    vpsllw $5, %xmm1, %xmm1
352; AVX-NEXT:    vpsllw $4, %xmm0, %xmm2
353; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
354; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
355; AVX-NEXT:    vpsllw $2, %xmm0, %xmm2
356; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
357; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
358; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
359; AVX-NEXT:    vpaddb %xmm0, %xmm0, %xmm2
360; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
361; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
362; AVX-NEXT:    retq
363;
364; XOP-LABEL: var_shift_v16i8:
365; XOP:       # BB#0:
366; XOP-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
367; XOP-NEXT:    retq
368;
369; AVX512-LABEL: var_shift_v16i8:
370; AVX512:       ## BB#0:
371; AVX512-NEXT:    vpsllw $5, %xmm1, %xmm1
372; AVX512-NEXT:    vpsllw $4, %xmm0, %xmm2
373; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
374; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
375; AVX512-NEXT:    vpsllw $2, %xmm0, %xmm2
376; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
377; AVX512-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
378; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
379; AVX512-NEXT:    vpaddb %xmm0, %xmm0, %xmm2
380; AVX512-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
381; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
382; AVX512-NEXT:    retq
383;
384; X32-SSE-LABEL: var_shift_v16i8:
385; X32-SSE:       # BB#0:
386; X32-SSE-NEXT:    psllw $5, %xmm1
387; X32-SSE-NEXT:    pxor %xmm2, %xmm2
388; X32-SSE-NEXT:    pxor %xmm3, %xmm3
389; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm3
390; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
391; X32-SSE-NEXT:    pandn %xmm0, %xmm4
392; X32-SSE-NEXT:    psllw $4, %xmm0
393; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
394; X32-SSE-NEXT:    pand %xmm3, %xmm0
395; X32-SSE-NEXT:    por %xmm4, %xmm0
396; X32-SSE-NEXT:    paddb %xmm1, %xmm1
397; X32-SSE-NEXT:    pxor %xmm3, %xmm3
398; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm3
399; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
400; X32-SSE-NEXT:    pandn %xmm0, %xmm4
401; X32-SSE-NEXT:    psllw $2, %xmm0
402; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
403; X32-SSE-NEXT:    pand %xmm3, %xmm0
404; X32-SSE-NEXT:    por %xmm4, %xmm0
405; X32-SSE-NEXT:    paddb %xmm1, %xmm1
406; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm2
407; X32-SSE-NEXT:    movdqa %xmm2, %xmm1
408; X32-SSE-NEXT:    pandn %xmm0, %xmm1
409; X32-SSE-NEXT:    paddb %xmm0, %xmm0
410; X32-SSE-NEXT:    pand %xmm2, %xmm0
411; X32-SSE-NEXT:    por %xmm1, %xmm0
412; X32-SSE-NEXT:    retl
413  %shift = shl <16 x i8> %a, %b
414  ret <16 x i8> %shift
415}
416
417;
418; Uniform Variable Shifts
419;
420
421define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
422; SSE-LABEL: splatvar_shift_v2i64:
423; SSE:       # BB#0:
424; SSE-NEXT:    psllq %xmm1, %xmm0
425; SSE-NEXT:    retq
426;
427; AVX-LABEL: splatvar_shift_v2i64:
428; AVX:       # BB#0:
429; AVX-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
430; AVX-NEXT:    retq
431;
432; XOP-LABEL: splatvar_shift_v2i64:
433; XOP:       # BB#0:
434; XOP-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
435; XOP-NEXT:    retq
436;
437; AVX512-LABEL: splatvar_shift_v2i64:
438; AVX512:       ## BB#0:
439; AVX512-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
440; AVX512-NEXT:    retq
441;
442; X32-SSE-LABEL: splatvar_shift_v2i64:
443; X32-SSE:       # BB#0:
444; X32-SSE-NEXT:    movq {{.*#+}} xmm1 = xmm1[0],zero
445; X32-SSE-NEXT:    psllq %xmm1, %xmm0
446; X32-SSE-NEXT:    retl
447  %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer
448  %shift = shl <2 x i64> %a, %splat
449  ret <2 x i64> %shift
450}
451
452define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
453; SSE2-LABEL: splatvar_shift_v4i32:
454; SSE2:       # BB#0:
455; SSE2-NEXT:    xorps %xmm2, %xmm2
456; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
457; SSE2-NEXT:    pslld %xmm2, %xmm0
458; SSE2-NEXT:    retq
459;
460; SSE41-LABEL: splatvar_shift_v4i32:
461; SSE41:       # BB#0:
462; SSE41-NEXT:    pxor %xmm2, %xmm2
463; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7]
464; SSE41-NEXT:    pslld %xmm2, %xmm0
465; SSE41-NEXT:    retq
466;
467; AVX-LABEL: splatvar_shift_v4i32:
468; AVX:       # BB#0:
469; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
470; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
471; AVX-NEXT:    vpslld %xmm1, %xmm0, %xmm0
472; AVX-NEXT:    retq
473;
474; XOP-LABEL: splatvar_shift_v4i32:
475; XOP:       # BB#0:
476; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
477; XOP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
478; XOP-NEXT:    vpslld %xmm1, %xmm0, %xmm0
479; XOP-NEXT:    retq
480;
481; AVX512-LABEL: splatvar_shift_v4i32:
482; AVX512:       ## BB#0:
483; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
484; AVX512-NEXT:    vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
485; AVX512-NEXT:    vpslld %xmm1, %xmm0, %xmm0
486; AVX512-NEXT:    retq
487;
488; X32-SSE-LABEL: splatvar_shift_v4i32:
489; X32-SSE:       # BB#0:
490; X32-SSE-NEXT:    xorps %xmm2, %xmm2
491; X32-SSE-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
492; X32-SSE-NEXT:    pslld %xmm2, %xmm0
493; X32-SSE-NEXT:    retl
494  %splat = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer
495  %shift = shl <4 x i32> %a, %splat
496  ret <4 x i32> %shift
497}
498
499define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
500; SSE2-LABEL: splatvar_shift_v8i16:
501; SSE2:       # BB#0:
502; SSE2-NEXT:    movd %xmm1, %eax
503; SSE2-NEXT:    movzwl %ax, %eax
504; SSE2-NEXT:    movd %eax, %xmm1
505; SSE2-NEXT:    psllw %xmm1, %xmm0
506; SSE2-NEXT:    retq
507;
508; SSE41-LABEL: splatvar_shift_v8i16:
509; SSE41:       # BB#0:
510; SSE41-NEXT:    pxor %xmm2, %xmm2
511; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3,4,5,6,7]
512; SSE41-NEXT:    psllw %xmm2, %xmm0
513; SSE41-NEXT:    retq
514;
515; AVX-LABEL: splatvar_shift_v8i16:
516; AVX:       # BB#0:
517; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
518; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
519; AVX-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
520; AVX-NEXT:    retq
521;
522; XOP-LABEL: splatvar_shift_v8i16:
523; XOP:       # BB#0:
524; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
525; XOP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
526; XOP-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
527; XOP-NEXT:    retq
528;
529; AVX512-LABEL: splatvar_shift_v8i16:
530; AVX512:       ## BB#0:
531; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
532; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
533; AVX512-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
534; AVX512-NEXT:    retq
535;
536; X32-SSE-LABEL: splatvar_shift_v8i16:
537; X32-SSE:       # BB#0:
538; X32-SSE-NEXT:    movd %xmm1, %eax
539; X32-SSE-NEXT:    movzwl %ax, %eax
540; X32-SSE-NEXT:    movd %eax, %xmm1
541; X32-SSE-NEXT:    psllw %xmm1, %xmm0
542; X32-SSE-NEXT:    retl
543  %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer
544  %shift = shl <8 x i16> %a, %splat
545  ret <8 x i16> %shift
546}
547
548define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
549; SSE2-LABEL: splatvar_shift_v16i8:
550; SSE2:       # BB#0:
551; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
552; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
553; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1]
554; SSE2-NEXT:    psllw $5, %xmm2
555; SSE2-NEXT:    pxor %xmm1, %xmm1
556; SSE2-NEXT:    pxor %xmm3, %xmm3
557; SSE2-NEXT:    pcmpgtb %xmm2, %xmm3
558; SSE2-NEXT:    movdqa %xmm3, %xmm4
559; SSE2-NEXT:    pandn %xmm0, %xmm4
560; SSE2-NEXT:    psllw $4, %xmm0
561; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
562; SSE2-NEXT:    pand %xmm3, %xmm0
563; SSE2-NEXT:    por %xmm4, %xmm0
564; SSE2-NEXT:    paddb %xmm2, %xmm2
565; SSE2-NEXT:    pxor %xmm3, %xmm3
566; SSE2-NEXT:    pcmpgtb %xmm2, %xmm3
567; SSE2-NEXT:    movdqa %xmm3, %xmm4
568; SSE2-NEXT:    pandn %xmm0, %xmm4
569; SSE2-NEXT:    psllw $2, %xmm0
570; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
571; SSE2-NEXT:    pand %xmm3, %xmm0
572; SSE2-NEXT:    por %xmm4, %xmm0
573; SSE2-NEXT:    paddb %xmm2, %xmm2
574; SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
575; SSE2-NEXT:    movdqa %xmm1, %xmm2
576; SSE2-NEXT:    pandn %xmm0, %xmm2
577; SSE2-NEXT:    paddb %xmm0, %xmm0
578; SSE2-NEXT:    pand %xmm1, %xmm0
579; SSE2-NEXT:    por %xmm2, %xmm0
580; SSE2-NEXT:    retq
581;
582; SSE41-LABEL: splatvar_shift_v16i8:
583; SSE41:       # BB#0:
584; SSE41-NEXT:    movdqa %xmm0, %xmm2
585; SSE41-NEXT:    pxor %xmm0, %xmm0
586; SSE41-NEXT:    pshufb %xmm0, %xmm1
587; SSE41-NEXT:    psllw $5, %xmm1
588; SSE41-NEXT:    movdqa %xmm1, %xmm3
589; SSE41-NEXT:    paddb %xmm3, %xmm3
590; SSE41-NEXT:    movdqa %xmm2, %xmm4
591; SSE41-NEXT:    psllw $4, %xmm4
592; SSE41-NEXT:    pand {{.*}}(%rip), %xmm4
593; SSE41-NEXT:    movdqa %xmm1, %xmm0
594; SSE41-NEXT:    pblendvb %xmm4, %xmm2
595; SSE41-NEXT:    movdqa %xmm2, %xmm1
596; SSE41-NEXT:    psllw $2, %xmm1
597; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
598; SSE41-NEXT:    movdqa %xmm3, %xmm0
599; SSE41-NEXT:    pblendvb %xmm1, %xmm2
600; SSE41-NEXT:    movdqa %xmm2, %xmm1
601; SSE41-NEXT:    paddb %xmm1, %xmm1
602; SSE41-NEXT:    paddb %xmm3, %xmm3
603; SSE41-NEXT:    movdqa %xmm3, %xmm0
604; SSE41-NEXT:    pblendvb %xmm1, %xmm2
605; SSE41-NEXT:    movdqa %xmm2, %xmm0
606; SSE41-NEXT:    retq
607;
608; AVX1-LABEL: splatvar_shift_v16i8:
609; AVX1:       # BB#0:
610; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
611; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
612; AVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
613; AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm2
614; AVX1-NEXT:    vpsllw $4, %xmm0, %xmm3
615; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
616; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
617; AVX1-NEXT:    vpsllw $2, %xmm0, %xmm1
618; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
619; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
620; AVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm1
621; AVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
622; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
623; AVX1-NEXT:    retq
624;
625; AVX2-LABEL: splatvar_shift_v16i8:
626; AVX2:       # BB#0:
627; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
628; AVX2-NEXT:    vpsllw $5, %xmm1, %xmm1
629; AVX2-NEXT:    vpsllw $4, %xmm0, %xmm2
630; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
631; AVX2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
632; AVX2-NEXT:    vpsllw $2, %xmm0, %xmm2
633; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
634; AVX2-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
635; AVX2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
636; AVX2-NEXT:    vpaddb %xmm0, %xmm0, %xmm2
637; AVX2-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
638; AVX2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
639; AVX2-NEXT:    retq
640;
641; XOPAVX1-LABEL: splatvar_shift_v16i8:
642; XOPAVX1:       # BB#0:
643; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
644; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
645; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
646; XOPAVX1-NEXT:    retq
647;
648; XOPAVX2-LABEL: splatvar_shift_v16i8:
649; XOPAVX2:       # BB#0:
650; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
651; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
652; XOPAVX2-NEXT:    retq
653;
654; AVX512-LABEL: splatvar_shift_v16i8:
655; AVX512:       ## BB#0:
656; AVX512-NEXT:    vpbroadcastb %xmm1, %xmm1
657; AVX512-NEXT:    vpsllw $5, %xmm1, %xmm1
658; AVX512-NEXT:    vpsllw $4, %xmm0, %xmm2
659; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
660; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
661; AVX512-NEXT:    vpsllw $2, %xmm0, %xmm2
662; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
663; AVX512-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
664; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
665; AVX512-NEXT:    vpaddb %xmm0, %xmm0, %xmm2
666; AVX512-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
667; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
668; AVX512-NEXT:    retq
669;
670; X32-SSE-LABEL: splatvar_shift_v16i8:
671; X32-SSE:       # BB#0:
672; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
673; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
674; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1]
675; X32-SSE-NEXT:    psllw $5, %xmm2
676; X32-SSE-NEXT:    pxor %xmm1, %xmm1
677; X32-SSE-NEXT:    pxor %xmm3, %xmm3
678; X32-SSE-NEXT:    pcmpgtb %xmm2, %xmm3
679; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
680; X32-SSE-NEXT:    pandn %xmm0, %xmm4
681; X32-SSE-NEXT:    psllw $4, %xmm0
682; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
683; X32-SSE-NEXT:    pand %xmm3, %xmm0
684; X32-SSE-NEXT:    por %xmm4, %xmm0
685; X32-SSE-NEXT:    paddb %xmm2, %xmm2
686; X32-SSE-NEXT:    pxor %xmm3, %xmm3
687; X32-SSE-NEXT:    pcmpgtb %xmm2, %xmm3
688; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
689; X32-SSE-NEXT:    pandn %xmm0, %xmm4
690; X32-SSE-NEXT:    psllw $2, %xmm0
691; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
692; X32-SSE-NEXT:    pand %xmm3, %xmm0
693; X32-SSE-NEXT:    por %xmm4, %xmm0
694; X32-SSE-NEXT:    paddb %xmm2, %xmm2
695; X32-SSE-NEXT:    pcmpgtb %xmm2, %xmm1
696; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
697; X32-SSE-NEXT:    pandn %xmm0, %xmm2
698; X32-SSE-NEXT:    paddb %xmm0, %xmm0
699; X32-SSE-NEXT:    pand %xmm1, %xmm0
700; X32-SSE-NEXT:    por %xmm2, %xmm0
701; X32-SSE-NEXT:    retl
702  %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer
703  %shift = shl <16 x i8> %a, %splat
704  ret <16 x i8> %shift
705}
706
707;
708; Constant Shifts
709;
710
711define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
712; SSE2-LABEL: constant_shift_v2i64:
713; SSE2:       # BB#0:
714; SSE2-NEXT:    movdqa %xmm0, %xmm1
715; SSE2-NEXT:    psllq $7, %xmm1
716; SSE2-NEXT:    psllq $1, %xmm0
717; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
718; SSE2-NEXT:    movapd %xmm1, %xmm0
719; SSE2-NEXT:    retq
720;
721; SSE41-LABEL: constant_shift_v2i64:
722; SSE41:       # BB#0:
723; SSE41-NEXT:    movdqa %xmm0, %xmm1
724; SSE41-NEXT:    psllq $7, %xmm1
725; SSE41-NEXT:    psllq $1, %xmm0
726; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
727; SSE41-NEXT:    retq
728;
729; AVX1-LABEL: constant_shift_v2i64:
730; AVX1:       # BB#0:
731; AVX1-NEXT:    vpsllq $7, %xmm0, %xmm1
732; AVX1-NEXT:    vpsllq $1, %xmm0, %xmm0
733; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
734; AVX1-NEXT:    retq
735;
736; AVX2-LABEL: constant_shift_v2i64:
737; AVX2:       # BB#0:
738; AVX2-NEXT:    vpsllvq {{.*}}(%rip), %xmm0, %xmm0
739; AVX2-NEXT:    retq
740;
741; XOPAVX1-LABEL: constant_shift_v2i64:
742; XOPAVX1:       # BB#0:
743; XOPAVX1-NEXT:    vpshlq {{.*}}(%rip), %xmm0, %xmm0
744; XOPAVX1-NEXT:    retq
745;
746; XOPAVX2-LABEL: constant_shift_v2i64:
747; XOPAVX2:       # BB#0:
748; XOPAVX2-NEXT:    vpsllvq {{.*}}(%rip), %xmm0, %xmm0
749; XOPAVX2-NEXT:    retq
750;
751; AVX512-LABEL: constant_shift_v2i64:
752; AVX512:       ## BB#0:
753; AVX512-NEXT:    vpsllvq {{.*}}(%rip), %xmm0, %xmm0
754; AVX512-NEXT:    retq
755;
756; X32-SSE-LABEL: constant_shift_v2i64:
757; X32-SSE:       # BB#0:
758; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
759; X32-SSE-NEXT:    psllq $7, %xmm1
760; X32-SSE-NEXT:    psllq $1, %xmm0
761; X32-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
762; X32-SSE-NEXT:    movapd %xmm1, %xmm0
763; X32-SSE-NEXT:    retl
764  %shift = shl <2 x i64> %a, <i64 1, i64 7>
765  ret <2 x i64> %shift
766}
767
768define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind {
769; SSE2-LABEL: constant_shift_v4i32:
770; SSE2:       # BB#0:
771; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [16,32,64,128]
772; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
773; SSE2-NEXT:    pmuludq %xmm1, %xmm0
774; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
775; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
776; SSE2-NEXT:    pmuludq %xmm2, %xmm1
777; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
778; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
779; SSE2-NEXT:    retq
780;
781; SSE41-LABEL: constant_shift_v4i32:
782; SSE41:       # BB#0:
783; SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
784; SSE41-NEXT:    retq
785;
786; AVX1-LABEL: constant_shift_v4i32:
787; AVX1:       # BB#0:
788; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
789; AVX1-NEXT:    retq
790;
791; AVX2-LABEL: constant_shift_v4i32:
792; AVX2:       # BB#0:
793; AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
794; AVX2-NEXT:    retq
795;
796; XOPAVX1-LABEL: constant_shift_v4i32:
797; XOPAVX1:       # BB#0:
798; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm0
799; XOPAVX1-NEXT:    retq
800;
801; XOPAVX2-LABEL: constant_shift_v4i32:
802; XOPAVX2:       # BB#0:
803; XOPAVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
804; XOPAVX2-NEXT:    retq
805;
806; AVX512-LABEL: constant_shift_v4i32:
807; AVX512:       ## BB#0:
808; AVX512-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
809; AVX512-NEXT:    retq
810;
811; X32-SSE-LABEL: constant_shift_v4i32:
812; X32-SSE:       # BB#0:
813; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [16,32,64,128]
814; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
815; X32-SSE-NEXT:    pmuludq %xmm1, %xmm0
816; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
817; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
818; X32-SSE-NEXT:    pmuludq %xmm2, %xmm1
819; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
820; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
821; X32-SSE-NEXT:    retl
822  %shift = shl <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7>
823  ret <4 x i32> %shift
824}
825
826define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
827; SSE-LABEL: constant_shift_v8i16:
828; SSE:       # BB#0:
829; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm0
830; SSE-NEXT:    retq
831;
832; AVX-LABEL: constant_shift_v8i16:
833; AVX:       # BB#0:
834; AVX-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
835; AVX-NEXT:    retq
836;
837; XOP-LABEL: constant_shift_v8i16:
838; XOP:       # BB#0:
839; XOP-NEXT:    vpshlw {{.*}}(%rip), %xmm0, %xmm0
840; XOP-NEXT:    retq
841;
842; AVX512-LABEL: constant_shift_v8i16:
843; AVX512:       ## BB#0:
844; AVX512-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
845; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
846; AVX512-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
847; AVX512-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
848; AVX512-NEXT:    retq
849;
850; X32-SSE-LABEL: constant_shift_v8i16:
851; X32-SSE:       # BB#0:
852; X32-SSE-NEXT:    pmullw {{\.LCPI.*}}, %xmm0
853; X32-SSE-NEXT:    retl
854  %shift = shl <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
855  ret <8 x i16> %shift
856}
857
858define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
859; SSE2-LABEL: constant_shift_v16i8:
860; SSE2:       # BB#0:
861; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
862; SSE2-NEXT:    psllw $5, %xmm2
863; SSE2-NEXT:    pxor %xmm1, %xmm1
864; SSE2-NEXT:    pxor %xmm3, %xmm3
865; SSE2-NEXT:    pcmpgtb %xmm2, %xmm3
866; SSE2-NEXT:    movdqa %xmm3, %xmm4
867; SSE2-NEXT:    pandn %xmm0, %xmm4
868; SSE2-NEXT:    psllw $4, %xmm0
869; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
870; SSE2-NEXT:    pand %xmm3, %xmm0
871; SSE2-NEXT:    por %xmm4, %xmm0
872; SSE2-NEXT:    paddb %xmm2, %xmm2
873; SSE2-NEXT:    pxor %xmm3, %xmm3
874; SSE2-NEXT:    pcmpgtb %xmm2, %xmm3
875; SSE2-NEXT:    movdqa %xmm3, %xmm4
876; SSE2-NEXT:    pandn %xmm0, %xmm4
877; SSE2-NEXT:    psllw $2, %xmm0
878; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
879; SSE2-NEXT:    pand %xmm3, %xmm0
880; SSE2-NEXT:    por %xmm4, %xmm0
881; SSE2-NEXT:    paddb %xmm2, %xmm2
882; SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
883; SSE2-NEXT:    movdqa %xmm1, %xmm2
884; SSE2-NEXT:    pandn %xmm0, %xmm2
885; SSE2-NEXT:    paddb %xmm0, %xmm0
886; SSE2-NEXT:    pand %xmm1, %xmm0
887; SSE2-NEXT:    por %xmm2, %xmm0
888; SSE2-NEXT:    retq
889;
890; SSE41-LABEL: constant_shift_v16i8:
891; SSE41:       # BB#0:
892; SSE41-NEXT:    movdqa %xmm0, %xmm1
893; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
894; SSE41-NEXT:    psllw $5, %xmm0
895; SSE41-NEXT:    movdqa %xmm1, %xmm2
896; SSE41-NEXT:    psllw $4, %xmm2
897; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
898; SSE41-NEXT:    pblendvb %xmm2, %xmm1
899; SSE41-NEXT:    movdqa %xmm1, %xmm2
900; SSE41-NEXT:    psllw $2, %xmm2
901; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
902; SSE41-NEXT:    paddb %xmm0, %xmm0
903; SSE41-NEXT:    pblendvb %xmm2, %xmm1
904; SSE41-NEXT:    movdqa %xmm1, %xmm2
905; SSE41-NEXT:    paddb %xmm2, %xmm2
906; SSE41-NEXT:    paddb %xmm0, %xmm0
907; SSE41-NEXT:    pblendvb %xmm2, %xmm1
908; SSE41-NEXT:    movdqa %xmm1, %xmm0
909; SSE41-NEXT:    retq
910;
911; AVX-LABEL: constant_shift_v16i8:
912; AVX:       # BB#0:
913; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
914; AVX-NEXT:    vpsllw $5, %xmm1, %xmm1
915; AVX-NEXT:    vpsllw $4, %xmm0, %xmm2
916; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
917; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
918; AVX-NEXT:    vpsllw $2, %xmm0, %xmm2
919; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
920; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
921; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
922; AVX-NEXT:    vpaddb %xmm0, %xmm0, %xmm2
923; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
924; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
925; AVX-NEXT:    retq
926;
927; XOP-LABEL: constant_shift_v16i8:
928; XOP:       # BB#0:
929; XOP-NEXT:    vpshlb {{.*}}(%rip), %xmm0, %xmm0
930; XOP-NEXT:    retq
931;
932; AVX512-LABEL: constant_shift_v16i8:
933; AVX512:       ## BB#0:
934; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
935; AVX512-NEXT:    vpsllw $5, %xmm1, %xmm1
936; AVX512-NEXT:    vpsllw $4, %xmm0, %xmm2
937; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
938; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
939; AVX512-NEXT:    vpsllw $2, %xmm0, %xmm2
940; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
941; AVX512-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
942; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
943; AVX512-NEXT:    vpaddb %xmm0, %xmm0, %xmm2
944; AVX512-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
945; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
946; AVX512-NEXT:    retq
947;
948; X32-SSE-LABEL: constant_shift_v16i8:
949; X32-SSE:       # BB#0:
950; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
951; X32-SSE-NEXT:    psllw $5, %xmm2
952; X32-SSE-NEXT:    pxor %xmm1, %xmm1
953; X32-SSE-NEXT:    pxor %xmm3, %xmm3
954; X32-SSE-NEXT:    pcmpgtb %xmm2, %xmm3
955; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
956; X32-SSE-NEXT:    pandn %xmm0, %xmm4
957; X32-SSE-NEXT:    psllw $4, %xmm0
958; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
959; X32-SSE-NEXT:    pand %xmm3, %xmm0
960; X32-SSE-NEXT:    por %xmm4, %xmm0
961; X32-SSE-NEXT:    paddb %xmm2, %xmm2
962; X32-SSE-NEXT:    pxor %xmm3, %xmm3
963; X32-SSE-NEXT:    pcmpgtb %xmm2, %xmm3
964; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
965; X32-SSE-NEXT:    pandn %xmm0, %xmm4
966; X32-SSE-NEXT:    psllw $2, %xmm0
967; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
968; X32-SSE-NEXT:    pand %xmm3, %xmm0
969; X32-SSE-NEXT:    por %xmm4, %xmm0
970; X32-SSE-NEXT:    paddb %xmm2, %xmm2
971; X32-SSE-NEXT:    pcmpgtb %xmm2, %xmm1
972; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
973; X32-SSE-NEXT:    pandn %xmm0, %xmm2
974; X32-SSE-NEXT:    paddb %xmm0, %xmm0
975; X32-SSE-NEXT:    pand %xmm1, %xmm0
976; X32-SSE-NEXT:    por %xmm2, %xmm0
977; X32-SSE-NEXT:    retl
978  %shift = shl <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
979  ret <16 x i8> %shift
980}
981
982;
983; Uniform Constant Shifts
984;
985
986define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind {
987; SSE-LABEL: splatconstant_shift_v2i64:
988; SSE:       # BB#0:
989; SSE-NEXT:    psllq $7, %xmm0
990; SSE-NEXT:    retq
991;
992; AVX-LABEL: splatconstant_shift_v2i64:
993; AVX:       # BB#0:
994; AVX-NEXT:    vpsllq $7, %xmm0, %xmm0
995; AVX-NEXT:    retq
996;
997; XOP-LABEL: splatconstant_shift_v2i64:
998; XOP:       # BB#0:
999; XOP-NEXT:    vpsllq $7, %xmm0, %xmm0
1000; XOP-NEXT:    retq
1001;
1002; AVX512-LABEL: splatconstant_shift_v2i64:
1003; AVX512:       ## BB#0:
1004; AVX512-NEXT:    vpsllq $7, %xmm0, %xmm0
1005; AVX512-NEXT:    retq
1006;
1007; X32-SSE-LABEL: splatconstant_shift_v2i64:
1008; X32-SSE:       # BB#0:
1009; X32-SSE-NEXT:    psllq $7, %xmm0
1010; X32-SSE-NEXT:    retl
1011  %shift = shl <2 x i64> %a, <i64 7, i64 7>
1012  ret <2 x i64> %shift
1013}
1014
1015define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) nounwind {
1016; SSE-LABEL: splatconstant_shift_v4i32:
1017; SSE:       # BB#0:
1018; SSE-NEXT:    pslld $5, %xmm0
1019; SSE-NEXT:    retq
1020;
1021; AVX-LABEL: splatconstant_shift_v4i32:
1022; AVX:       # BB#0:
1023; AVX-NEXT:    vpslld $5, %xmm0, %xmm0
1024; AVX-NEXT:    retq
1025;
1026; XOP-LABEL: splatconstant_shift_v4i32:
1027; XOP:       # BB#0:
1028; XOP-NEXT:    vpslld $5, %xmm0, %xmm0
1029; XOP-NEXT:    retq
1030;
1031; AVX512-LABEL: splatconstant_shift_v4i32:
1032; AVX512:       ## BB#0:
1033; AVX512-NEXT:    vpslld $5, %xmm0, %xmm0
1034; AVX512-NEXT:    retq
1035;
1036; X32-SSE-LABEL: splatconstant_shift_v4i32:
1037; X32-SSE:       # BB#0:
1038; X32-SSE-NEXT:    pslld $5, %xmm0
1039; X32-SSE-NEXT:    retl
1040  %shift = shl <4 x i32> %a, <i32 5, i32 5, i32 5, i32 5>
1041  ret <4 x i32> %shift
1042}
1043
1044define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) nounwind {
1045; SSE-LABEL: splatconstant_shift_v8i16:
1046; SSE:       # BB#0:
1047; SSE-NEXT:    psllw $3, %xmm0
1048; SSE-NEXT:    retq
1049;
1050; AVX-LABEL: splatconstant_shift_v8i16:
1051; AVX:       # BB#0:
1052; AVX-NEXT:    vpsllw $3, %xmm0, %xmm0
1053; AVX-NEXT:    retq
1054;
1055; XOP-LABEL: splatconstant_shift_v8i16:
1056; XOP:       # BB#0:
1057; XOP-NEXT:    vpsllw $3, %xmm0, %xmm0
1058; XOP-NEXT:    retq
1059;
1060; AVX512-LABEL: splatconstant_shift_v8i16:
1061; AVX512:       ## BB#0:
1062; AVX512-NEXT:    vpsllw $3, %xmm0, %xmm0
1063; AVX512-NEXT:    retq
1064;
1065; X32-SSE-LABEL: splatconstant_shift_v8i16:
1066; X32-SSE:       # BB#0:
1067; X32-SSE-NEXT:    psllw $3, %xmm0
1068; X32-SSE-NEXT:    retl
1069  %shift = shl <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
1070  ret <8 x i16> %shift
1071}
1072
1073define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind {
1074; SSE-LABEL: splatconstant_shift_v16i8:
1075; SSE:       # BB#0:
1076; SSE-NEXT:    psllw $3, %xmm0
1077; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
1078; SSE-NEXT:    retq
1079;
1080; AVX-LABEL: splatconstant_shift_v16i8:
1081; AVX:       # BB#0:
1082; AVX-NEXT:    vpsllw $3, %xmm0, %xmm0
1083; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
1084; AVX-NEXT:    retq
1085;
1086; XOP-LABEL: splatconstant_shift_v16i8:
1087; XOP:       # BB#0:
1088; XOP-NEXT:    vpshlb {{.*}}(%rip), %xmm0, %xmm0
1089; XOP-NEXT:    retq
1090;
1091; AVX512-LABEL: splatconstant_shift_v16i8:
1092; AVX512:       ## BB#0:
1093; AVX512-NEXT:    vpsllw $3, %xmm0, %xmm0
1094; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
1095; AVX512-NEXT:    retq
1096;
1097; X32-SSE-LABEL: splatconstant_shift_v16i8:
1098; X32-SSE:       # BB#0:
1099; X32-SSE-NEXT:    psllw $3, %xmm0
1100; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
1101; X32-SSE-NEXT:    retl
1102  %shift = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
1103  ret <16 x i8> %shift
1104}
1105