• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
12;
13; Just one 32-bit run to make sure we do reasonable things for i64 shifts.
14; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2
15
16;
17; Variable Shifts
18;
19
20define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
21; SSE2-LABEL: var_shift_v2i64:
22; SSE2:       # %bb.0:
23; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
24; SSE2-NEXT:    movdqa %xmm2, %xmm3
25; SSE2-NEXT:    psrlq %xmm1, %xmm3
26; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
27; SSE2-NEXT:    psrlq %xmm4, %xmm2
28; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
29; SSE2-NEXT:    movdqa %xmm0, %xmm3
30; SSE2-NEXT:    psrlq %xmm1, %xmm3
31; SSE2-NEXT:    psrlq %xmm4, %xmm0
32; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
33; SSE2-NEXT:    xorpd %xmm2, %xmm0
34; SSE2-NEXT:    psubq %xmm2, %xmm0
35; SSE2-NEXT:    retq
36;
37; SSE41-LABEL: var_shift_v2i64:
38; SSE41:       # %bb.0:
39; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
40; SSE41-NEXT:    movdqa %xmm2, %xmm3
41; SSE41-NEXT:    psrlq %xmm1, %xmm3
42; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
43; SSE41-NEXT:    psrlq %xmm4, %xmm2
44; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
45; SSE41-NEXT:    movdqa %xmm0, %xmm3
46; SSE41-NEXT:    psrlq %xmm1, %xmm3
47; SSE41-NEXT:    psrlq %xmm4, %xmm0
48; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
49; SSE41-NEXT:    pxor %xmm2, %xmm0
50; SSE41-NEXT:    psubq %xmm2, %xmm0
51; SSE41-NEXT:    retq
52;
53; AVX1-LABEL: var_shift_v2i64:
54; AVX1:       # %bb.0:
55; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
56; AVX1-NEXT:    vpsrlq %xmm1, %xmm2, %xmm3
57; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
58; AVX1-NEXT:    vpsrlq %xmm4, %xmm2, %xmm2
59; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
60; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm1
61; AVX1-NEXT:    vpsrlq %xmm4, %xmm0, %xmm0
62; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
63; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
64; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
65; AVX1-NEXT:    retq
66;
67; AVX2-LABEL: var_shift_v2i64:
68; AVX2:       # %bb.0:
69; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
70; AVX2-NEXT:    vpsrlvq %xmm1, %xmm2, %xmm3
71; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
72; AVX2-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
73; AVX2-NEXT:    vpsubq %xmm3, %xmm0, %xmm0
74; AVX2-NEXT:    retq
75;
76; XOP-LABEL: var_shift_v2i64:
77; XOP:       # %bb.0:
78; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
79; XOP-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
80; XOP-NEXT:    vpshaq %xmm1, %xmm0, %xmm0
81; XOP-NEXT:    retq
82;
83; AVX512-LABEL: var_shift_v2i64:
84; AVX512:       # %bb.0:
85; AVX512-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
86; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
87; AVX512-NEXT:    vpsravq %zmm1, %zmm0, %zmm0
88; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
89; AVX512-NEXT:    vzeroupper
90; AVX512-NEXT:    retq
91;
92; AVX512VL-LABEL: var_shift_v2i64:
93; AVX512VL:       # %bb.0:
94; AVX512VL-NEXT:    vpsravq %xmm1, %xmm0, %xmm0
95; AVX512VL-NEXT:    retq
96;
97; X32-SSE-LABEL: var_shift_v2i64:
98; X32-SSE:       # %bb.0:
99; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
100; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
101; X32-SSE-NEXT:    psrlq %xmm1, %xmm3
102; X32-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
103; X32-SSE-NEXT:    psrlq %xmm4, %xmm2
104; X32-SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
105; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
106; X32-SSE-NEXT:    psrlq %xmm1, %xmm3
107; X32-SSE-NEXT:    psrlq %xmm4, %xmm0
108; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
109; X32-SSE-NEXT:    xorpd %xmm2, %xmm0
110; X32-SSE-NEXT:    psubq %xmm2, %xmm0
111; X32-SSE-NEXT:    retl
112  %shift = ashr <2 x i64> %a, %b
113  ret <2 x i64> %shift
114}
115
116define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
117; SSE2-LABEL: var_shift_v4i32:
118; SSE2:       # %bb.0:
119; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
120; SSE2-NEXT:    movdqa %xmm0, %xmm3
121; SSE2-NEXT:    psrad %xmm2, %xmm3
122; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
123; SSE2-NEXT:    movdqa %xmm0, %xmm2
124; SSE2-NEXT:    psrad %xmm4, %xmm2
125; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
126; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
127; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
128; SSE2-NEXT:    movdqa %xmm0, %xmm4
129; SSE2-NEXT:    psrad %xmm3, %xmm4
130; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
131; SSE2-NEXT:    psrad %xmm1, %xmm0
132; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
133; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
134; SSE2-NEXT:    movaps %xmm2, %xmm0
135; SSE2-NEXT:    retq
136;
137; SSE41-LABEL: var_shift_v4i32:
138; SSE41:       # %bb.0:
139; SSE41-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
140; SSE41-NEXT:    movdqa %xmm0, %xmm3
141; SSE41-NEXT:    psrad %xmm2, %xmm3
142; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
143; SSE41-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
144; SSE41-NEXT:    movdqa %xmm0, %xmm5
145; SSE41-NEXT:    psrad %xmm4, %xmm5
146; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
147; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
148; SSE41-NEXT:    movdqa %xmm0, %xmm3
149; SSE41-NEXT:    psrad %xmm1, %xmm3
150; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
151; SSE41-NEXT:    psrad %xmm1, %xmm0
152; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
153; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
154; SSE41-NEXT:    retq
155;
156; AVX1-LABEL: var_shift_v4i32:
157; AVX1:       # %bb.0:
158; AVX1-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
159; AVX1-NEXT:    vpsrad %xmm2, %xmm0, %xmm2
160; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm3
161; AVX1-NEXT:    vpsrad %xmm3, %xmm0, %xmm3
162; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
163; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
164; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
165; AVX1-NEXT:    vpsrad %xmm3, %xmm0, %xmm3
166; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
167; AVX1-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
168; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
169; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
170; AVX1-NEXT:    retq
171;
172; AVX2-LABEL: var_shift_v4i32:
173; AVX2:       # %bb.0:
174; AVX2-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
175; AVX2-NEXT:    retq
176;
177; XOPAVX1-LABEL: var_shift_v4i32:
178; XOPAVX1:       # %bb.0:
179; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
180; XOPAVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm1
181; XOPAVX1-NEXT:    vpshad %xmm1, %xmm0, %xmm0
182; XOPAVX1-NEXT:    retq
183;
184; XOPAVX2-LABEL: var_shift_v4i32:
185; XOPAVX2:       # %bb.0:
186; XOPAVX2-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
187; XOPAVX2-NEXT:    retq
188;
189; AVX512-LABEL: var_shift_v4i32:
190; AVX512:       # %bb.0:
191; AVX512-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
192; AVX512-NEXT:    retq
193;
194; AVX512VL-LABEL: var_shift_v4i32:
195; AVX512VL:       # %bb.0:
196; AVX512VL-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
197; AVX512VL-NEXT:    retq
198;
199; X32-SSE-LABEL: var_shift_v4i32:
200; X32-SSE:       # %bb.0:
201; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
202; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
203; X32-SSE-NEXT:    psrad %xmm2, %xmm3
204; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
205; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
206; X32-SSE-NEXT:    psrad %xmm4, %xmm2
207; X32-SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
208; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
209; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
210; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
211; X32-SSE-NEXT:    psrad %xmm3, %xmm4
212; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
213; X32-SSE-NEXT:    psrad %xmm1, %xmm0
214; X32-SSE-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
215; X32-SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
216; X32-SSE-NEXT:    movaps %xmm2, %xmm0
217; X32-SSE-NEXT:    retl
218  %shift = ashr <4 x i32> %a, %b
219  ret <4 x i32> %shift
220}
221
222define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
223; SSE2-LABEL: var_shift_v8i16:
224; SSE2:       # %bb.0:
225; SSE2-NEXT:    psllw $12, %xmm1
226; SSE2-NEXT:    movdqa %xmm1, %xmm2
227; SSE2-NEXT:    psraw $15, %xmm2
228; SSE2-NEXT:    movdqa %xmm2, %xmm3
229; SSE2-NEXT:    pandn %xmm0, %xmm3
230; SSE2-NEXT:    psraw $8, %xmm0
231; SSE2-NEXT:    pand %xmm2, %xmm0
232; SSE2-NEXT:    por %xmm3, %xmm0
233; SSE2-NEXT:    paddw %xmm1, %xmm1
234; SSE2-NEXT:    movdqa %xmm1, %xmm2
235; SSE2-NEXT:    psraw $15, %xmm2
236; SSE2-NEXT:    movdqa %xmm2, %xmm3
237; SSE2-NEXT:    pandn %xmm0, %xmm3
238; SSE2-NEXT:    psraw $4, %xmm0
239; SSE2-NEXT:    pand %xmm2, %xmm0
240; SSE2-NEXT:    por %xmm3, %xmm0
241; SSE2-NEXT:    paddw %xmm1, %xmm1
242; SSE2-NEXT:    movdqa %xmm1, %xmm2
243; SSE2-NEXT:    psraw $15, %xmm2
244; SSE2-NEXT:    movdqa %xmm2, %xmm3
245; SSE2-NEXT:    pandn %xmm0, %xmm3
246; SSE2-NEXT:    psraw $2, %xmm0
247; SSE2-NEXT:    pand %xmm2, %xmm0
248; SSE2-NEXT:    por %xmm3, %xmm0
249; SSE2-NEXT:    paddw %xmm1, %xmm1
250; SSE2-NEXT:    psraw $15, %xmm1
251; SSE2-NEXT:    movdqa %xmm1, %xmm2
252; SSE2-NEXT:    pandn %xmm0, %xmm2
253; SSE2-NEXT:    psraw $1, %xmm0
254; SSE2-NEXT:    pand %xmm1, %xmm0
255; SSE2-NEXT:    por %xmm2, %xmm0
256; SSE2-NEXT:    retq
257;
258; SSE41-LABEL: var_shift_v8i16:
259; SSE41:       # %bb.0:
260; SSE41-NEXT:    movdqa %xmm0, %xmm2
261; SSE41-NEXT:    movdqa %xmm1, %xmm0
262; SSE41-NEXT:    psllw $12, %xmm0
263; SSE41-NEXT:    psllw $4, %xmm1
264; SSE41-NEXT:    por %xmm0, %xmm1
265; SSE41-NEXT:    movdqa %xmm1, %xmm3
266; SSE41-NEXT:    paddw %xmm1, %xmm3
267; SSE41-NEXT:    movdqa %xmm2, %xmm4
268; SSE41-NEXT:    psraw $8, %xmm4
269; SSE41-NEXT:    movdqa %xmm1, %xmm0
270; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm2
271; SSE41-NEXT:    movdqa %xmm2, %xmm1
272; SSE41-NEXT:    psraw $4, %xmm1
273; SSE41-NEXT:    movdqa %xmm3, %xmm0
274; SSE41-NEXT:    pblendvb %xmm0, %xmm1, %xmm2
275; SSE41-NEXT:    movdqa %xmm2, %xmm1
276; SSE41-NEXT:    psraw $2, %xmm1
277; SSE41-NEXT:    paddw %xmm3, %xmm3
278; SSE41-NEXT:    movdqa %xmm3, %xmm0
279; SSE41-NEXT:    pblendvb %xmm0, %xmm1, %xmm2
280; SSE41-NEXT:    movdqa %xmm2, %xmm1
281; SSE41-NEXT:    psraw $1, %xmm1
282; SSE41-NEXT:    paddw %xmm3, %xmm3
283; SSE41-NEXT:    movdqa %xmm3, %xmm0
284; SSE41-NEXT:    pblendvb %xmm0, %xmm1, %xmm2
285; SSE41-NEXT:    movdqa %xmm2, %xmm0
286; SSE41-NEXT:    retq
287;
288; AVX1-LABEL: var_shift_v8i16:
289; AVX1:       # %bb.0:
290; AVX1-NEXT:    vpsllw $12, %xmm1, %xmm2
291; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
292; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
293; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm2
294; AVX1-NEXT:    vpsraw $8, %xmm0, %xmm3
295; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
296; AVX1-NEXT:    vpsraw $4, %xmm0, %xmm1
297; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
298; AVX1-NEXT:    vpsraw $2, %xmm0, %xmm1
299; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
300; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
301; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm1
302; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
303; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
304; AVX1-NEXT:    retq
305;
306; AVX2-LABEL: var_shift_v8i16:
307; AVX2:       # %bb.0:
308; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
309; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
310; AVX2-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
311; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
312; AVX2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
313; AVX2-NEXT:    vzeroupper
314; AVX2-NEXT:    retq
315;
316; XOP-LABEL: var_shift_v8i16:
317; XOP:       # %bb.0:
318; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
319; XOP-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
320; XOP-NEXT:    vpshaw %xmm1, %xmm0, %xmm0
321; XOP-NEXT:    retq
322;
323; AVX512DQ-LABEL: var_shift_v8i16:
324; AVX512DQ:       # %bb.0:
325; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
326; AVX512DQ-NEXT:    vpmovsxwd %xmm0, %ymm0
327; AVX512DQ-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
328; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
329; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
330; AVX512DQ-NEXT:    vzeroupper
331; AVX512DQ-NEXT:    retq
332;
333; AVX512BW-LABEL: var_shift_v8i16:
334; AVX512BW:       # %bb.0:
335; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
336; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
337; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
338; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
339; AVX512BW-NEXT:    vzeroupper
340; AVX512BW-NEXT:    retq
341;
342; AVX512DQVL-LABEL: var_shift_v8i16:
343; AVX512DQVL:       # %bb.0:
344; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
345; AVX512DQVL-NEXT:    vpmovsxwd %xmm0, %ymm0
346; AVX512DQVL-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
347; AVX512DQVL-NEXT:    vpmovdw %ymm0, %xmm0
348; AVX512DQVL-NEXT:    vzeroupper
349; AVX512DQVL-NEXT:    retq
350;
351; AVX512BWVL-LABEL: var_shift_v8i16:
352; AVX512BWVL:       # %bb.0:
353; AVX512BWVL-NEXT:    vpsravw %xmm1, %xmm0, %xmm0
354; AVX512BWVL-NEXT:    retq
355;
356; X32-SSE-LABEL: var_shift_v8i16:
357; X32-SSE:       # %bb.0:
358; X32-SSE-NEXT:    psllw $12, %xmm1
359; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
360; X32-SSE-NEXT:    psraw $15, %xmm2
361; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
362; X32-SSE-NEXT:    pandn %xmm0, %xmm3
363; X32-SSE-NEXT:    psraw $8, %xmm0
364; X32-SSE-NEXT:    pand %xmm2, %xmm0
365; X32-SSE-NEXT:    por %xmm3, %xmm0
366; X32-SSE-NEXT:    paddw %xmm1, %xmm1
367; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
368; X32-SSE-NEXT:    psraw $15, %xmm2
369; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
370; X32-SSE-NEXT:    pandn %xmm0, %xmm3
371; X32-SSE-NEXT:    psraw $4, %xmm0
372; X32-SSE-NEXT:    pand %xmm2, %xmm0
373; X32-SSE-NEXT:    por %xmm3, %xmm0
374; X32-SSE-NEXT:    paddw %xmm1, %xmm1
375; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
376; X32-SSE-NEXT:    psraw $15, %xmm2
377; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
378; X32-SSE-NEXT:    pandn %xmm0, %xmm3
379; X32-SSE-NEXT:    psraw $2, %xmm0
380; X32-SSE-NEXT:    pand %xmm2, %xmm0
381; X32-SSE-NEXT:    por %xmm3, %xmm0
382; X32-SSE-NEXT:    paddw %xmm1, %xmm1
383; X32-SSE-NEXT:    psraw $15, %xmm1
384; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
385; X32-SSE-NEXT:    pandn %xmm0, %xmm2
386; X32-SSE-NEXT:    psraw $1, %xmm0
387; X32-SSE-NEXT:    pand %xmm1, %xmm0
388; X32-SSE-NEXT:    por %xmm2, %xmm0
389; X32-SSE-NEXT:    retl
390  %shift = ashr <8 x i16> %a, %b
391  ret <8 x i16> %shift
392}
393
394define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
395; SSE2-LABEL: var_shift_v16i8:
396; SSE2:       # %bb.0:
397; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
398; SSE2-NEXT:    psllw $5, %xmm1
399; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
400; SSE2-NEXT:    pxor %xmm3, %xmm3
401; SSE2-NEXT:    pxor %xmm5, %xmm5
402; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
403; SSE2-NEXT:    movdqa %xmm5, %xmm6
404; SSE2-NEXT:    pandn %xmm2, %xmm6
405; SSE2-NEXT:    psraw $4, %xmm2
406; SSE2-NEXT:    pand %xmm5, %xmm2
407; SSE2-NEXT:    por %xmm6, %xmm2
408; SSE2-NEXT:    paddw %xmm4, %xmm4
409; SSE2-NEXT:    pxor %xmm5, %xmm5
410; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
411; SSE2-NEXT:    movdqa %xmm5, %xmm6
412; SSE2-NEXT:    pandn %xmm2, %xmm6
413; SSE2-NEXT:    psraw $2, %xmm2
414; SSE2-NEXT:    pand %xmm5, %xmm2
415; SSE2-NEXT:    por %xmm6, %xmm2
416; SSE2-NEXT:    paddw %xmm4, %xmm4
417; SSE2-NEXT:    pxor %xmm5, %xmm5
418; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
419; SSE2-NEXT:    movdqa %xmm5, %xmm4
420; SSE2-NEXT:    pandn %xmm2, %xmm4
421; SSE2-NEXT:    psraw $1, %xmm2
422; SSE2-NEXT:    pand %xmm5, %xmm2
423; SSE2-NEXT:    por %xmm4, %xmm2
424; SSE2-NEXT:    psrlw $8, %xmm2
425; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
426; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
427; SSE2-NEXT:    pxor %xmm4, %xmm4
428; SSE2-NEXT:    pcmpgtw %xmm1, %xmm4
429; SSE2-NEXT:    movdqa %xmm4, %xmm5
430; SSE2-NEXT:    pandn %xmm0, %xmm5
431; SSE2-NEXT:    psraw $4, %xmm0
432; SSE2-NEXT:    pand %xmm4, %xmm0
433; SSE2-NEXT:    por %xmm5, %xmm0
434; SSE2-NEXT:    paddw %xmm1, %xmm1
435; SSE2-NEXT:    pxor %xmm4, %xmm4
436; SSE2-NEXT:    pcmpgtw %xmm1, %xmm4
437; SSE2-NEXT:    movdqa %xmm4, %xmm5
438; SSE2-NEXT:    pandn %xmm0, %xmm5
439; SSE2-NEXT:    psraw $2, %xmm0
440; SSE2-NEXT:    pand %xmm4, %xmm0
441; SSE2-NEXT:    por %xmm5, %xmm0
442; SSE2-NEXT:    paddw %xmm1, %xmm1
443; SSE2-NEXT:    pcmpgtw %xmm1, %xmm3
444; SSE2-NEXT:    movdqa %xmm3, %xmm1
445; SSE2-NEXT:    pandn %xmm0, %xmm1
446; SSE2-NEXT:    psraw $1, %xmm0
447; SSE2-NEXT:    pand %xmm3, %xmm0
448; SSE2-NEXT:    por %xmm1, %xmm0
449; SSE2-NEXT:    psrlw $8, %xmm0
450; SSE2-NEXT:    packuswb %xmm2, %xmm0
451; SSE2-NEXT:    retq
452;
453; SSE41-LABEL: var_shift_v16i8:
454; SSE41:       # %bb.0:
455; SSE41-NEXT:    movdqa %xmm0, %xmm2
456; SSE41-NEXT:    psllw $5, %xmm1
457; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
458; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
459; SSE41-NEXT:    movdqa %xmm3, %xmm4
460; SSE41-NEXT:    psraw $4, %xmm4
461; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm3
462; SSE41-NEXT:    movdqa %xmm3, %xmm4
463; SSE41-NEXT:    psraw $2, %xmm4
464; SSE41-NEXT:    paddw %xmm0, %xmm0
465; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm3
466; SSE41-NEXT:    movdqa %xmm3, %xmm4
467; SSE41-NEXT:    psraw $1, %xmm4
468; SSE41-NEXT:    paddw %xmm0, %xmm0
469; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm3
470; SSE41-NEXT:    psrlw $8, %xmm3
471; SSE41-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
472; SSE41-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
473; SSE41-NEXT:    movdqa %xmm1, %xmm2
474; SSE41-NEXT:    psraw $4, %xmm2
475; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
476; SSE41-NEXT:    movdqa %xmm1, %xmm2
477; SSE41-NEXT:    psraw $2, %xmm2
478; SSE41-NEXT:    paddw %xmm0, %xmm0
479; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
480; SSE41-NEXT:    movdqa %xmm1, %xmm2
481; SSE41-NEXT:    psraw $1, %xmm2
482; SSE41-NEXT:    paddw %xmm0, %xmm0
483; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
484; SSE41-NEXT:    psrlw $8, %xmm1
485; SSE41-NEXT:    packuswb %xmm3, %xmm1
486; SSE41-NEXT:    movdqa %xmm1, %xmm0
487; SSE41-NEXT:    retq
488;
489; AVX-LABEL: var_shift_v16i8:
490; AVX:       # %bb.0:
491; AVX-NEXT:    vpsllw $5, %xmm1, %xmm1
492; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
493; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
494; AVX-NEXT:    vpsraw $4, %xmm3, %xmm4
495; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
496; AVX-NEXT:    vpsraw $2, %xmm3, %xmm4
497; AVX-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
498; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
499; AVX-NEXT:    vpsraw $1, %xmm3, %xmm4
500; AVX-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
501; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
502; AVX-NEXT:    vpsrlw $8, %xmm2, %xmm2
503; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
504; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
505; AVX-NEXT:    vpsraw $4, %xmm0, %xmm3
506; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
507; AVX-NEXT:    vpsraw $2, %xmm0, %xmm3
508; AVX-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
509; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
510; AVX-NEXT:    vpsraw $1, %xmm0, %xmm3
511; AVX-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
512; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
513; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
514; AVX-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
515; AVX-NEXT:    retq
516;
517; XOP-LABEL: var_shift_v16i8:
518; XOP:       # %bb.0:
519; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
520; XOP-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
521; XOP-NEXT:    vpshab %xmm1, %xmm0, %xmm0
522; XOP-NEXT:    retq
523;
524; AVX512DQ-LABEL: var_shift_v16i8:
525; AVX512DQ:       # %bb.0:
526; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
527; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
528; AVX512DQ-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
529; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
530; AVX512DQ-NEXT:    vzeroupper
531; AVX512DQ-NEXT:    retq
532;
533; AVX512BW-LABEL: var_shift_v16i8:
534; AVX512BW:       # %bb.0:
535; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
536; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
537; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
538; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
539; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
540; AVX512BW-NEXT:    vzeroupper
541; AVX512BW-NEXT:    retq
542;
543; AVX512DQVL-LABEL: var_shift_v16i8:
544; AVX512DQVL:       # %bb.0:
545; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
546; AVX512DQVL-NEXT:    vpmovsxbd %xmm0, %zmm0
547; AVX512DQVL-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
548; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
549; AVX512DQVL-NEXT:    vzeroupper
550; AVX512DQVL-NEXT:    retq
551;
552; AVX512BWVL-LABEL: var_shift_v16i8:
553; AVX512BWVL:       # %bb.0:
554; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
555; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
556; AVX512BWVL-NEXT:    vpsravw %ymm1, %ymm0, %ymm0
557; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
558; AVX512BWVL-NEXT:    vzeroupper
559; AVX512BWVL-NEXT:    retq
560;
561; X32-SSE-LABEL: var_shift_v16i8:
562; X32-SSE:       # %bb.0:
563; X32-SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
564; X32-SSE-NEXT:    psllw $5, %xmm1
565; X32-SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
566; X32-SSE-NEXT:    pxor %xmm3, %xmm3
567; X32-SSE-NEXT:    pxor %xmm5, %xmm5
568; X32-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
569; X32-SSE-NEXT:    movdqa %xmm5, %xmm6
570; X32-SSE-NEXT:    pandn %xmm2, %xmm6
571; X32-SSE-NEXT:    psraw $4, %xmm2
572; X32-SSE-NEXT:    pand %xmm5, %xmm2
573; X32-SSE-NEXT:    por %xmm6, %xmm2
574; X32-SSE-NEXT:    paddw %xmm4, %xmm4
575; X32-SSE-NEXT:    pxor %xmm5, %xmm5
576; X32-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
577; X32-SSE-NEXT:    movdqa %xmm5, %xmm6
578; X32-SSE-NEXT:    pandn %xmm2, %xmm6
579; X32-SSE-NEXT:    psraw $2, %xmm2
580; X32-SSE-NEXT:    pand %xmm5, %xmm2
581; X32-SSE-NEXT:    por %xmm6, %xmm2
582; X32-SSE-NEXT:    paddw %xmm4, %xmm4
583; X32-SSE-NEXT:    pxor %xmm5, %xmm5
584; X32-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
585; X32-SSE-NEXT:    movdqa %xmm5, %xmm4
586; X32-SSE-NEXT:    pandn %xmm2, %xmm4
587; X32-SSE-NEXT:    psraw $1, %xmm2
588; X32-SSE-NEXT:    pand %xmm5, %xmm2
589; X32-SSE-NEXT:    por %xmm4, %xmm2
590; X32-SSE-NEXT:    psrlw $8, %xmm2
591; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
592; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
593; X32-SSE-NEXT:    pxor %xmm4, %xmm4
594; X32-SSE-NEXT:    pcmpgtw %xmm1, %xmm4
595; X32-SSE-NEXT:    movdqa %xmm4, %xmm5
596; X32-SSE-NEXT:    pandn %xmm0, %xmm5
597; X32-SSE-NEXT:    psraw $4, %xmm0
598; X32-SSE-NEXT:    pand %xmm4, %xmm0
599; X32-SSE-NEXT:    por %xmm5, %xmm0
600; X32-SSE-NEXT:    paddw %xmm1, %xmm1
601; X32-SSE-NEXT:    pxor %xmm4, %xmm4
602; X32-SSE-NEXT:    pcmpgtw %xmm1, %xmm4
603; X32-SSE-NEXT:    movdqa %xmm4, %xmm5
604; X32-SSE-NEXT:    pandn %xmm0, %xmm5
605; X32-SSE-NEXT:    psraw $2, %xmm0
606; X32-SSE-NEXT:    pand %xmm4, %xmm0
607; X32-SSE-NEXT:    por %xmm5, %xmm0
608; X32-SSE-NEXT:    paddw %xmm1, %xmm1
609; X32-SSE-NEXT:    pcmpgtw %xmm1, %xmm3
610; X32-SSE-NEXT:    movdqa %xmm3, %xmm1
611; X32-SSE-NEXT:    pandn %xmm0, %xmm1
612; X32-SSE-NEXT:    psraw $1, %xmm0
613; X32-SSE-NEXT:    pand %xmm3, %xmm0
614; X32-SSE-NEXT:    por %xmm1, %xmm0
615; X32-SSE-NEXT:    psrlw $8, %xmm0
616; X32-SSE-NEXT:    packuswb %xmm2, %xmm0
617; X32-SSE-NEXT:    retl
618  %shift = ashr <16 x i8> %a, %b
619  ret <16 x i8> %shift
620}
621
622;
623; Uniform Variable Shifts
624;
625
626define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
627; SSE-LABEL: splatvar_shift_v2i64:
628; SSE:       # %bb.0:
629; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
630; SSE-NEXT:    psrlq %xmm1, %xmm2
631; SSE-NEXT:    psrlq %xmm1, %xmm0
632; SSE-NEXT:    pxor %xmm2, %xmm0
633; SSE-NEXT:    psubq %xmm2, %xmm0
634; SSE-NEXT:    retq
635;
636; AVX-LABEL: splatvar_shift_v2i64:
637; AVX:       # %bb.0:
638; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
639; AVX-NEXT:    vpsrlq %xmm1, %xmm2, %xmm2
640; AVX-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
641; AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
642; AVX-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
643; AVX-NEXT:    retq
644;
645; XOPAVX1-LABEL: splatvar_shift_v2i64:
646; XOPAVX1:       # %bb.0:
647; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
648; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
649; XOPAVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
650; XOPAVX1-NEXT:    vpshaq %xmm1, %xmm0, %xmm0
651; XOPAVX1-NEXT:    retq
652;
653; XOPAVX2-LABEL: splatvar_shift_v2i64:
654; XOPAVX2:       # %bb.0:
655; XOPAVX2-NEXT:    vpbroadcastq %xmm1, %xmm1
656; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
657; XOPAVX2-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
658; XOPAVX2-NEXT:    vpshaq %xmm1, %xmm0, %xmm0
659; XOPAVX2-NEXT:    retq
660;
661; AVX512-LABEL: splatvar_shift_v2i64:
662; AVX512:       # %bb.0:
663; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
664; AVX512-NEXT:    vpsraq %xmm1, %zmm0, %zmm0
665; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
666; AVX512-NEXT:    vzeroupper
667; AVX512-NEXT:    retq
668;
669; AVX512VL-LABEL: splatvar_shift_v2i64:
670; AVX512VL:       # %bb.0:
671; AVX512VL-NEXT:    vpsraq %xmm1, %xmm0, %xmm0
672; AVX512VL-NEXT:    retq
673;
674; X32-SSE-LABEL: splatvar_shift_v2i64:
675; X32-SSE:       # %bb.0:
676; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
677; X32-SSE-NEXT:    psrlq %xmm1, %xmm2
678; X32-SSE-NEXT:    psrlq %xmm1, %xmm0
679; X32-SSE-NEXT:    pxor %xmm2, %xmm0
680; X32-SSE-NEXT:    psubq %xmm2, %xmm0
681; X32-SSE-NEXT:    retl
682  %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer
683  %shift = ashr <2 x i64> %a, %splat
684  ret <2 x i64> %shift
685}
686
687define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
688; SSE2-LABEL: splatvar_shift_v4i32:
689; SSE2:       # %bb.0:
690; SSE2-NEXT:    xorps %xmm2, %xmm2
691; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
692; SSE2-NEXT:    psrad %xmm2, %xmm0
693; SSE2-NEXT:    retq
694;
695; SSE41-LABEL: splatvar_shift_v4i32:
696; SSE41:       # %bb.0:
697; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
698; SSE41-NEXT:    psrad %xmm1, %xmm0
699; SSE41-NEXT:    retq
700;
701; AVX-LABEL: splatvar_shift_v4i32:
702; AVX:       # %bb.0:
703; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
704; AVX-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
705; AVX-NEXT:    retq
706;
707; XOP-LABEL: splatvar_shift_v4i32:
708; XOP:       # %bb.0:
709; XOP-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
710; XOP-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
711; XOP-NEXT:    retq
712;
713; AVX512-LABEL: splatvar_shift_v4i32:
714; AVX512:       # %bb.0:
715; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
716; AVX512-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
717; AVX512-NEXT:    retq
718;
719; AVX512VL-LABEL: splatvar_shift_v4i32:
720; AVX512VL:       # %bb.0:
721; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
722; AVX512VL-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
723; AVX512VL-NEXT:    retq
724;
725; X32-SSE-LABEL: splatvar_shift_v4i32:
726; X32-SSE:       # %bb.0:
727; X32-SSE-NEXT:    xorps %xmm2, %xmm2
728; X32-SSE-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
729; X32-SSE-NEXT:    psrad %xmm2, %xmm0
730; X32-SSE-NEXT:    retl
731  %splat = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer
732  %shift = ashr <4 x i32> %a, %splat
733  ret <4 x i32> %shift
734}
735
736define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
737; SSE2-LABEL: splatvar_shift_v8i16:
738; SSE2:       # %bb.0:
739; SSE2-NEXT:    pextrw $0, %xmm1, %eax
740; SSE2-NEXT:    movd %eax, %xmm1
741; SSE2-NEXT:    psraw %xmm1, %xmm0
742; SSE2-NEXT:    retq
743;
744; SSE41-LABEL: splatvar_shift_v8i16:
745; SSE41:       # %bb.0:
746; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
747; SSE41-NEXT:    psraw %xmm1, %xmm0
748; SSE41-NEXT:    retq
749;
750; AVX-LABEL: splatvar_shift_v8i16:
751; AVX:       # %bb.0:
752; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
753; AVX-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
754; AVX-NEXT:    retq
755;
756; XOP-LABEL: splatvar_shift_v8i16:
757; XOP:       # %bb.0:
758; XOP-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
759; XOP-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
760; XOP-NEXT:    retq
761;
762; AVX512-LABEL: splatvar_shift_v8i16:
763; AVX512:       # %bb.0:
764; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
765; AVX512-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
766; AVX512-NEXT:    retq
767;
768; AVX512VL-LABEL: splatvar_shift_v8i16:
769; AVX512VL:       # %bb.0:
770; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
771; AVX512VL-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
772; AVX512VL-NEXT:    retq
773;
774; X32-SSE-LABEL: splatvar_shift_v8i16:
775; X32-SSE:       # %bb.0:
776; X32-SSE-NEXT:    pextrw $0, %xmm1, %eax
777; X32-SSE-NEXT:    movd %eax, %xmm1
778; X32-SSE-NEXT:    psraw %xmm1, %xmm0
779; X32-SSE-NEXT:    retl
780  %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer
781  %shift = ashr <8 x i16> %a, %splat
782  ret <8 x i16> %shift
783}
784
785define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
786; SSE2-LABEL: splatvar_shift_v16i8:
787; SSE2:       # %bb.0:
788; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
789; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
790; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0]
791; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
792; SSE2-NEXT:    psllw $5, %xmm3
793; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
794; SSE2-NEXT:    pxor %xmm2, %xmm2
795; SSE2-NEXT:    pxor %xmm5, %xmm5
796; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
797; SSE2-NEXT:    movdqa %xmm5, %xmm6
798; SSE2-NEXT:    pandn %xmm1, %xmm6
799; SSE2-NEXT:    psraw $4, %xmm1
800; SSE2-NEXT:    pand %xmm5, %xmm1
801; SSE2-NEXT:    por %xmm6, %xmm1
802; SSE2-NEXT:    paddw %xmm4, %xmm4
803; SSE2-NEXT:    pxor %xmm5, %xmm5
804; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
805; SSE2-NEXT:    movdqa %xmm5, %xmm6
806; SSE2-NEXT:    pandn %xmm1, %xmm6
807; SSE2-NEXT:    psraw $2, %xmm1
808; SSE2-NEXT:    pand %xmm5, %xmm1
809; SSE2-NEXT:    por %xmm6, %xmm1
810; SSE2-NEXT:    paddw %xmm4, %xmm4
811; SSE2-NEXT:    pxor %xmm5, %xmm5
812; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
813; SSE2-NEXT:    movdqa %xmm5, %xmm4
814; SSE2-NEXT:    pandn %xmm1, %xmm4
815; SSE2-NEXT:    psraw $1, %xmm1
816; SSE2-NEXT:    pand %xmm5, %xmm1
817; SSE2-NEXT:    por %xmm4, %xmm1
818; SSE2-NEXT:    psrlw $8, %xmm1
819; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
820; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
821; SSE2-NEXT:    pxor %xmm4, %xmm4
822; SSE2-NEXT:    pcmpgtw %xmm3, %xmm4
823; SSE2-NEXT:    movdqa %xmm4, %xmm5
824; SSE2-NEXT:    pandn %xmm0, %xmm5
825; SSE2-NEXT:    psraw $4, %xmm0
826; SSE2-NEXT:    pand %xmm4, %xmm0
827; SSE2-NEXT:    por %xmm5, %xmm0
828; SSE2-NEXT:    paddw %xmm3, %xmm3
829; SSE2-NEXT:    pxor %xmm4, %xmm4
830; SSE2-NEXT:    pcmpgtw %xmm3, %xmm4
831; SSE2-NEXT:    movdqa %xmm4, %xmm5
832; SSE2-NEXT:    pandn %xmm0, %xmm5
833; SSE2-NEXT:    psraw $2, %xmm0
834; SSE2-NEXT:    pand %xmm4, %xmm0
835; SSE2-NEXT:    por %xmm5, %xmm0
836; SSE2-NEXT:    paddw %xmm3, %xmm3
837; SSE2-NEXT:    pcmpgtw %xmm3, %xmm2
838; SSE2-NEXT:    movdqa %xmm2, %xmm3
839; SSE2-NEXT:    pandn %xmm0, %xmm3
840; SSE2-NEXT:    psraw $1, %xmm0
841; SSE2-NEXT:    pand %xmm2, %xmm0
842; SSE2-NEXT:    por %xmm3, %xmm0
843; SSE2-NEXT:    psrlw $8, %xmm0
844; SSE2-NEXT:    packuswb %xmm1, %xmm0
845; SSE2-NEXT:    retq
846;
847; SSE41-LABEL: splatvar_shift_v16i8:
848; SSE41:       # %bb.0:
849; SSE41-NEXT:    movdqa %xmm0, %xmm2
850; SSE41-NEXT:    pxor %xmm0, %xmm0
851; SSE41-NEXT:    pshufb %xmm0, %xmm1
852; SSE41-NEXT:    psllw $5, %xmm1
853; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
854; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
855; SSE41-NEXT:    movdqa %xmm3, %xmm4
856; SSE41-NEXT:    psraw $4, %xmm4
857; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm3
858; SSE41-NEXT:    movdqa %xmm3, %xmm4
859; SSE41-NEXT:    psraw $2, %xmm4
860; SSE41-NEXT:    paddw %xmm0, %xmm0
861; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm3
862; SSE41-NEXT:    movdqa %xmm3, %xmm4
863; SSE41-NEXT:    psraw $1, %xmm4
864; SSE41-NEXT:    paddw %xmm0, %xmm0
865; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm3
866; SSE41-NEXT:    psrlw $8, %xmm3
867; SSE41-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
868; SSE41-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
869; SSE41-NEXT:    movdqa %xmm1, %xmm2
870; SSE41-NEXT:    psraw $4, %xmm2
871; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
872; SSE41-NEXT:    movdqa %xmm1, %xmm2
873; SSE41-NEXT:    psraw $2, %xmm2
874; SSE41-NEXT:    paddw %xmm0, %xmm0
875; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
876; SSE41-NEXT:    movdqa %xmm1, %xmm2
877; SSE41-NEXT:    psraw $1, %xmm2
878; SSE41-NEXT:    paddw %xmm0, %xmm0
879; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
880; SSE41-NEXT:    psrlw $8, %xmm1
881; SSE41-NEXT:    packuswb %xmm3, %xmm1
882; SSE41-NEXT:    movdqa %xmm1, %xmm0
883; SSE41-NEXT:    retq
884;
885; AVX1-LABEL: splatvar_shift_v16i8:
886; AVX1:       # %bb.0:
887; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
888; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
889; AVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
890; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
891; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
892; AVX1-NEXT:    vpsraw $4, %xmm3, %xmm4
893; AVX1-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
894; AVX1-NEXT:    vpsraw $2, %xmm3, %xmm4
895; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
896; AVX1-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
897; AVX1-NEXT:    vpsraw $1, %xmm3, %xmm4
898; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
899; AVX1-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
900; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
901; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
902; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
903; AVX1-NEXT:    vpsraw $4, %xmm0, %xmm3
904; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
905; AVX1-NEXT:    vpsraw $2, %xmm0, %xmm3
906; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
907; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
908; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm3
909; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
910; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
911; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
912; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
913; AVX1-NEXT:    retq
914;
915; AVX2-LABEL: splatvar_shift_v16i8:
916; AVX2:       # %bb.0:
917; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
918; AVX2-NEXT:    vpsllw $5, %xmm1, %xmm1
919; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
920; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
921; AVX2-NEXT:    vpsraw $4, %xmm3, %xmm4
922; AVX2-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
923; AVX2-NEXT:    vpsraw $2, %xmm3, %xmm4
924; AVX2-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
925; AVX2-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
926; AVX2-NEXT:    vpsraw $1, %xmm3, %xmm4
927; AVX2-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
928; AVX2-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
929; AVX2-NEXT:    vpsrlw $8, %xmm2, %xmm2
930; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
931; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
932; AVX2-NEXT:    vpsraw $4, %xmm0, %xmm3
933; AVX2-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
934; AVX2-NEXT:    vpsraw $2, %xmm0, %xmm3
935; AVX2-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
936; AVX2-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
937; AVX2-NEXT:    vpsraw $1, %xmm0, %xmm3
938; AVX2-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
939; AVX2-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
940; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm0
941; AVX2-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
942; AVX2-NEXT:    retq
943;
944; XOPAVX1-LABEL: splatvar_shift_v16i8:
945; XOPAVX1:       # %bb.0:
946; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
947; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
948; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
949; XOPAVX1-NEXT:    vpshab %xmm1, %xmm0, %xmm0
950; XOPAVX1-NEXT:    retq
951;
952; XOPAVX2-LABEL: splatvar_shift_v16i8:
953; XOPAVX2:       # %bb.0:
954; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
955; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
956; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
957; XOPAVX2-NEXT:    vpshab %xmm1, %xmm0, %xmm0
958; XOPAVX2-NEXT:    retq
959;
960; AVX512DQ-LABEL: splatvar_shift_v16i8:
961; AVX512DQ:       # %bb.0:
962; AVX512DQ-NEXT:    vpbroadcastb %xmm1, %xmm1
963; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
964; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
965; AVX512DQ-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
966; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
967; AVX512DQ-NEXT:    vzeroupper
968; AVX512DQ-NEXT:    retq
969;
970; AVX512BW-LABEL: splatvar_shift_v16i8:
971; AVX512BW:       # %bb.0:
972; AVX512BW-NEXT:    vpbroadcastb %xmm1, %xmm1
973; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
974; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
975; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
976; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
977; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
978; AVX512BW-NEXT:    vzeroupper
979; AVX512BW-NEXT:    retq
980;
981; AVX512DQVL-LABEL: splatvar_shift_v16i8:
982; AVX512DQVL:       # %bb.0:
983; AVX512DQVL-NEXT:    vpbroadcastb %xmm1, %xmm1
984; AVX512DQVL-NEXT:    vpmovsxbd %xmm0, %zmm0
985; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
986; AVX512DQVL-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
987; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
988; AVX512DQVL-NEXT:    vzeroupper
989; AVX512DQVL-NEXT:    retq
990;
991; AVX512BWVL-LABEL: splatvar_shift_v16i8:
992; AVX512BWVL:       # %bb.0:
993; AVX512BWVL-NEXT:    vpbroadcastb %xmm1, %xmm1
994; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
995; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
996; AVX512BWVL-NEXT:    vpsravw %ymm1, %ymm0, %ymm0
997; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
998; AVX512BWVL-NEXT:    vzeroupper
999; AVX512BWVL-NEXT:    retq
1000;
1001; X32-SSE-LABEL: splatvar_shift_v16i8:
1002; X32-SSE:       # %bb.0:
1003; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1004; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
1005; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,0,0]
1006; X32-SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1007; X32-SSE-NEXT:    psllw $5, %xmm3
1008; X32-SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
1009; X32-SSE-NEXT:    pxor %xmm2, %xmm2
1010; X32-SSE-NEXT:    pxor %xmm5, %xmm5
1011; X32-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
1012; X32-SSE-NEXT:    movdqa %xmm5, %xmm6
1013; X32-SSE-NEXT:    pandn %xmm1, %xmm6
1014; X32-SSE-NEXT:    psraw $4, %xmm1
1015; X32-SSE-NEXT:    pand %xmm5, %xmm1
1016; X32-SSE-NEXT:    por %xmm6, %xmm1
1017; X32-SSE-NEXT:    paddw %xmm4, %xmm4
1018; X32-SSE-NEXT:    pxor %xmm5, %xmm5
1019; X32-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
1020; X32-SSE-NEXT:    movdqa %xmm5, %xmm6
1021; X32-SSE-NEXT:    pandn %xmm1, %xmm6
1022; X32-SSE-NEXT:    psraw $2, %xmm1
1023; X32-SSE-NEXT:    pand %xmm5, %xmm1
1024; X32-SSE-NEXT:    por %xmm6, %xmm1
1025; X32-SSE-NEXT:    paddw %xmm4, %xmm4
1026; X32-SSE-NEXT:    pxor %xmm5, %xmm5
1027; X32-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
1028; X32-SSE-NEXT:    movdqa %xmm5, %xmm4
1029; X32-SSE-NEXT:    pandn %xmm1, %xmm4
1030; X32-SSE-NEXT:    psraw $1, %xmm1
1031; X32-SSE-NEXT:    pand %xmm5, %xmm1
1032; X32-SSE-NEXT:    por %xmm4, %xmm1
1033; X32-SSE-NEXT:    psrlw $8, %xmm1
1034; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1035; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1036; X32-SSE-NEXT:    pxor %xmm4, %xmm4
1037; X32-SSE-NEXT:    pcmpgtw %xmm3, %xmm4
1038; X32-SSE-NEXT:    movdqa %xmm4, %xmm5
1039; X32-SSE-NEXT:    pandn %xmm0, %xmm5
1040; X32-SSE-NEXT:    psraw $4, %xmm0
1041; X32-SSE-NEXT:    pand %xmm4, %xmm0
1042; X32-SSE-NEXT:    por %xmm5, %xmm0
1043; X32-SSE-NEXT:    paddw %xmm3, %xmm3
1044; X32-SSE-NEXT:    pxor %xmm4, %xmm4
1045; X32-SSE-NEXT:    pcmpgtw %xmm3, %xmm4
1046; X32-SSE-NEXT:    movdqa %xmm4, %xmm5
1047; X32-SSE-NEXT:    pandn %xmm0, %xmm5
1048; X32-SSE-NEXT:    psraw $2, %xmm0
1049; X32-SSE-NEXT:    pand %xmm4, %xmm0
1050; X32-SSE-NEXT:    por %xmm5, %xmm0
1051; X32-SSE-NEXT:    paddw %xmm3, %xmm3
1052; X32-SSE-NEXT:    pcmpgtw %xmm3, %xmm2
1053; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
1054; X32-SSE-NEXT:    pandn %xmm0, %xmm3
1055; X32-SSE-NEXT:    psraw $1, %xmm0
1056; X32-SSE-NEXT:    pand %xmm2, %xmm0
1057; X32-SSE-NEXT:    por %xmm3, %xmm0
1058; X32-SSE-NEXT:    psrlw $8, %xmm0
1059; X32-SSE-NEXT:    packuswb %xmm1, %xmm0
1060; X32-SSE-NEXT:    retl
1061  %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer
1062  %shift = ashr <16 x i8> %a, %splat
1063  ret <16 x i8> %shift
1064}
1065
1066;
1067; Constant Shifts
1068;
1069
1070define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
1071; SSE2-LABEL: constant_shift_v2i64:
1072; SSE2:       # %bb.0:
1073; SSE2-NEXT:    movdqa %xmm0, %xmm1
1074; SSE2-NEXT:    psrlq $1, %xmm1
1075; SSE2-NEXT:    psrlq $7, %xmm0
1076; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1077; SSE2-NEXT:    movapd {{.*#+}} xmm1 = [4611686018427387904,72057594037927936]
1078; SSE2-NEXT:    xorpd %xmm1, %xmm0
1079; SSE2-NEXT:    psubq %xmm1, %xmm0
1080; SSE2-NEXT:    retq
1081;
1082; SSE41-LABEL: constant_shift_v2i64:
1083; SSE41:       # %bb.0:
1084; SSE41-NEXT:    movdqa %xmm0, %xmm1
1085; SSE41-NEXT:    psrlq $7, %xmm1
1086; SSE41-NEXT:    psrlq $1, %xmm0
1087; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1088; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [4611686018427387904,72057594037927936]
1089; SSE41-NEXT:    pxor %xmm1, %xmm0
1090; SSE41-NEXT:    psubq %xmm1, %xmm0
1091; SSE41-NEXT:    retq
1092;
1093; AVX1-LABEL: constant_shift_v2i64:
1094; AVX1:       # %bb.0:
1095; AVX1-NEXT:    vpsrlq $7, %xmm0, %xmm1
1096; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
1097; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1098; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [4611686018427387904,72057594037927936]
1099; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
1100; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
1101; AVX1-NEXT:    retq
1102;
1103; AVX2-LABEL: constant_shift_v2i64:
1104; AVX2:       # %bb.0:
1105; AVX2-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
1106; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [4611686018427387904,72057594037927936]
1107; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
1108; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
1109; AVX2-NEXT:    retq
1110;
1111; XOP-LABEL: constant_shift_v2i64:
1112; XOP:       # %bb.0:
1113; XOP-NEXT:    vpshaq {{.*}}(%rip), %xmm0, %xmm0
1114; XOP-NEXT:    retq
1115;
1116; AVX512-LABEL: constant_shift_v2i64:
1117; AVX512:       # %bb.0:
1118; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1119; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,7]
1120; AVX512-NEXT:    vpsravq %zmm1, %zmm0, %zmm0
1121; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1122; AVX512-NEXT:    vzeroupper
1123; AVX512-NEXT:    retq
1124;
1125; AVX512VL-LABEL: constant_shift_v2i64:
1126; AVX512VL:       # %bb.0:
1127; AVX512VL-NEXT:    vpsravq {{.*}}(%rip), %xmm0, %xmm0
1128; AVX512VL-NEXT:    retq
1129;
1130; X32-SSE-LABEL: constant_shift_v2i64:
1131; X32-SSE:       # %bb.0:
1132; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,2147483648,0,2147483648]
1133; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
1134; X32-SSE-NEXT:    psrlq $1, %xmm2
1135; X32-SSE-NEXT:    psrlq $7, %xmm1
1136; X32-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
1137; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
1138; X32-SSE-NEXT:    psrlq $1, %xmm2
1139; X32-SSE-NEXT:    psrlq $7, %xmm0
1140; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
1141; X32-SSE-NEXT:    xorpd %xmm1, %xmm0
1142; X32-SSE-NEXT:    psubq %xmm1, %xmm0
1143; X32-SSE-NEXT:    retl
1144  %shift = ashr <2 x i64> %a, <i64 1, i64 7>
1145  ret <2 x i64> %shift
1146}
1147
1148define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind {
1149; SSE2-LABEL: constant_shift_v4i32:
1150; SSE2:       # %bb.0:
1151; SSE2-NEXT:    movdqa %xmm0, %xmm1
1152; SSE2-NEXT:    psrad $7, %xmm1
1153; SSE2-NEXT:    movdqa %xmm0, %xmm2
1154; SSE2-NEXT:    psrad $6, %xmm2
1155; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
1156; SSE2-NEXT:    movdqa %xmm0, %xmm1
1157; SSE2-NEXT:    psrad $5, %xmm1
1158; SSE2-NEXT:    psrad $4, %xmm0
1159; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1160; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
1161; SSE2-NEXT:    retq
1162;
1163; SSE41-LABEL: constant_shift_v4i32:
1164; SSE41:       # %bb.0:
1165; SSE41-NEXT:    movdqa %xmm0, %xmm1
1166; SSE41-NEXT:    psrad $7, %xmm1
1167; SSE41-NEXT:    movdqa %xmm0, %xmm2
1168; SSE41-NEXT:    psrad $5, %xmm2
1169; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
1170; SSE41-NEXT:    movdqa %xmm0, %xmm1
1171; SSE41-NEXT:    psrad $6, %xmm1
1172; SSE41-NEXT:    psrad $4, %xmm0
1173; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1174; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
1175; SSE41-NEXT:    retq
1176;
1177; AVX1-LABEL: constant_shift_v4i32:
1178; AVX1:       # %bb.0:
1179; AVX1-NEXT:    vpsrad $7, %xmm0, %xmm1
1180; AVX1-NEXT:    vpsrad $5, %xmm0, %xmm2
1181; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
1182; AVX1-NEXT:    vpsrad $6, %xmm0, %xmm2
1183; AVX1-NEXT:    vpsrad $4, %xmm0, %xmm0
1184; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
1185; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
1186; AVX1-NEXT:    retq
1187;
1188; AVX2-LABEL: constant_shift_v4i32:
1189; AVX2:       # %bb.0:
1190; AVX2-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0
1191; AVX2-NEXT:    retq
1192;
1193; XOPAVX1-LABEL: constant_shift_v4i32:
1194; XOPAVX1:       # %bb.0:
1195; XOPAVX1-NEXT:    vpshad {{.*}}(%rip), %xmm0, %xmm0
1196; XOPAVX1-NEXT:    retq
1197;
1198; XOPAVX2-LABEL: constant_shift_v4i32:
1199; XOPAVX2:       # %bb.0:
1200; XOPAVX2-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0
1201; XOPAVX2-NEXT:    retq
1202;
1203; AVX512-LABEL: constant_shift_v4i32:
1204; AVX512:       # %bb.0:
1205; AVX512-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0
1206; AVX512-NEXT:    retq
1207;
1208; AVX512VL-LABEL: constant_shift_v4i32:
1209; AVX512VL:       # %bb.0:
1210; AVX512VL-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0
1211; AVX512VL-NEXT:    retq
1212;
1213; X32-SSE-LABEL: constant_shift_v4i32:
1214; X32-SSE:       # %bb.0:
1215; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
1216; X32-SSE-NEXT:    psrad $7, %xmm1
1217; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
1218; X32-SSE-NEXT:    psrad $6, %xmm2
1219; X32-SSE-NEXT:    punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
1220; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
1221; X32-SSE-NEXT:    psrad $5, %xmm1
1222; X32-SSE-NEXT:    psrad $4, %xmm0
1223; X32-SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1224; X32-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
1225; X32-SSE-NEXT:    retl
1226  %shift = ashr <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7>
1227  ret <4 x i32> %shift
1228}
1229
1230define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
1231; SSE2-LABEL: constant_shift_v8i16:
1232; SSE2:       # %bb.0:
1233; SSE2-NEXT:    movdqa %xmm0, %xmm1
1234; SSE2-NEXT:    psraw $4, %xmm1
1235; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
1236; SSE2-NEXT:    movapd %xmm1, %xmm2
1237; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3]
1238; SSE2-NEXT:    psraw $2, %xmm1
1239; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
1240; SSE2-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1241; SSE2-NEXT:    movaps {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0]
1242; SSE2-NEXT:    movaps %xmm2, %xmm0
1243; SSE2-NEXT:    andps %xmm1, %xmm0
1244; SSE2-NEXT:    psraw $1, %xmm2
1245; SSE2-NEXT:    andnps %xmm2, %xmm1
1246; SSE2-NEXT:    orps %xmm1, %xmm0
1247; SSE2-NEXT:    retq
1248;
1249; SSE41-LABEL: constant_shift_v8i16:
1250; SSE41:       # %bb.0:
1251; SSE41-NEXT:    movdqa %xmm0, %xmm1
1252; SSE41-NEXT:    psraw $4, %xmm1
1253; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1254; SSE41-NEXT:    movdqa %xmm1, %xmm2
1255; SSE41-NEXT:    psraw $2, %xmm2
1256; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1257; SSE41-NEXT:    movdqa %xmm2, %xmm0
1258; SSE41-NEXT:    psraw $1, %xmm0
1259; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7]
1260; SSE41-NEXT:    retq
1261;
1262; AVX1-LABEL: constant_shift_v8i16:
1263; AVX1:       # %bb.0:
1264; AVX1-NEXT:    vpsraw $4, %xmm0, %xmm1
1265; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1266; AVX1-NEXT:    vpsraw $2, %xmm0, %xmm1
1267; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
1268; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm1
1269; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
1270; AVX1-NEXT:    retq
1271;
1272; AVX2-LABEL: constant_shift_v8i16:
1273; AVX2:       # %bb.0:
1274; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
1275; AVX2-NEXT:    vpsravd {{.*}}(%rip), %ymm0, %ymm0
1276; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1277; AVX2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
1278; AVX2-NEXT:    vzeroupper
1279; AVX2-NEXT:    retq
1280;
1281; XOP-LABEL: constant_shift_v8i16:
1282; XOP:       # %bb.0:
1283; XOP-NEXT:    vpshaw {{.*}}(%rip), %xmm0, %xmm0
1284; XOP-NEXT:    retq
1285;
1286; AVX512DQ-LABEL: constant_shift_v8i16:
1287; AVX512DQ:       # %bb.0:
1288; AVX512DQ-NEXT:    vpmovsxwd %xmm0, %ymm0
1289; AVX512DQ-NEXT:    vpsravd {{.*}}(%rip), %ymm0, %ymm0
1290; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
1291; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1292; AVX512DQ-NEXT:    vzeroupper
1293; AVX512DQ-NEXT:    retq
1294;
1295; AVX512BW-LABEL: constant_shift_v8i16:
1296; AVX512BW:       # %bb.0:
1297; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1298; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
1299; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
1300; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1301; AVX512BW-NEXT:    vzeroupper
1302; AVX512BW-NEXT:    retq
1303;
1304; AVX512DQVL-LABEL: constant_shift_v8i16:
1305; AVX512DQVL:       # %bb.0:
1306; AVX512DQVL-NEXT:    vpmovsxwd %xmm0, %ymm0
1307; AVX512DQVL-NEXT:    vpsravd {{.*}}(%rip), %ymm0, %ymm0
1308; AVX512DQVL-NEXT:    vpmovdw %ymm0, %xmm0
1309; AVX512DQVL-NEXT:    vzeroupper
1310; AVX512DQVL-NEXT:    retq
1311;
1312; AVX512BWVL-LABEL: constant_shift_v8i16:
1313; AVX512BWVL:       # %bb.0:
1314; AVX512BWVL-NEXT:    vpsravw {{.*}}(%rip), %xmm0, %xmm0
1315; AVX512BWVL-NEXT:    retq
1316;
1317; X32-SSE-LABEL: constant_shift_v8i16:
1318; X32-SSE:       # %bb.0:
1319; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
1320; X32-SSE-NEXT:    psraw $4, %xmm1
1321; X32-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
1322; X32-SSE-NEXT:    movapd %xmm1, %xmm2
1323; X32-SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3]
1324; X32-SSE-NEXT:    psraw $2, %xmm1
1325; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
1326; X32-SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1327; X32-SSE-NEXT:    movaps {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0]
1328; X32-SSE-NEXT:    movaps %xmm2, %xmm0
1329; X32-SSE-NEXT:    andps %xmm1, %xmm0
1330; X32-SSE-NEXT:    psraw $1, %xmm2
1331; X32-SSE-NEXT:    andnps %xmm2, %xmm1
1332; X32-SSE-NEXT:    orps %xmm1, %xmm0
1333; X32-SSE-NEXT:    retl
1334  %shift = ashr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
1335  ret <8 x i16> %shift
1336}
1337
1338define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
1339; SSE2-LABEL: constant_shift_v16i8:
1340; SSE2:       # %bb.0:
1341; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1342; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [8192,24640,41088,57536,49376,32928,16480,32]
1343; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
1344; SSE2-NEXT:    pxor %xmm2, %xmm2
1345; SSE2-NEXT:    pxor %xmm5, %xmm5
1346; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
1347; SSE2-NEXT:    movdqa %xmm5, %xmm6
1348; SSE2-NEXT:    pandn %xmm1, %xmm6
1349; SSE2-NEXT:    psraw $4, %xmm1
1350; SSE2-NEXT:    pand %xmm5, %xmm1
1351; SSE2-NEXT:    por %xmm6, %xmm1
1352; SSE2-NEXT:    paddw %xmm4, %xmm4
1353; SSE2-NEXT:    pxor %xmm5, %xmm5
1354; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
1355; SSE2-NEXT:    movdqa %xmm5, %xmm6
1356; SSE2-NEXT:    pandn %xmm1, %xmm6
1357; SSE2-NEXT:    psraw $2, %xmm1
1358; SSE2-NEXT:    pand %xmm5, %xmm1
1359; SSE2-NEXT:    por %xmm6, %xmm1
1360; SSE2-NEXT:    paddw %xmm4, %xmm4
1361; SSE2-NEXT:    pxor %xmm5, %xmm5
1362; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
1363; SSE2-NEXT:    movdqa %xmm5, %xmm4
1364; SSE2-NEXT:    pandn %xmm1, %xmm4
1365; SSE2-NEXT:    psraw $1, %xmm1
1366; SSE2-NEXT:    pand %xmm5, %xmm1
1367; SSE2-NEXT:    por %xmm4, %xmm1
1368; SSE2-NEXT:    psrlw $8, %xmm1
1369; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1370; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1371; SSE2-NEXT:    pxor %xmm4, %xmm4
1372; SSE2-NEXT:    pcmpgtw %xmm3, %xmm4
1373; SSE2-NEXT:    movdqa %xmm4, %xmm5
1374; SSE2-NEXT:    pandn %xmm0, %xmm5
1375; SSE2-NEXT:    psraw $4, %xmm0
1376; SSE2-NEXT:    pand %xmm4, %xmm0
1377; SSE2-NEXT:    por %xmm5, %xmm0
1378; SSE2-NEXT:    paddw %xmm3, %xmm3
1379; SSE2-NEXT:    pxor %xmm4, %xmm4
1380; SSE2-NEXT:    pcmpgtw %xmm3, %xmm4
1381; SSE2-NEXT:    movdqa %xmm4, %xmm5
1382; SSE2-NEXT:    pandn %xmm0, %xmm5
1383; SSE2-NEXT:    psraw $2, %xmm0
1384; SSE2-NEXT:    pand %xmm4, %xmm0
1385; SSE2-NEXT:    por %xmm5, %xmm0
1386; SSE2-NEXT:    paddw %xmm3, %xmm3
1387; SSE2-NEXT:    pcmpgtw %xmm3, %xmm2
1388; SSE2-NEXT:    movdqa %xmm2, %xmm3
1389; SSE2-NEXT:    pandn %xmm0, %xmm3
1390; SSE2-NEXT:    psraw $1, %xmm0
1391; SSE2-NEXT:    pand %xmm2, %xmm0
1392; SSE2-NEXT:    por %xmm3, %xmm0
1393; SSE2-NEXT:    psrlw $8, %xmm0
1394; SSE2-NEXT:    packuswb %xmm1, %xmm0
1395; SSE2-NEXT:    retq
1396;
1397; SSE41-LABEL: constant_shift_v16i8:
1398; SSE41:       # %bb.0:
1399; SSE41-NEXT:    movdqa %xmm0, %xmm1
1400; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [8192,24640,41088,57536,49376,32928,16480,32]
1401; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
1402; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
1403; SSE41-NEXT:    movdqa %xmm2, %xmm4
1404; SSE41-NEXT:    psraw $4, %xmm4
1405; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm2
1406; SSE41-NEXT:    movdqa %xmm2, %xmm4
1407; SSE41-NEXT:    psraw $2, %xmm4
1408; SSE41-NEXT:    paddw %xmm0, %xmm0
1409; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm2
1410; SSE41-NEXT:    movdqa %xmm2, %xmm4
1411; SSE41-NEXT:    psraw $1, %xmm4
1412; SSE41-NEXT:    paddw %xmm0, %xmm0
1413; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm2
1414; SSE41-NEXT:    psrlw $8, %xmm2
1415; SSE41-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
1416; SSE41-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1417; SSE41-NEXT:    movdqa %xmm1, %xmm3
1418; SSE41-NEXT:    psraw $4, %xmm3
1419; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
1420; SSE41-NEXT:    movdqa %xmm1, %xmm3
1421; SSE41-NEXT:    psraw $2, %xmm3
1422; SSE41-NEXT:    paddw %xmm0, %xmm0
1423; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
1424; SSE41-NEXT:    movdqa %xmm1, %xmm3
1425; SSE41-NEXT:    psraw $1, %xmm3
1426; SSE41-NEXT:    paddw %xmm0, %xmm0
1427; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
1428; SSE41-NEXT:    psrlw $8, %xmm1
1429; SSE41-NEXT:    packuswb %xmm2, %xmm1
1430; SSE41-NEXT:    movdqa %xmm1, %xmm0
1431; SSE41-NEXT:    retq
1432;
1433; AVX-LABEL: constant_shift_v16i8:
1434; AVX:       # %bb.0:
1435; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [8192,24640,41088,57536,49376,32928,16480,32]
1436; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
1437; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1438; AVX-NEXT:    vpsraw $4, %xmm3, %xmm4
1439; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
1440; AVX-NEXT:    vpsraw $2, %xmm3, %xmm4
1441; AVX-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
1442; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
1443; AVX-NEXT:    vpsraw $1, %xmm3, %xmm4
1444; AVX-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
1445; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
1446; AVX-NEXT:    vpsrlw $8, %xmm2, %xmm2
1447; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1448; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1449; AVX-NEXT:    vpsraw $4, %xmm0, %xmm3
1450; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
1451; AVX-NEXT:    vpsraw $2, %xmm0, %xmm3
1452; AVX-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
1453; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
1454; AVX-NEXT:    vpsraw $1, %xmm0, %xmm3
1455; AVX-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
1456; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
1457; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
1458; AVX-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
1459; AVX-NEXT:    retq
1460;
1461; XOP-LABEL: constant_shift_v16i8:
1462; XOP:       # %bb.0:
1463; XOP-NEXT:    vpshab {{.*}}(%rip), %xmm0, %xmm0
1464; XOP-NEXT:    retq
1465;
1466; AVX512DQ-LABEL: constant_shift_v16i8:
1467; AVX512DQ:       # %bb.0:
1468; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
1469; AVX512DQ-NEXT:    vpsravd {{.*}}(%rip), %zmm0, %zmm0
1470; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
1471; AVX512DQ-NEXT:    vzeroupper
1472; AVX512DQ-NEXT:    retq
1473;
1474; AVX512BW-LABEL: constant_shift_v16i8:
1475; AVX512BW:       # %bb.0:
1476; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
1477; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
1478; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
1479; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1480; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1481; AVX512BW-NEXT:    vzeroupper
1482; AVX512BW-NEXT:    retq
1483;
1484; AVX512DQVL-LABEL: constant_shift_v16i8:
1485; AVX512DQVL:       # %bb.0:
1486; AVX512DQVL-NEXT:    vpmovsxbd %xmm0, %zmm0
1487; AVX512DQVL-NEXT:    vpsravd {{.*}}(%rip), %zmm0, %zmm0
1488; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
1489; AVX512DQVL-NEXT:    vzeroupper
1490; AVX512DQVL-NEXT:    retq
1491;
1492; AVX512BWVL-LABEL: constant_shift_v16i8:
1493; AVX512BWVL:       # %bb.0:
1494; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
1495; AVX512BWVL-NEXT:    vpsravw {{.*}}(%rip), %ymm0, %ymm0
1496; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
1497; AVX512BWVL-NEXT:    vzeroupper
1498; AVX512BWVL-NEXT:    retq
1499;
1500; X32-SSE-LABEL: constant_shift_v16i8:
1501; X32-SSE:       # %bb.0:
1502; X32-SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1503; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [8192,24640,41088,57536,49376,32928,16480,32]
1504; X32-SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
1505; X32-SSE-NEXT:    pxor %xmm2, %xmm2
1506; X32-SSE-NEXT:    pxor %xmm5, %xmm5
1507; X32-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
1508; X32-SSE-NEXT:    movdqa %xmm5, %xmm6
1509; X32-SSE-NEXT:    pandn %xmm1, %xmm6
1510; X32-SSE-NEXT:    psraw $4, %xmm1
1511; X32-SSE-NEXT:    pand %xmm5, %xmm1
1512; X32-SSE-NEXT:    por %xmm6, %xmm1
1513; X32-SSE-NEXT:    paddw %xmm4, %xmm4
1514; X32-SSE-NEXT:    pxor %xmm5, %xmm5
1515; X32-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
1516; X32-SSE-NEXT:    movdqa %xmm5, %xmm6
1517; X32-SSE-NEXT:    pandn %xmm1, %xmm6
1518; X32-SSE-NEXT:    psraw $2, %xmm1
1519; X32-SSE-NEXT:    pand %xmm5, %xmm1
1520; X32-SSE-NEXT:    por %xmm6, %xmm1
1521; X32-SSE-NEXT:    paddw %xmm4, %xmm4
1522; X32-SSE-NEXT:    pxor %xmm5, %xmm5
1523; X32-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
1524; X32-SSE-NEXT:    movdqa %xmm5, %xmm4
1525; X32-SSE-NEXT:    pandn %xmm1, %xmm4
1526; X32-SSE-NEXT:    psraw $1, %xmm1
1527; X32-SSE-NEXT:    pand %xmm5, %xmm1
1528; X32-SSE-NEXT:    por %xmm4, %xmm1
1529; X32-SSE-NEXT:    psrlw $8, %xmm1
1530; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1531; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1532; X32-SSE-NEXT:    pxor %xmm4, %xmm4
1533; X32-SSE-NEXT:    pcmpgtw %xmm3, %xmm4
1534; X32-SSE-NEXT:    movdqa %xmm4, %xmm5
1535; X32-SSE-NEXT:    pandn %xmm0, %xmm5
1536; X32-SSE-NEXT:    psraw $4, %xmm0
1537; X32-SSE-NEXT:    pand %xmm4, %xmm0
1538; X32-SSE-NEXT:    por %xmm5, %xmm0
1539; X32-SSE-NEXT:    paddw %xmm3, %xmm3
1540; X32-SSE-NEXT:    pxor %xmm4, %xmm4
1541; X32-SSE-NEXT:    pcmpgtw %xmm3, %xmm4
1542; X32-SSE-NEXT:    movdqa %xmm4, %xmm5
1543; X32-SSE-NEXT:    pandn %xmm0, %xmm5
1544; X32-SSE-NEXT:    psraw $2, %xmm0
1545; X32-SSE-NEXT:    pand %xmm4, %xmm0
1546; X32-SSE-NEXT:    por %xmm5, %xmm0
1547; X32-SSE-NEXT:    paddw %xmm3, %xmm3
1548; X32-SSE-NEXT:    pcmpgtw %xmm3, %xmm2
1549; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
1550; X32-SSE-NEXT:    pandn %xmm0, %xmm3
1551; X32-SSE-NEXT:    psraw $1, %xmm0
1552; X32-SSE-NEXT:    pand %xmm2, %xmm0
1553; X32-SSE-NEXT:    por %xmm3, %xmm0
1554; X32-SSE-NEXT:    psrlw $8, %xmm0
1555; X32-SSE-NEXT:    packuswb %xmm1, %xmm0
1556; X32-SSE-NEXT:    retl
1557  %shift = ashr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
1558  ret <16 x i8> %shift
1559}
1560
1561;
1562; Uniform Constant Shifts
1563;
1564
1565define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind {
1566; SSE2-LABEL: splatconstant_shift_v2i64:
1567; SSE2:       # %bb.0:
1568; SSE2-NEXT:    movdqa %xmm0, %xmm1
1569; SSE2-NEXT:    psrad $7, %xmm1
1570; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
1571; SSE2-NEXT:    psrlq $7, %xmm0
1572; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1573; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1574; SSE2-NEXT:    retq
1575;
1576; SSE41-LABEL: splatconstant_shift_v2i64:
1577; SSE41:       # %bb.0:
1578; SSE41-NEXT:    movdqa %xmm0, %xmm1
1579; SSE41-NEXT:    psrad $7, %xmm1
1580; SSE41-NEXT:    psrlq $7, %xmm0
1581; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
1582; SSE41-NEXT:    retq
1583;
1584; AVX1-LABEL: splatconstant_shift_v2i64:
1585; AVX1:       # %bb.0:
1586; AVX1-NEXT:    vpsrad $7, %xmm0, %xmm1
1587; AVX1-NEXT:    vpsrlq $7, %xmm0, %xmm0
1588; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
1589; AVX1-NEXT:    retq
1590;
1591; AVX2-LABEL: splatconstant_shift_v2i64:
1592; AVX2:       # %bb.0:
1593; AVX2-NEXT:    vpsrad $7, %xmm0, %xmm1
1594; AVX2-NEXT:    vpsrlq $7, %xmm0, %xmm0
1595; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
1596; AVX2-NEXT:    retq
1597;
1598; XOP-LABEL: splatconstant_shift_v2i64:
1599; XOP:       # %bb.0:
1600; XOP-NEXT:    vpshaq {{.*}}(%rip), %xmm0, %xmm0
1601; XOP-NEXT:    retq
1602;
1603; AVX512-LABEL: splatconstant_shift_v2i64:
1604; AVX512:       # %bb.0:
1605; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1606; AVX512-NEXT:    vpsraq $7, %zmm0, %zmm0
1607; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1608; AVX512-NEXT:    vzeroupper
1609; AVX512-NEXT:    retq
1610;
1611; AVX512VL-LABEL: splatconstant_shift_v2i64:
1612; AVX512VL:       # %bb.0:
1613; AVX512VL-NEXT:    vpsraq $7, %xmm0, %xmm0
1614; AVX512VL-NEXT:    retq
1615;
1616; X32-SSE-LABEL: splatconstant_shift_v2i64:
1617; X32-SSE:       # %bb.0:
1618; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
1619; X32-SSE-NEXT:    psrad $7, %xmm1
1620; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
1621; X32-SSE-NEXT:    psrlq $7, %xmm0
1622; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1623; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1624; X32-SSE-NEXT:    retl
1625  %shift = ashr <2 x i64> %a, <i64 7, i64 7>
1626  ret <2 x i64> %shift
1627}
1628
1629define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) nounwind {
1630; SSE-LABEL: splatconstant_shift_v4i32:
1631; SSE:       # %bb.0:
1632; SSE-NEXT:    psrad $5, %xmm0
1633; SSE-NEXT:    retq
1634;
1635; AVX-LABEL: splatconstant_shift_v4i32:
1636; AVX:       # %bb.0:
1637; AVX-NEXT:    vpsrad $5, %xmm0, %xmm0
1638; AVX-NEXT:    retq
1639;
1640; XOP-LABEL: splatconstant_shift_v4i32:
1641; XOP:       # %bb.0:
1642; XOP-NEXT:    vpsrad $5, %xmm0, %xmm0
1643; XOP-NEXT:    retq
1644;
1645; AVX512-LABEL: splatconstant_shift_v4i32:
1646; AVX512:       # %bb.0:
1647; AVX512-NEXT:    vpsrad $5, %xmm0, %xmm0
1648; AVX512-NEXT:    retq
1649;
1650; AVX512VL-LABEL: splatconstant_shift_v4i32:
1651; AVX512VL:       # %bb.0:
1652; AVX512VL-NEXT:    vpsrad $5, %xmm0, %xmm0
1653; AVX512VL-NEXT:    retq
1654;
1655; X32-SSE-LABEL: splatconstant_shift_v4i32:
1656; X32-SSE:       # %bb.0:
1657; X32-SSE-NEXT:    psrad $5, %xmm0
1658; X32-SSE-NEXT:    retl
1659  %shift = ashr <4 x i32> %a, <i32 5, i32 5, i32 5, i32 5>
1660  ret <4 x i32> %shift
1661}
1662
1663define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) nounwind {
1664; SSE-LABEL: splatconstant_shift_v8i16:
1665; SSE:       # %bb.0:
1666; SSE-NEXT:    psraw $3, %xmm0
1667; SSE-NEXT:    retq
1668;
1669; AVX-LABEL: splatconstant_shift_v8i16:
1670; AVX:       # %bb.0:
1671; AVX-NEXT:    vpsraw $3, %xmm0, %xmm0
1672; AVX-NEXT:    retq
1673;
1674; XOP-LABEL: splatconstant_shift_v8i16:
1675; XOP:       # %bb.0:
1676; XOP-NEXT:    vpsraw $3, %xmm0, %xmm0
1677; XOP-NEXT:    retq
1678;
1679; AVX512-LABEL: splatconstant_shift_v8i16:
1680; AVX512:       # %bb.0:
1681; AVX512-NEXT:    vpsraw $3, %xmm0, %xmm0
1682; AVX512-NEXT:    retq
1683;
1684; AVX512VL-LABEL: splatconstant_shift_v8i16:
1685; AVX512VL:       # %bb.0:
1686; AVX512VL-NEXT:    vpsraw $3, %xmm0, %xmm0
1687; AVX512VL-NEXT:    retq
1688;
1689; X32-SSE-LABEL: splatconstant_shift_v8i16:
1690; X32-SSE:       # %bb.0:
1691; X32-SSE-NEXT:    psraw $3, %xmm0
1692; X32-SSE-NEXT:    retl
1693  %shift = ashr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
1694  ret <8 x i16> %shift
1695}
1696
1697define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind {
1698; SSE-LABEL: splatconstant_shift_v16i8:
1699; SSE:       # %bb.0:
1700; SSE-NEXT:    psrlw $3, %xmm0
1701; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
1702; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
1703; SSE-NEXT:    pxor %xmm1, %xmm0
1704; SSE-NEXT:    psubb %xmm1, %xmm0
1705; SSE-NEXT:    retq
1706;
1707; AVX-LABEL: splatconstant_shift_v16i8:
1708; AVX:       # %bb.0:
1709; AVX-NEXT:    vpsrlw $3, %xmm0, %xmm0
1710; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
1711; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
1712; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
1713; AVX-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
1714; AVX-NEXT:    retq
1715;
1716; XOP-LABEL: splatconstant_shift_v16i8:
1717; XOP:       # %bb.0:
1718; XOP-NEXT:    vpshab {{.*}}(%rip), %xmm0, %xmm0
1719; XOP-NEXT:    retq
1720;
1721; AVX512-LABEL: splatconstant_shift_v16i8:
1722; AVX512:       # %bb.0:
1723; AVX512-NEXT:    vpsrlw $3, %xmm0, %xmm0
1724; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
1725; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
1726; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
1727; AVX512-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
1728; AVX512-NEXT:    retq
1729;
1730; AVX512VL-LABEL: splatconstant_shift_v16i8:
1731; AVX512VL:       # %bb.0:
1732; AVX512VL-NEXT:    vpsrlw $3, %xmm0, %xmm0
1733; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
1734; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
1735; AVX512VL-NEXT:    vpxor %xmm1, %xmm0, %xmm0
1736; AVX512VL-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
1737; AVX512VL-NEXT:    retq
1738;
1739; X32-SSE-LABEL: splatconstant_shift_v16i8:
1740; X32-SSE:       # %bb.0:
1741; X32-SSE-NEXT:    psrlw $3, %xmm0
1742; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
1743; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
1744; X32-SSE-NEXT:    pxor %xmm1, %xmm0
1745; X32-SSE-NEXT:    psubb %xmm1, %xmm0
1746; X32-SSE-NEXT:    retl
1747  %shift = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
1748  ret <16 x i8> %shift
1749}
1750