• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
6; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
7;
8; Variable Shifts
9;
10
11define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
12; AVX1-LABEL: var_shift_v4i64:
13; AVX1:       # BB#0:
14; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
15; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
16; AVX1-NEXT:    vpsrlq %xmm2, %xmm3, %xmm4
17; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[2,3,0,1]
18; AVX1-NEXT:    vpsrlq %xmm5, %xmm3, %xmm6
19; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4,5,6,7]
20; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
21; AVX1-NEXT:    vpsrlq %xmm2, %xmm6, %xmm2
22; AVX1-NEXT:    vpsrlq %xmm5, %xmm6, %xmm5
23; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4,5,6,7]
24; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
25; AVX1-NEXT:    vpsubq %xmm4, %xmm2, %xmm2
26; AVX1-NEXT:    vpsrlq %xmm1, %xmm3, %xmm4
27; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,3,0,1]
28; AVX1-NEXT:    vpsrlq %xmm5, %xmm3, %xmm3
29; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
30; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm1
31; AVX1-NEXT:    vpsrlq %xmm5, %xmm0, %xmm0
32; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
33; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm0
34; AVX1-NEXT:    vpsubq %xmm3, %xmm0, %xmm0
35; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
36; AVX1-NEXT:    retq
37;
38; AVX2-LABEL: var_shift_v4i64:
39; AVX2:       # BB#0:
40; AVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
41; AVX2-NEXT:    vpsrlvq %ymm1, %ymm2, %ymm3
42; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
43; AVX2-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
44; AVX2-NEXT:    vpsubq %ymm3, %ymm0, %ymm0
45; AVX2-NEXT:    retq
46;
47; XOPAVX1-LABEL: var_shift_v4i64:
48; XOPAVX1:       # BB#0:
49; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
50; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
51; XOPAVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
52; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
53; XOPAVX1-NEXT:    vpshaq %xmm2, %xmm4, %xmm2
54; XOPAVX1-NEXT:    vpsubq %xmm1, %xmm3, %xmm1
55; XOPAVX1-NEXT:    vpshaq %xmm1, %xmm0, %xmm0
56; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
57; XOPAVX1-NEXT:    retq
58;
59; XOPAVX2-LABEL: var_shift_v4i64:
60; XOPAVX2:       # BB#0:
61; XOPAVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
62; XOPAVX2-NEXT:    vpsrlvq %ymm1, %ymm2, %ymm3
63; XOPAVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
64; XOPAVX2-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
65; XOPAVX2-NEXT:    vpsubq %ymm3, %ymm0, %ymm0
66; XOPAVX2-NEXT:    retq
67;
68; AVX512-LABEL: var_shift_v4i64:
69; AVX512:       ## BB#0:
70; AVX512-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
71; AVX512-NEXT:    vpsrlvq %ymm1, %ymm2, %ymm3
72; AVX512-NEXT:    vpxor %ymm2, %ymm0, %ymm0
73; AVX512-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
74; AVX512-NEXT:    vpsubq %ymm3, %ymm0, %ymm0
75; AVX512-NEXT:    retq
76  %shift = ashr <4 x i64> %a, %b
77  ret <4 x i64> %shift
78}
79
80define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
81; AVX1-LABEL: var_shift_v8i32:
82; AVX1:       # BB#0:
83; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
84; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
85; AVX1-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
86; AVX1-NEXT:    vpsrad %xmm4, %xmm2, %xmm4
87; AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm5
88; AVX1-NEXT:    vpsrad %xmm5, %xmm2, %xmm5
89; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
90; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
91; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
92; AVX1-NEXT:    vpsrad %xmm6, %xmm2, %xmm6
93; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
94; AVX1-NEXT:    vpsrad %xmm3, %xmm2, %xmm2
95; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7]
96; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
97; AVX1-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
98; AVX1-NEXT:    vpsrad %xmm3, %xmm0, %xmm3
99; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm4
100; AVX1-NEXT:    vpsrad %xmm4, %xmm0, %xmm4
101; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
102; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
103; AVX1-NEXT:    vpsrad %xmm4, %xmm0, %xmm4
104; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
105; AVX1-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
106; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
107; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
108; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
109; AVX1-NEXT:    retq
110;
111; AVX2-LABEL: var_shift_v8i32:
112; AVX2:       # BB#0:
113; AVX2-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
114; AVX2-NEXT:    retq
115;
116; XOPAVX1-LABEL: var_shift_v8i32:
117; XOPAVX1:       # BB#0:
118; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
119; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
120; XOPAVX1-NEXT:    vpsubd %xmm2, %xmm3, %xmm2
121; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
122; XOPAVX1-NEXT:    vpshad %xmm2, %xmm4, %xmm2
123; XOPAVX1-NEXT:    vpsubd %xmm1, %xmm3, %xmm1
124; XOPAVX1-NEXT:    vpshad %xmm1, %xmm0, %xmm0
125; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
126; XOPAVX1-NEXT:    retq
127;
128; XOPAVX2-LABEL: var_shift_v8i32:
129; XOPAVX2:       # BB#0:
130; XOPAVX2-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
131; XOPAVX2-NEXT:    retq
132;
133; AVX512-LABEL: var_shift_v8i32:
134; AVX512:       ## BB#0:
135; AVX512-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
136; AVX512-NEXT:    retq
137  %shift = ashr <8 x i32> %a, %b
138  ret <8 x i32> %shift
139}
140
141define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
142; AVX1-LABEL: var_shift_v16i16:
143; AVX1:       # BB#0:
144; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
145; AVX1-NEXT:    vpsllw $12, %xmm2, %xmm3
146; AVX1-NEXT:    vpsllw $4, %xmm2, %xmm2
147; AVX1-NEXT:    vpor %xmm3, %xmm2, %xmm2
148; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm3
149; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
150; AVX1-NEXT:    vpsraw $8, %xmm4, %xmm5
151; AVX1-NEXT:    vpblendvb %xmm2, %xmm5, %xmm4, %xmm2
152; AVX1-NEXT:    vpsraw $4, %xmm2, %xmm4
153; AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
154; AVX1-NEXT:    vpsraw $2, %xmm2, %xmm4
155; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
156; AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
157; AVX1-NEXT:    vpsraw $1, %xmm2, %xmm4
158; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
159; AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
160; AVX1-NEXT:    vpsllw $12, %xmm1, %xmm3
161; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
162; AVX1-NEXT:    vpor %xmm3, %xmm1, %xmm1
163; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm3
164; AVX1-NEXT:    vpsraw $8, %xmm0, %xmm4
165; AVX1-NEXT:    vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
166; AVX1-NEXT:    vpsraw $4, %xmm0, %xmm1
167; AVX1-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
168; AVX1-NEXT:    vpsraw $2, %xmm0, %xmm1
169; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
170; AVX1-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
171; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm1
172; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
173; AVX1-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
174; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
175; AVX1-NEXT:    retq
176;
177; AVX2-LABEL: var_shift_v16i16:
178; AVX2:       # BB#0:
179; AVX2-NEXT:    vpxor %ymm2, %ymm2, %ymm2
180; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
181; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
182; AVX2-NEXT:    vpsravd %ymm3, %ymm4, %ymm3
183; AVX2-NEXT:    vpsrld $16, %ymm3, %ymm3
184; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
185; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
186; AVX2-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
187; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
188; AVX2-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
189; AVX2-NEXT:    retq
190;
191; XOPAVX1-LABEL: var_shift_v16i16:
192; XOPAVX1:       # BB#0:
193; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
194; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
195; XOPAVX1-NEXT:    vpsubw %xmm2, %xmm3, %xmm2
196; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
197; XOPAVX1-NEXT:    vpshaw %xmm2, %xmm4, %xmm2
198; XOPAVX1-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
199; XOPAVX1-NEXT:    vpshaw %xmm1, %xmm0, %xmm0
200; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
201; XOPAVX1-NEXT:    retq
202;
203; XOPAVX2-LABEL: var_shift_v16i16:
204; XOPAVX2:       # BB#0:
205; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
206; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
207; XOPAVX2-NEXT:    vpsubw %xmm2, %xmm3, %xmm2
208; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
209; XOPAVX2-NEXT:    vpshaw %xmm2, %xmm4, %xmm2
210; XOPAVX2-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
211; XOPAVX2-NEXT:    vpshaw %xmm1, %xmm0, %xmm0
212; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
213; XOPAVX2-NEXT:    retq
214;
215; AVX512-LABEL: var_shift_v16i16:
216; AVX512:       ## BB#0:
217; AVX512-NEXT:    ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
218; AVX512-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
219; AVX512-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
220; AVX512-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
221; AVX512-NEXT:    retq
222  %shift = ashr <16 x i16> %a, %b
223  ret <16 x i16> %shift
224}
225
226define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
227; AVX1-LABEL: var_shift_v32i8:
228; AVX1:       # BB#0:
229; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
230; AVX1-NEXT:    vpsllw $5, %xmm2, %xmm2
231; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
232; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
233; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
234; AVX1-NEXT:    vpsraw $4, %xmm5, %xmm6
235; AVX1-NEXT:    vpblendvb %xmm3, %xmm6, %xmm5, %xmm5
236; AVX1-NEXT:    vpsraw $2, %xmm5, %xmm6
237; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
238; AVX1-NEXT:    vpblendvb %xmm3, %xmm6, %xmm5, %xmm5
239; AVX1-NEXT:    vpsraw $1, %xmm5, %xmm6
240; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
241; AVX1-NEXT:    vpblendvb %xmm3, %xmm6, %xmm5, %xmm3
242; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
243; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
244; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
245; AVX1-NEXT:    vpsraw $4, %xmm4, %xmm5
246; AVX1-NEXT:    vpblendvb %xmm2, %xmm5, %xmm4, %xmm4
247; AVX1-NEXT:    vpsraw $2, %xmm4, %xmm5
248; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
249; AVX1-NEXT:    vpblendvb %xmm2, %xmm5, %xmm4, %xmm4
250; AVX1-NEXT:    vpsraw $1, %xmm4, %xmm5
251; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
252; AVX1-NEXT:    vpblendvb %xmm2, %xmm5, %xmm4, %xmm2
253; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
254; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
255; AVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
256; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
257; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
258; AVX1-NEXT:    vpsraw $4, %xmm4, %xmm5
259; AVX1-NEXT:    vpblendvb %xmm3, %xmm5, %xmm4, %xmm4
260; AVX1-NEXT:    vpsraw $2, %xmm4, %xmm5
261; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
262; AVX1-NEXT:    vpblendvb %xmm3, %xmm5, %xmm4, %xmm4
263; AVX1-NEXT:    vpsraw $1, %xmm4, %xmm5
264; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
265; AVX1-NEXT:    vpblendvb %xmm3, %xmm5, %xmm4, %xmm3
266; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
267; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
268; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
269; AVX1-NEXT:    vpsraw $4, %xmm0, %xmm4
270; AVX1-NEXT:    vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
271; AVX1-NEXT:    vpsraw $2, %xmm0, %xmm4
272; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
273; AVX1-NEXT:    vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
274; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm4
275; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
276; AVX1-NEXT:    vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
277; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
278; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
279; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
280; AVX1-NEXT:    retq
281;
282; AVX2-LABEL: var_shift_v32i8:
283; AVX2:       # BB#0:
284; AVX2-NEXT:    vpsllw $5, %ymm1, %ymm1
285; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
286; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
287; AVX2-NEXT:    vpsraw $4, %ymm3, %ymm4
288; AVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
289; AVX2-NEXT:    vpsraw $2, %ymm3, %ymm4
290; AVX2-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
291; AVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
292; AVX2-NEXT:    vpsraw $1, %ymm3, %ymm4
293; AVX2-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
294; AVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
295; AVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
296; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
297; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
298; AVX2-NEXT:    vpsraw $4, %ymm0, %ymm3
299; AVX2-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
300; AVX2-NEXT:    vpsraw $2, %ymm0, %ymm3
301; AVX2-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
302; AVX2-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
303; AVX2-NEXT:    vpsraw $1, %ymm0, %ymm3
304; AVX2-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
305; AVX2-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
306; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
307; AVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
308; AVX2-NEXT:    retq
309;
310; XOPAVX1-LABEL: var_shift_v32i8:
311; XOPAVX1:       # BB#0:
312; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
313; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
314; XOPAVX1-NEXT:    vpsubb %xmm2, %xmm3, %xmm2
315; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
316; XOPAVX1-NEXT:    vpshab %xmm2, %xmm4, %xmm2
317; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
318; XOPAVX1-NEXT:    vpshab %xmm1, %xmm0, %xmm0
319; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
320; XOPAVX1-NEXT:    retq
321;
322; XOPAVX2-LABEL: var_shift_v32i8:
323; XOPAVX2:       # BB#0:
324; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
325; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
326; XOPAVX2-NEXT:    vpsubb %xmm2, %xmm3, %xmm2
327; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
328; XOPAVX2-NEXT:    vpshab %xmm2, %xmm4, %xmm2
329; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
330; XOPAVX2-NEXT:    vpshab %xmm1, %xmm0, %xmm0
331; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
332; XOPAVX2-NEXT:    retq
333;
334; AVX512-LABEL: var_shift_v32i8:
335; AVX512:       ## BB#0:
336; AVX512-NEXT:    vpsllw $5, %ymm1, %ymm1
337; AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
338; AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
339; AVX512-NEXT:    vpsraw $4, %ymm3, %ymm4
340; AVX512-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
341; AVX512-NEXT:    vpsraw $2, %ymm3, %ymm4
342; AVX512-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
343; AVX512-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
344; AVX512-NEXT:    vpsraw $1, %ymm3, %ymm4
345; AVX512-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
346; AVX512-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
347; AVX512-NEXT:    vpsrlw $8, %ymm2, %ymm2
348; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
349; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
350; AVX512-NEXT:    vpsraw $4, %ymm0, %ymm3
351; AVX512-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
352; AVX512-NEXT:    vpsraw $2, %ymm0, %ymm3
353; AVX512-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
354; AVX512-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
355; AVX512-NEXT:    vpsraw $1, %ymm0, %ymm3
356; AVX512-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
357; AVX512-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
358; AVX512-NEXT:    vpsrlw $8, %ymm0, %ymm0
359; AVX512-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
360; AVX512-NEXT:    retq
361  %shift = ashr <32 x i8> %a, %b
362  ret <32 x i8> %shift
363}
364
365;
366; Uniform Variable Shifts
367;
368
369define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
370; AVX1-LABEL: splatvar_shift_v4i64:
371; AVX1:       # BB#0:
372; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
373; AVX1-NEXT:    vpsrlq %xmm1, %xmm2, %xmm2
374; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
375; AVX1-NEXT:    vpsrlq %xmm1, %xmm3, %xmm3
376; AVX1-NEXT:    vpxor %xmm2, %xmm3, %xmm3
377; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm3
378; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
379; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
380; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
381; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
382; AVX1-NEXT:    retq
383;
384; AVX2-LABEL: splatvar_shift_v4i64:
385; AVX2:       # BB#0:
386; AVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
387; AVX2-NEXT:    vpsrlq %xmm1, %ymm2, %ymm2
388; AVX2-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
389; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
390; AVX2-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
391; AVX2-NEXT:    retq
392;
393; XOPAVX1-LABEL: splatvar_shift_v4i64:
394; XOPAVX1:       # BB#0:
395; XOPAVX1-NEXT:    vmovddup {{.*#+}} xmm1 = xmm1[0,0]
396; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
397; XOPAVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
398; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
399; XOPAVX1-NEXT:    vpshaq %xmm1, %xmm2, %xmm2
400; XOPAVX1-NEXT:    vpshaq %xmm1, %xmm0, %xmm0
401; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
402; XOPAVX1-NEXT:    retq
403;
404; XOPAVX2-LABEL: splatvar_shift_v4i64:
405; XOPAVX2:       # BB#0:
406; XOPAVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
407; XOPAVX2-NEXT:    vpsrlq %xmm1, %ymm2, %ymm2
408; XOPAVX2-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
409; XOPAVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
410; XOPAVX2-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
411; XOPAVX2-NEXT:    retq
412;
413; AVX512-LABEL: splatvar_shift_v4i64:
414; AVX512:       ## BB#0:
415; AVX512-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
416; AVX512-NEXT:    vpsrlq %xmm1, %ymm2, %ymm2
417; AVX512-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
418; AVX512-NEXT:    vpxor %ymm2, %ymm0, %ymm0
419; AVX512-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
420; AVX512-NEXT:    retq
421  %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
422  %shift = ashr <4 x i64> %a, %splat
423  ret <4 x i64> %shift
424}
425
426define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
427; AVX1-LABEL: splatvar_shift_v8i32:
428; AVX1:       # BB#0:
429; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
430; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
431; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
432; AVX1-NEXT:    vpsrad %xmm1, %xmm2, %xmm2
433; AVX1-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
434; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
435; AVX1-NEXT:    retq
436;
437; AVX2-LABEL: splatvar_shift_v8i32:
438; AVX2:       # BB#0:
439; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
440; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
441; AVX2-NEXT:    vpsrad %xmm1, %ymm0, %ymm0
442; AVX2-NEXT:    retq
443;
444; XOPAVX1-LABEL: splatvar_shift_v8i32:
445; XOPAVX1:       # BB#0:
446; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
447; XOPAVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
448; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
449; XOPAVX1-NEXT:    vpsrad %xmm1, %xmm2, %xmm2
450; XOPAVX1-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
451; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
452; XOPAVX1-NEXT:    retq
453;
454; XOPAVX2-LABEL: splatvar_shift_v8i32:
455; XOPAVX2:       # BB#0:
456; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
457; XOPAVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
458; XOPAVX2-NEXT:    vpsrad %xmm1, %ymm0, %ymm0
459; XOPAVX2-NEXT:    retq
460;
461; AVX512-LABEL: splatvar_shift_v8i32:
462; AVX512:       ## BB#0:
463; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
464; AVX512-NEXT:    vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
465; AVX512-NEXT:    vpsrad %xmm1, %ymm0, %ymm0
466; AVX512-NEXT:    retq
467  %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
468  %shift = ashr <8 x i32> %a, %splat
469  ret <8 x i32> %shift
470}
471
472define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
473; AVX1-LABEL: splatvar_shift_v16i16:
474; AVX1:       # BB#0:
475; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
476; AVX1-NEXT:    vmovd %xmm1, %eax
477; AVX1-NEXT:    movzwl %ax, %eax
478; AVX1-NEXT:    vmovd %eax, %xmm1
479; AVX1-NEXT:    vpsraw %xmm1, %xmm2, %xmm2
480; AVX1-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
481; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
482; AVX1-NEXT:    retq
483;
484; AVX2-LABEL: splatvar_shift_v16i16:
485; AVX2:       # BB#0:
486; AVX2-NEXT:    vmovd %xmm1, %eax
487; AVX2-NEXT:    movzwl %ax, %eax
488; AVX2-NEXT:    vmovd %eax, %xmm1
489; AVX2-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
490; AVX2-NEXT:    retq
491;
492; XOPAVX1-LABEL: splatvar_shift_v16i16:
493; XOPAVX1:       # BB#0:
494; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
495; XOPAVX1-NEXT:    vmovd %xmm1, %eax
496; XOPAVX1-NEXT:    movzwl %ax, %eax
497; XOPAVX1-NEXT:    vmovd %eax, %xmm1
498; XOPAVX1-NEXT:    vpsraw %xmm1, %xmm2, %xmm2
499; XOPAVX1-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
500; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
501; XOPAVX1-NEXT:    retq
502;
503; XOPAVX2-LABEL: splatvar_shift_v16i16:
504; XOPAVX2:       # BB#0:
505; XOPAVX2-NEXT:    vmovd %xmm1, %eax
506; XOPAVX2-NEXT:    movzwl %ax, %eax
507; XOPAVX2-NEXT:    vmovd %eax, %xmm1
508; XOPAVX2-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
509; XOPAVX2-NEXT:    retq
510;
511; AVX512-LABEL: splatvar_shift_v16i16:
512; AVX512:       ## BB#0:
513; AVX512-NEXT:    vmovd %xmm1, %eax
514; AVX512-NEXT:    movzwl %ax, %eax
515; AVX512-NEXT:    vmovd %eax, %xmm1
516; AVX512-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
517; AVX512-NEXT:    retq
518  %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
519  %shift = ashr <16 x i16> %a, %splat
520  ret <16 x i16> %shift
521}
522
523define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
524; AVX1-LABEL: splatvar_shift_v32i8:
525; AVX1:       # BB#0:
526; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
527; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
528; AVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
529; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
530; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
531; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
532; AVX1-NEXT:    vpsraw $4, %xmm4, %xmm5
533; AVX1-NEXT:    vpblendvb %xmm2, %xmm5, %xmm4, %xmm4
534; AVX1-NEXT:    vpsraw $2, %xmm4, %xmm5
535; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm6
536; AVX1-NEXT:    vpblendvb %xmm6, %xmm5, %xmm4, %xmm4
537; AVX1-NEXT:    vpsraw $1, %xmm4, %xmm5
538; AVX1-NEXT:    vpaddw %xmm6, %xmm6, %xmm9
539; AVX1-NEXT:    vpblendvb %xmm9, %xmm5, %xmm4, %xmm4
540; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm8
541; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
542; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
543; AVX1-NEXT:    vpsraw $4, %xmm3, %xmm5
544; AVX1-NEXT:    vpblendvb %xmm1, %xmm5, %xmm3, %xmm3
545; AVX1-NEXT:    vpsraw $2, %xmm3, %xmm5
546; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm4
547; AVX1-NEXT:    vpblendvb %xmm4, %xmm5, %xmm3, %xmm3
548; AVX1-NEXT:    vpsraw $1, %xmm3, %xmm5
549; AVX1-NEXT:    vpaddw %xmm4, %xmm4, %xmm7
550; AVX1-NEXT:    vpblendvb %xmm7, %xmm5, %xmm3, %xmm3
551; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
552; AVX1-NEXT:    vpackuswb %xmm8, %xmm3, %xmm8
553; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
554; AVX1-NEXT:    vpsraw $4, %xmm5, %xmm3
555; AVX1-NEXT:    vpblendvb %xmm2, %xmm3, %xmm5, %xmm2
556; AVX1-NEXT:    vpsraw $2, %xmm2, %xmm3
557; AVX1-NEXT:    vpblendvb %xmm6, %xmm3, %xmm2, %xmm2
558; AVX1-NEXT:    vpsraw $1, %xmm2, %xmm3
559; AVX1-NEXT:    vpblendvb %xmm9, %xmm3, %xmm2, %xmm2
560; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
561; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
562; AVX1-NEXT:    vpsraw $4, %xmm0, %xmm3
563; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
564; AVX1-NEXT:    vpsraw $2, %xmm0, %xmm1
565; AVX1-NEXT:    vpblendvb %xmm4, %xmm1, %xmm0, %xmm0
566; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm1
567; AVX1-NEXT:    vpblendvb %xmm7, %xmm1, %xmm0, %xmm0
568; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
569; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
570; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm0
571; AVX1-NEXT:    retq
572;
573; AVX2-LABEL: splatvar_shift_v32i8:
574; AVX2:       # BB#0:
575; AVX2-NEXT:    vpbroadcastb %xmm1, %ymm1
576; AVX2-NEXT:    vpsllw $5, %ymm1, %ymm1
577; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
578; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
579; AVX2-NEXT:    vpsraw $4, %ymm3, %ymm4
580; AVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
581; AVX2-NEXT:    vpsraw $2, %ymm3, %ymm4
582; AVX2-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
583; AVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
584; AVX2-NEXT:    vpsraw $1, %ymm3, %ymm4
585; AVX2-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
586; AVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
587; AVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
588; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
589; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
590; AVX2-NEXT:    vpsraw $4, %ymm0, %ymm3
591; AVX2-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
592; AVX2-NEXT:    vpsraw $2, %ymm0, %ymm3
593; AVX2-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
594; AVX2-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
595; AVX2-NEXT:    vpsraw $1, %ymm0, %ymm3
596; AVX2-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
597; AVX2-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
598; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
599; AVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
600; AVX2-NEXT:    retq
601;
602; XOPAVX1-LABEL: splatvar_shift_v32i8:
603; XOPAVX1:       # BB#0:
604; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
605; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
606; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
607; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
608; XOPAVX1-NEXT:    vpshab %xmm1, %xmm2, %xmm2
609; XOPAVX1-NEXT:    vpshab %xmm1, %xmm0, %xmm0
610; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
611; XOPAVX1-NEXT:    retq
612;
613; XOPAVX2-LABEL: splatvar_shift_v32i8:
614; XOPAVX2:       # BB#0:
615; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %ymm1
616; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
617; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
618; XOPAVX2-NEXT:    vpsubb %xmm2, %xmm3, %xmm2
619; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
620; XOPAVX2-NEXT:    vpshab %xmm2, %xmm4, %xmm2
621; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
622; XOPAVX2-NEXT:    vpshab %xmm1, %xmm0, %xmm0
623; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
624; XOPAVX2-NEXT:    retq
625;
626; AVX512-LABEL: splatvar_shift_v32i8:
627; AVX512:       ## BB#0:
628; AVX512-NEXT:    vpbroadcastb %xmm1, %ymm1
629; AVX512-NEXT:    vpsllw $5, %ymm1, %ymm1
630; AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
631; AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
632; AVX512-NEXT:    vpsraw $4, %ymm3, %ymm4
633; AVX512-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
634; AVX512-NEXT:    vpsraw $2, %ymm3, %ymm4
635; AVX512-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
636; AVX512-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
637; AVX512-NEXT:    vpsraw $1, %ymm3, %ymm4
638; AVX512-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
639; AVX512-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
640; AVX512-NEXT:    vpsrlw $8, %ymm2, %ymm2
641; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
642; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
643; AVX512-NEXT:    vpsraw $4, %ymm0, %ymm3
644; AVX512-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
645; AVX512-NEXT:    vpsraw $2, %ymm0, %ymm3
646; AVX512-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
647; AVX512-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
648; AVX512-NEXT:    vpsraw $1, %ymm0, %ymm3
649; AVX512-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
650; AVX512-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
651; AVX512-NEXT:    vpsrlw $8, %ymm0, %ymm0
652; AVX512-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
653; AVX512-NEXT:    retq
654  %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
655  %shift = ashr <32 x i8> %a, %splat
656  ret <32 x i8> %shift
657}
658
659;
660; Constant Shifts
661;
662
663define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
664; AVX1-LABEL: constant_shift_v4i64:
665; AVX1:       # BB#0:
666; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
667; AVX1-NEXT:    vpsrlq $62, %xmm1, %xmm2
668; AVX1-NEXT:    vpsrlq $31, %xmm1, %xmm1
669; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
670; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [4294967296,2]
671; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm1
672; AVX1-NEXT:    vpsubq %xmm2, %xmm1, %xmm1
673; AVX1-NEXT:    vpsrlq $7, %xmm0, %xmm2
674; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
675; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
676; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [4611686018427387904,72057594037927936]
677; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
678; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
679; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
680; AVX1-NEXT:    retq
681;
682; AVX2-LABEL: constant_shift_v4i64:
683; AVX2:       # BB#0:
684; AVX2-NEXT:    vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
685; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [4611686018427387904,72057594037927936,4294967296,2]
686; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
687; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
688; AVX2-NEXT:    retq
689;
690; XOPAVX1-LABEL: constant_shift_v4i64:
691; XOPAVX1:       # BB#0:
692; XOPAVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
693; XOPAVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm1, %xmm2
694; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
695; XOPAVX1-NEXT:    vpshaq %xmm2, %xmm3, %xmm2
696; XOPAVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm1, %xmm1
697; XOPAVX1-NEXT:    vpshaq %xmm1, %xmm0, %xmm0
698; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
699; XOPAVX1-NEXT:    retq
700;
701; XOPAVX2-LABEL: constant_shift_v4i64:
702; XOPAVX2:       # BB#0:
703; XOPAVX2-NEXT:    vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
704; XOPAVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [4611686018427387904,72057594037927936,4294967296,2]
705; XOPAVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
706; XOPAVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
707; XOPAVX2-NEXT:    retq
708;
709; AVX512-LABEL: constant_shift_v4i64:
710; AVX512:       ## BB#0:
711; AVX512-NEXT:    vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
712; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [4611686018427387904,72057594037927936,4294967296,2]
713; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
714; AVX512-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
715; AVX512-NEXT:    retq
716  %shift = ashr <4 x i64> %a, <i64 1, i64 7, i64 31, i64 62>
717  ret <4 x i64> %shift
718}
719
720define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
721; AVX1-LABEL: constant_shift_v8i32:
722; AVX1:       # BB#0:
723; AVX1-NEXT:    vpsrad $7, %xmm0, %xmm1
724; AVX1-NEXT:    vpsrad $5, %xmm0, %xmm2
725; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
726; AVX1-NEXT:    vpsrad $6, %xmm0, %xmm2
727; AVX1-NEXT:    vpsrad $4, %xmm0, %xmm3
728; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
729; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
730; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
731; AVX1-NEXT:    vpsrad $7, %xmm0, %xmm2
732; AVX1-NEXT:    vpsrad $9, %xmm0, %xmm3
733; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
734; AVX1-NEXT:    vpsrad $8, %xmm0, %xmm0
735; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
736; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
737; AVX1-NEXT:    retq
738;
739; AVX2-LABEL: constant_shift_v8i32:
740; AVX2:       # BB#0:
741; AVX2-NEXT:    vpsravd {{.*}}(%rip), %ymm0, %ymm0
742; AVX2-NEXT:    retq
743;
744; XOPAVX1-LABEL: constant_shift_v8i32:
745; XOPAVX1:       # BB#0:
746; XOPAVX1-NEXT:    vpshad {{.*}}(%rip), %xmm0, %xmm1
747; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
748; XOPAVX1-NEXT:    vpshad {{.*}}(%rip), %xmm0, %xmm0
749; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
750; XOPAVX1-NEXT:    retq
751;
752; XOPAVX2-LABEL: constant_shift_v8i32:
753; XOPAVX2:       # BB#0:
754; XOPAVX2-NEXT:    vpsravd {{.*}}(%rip), %ymm0, %ymm0
755; XOPAVX2-NEXT:    retq
756;
757; AVX512-LABEL: constant_shift_v8i32:
758; AVX512:       ## BB#0:
759; AVX512-NEXT:    vpsravd {{.*}}(%rip), %ymm0, %ymm0
760; AVX512-NEXT:    retq
761  %shift = ashr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
762  ret <8 x i32> %shift
763}
764
765define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
766; AVX1-LABEL: constant_shift_v16i16:
767; AVX1:       # BB#0:
768; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
769; AVX1-NEXT:    vpsraw $8, %xmm1, %xmm1
770; AVX1-NEXT:    vpsraw $4, %xmm1, %xmm2
771; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
772; AVX1-NEXT:    vpsraw $2, %xmm1, %xmm2
773; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
774; AVX1-NEXT:    vpsraw $1, %xmm1, %xmm2
775; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
776; AVX1-NEXT:    vpsraw $4, %xmm0, %xmm2
777; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
778; AVX1-NEXT:    vpsraw $2, %xmm0, %xmm2
779; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
780; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm2
781; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
782; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
783; AVX1-NEXT:    retq
784;
785; AVX2-LABEL: constant_shift_v16i16:
786; AVX2:       # BB#0:
787; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
788; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
789; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15]
790; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
791; AVX2-NEXT:    vpsravd %ymm3, %ymm4, %ymm3
792; AVX2-NEXT:    vpsrld $16, %ymm3, %ymm3
793; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
794; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
795; AVX2-NEXT:    vpsravd %ymm2, %ymm0, %ymm0
796; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
797; AVX2-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
798; AVX2-NEXT:    retq
799;
800; XOPAVX1-LABEL: constant_shift_v16i16:
801; XOPAVX1:       # BB#0:
802; XOPAVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
803; XOPAVX1-NEXT:    vpsubw {{.*}}(%rip), %xmm1, %xmm2
804; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
805; XOPAVX1-NEXT:    vpshaw %xmm2, %xmm3, %xmm2
806; XOPAVX1-NEXT:    vpsubw {{.*}}(%rip), %xmm1, %xmm1
807; XOPAVX1-NEXT:    vpshaw %xmm1, %xmm0, %xmm0
808; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
809; XOPAVX1-NEXT:    retq
810;
811; XOPAVX2-LABEL: constant_shift_v16i16:
812; XOPAVX2:       # BB#0:
813; XOPAVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
814; XOPAVX2-NEXT:    vpsubw {{.*}}(%rip), %xmm1, %xmm2
815; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
816; XOPAVX2-NEXT:    vpshaw %xmm2, %xmm3, %xmm2
817; XOPAVX2-NEXT:    vpsubw {{.*}}(%rip), %xmm1, %xmm1
818; XOPAVX2-NEXT:    vpshaw %xmm1, %xmm0, %xmm0
819; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
820; XOPAVX2-NEXT:    retq
821;
822; AVX512-LABEL: constant_shift_v16i16:
823; AVX512:       ## BB#0:
824; AVX512-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
825; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
826; AVX512-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
827; AVX512-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
828; AVX512-NEXT:    retq
829  %shift = ashr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
830  ret <16 x i16> %shift
831}
832
833define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
834; AVX1-LABEL: constant_shift_v32i8:
835; AVX1:       # BB#0:
836; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
837; AVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
838; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
839; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
840; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
841; AVX1-NEXT:    vpsraw $4, %xmm4, %xmm5
842; AVX1-NEXT:    vpblendvb %xmm2, %xmm5, %xmm4, %xmm4
843; AVX1-NEXT:    vpsraw $2, %xmm4, %xmm5
844; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm6
845; AVX1-NEXT:    vpblendvb %xmm6, %xmm5, %xmm4, %xmm4
846; AVX1-NEXT:    vpsraw $1, %xmm4, %xmm5
847; AVX1-NEXT:    vpaddw %xmm6, %xmm6, %xmm9
848; AVX1-NEXT:    vpblendvb %xmm9, %xmm5, %xmm4, %xmm4
849; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm8
850; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
851; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
852; AVX1-NEXT:    vpsraw $4, %xmm3, %xmm5
853; AVX1-NEXT:    vpblendvb %xmm1, %xmm5, %xmm3, %xmm3
854; AVX1-NEXT:    vpsraw $2, %xmm3, %xmm5
855; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm4
856; AVX1-NEXT:    vpblendvb %xmm4, %xmm5, %xmm3, %xmm3
857; AVX1-NEXT:    vpsraw $1, %xmm3, %xmm5
858; AVX1-NEXT:    vpaddw %xmm4, %xmm4, %xmm7
859; AVX1-NEXT:    vpblendvb %xmm7, %xmm5, %xmm3, %xmm3
860; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
861; AVX1-NEXT:    vpackuswb %xmm8, %xmm3, %xmm8
862; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
863; AVX1-NEXT:    vpsraw $4, %xmm5, %xmm3
864; AVX1-NEXT:    vpblendvb %xmm2, %xmm3, %xmm5, %xmm2
865; AVX1-NEXT:    vpsraw $2, %xmm2, %xmm3
866; AVX1-NEXT:    vpblendvb %xmm6, %xmm3, %xmm2, %xmm2
867; AVX1-NEXT:    vpsraw $1, %xmm2, %xmm3
868; AVX1-NEXT:    vpblendvb %xmm9, %xmm3, %xmm2, %xmm2
869; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
870; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
871; AVX1-NEXT:    vpsraw $4, %xmm0, %xmm3
872; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
873; AVX1-NEXT:    vpsraw $2, %xmm0, %xmm1
874; AVX1-NEXT:    vpblendvb %xmm4, %xmm1, %xmm0, %xmm0
875; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm1
876; AVX1-NEXT:    vpblendvb %xmm7, %xmm1, %xmm0, %xmm0
877; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
878; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
879; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm0
880; AVX1-NEXT:    retq
881;
882; AVX2-LABEL: constant_shift_v32i8:
883; AVX2:       # BB#0:
884; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
885; AVX2-NEXT:    vpsllw $5, %ymm1, %ymm1
886; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
887; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
888; AVX2-NEXT:    vpsraw $4, %ymm3, %ymm4
889; AVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
890; AVX2-NEXT:    vpsraw $2, %ymm3, %ymm4
891; AVX2-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
892; AVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
893; AVX2-NEXT:    vpsraw $1, %ymm3, %ymm4
894; AVX2-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
895; AVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
896; AVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
897; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
898; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
899; AVX2-NEXT:    vpsraw $4, %ymm0, %ymm3
900; AVX2-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
901; AVX2-NEXT:    vpsraw $2, %ymm0, %ymm3
902; AVX2-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
903; AVX2-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
904; AVX2-NEXT:    vpsraw $1, %ymm0, %ymm3
905; AVX2-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
906; AVX2-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
907; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
908; AVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
909; AVX2-NEXT:    retq
910;
911; XOPAVX1-LABEL: constant_shift_v32i8:
912; XOPAVX1:       # BB#0:
913; XOPAVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
914; XOPAVX1-NEXT:    vpsubb {{.*}}(%rip), %xmm1, %xmm1
915; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
916; XOPAVX1-NEXT:    vpshab %xmm1, %xmm2, %xmm2
917; XOPAVX1-NEXT:    vpshab %xmm1, %xmm0, %xmm0
918; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
919; XOPAVX1-NEXT:    retq
920;
921; XOPAVX2-LABEL: constant_shift_v32i8:
922; XOPAVX2:       # BB#0:
923; XOPAVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
924; XOPAVX2-NEXT:    vpsubb {{.*}}(%rip), %xmm1, %xmm1
925; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
926; XOPAVX2-NEXT:    vpshab %xmm1, %xmm2, %xmm2
927; XOPAVX2-NEXT:    vpshab %xmm1, %xmm0, %xmm0
928; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
929; XOPAVX2-NEXT:    retq
930;
931; AVX512-LABEL: constant_shift_v32i8:
932; AVX512:       ## BB#0:
933; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
934; AVX512-NEXT:    vpsllw $5, %ymm1, %ymm1
935; AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
936; AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
937; AVX512-NEXT:    vpsraw $4, %ymm3, %ymm4
938; AVX512-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
939; AVX512-NEXT:    vpsraw $2, %ymm3, %ymm4
940; AVX512-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
941; AVX512-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
942; AVX512-NEXT:    vpsraw $1, %ymm3, %ymm4
943; AVX512-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
944; AVX512-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
945; AVX512-NEXT:    vpsrlw $8, %ymm2, %ymm2
946; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
947; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
948; AVX512-NEXT:    vpsraw $4, %ymm0, %ymm3
949; AVX512-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
950; AVX512-NEXT:    vpsraw $2, %ymm0, %ymm3
951; AVX512-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
952; AVX512-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
953; AVX512-NEXT:    vpsraw $1, %ymm0, %ymm3
954; AVX512-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
955; AVX512-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
956; AVX512-NEXT:    vpsrlw $8, %ymm0, %ymm0
957; AVX512-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
958; AVX512-NEXT:    retq
959  %shift = ashr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
960  ret <32 x i8> %shift
961}
962
963;
964; Uniform Constant Shifts
965;
966
967define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
968; AVX1-LABEL: splatconstant_shift_v4i64:
969; AVX1:       # BB#0:
970; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
971; AVX1-NEXT:    vpsrad $7, %xmm1, %xmm2
972; AVX1-NEXT:    vpsrlq $7, %xmm1, %xmm1
973; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
974; AVX1-NEXT:    vpsrad $7, %xmm0, %xmm2
975; AVX1-NEXT:    vpsrlq $7, %xmm0, %xmm0
976; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
977; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
978; AVX1-NEXT:    retq
979;
980; AVX2-LABEL: splatconstant_shift_v4i64:
981; AVX2:       # BB#0:
982; AVX2-NEXT:    vpsrad $7, %ymm0, %ymm1
983; AVX2-NEXT:    vpsrlq $7, %ymm0, %ymm0
984; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
985; AVX2-NEXT:    retq
986;
987; XOPAVX1-LABEL: splatconstant_shift_v4i64:
988; XOPAVX1:       # BB#0:
989; XOPAVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
990; XOPAVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm1, %xmm1
991; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
992; XOPAVX1-NEXT:    vpshaq %xmm1, %xmm2, %xmm2
993; XOPAVX1-NEXT:    vpshaq %xmm1, %xmm0, %xmm0
994; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
995; XOPAVX1-NEXT:    retq
996;
997; XOPAVX2-LABEL: splatconstant_shift_v4i64:
998; XOPAVX2:       # BB#0:
999; XOPAVX2-NEXT:    vpsrlq $7, %ymm0, %ymm0
1000; XOPAVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm1
1001; XOPAVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
1002; XOPAVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
1003; XOPAVX2-NEXT:    retq
1004;
1005; AVX512-LABEL: splatconstant_shift_v4i64:
1006; AVX512:       ## BB#0:
1007; AVX512-NEXT:    vpsrad $7, %ymm0, %ymm1
1008; AVX512-NEXT:    vpsrlq $7, %ymm0, %ymm0
1009; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
1010; AVX512-NEXT:    retq
1011  %shift = ashr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
1012  ret <4 x i64> %shift
1013}
1014
1015define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind {
1016; AVX1-LABEL: splatconstant_shift_v8i32:
1017; AVX1:       # BB#0:
1018; AVX1-NEXT:    vpsrad $5, %xmm0, %xmm1
1019; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1020; AVX1-NEXT:    vpsrad $5, %xmm0, %xmm0
1021; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1022; AVX1-NEXT:    retq
1023;
1024; AVX2-LABEL: splatconstant_shift_v8i32:
1025; AVX2:       # BB#0:
1026; AVX2-NEXT:    vpsrad $5, %ymm0, %ymm0
1027; AVX2-NEXT:    retq
1028;
1029; XOPAVX1-LABEL: splatconstant_shift_v8i32:
1030; XOPAVX1:       # BB#0:
1031; XOPAVX1-NEXT:    vpsrad $5, %xmm0, %xmm1
1032; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1033; XOPAVX1-NEXT:    vpsrad $5, %xmm0, %xmm0
1034; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1035; XOPAVX1-NEXT:    retq
1036;
1037; XOPAVX2-LABEL: splatconstant_shift_v8i32:
1038; XOPAVX2:       # BB#0:
1039; XOPAVX2-NEXT:    vpsrad $5, %ymm0, %ymm0
1040; XOPAVX2-NEXT:    retq
1041;
1042; AVX512-LABEL: splatconstant_shift_v8i32:
1043; AVX512:       ## BB#0:
1044; AVX512-NEXT:    vpsrad $5, %ymm0, %ymm0
1045; AVX512-NEXT:    retq
1046  %shift = ashr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
1047  ret <8 x i32> %shift
1048}
1049
1050define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind {
1051; AVX1-LABEL: splatconstant_shift_v16i16:
1052; AVX1:       # BB#0:
1053; AVX1-NEXT:    vpsraw $3, %xmm0, %xmm1
1054; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1055; AVX1-NEXT:    vpsraw $3, %xmm0, %xmm0
1056; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1057; AVX1-NEXT:    retq
1058;
1059; AVX2-LABEL: splatconstant_shift_v16i16:
1060; AVX2:       # BB#0:
1061; AVX2-NEXT:    vpsraw $3, %ymm0, %ymm0
1062; AVX2-NEXT:    retq
1063;
1064; XOPAVX1-LABEL: splatconstant_shift_v16i16:
1065; XOPAVX1:       # BB#0:
1066; XOPAVX1-NEXT:    vpsraw $3, %xmm0, %xmm1
1067; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1068; XOPAVX1-NEXT:    vpsraw $3, %xmm0, %xmm0
1069; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1070; XOPAVX1-NEXT:    retq
1071;
1072; XOPAVX2-LABEL: splatconstant_shift_v16i16:
1073; XOPAVX2:       # BB#0:
1074; XOPAVX2-NEXT:    vpsraw $3, %ymm0, %ymm0
1075; XOPAVX2-NEXT:    retq
1076;
1077; AVX512-LABEL: splatconstant_shift_v16i16:
1078; AVX512:       ## BB#0:
1079; AVX512-NEXT:    vpsraw $3, %ymm0, %ymm0
1080; AVX512-NEXT:    retq
1081  %shift = ashr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
1082  ret <16 x i16> %shift
1083}
1084
1085define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
1086; AVX1-LABEL: splatconstant_shift_v32i8:
1087; AVX1:       # BB#0:
1088; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1089; AVX1-NEXT:    vpsrlw $3, %xmm1, %xmm1
1090; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
1091; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
1092; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
1093; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm1
1094; AVX1-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
1095; AVX1-NEXT:    vpsrlw $3, %xmm0, %xmm0
1096; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
1097; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm0
1098; AVX1-NEXT:    vpsubb %xmm3, %xmm0, %xmm0
1099; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1100; AVX1-NEXT:    retq
1101;
1102; AVX2-LABEL: splatconstant_shift_v32i8:
1103; AVX2:       # BB#0:
1104; AVX2-NEXT:    vpsrlw $3, %ymm0, %ymm0
1105; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1106; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
1107; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
1108; AVX2-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
1109; AVX2-NEXT:    retq
1110;
1111; XOPAVX1-LABEL: splatconstant_shift_v32i8:
1112; XOPAVX1:       # BB#0:
1113; XOPAVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1114; XOPAVX1-NEXT:    vpsubb {{.*}}(%rip), %xmm1, %xmm1
1115; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1116; XOPAVX1-NEXT:    vpshab %xmm1, %xmm2, %xmm2
1117; XOPAVX1-NEXT:    vpshab %xmm1, %xmm0, %xmm0
1118; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1119; XOPAVX1-NEXT:    retq
1120;
1121; XOPAVX2-LABEL: splatconstant_shift_v32i8:
1122; XOPAVX2:       # BB#0:
1123; XOPAVX2-NEXT:    vpsrlw $3, %ymm0, %ymm0
1124; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1125; XOPAVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
1126; XOPAVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
1127; XOPAVX2-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
1128; XOPAVX2-NEXT:    retq
1129;
1130; AVX512-LABEL: splatconstant_shift_v32i8:
1131; AVX512:       ## BB#0:
1132; AVX512-NEXT:    vpsrlw $3, %ymm0, %ymm0
1133; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1134; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
1135; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
1136; AVX512-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
1137; AVX512-NEXT:    retq
1138  %shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
1139  ret <32 x i8> %shift
1140}
1141