• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
6
7;
8; Variable Shifts
9;
10
11define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
12; AVX1-LABEL: var_shift_v4i64:
13; AVX1:       # BB#0:
14; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
15; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
16; AVX1-NEXT:    vpsrlq %xmm2, %xmm3, %xmm4
17; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
18; AVX1-NEXT:    vpsrlq %xmm2, %xmm3, %xmm2
19; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
20; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm3
21; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
22; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
23; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
24; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
25; AVX1-NEXT:    retq
26;
27; AVX2-LABEL: var_shift_v4i64:
28; AVX2:       # BB#0:
29; AVX2-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
30; AVX2-NEXT:    retq
31;
32; XOPAVX1-LABEL: var_shift_v4i64:
33; XOPAVX1:       # BB#0:
34; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
35; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
36; XOPAVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
37; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
38; XOPAVX1-NEXT:    vpshlq %xmm2, %xmm4, %xmm2
39; XOPAVX1-NEXT:    vpsubq %xmm1, %xmm3, %xmm1
40; XOPAVX1-NEXT:    vpshlq %xmm1, %xmm0, %xmm0
41; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
42; XOPAVX1-NEXT:    retq
43;
44; XOPAVX2-LABEL: var_shift_v4i64:
45; XOPAVX2:       # BB#0:
46; XOPAVX2-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
47; XOPAVX2-NEXT:    retq
48  %shift = lshr <4 x i64> %a, %b
49  ret <4 x i64> %shift
50}
51
52define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
53; AVX1-LABEL: var_shift_v8i32:
54; AVX1:       # BB#0:
55; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
56; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
57; AVX1-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
58; AVX1-NEXT:    vpsrld %xmm4, %xmm2, %xmm4
59; AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm5
60; AVX1-NEXT:    vpsrld %xmm5, %xmm2, %xmm5
61; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
62; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
63; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
64; AVX1-NEXT:    vpsrld %xmm6, %xmm2, %xmm6
65; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
66; AVX1-NEXT:    vpsrld %xmm3, %xmm2, %xmm2
67; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7]
68; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
69; AVX1-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
70; AVX1-NEXT:    vpsrld %xmm3, %xmm0, %xmm3
71; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm4
72; AVX1-NEXT:    vpsrld %xmm4, %xmm0, %xmm4
73; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
74; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
75; AVX1-NEXT:    vpsrld %xmm4, %xmm0, %xmm4
76; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
77; AVX1-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
78; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
79; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
80; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
81; AVX1-NEXT:    retq
82;
83; AVX2-LABEL: var_shift_v8i32:
84; AVX2:       # BB#0:
85; AVX2-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
86; AVX2-NEXT:    retq
87;
88; XOPAVX1-LABEL: var_shift_v8i32:
89; XOPAVX1:       # BB#0:
90; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
91; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
92; XOPAVX1-NEXT:    vpsubd %xmm2, %xmm3, %xmm2
93; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
94; XOPAVX1-NEXT:    vpshld %xmm2, %xmm4, %xmm2
95; XOPAVX1-NEXT:    vpsubd %xmm1, %xmm3, %xmm1
96; XOPAVX1-NEXT:    vpshld %xmm1, %xmm0, %xmm0
97; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
98; XOPAVX1-NEXT:    retq
99;
100; XOPAVX2-LABEL: var_shift_v8i32:
101; XOPAVX2:       # BB#0:
102; XOPAVX2-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
103; XOPAVX2-NEXT:    retq
104  %shift = lshr <8 x i32> %a, %b
105  ret <8 x i32> %shift
106}
107
108define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
109; AVX1-LABEL: var_shift_v16i16:
110; AVX1:       # BB#0:
111; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
112; AVX1-NEXT:    vpsllw $12, %xmm2, %xmm3
113; AVX1-NEXT:    vpsllw $4, %xmm2, %xmm2
114; AVX1-NEXT:    vpor %xmm3, %xmm2, %xmm2
115; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm3
116; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
117; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm5
118; AVX1-NEXT:    vpblendvb %xmm2, %xmm5, %xmm4, %xmm2
119; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm4
120; AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
121; AVX1-NEXT:    vpsrlw $2, %xmm2, %xmm4
122; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
123; AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
124; AVX1-NEXT:    vpsrlw $1, %xmm2, %xmm4
125; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
126; AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
127; AVX1-NEXT:    vpsllw $12, %xmm1, %xmm3
128; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
129; AVX1-NEXT:    vpor %xmm3, %xmm1, %xmm1
130; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm3
131; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm4
132; AVX1-NEXT:    vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
133; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm1
134; AVX1-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
135; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm1
136; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
137; AVX1-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
138; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm1
139; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
140; AVX1-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
141; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
142; AVX1-NEXT:    retq
143;
144; AVX2-LABEL: var_shift_v16i16:
145; AVX2:       # BB#0:
146; AVX2-NEXT:    vpxor %ymm2, %ymm2, %ymm2
147; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
148; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
149; AVX2-NEXT:    vpsrlvd %ymm3, %ymm4, %ymm3
150; AVX2-NEXT:    vpsrld $16, %ymm3, %ymm3
151; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
152; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
153; AVX2-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
154; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
155; AVX2-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
156; AVX2-NEXT:    retq
157;
158; XOPAVX1-LABEL: var_shift_v16i16:
159; XOPAVX1:       # BB#0:
160; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
161; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
162; XOPAVX1-NEXT:    vpsubw %xmm2, %xmm3, %xmm2
163; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
164; XOPAVX1-NEXT:    vpshlw %xmm2, %xmm4, %xmm2
165; XOPAVX1-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
166; XOPAVX1-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
167; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
168; XOPAVX1-NEXT:    retq
169;
170; XOPAVX2-LABEL: var_shift_v16i16:
171; XOPAVX2:       # BB#0:
172; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
173; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
174; XOPAVX2-NEXT:    vpsubw %xmm2, %xmm3, %xmm2
175; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
176; XOPAVX2-NEXT:    vpshlw %xmm2, %xmm4, %xmm2
177; XOPAVX2-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
178; XOPAVX2-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
179; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
180; XOPAVX2-NEXT:    retq
181  %shift = lshr <16 x i16> %a, %b
182  ret <16 x i16> %shift
183}
184
185define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
186; AVX1-LABEL: var_shift_v32i8:
187; AVX1:       # BB#0:
188; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
189; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm3
190; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
191; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
192; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
193; AVX1-NEXT:    vpsllw $5, %xmm5, %xmm5
194; AVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
195; AVX1-NEXT:    vpsrlw $2, %xmm2, %xmm3
196; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
197; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
198; AVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
199; AVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
200; AVX1-NEXT:    vpsrlw $1, %xmm2, %xmm3
201; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
202; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
203; AVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
204; AVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
205; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm3
206; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
207; AVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
208; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
209; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm3
210; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
211; AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
212; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
213; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm3
214; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
215; AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
216; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
217; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
218; AVX1-NEXT:    retq
219;
220; AVX2-LABEL: var_shift_v32i8:
221; AVX2:       # BB#0:
222; AVX2-NEXT:    vpsllw $5, %ymm1, %ymm1
223; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm2
224; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
225; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
226; AVX2-NEXT:    vpsrlw $2, %ymm0, %ymm2
227; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
228; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
229; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
230; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm2
231; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
232; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
233; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
234; AVX2-NEXT:    retq
235;
236; XOPAVX1-LABEL: var_shift_v32i8:
237; XOPAVX1:       # BB#0:
238; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
239; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
240; XOPAVX1-NEXT:    vpsubb %xmm2, %xmm3, %xmm2
241; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
242; XOPAVX1-NEXT:    vpshlb %xmm2, %xmm4, %xmm2
243; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
244; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
245; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
246; XOPAVX1-NEXT:    retq
247;
248; XOPAVX2-LABEL: var_shift_v32i8:
249; XOPAVX2:       # BB#0:
250; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
251; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
252; XOPAVX2-NEXT:    vpsubb %xmm2, %xmm3, %xmm2
253; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
254; XOPAVX2-NEXT:    vpshlb %xmm2, %xmm4, %xmm2
255; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
256; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
257; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
258; XOPAVX2-NEXT:    retq
259  %shift = lshr <32 x i8> %a, %b
260  ret <32 x i8> %shift
261}
262
263;
264; Uniform Variable Shifts
265;
266
267define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
268; AVX1-LABEL: splatvar_shift_v4i64:
269; AVX1:       # BB#0:
270; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
271; AVX1-NEXT:    vpsrlq %xmm1, %xmm2, %xmm2
272; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
273; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
274; AVX1-NEXT:    retq
275;
276; AVX2-LABEL: splatvar_shift_v4i64:
277; AVX2:       # BB#0:
278; AVX2-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
279; AVX2-NEXT:    retq
280;
281; XOPAVX1-LABEL: splatvar_shift_v4i64:
282; XOPAVX1:       # BB#0:
283; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
284; XOPAVX1-NEXT:    vpsrlq %xmm1, %xmm2, %xmm2
285; XOPAVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
286; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
287; XOPAVX1-NEXT:    retq
288;
289; XOPAVX2-LABEL: splatvar_shift_v4i64:
290; XOPAVX2:       # BB#0:
291; XOPAVX2-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
292; XOPAVX2-NEXT:    retq
293  %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
294  %shift = lshr <4 x i64> %a, %splat
295  ret <4 x i64> %shift
296}
297
298define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
299; AVX1-LABEL: splatvar_shift_v8i32:
300; AVX1:       # BB#0:
301; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
302; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
303; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
304; AVX1-NEXT:    vpsrld %xmm1, %xmm2, %xmm2
305; AVX1-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
306; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
307; AVX1-NEXT:    retq
308;
309; AVX2-LABEL: splatvar_shift_v8i32:
310; AVX2:       # BB#0:
311; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
312; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
313; AVX2-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
314; AVX2-NEXT:    retq
315;
316; XOPAVX1-LABEL: splatvar_shift_v8i32:
317; XOPAVX1:       # BB#0:
318; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
319; XOPAVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
320; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
321; XOPAVX1-NEXT:    vpsrld %xmm1, %xmm2, %xmm2
322; XOPAVX1-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
323; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
324; XOPAVX1-NEXT:    retq
325;
326; XOPAVX2-LABEL: splatvar_shift_v8i32:
327; XOPAVX2:       # BB#0:
328; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
329; XOPAVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
330; XOPAVX2-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
331; XOPAVX2-NEXT:    retq
332  %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
333  %shift = lshr <8 x i32> %a, %splat
334  ret <8 x i32> %shift
335}
336
337define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
338; AVX1-LABEL: splatvar_shift_v16i16:
339; AVX1:       # BB#0:
340; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
341; AVX1-NEXT:    vmovd %xmm1, %eax
342; AVX1-NEXT:    movzwl %ax, %eax
343; AVX1-NEXT:    vmovd %eax, %xmm1
344; AVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
345; AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
346; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
347; AVX1-NEXT:    retq
348;
349; AVX2-LABEL: splatvar_shift_v16i16:
350; AVX2:       # BB#0:
351; AVX2-NEXT:    vmovd %xmm1, %eax
352; AVX2-NEXT:    movzwl %ax, %eax
353; AVX2-NEXT:    vmovd %eax, %xmm1
354; AVX2-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
355; AVX2-NEXT:    retq
356;
357; XOPAVX1-LABEL: splatvar_shift_v16i16:
358; XOPAVX1:       # BB#0:
359; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
360; XOPAVX1-NEXT:    vmovd %xmm1, %eax
361; XOPAVX1-NEXT:    movzwl %ax, %eax
362; XOPAVX1-NEXT:    vmovd %eax, %xmm1
363; XOPAVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
364; XOPAVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
365; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
366; XOPAVX1-NEXT:    retq
367;
368; XOPAVX2-LABEL: splatvar_shift_v16i16:
369; XOPAVX2:       # BB#0:
370; XOPAVX2-NEXT:    vmovd %xmm1, %eax
371; XOPAVX2-NEXT:    movzwl %ax, %eax
372; XOPAVX2-NEXT:    vmovd %eax, %xmm1
373; XOPAVX2-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
374; XOPAVX2-NEXT:    retq
375  %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
376  %shift = lshr <16 x i16> %a, %splat
377  ret <16 x i16> %shift
378}
379
380define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
381; AVX1-LABEL: splatvar_shift_v32i8:
382; AVX1:       # BB#0:
383; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
384; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
385; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
386; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm3
387; AVX1-NEXT:    vmovdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
388; AVX1-NEXT:    vpand %xmm8, %xmm3, %xmm3
389; AVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
390; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm2, %xmm2
391; AVX1-NEXT:    vpsrlw $2, %xmm2, %xmm3
392; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
393; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
394; AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm6
395; AVX1-NEXT:    vpblendvb %xmm6, %xmm3, %xmm2, %xmm2
396; AVX1-NEXT:    vpsrlw $1, %xmm2, %xmm3
397; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
398; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
399; AVX1-NEXT:    vpaddb %xmm6, %xmm6, %xmm4
400; AVX1-NEXT:    vpblendvb %xmm4, %xmm3, %xmm2, %xmm2
401; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm3
402; AVX1-NEXT:    vpand %xmm8, %xmm3, %xmm3
403; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
404; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm1
405; AVX1-NEXT:    vpand %xmm5, %xmm1, %xmm1
406; AVX1-NEXT:    vpblendvb %xmm6, %xmm1, %xmm0, %xmm0
407; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm1
408; AVX1-NEXT:    vpand %xmm7, %xmm1, %xmm1
409; AVX1-NEXT:    vpblendvb %xmm4, %xmm1, %xmm0, %xmm0
410; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
411; AVX1-NEXT:    retq
412;
413; AVX2-LABEL: splatvar_shift_v32i8:
414; AVX2:       # BB#0:
415; AVX2-NEXT:    vpbroadcastb %xmm1, %ymm1
416; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm2
417; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
418; AVX2-NEXT:    vpsllw $5, %ymm1, %ymm1
419; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
420; AVX2-NEXT:    vpsrlw $2, %ymm0, %ymm2
421; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
422; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
423; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
424; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm2
425; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
426; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
427; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
428; AVX2-NEXT:    retq
429;
430; XOPAVX1-LABEL: splatvar_shift_v32i8:
431; XOPAVX1:       # BB#0:
432; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
433; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
434; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
435; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
436; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm2, %xmm2
437; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
438; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
439; XOPAVX1-NEXT:    retq
440;
441; XOPAVX2-LABEL: splatvar_shift_v32i8:
442; XOPAVX2:       # BB#0:
443; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %ymm1
444; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
445; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
446; XOPAVX2-NEXT:    vpsubb %xmm2, %xmm3, %xmm2
447; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
448; XOPAVX2-NEXT:    vpshlb %xmm2, %xmm4, %xmm2
449; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
450; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
451; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
452; XOPAVX2-NEXT:    retq
453  %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
454  %shift = lshr <32 x i8> %a, %splat
455  ret <32 x i8> %shift
456}
457
458;
459; Constant Shifts
460;
461
462define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
463; AVX1-LABEL: constant_shift_v4i64:
464; AVX1:       # BB#0:
465; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
466; AVX1-NEXT:    vpsrlq $62, %xmm1, %xmm2
467; AVX1-NEXT:    vpsrlq $31, %xmm1, %xmm1
468; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
469; AVX1-NEXT:    vpsrlq $7, %xmm0, %xmm2
470; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
471; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
472; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
473; AVX1-NEXT:    retq
474;
475; AVX2-LABEL: constant_shift_v4i64:
476; AVX2:       # BB#0:
477; AVX2-NEXT:    vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
478; AVX2-NEXT:    retq
479;
480; XOPAVX1-LABEL: constant_shift_v4i64:
481; XOPAVX1:       # BB#0:
482; XOPAVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
483; XOPAVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm1, %xmm2
484; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
485; XOPAVX1-NEXT:    vpshlq %xmm2, %xmm3, %xmm2
486; XOPAVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm1, %xmm1
487; XOPAVX1-NEXT:    vpshlq %xmm1, %xmm0, %xmm0
488; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
489; XOPAVX1-NEXT:    retq
490;
491; XOPAVX2-LABEL: constant_shift_v4i64:
492; XOPAVX2:       # BB#0:
493; XOPAVX2-NEXT:    vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
494; XOPAVX2-NEXT:    retq
495  %shift = lshr <4 x i64> %a, <i64 1, i64 7, i64 31, i64 62>
496  ret <4 x i64> %shift
497}
498
499define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
500; AVX1-LABEL: constant_shift_v8i32:
501; AVX1:       # BB#0:
502; AVX1-NEXT:    vpsrld $7, %xmm0, %xmm1
503; AVX1-NEXT:    vpsrld $5, %xmm0, %xmm2
504; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
505; AVX1-NEXT:    vpsrld $6, %xmm0, %xmm2
506; AVX1-NEXT:    vpsrld $4, %xmm0, %xmm3
507; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
508; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
509; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
510; AVX1-NEXT:    vpsrld $7, %xmm0, %xmm2
511; AVX1-NEXT:    vpsrld $9, %xmm0, %xmm3
512; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
513; AVX1-NEXT:    vpsrld $8, %xmm0, %xmm0
514; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
515; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
516; AVX1-NEXT:    retq
517;
518; AVX2-LABEL: constant_shift_v8i32:
519; AVX2:       # BB#0:
520; AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
521; AVX2-NEXT:    retq
522;
523; XOPAVX1-LABEL: constant_shift_v8i32:
524; XOPAVX1:       # BB#0:
525; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm1
526; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
527; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm0
528; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
529; XOPAVX1-NEXT:    retq
530;
531; XOPAVX2-LABEL: constant_shift_v8i32:
532; XOPAVX2:       # BB#0:
533; XOPAVX2-NEXT:    vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
534; XOPAVX2-NEXT:    retq
535  %shift = lshr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
536  ret <8 x i32> %shift
537}
538
539define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
540; AVX1-LABEL: constant_shift_v16i16:
541; AVX1:       # BB#0:
542; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
543; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm2
544; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [32896,37008,41120,45232,49344,53456,57568,61680]
545; AVX1-NEXT:    vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
546; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm2
547; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [256,8480,16704,24928,33152,41376,49600,57824]
548; AVX1-NEXT:    vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
549; AVX1-NEXT:    vpsrlw $2, %xmm1, %xmm2
550; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [512,16960,33408,49856,768,17216,33664,50112]
551; AVX1-NEXT:    vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
552; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm2
553; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1024,33920,1280,34176,1536,34432,1792,34688]
554; AVX1-NEXT:    vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
555; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2
556; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,4112,8224,12336,16448,20560,24672,28784]
557; AVX1-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
558; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm2
559; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,8224,16448,24672,32896,41120,49344,57568]
560; AVX1-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
561; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm2
562; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,16448,32896,49344,256,16704,33152,49600]
563; AVX1-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
564; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm2
565; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,32896,256,33152,512,33408,768,33664]
566; AVX1-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
567; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
568; AVX1-NEXT:    retq
569;
570; AVX2-LABEL: constant_shift_v16i16:
571; AVX2:       # BB#0:
572; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
573; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
574; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15]
575; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
576; AVX2-NEXT:    vpsrlvd %ymm3, %ymm4, %ymm3
577; AVX2-NEXT:    vpsrld $16, %ymm3, %ymm3
578; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
579; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
580; AVX2-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
581; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
582; AVX2-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
583; AVX2-NEXT:    retq
584;
585; XOPAVX1-LABEL: constant_shift_v16i16:
586; XOPAVX1:       # BB#0:
587; XOPAVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
588; XOPAVX1-NEXT:    vpsubw {{.*}}(%rip), %xmm1, %xmm2
589; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
590; XOPAVX1-NEXT:    vpshlw %xmm2, %xmm3, %xmm2
591; XOPAVX1-NEXT:    vpsubw {{.*}}(%rip), %xmm1, %xmm1
592; XOPAVX1-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
593; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
594; XOPAVX1-NEXT:    retq
595;
596; XOPAVX2-LABEL: constant_shift_v16i16:
597; XOPAVX2:       # BB#0:
598; XOPAVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
599; XOPAVX2-NEXT:    vpsubw {{.*}}(%rip), %xmm1, %xmm2
600; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
601; XOPAVX2-NEXT:    vpshlw %xmm2, %xmm3, %xmm2
602; XOPAVX2-NEXT:    vpsubw {{.*}}(%rip), %xmm1, %xmm1
603; XOPAVX2-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
604; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
605; XOPAVX2-NEXT:    retq
606  %shift = lshr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
607  ret <16 x i16> %shift
608}
609
610define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
611; AVX1-LABEL: constant_shift_v32i8:
612; AVX1:       # BB#0:
613; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
614; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm2
615; AVX1-NEXT:    vmovdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
616; AVX1-NEXT:    vpand %xmm8, %xmm2, %xmm2
617; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
618; AVX1-NEXT:    vpsllw $5, %xmm4, %xmm4
619; AVX1-NEXT:    vpblendvb %xmm4, %xmm2, %xmm1, %xmm1
620; AVX1-NEXT:    vpsrlw $2, %xmm1, %xmm2
621; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
622; AVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2
623; AVX1-NEXT:    vpaddb %xmm4, %xmm4, %xmm6
624; AVX1-NEXT:    vpblendvb %xmm6, %xmm2, %xmm1, %xmm1
625; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm2
626; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
627; AVX1-NEXT:    vpand %xmm7, %xmm2, %xmm2
628; AVX1-NEXT:    vpaddb %xmm6, %xmm6, %xmm3
629; AVX1-NEXT:    vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
630; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm2
631; AVX1-NEXT:    vpand %xmm8, %xmm2, %xmm2
632; AVX1-NEXT:    vpblendvb %xmm4, %xmm2, %xmm0, %xmm0
633; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm2
634; AVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2
635; AVX1-NEXT:    vpblendvb %xmm6, %xmm2, %xmm0, %xmm0
636; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm2
637; AVX1-NEXT:    vpand %xmm7, %xmm2, %xmm2
638; AVX1-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
639; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
640; AVX1-NEXT:    retq
641;
642; AVX2-LABEL: constant_shift_v32i8:
643; AVX2:       # BB#0:
644; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
645; AVX2-NEXT:    vpsllw $5, %ymm1, %ymm1
646; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm2
647; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
648; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
649; AVX2-NEXT:    vpsrlw $2, %ymm0, %ymm2
650; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
651; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
652; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
653; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm2
654; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
655; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
656; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
657; AVX2-NEXT:    retq
658;
659; XOPAVX1-LABEL: constant_shift_v32i8:
660; XOPAVX1:       # BB#0:
661; XOPAVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
662; XOPAVX1-NEXT:    vpsubb {{.*}}(%rip), %xmm1, %xmm1
663; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
664; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm2, %xmm2
665; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
666; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
667; XOPAVX1-NEXT:    retq
668;
669; XOPAVX2-LABEL: constant_shift_v32i8:
670; XOPAVX2:       # BB#0:
671; XOPAVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
672; XOPAVX2-NEXT:    vpsubb {{.*}}(%rip), %xmm1, %xmm1
673; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
674; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm2, %xmm2
675; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
676; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
677; XOPAVX2-NEXT:    retq
678  %shift = lshr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
679  ret <32 x i8> %shift
680}
681
682;
683; Uniform Constant Shifts
684;
685
686define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
687; AVX1-LABEL: splatconstant_shift_v4i64:
688; AVX1:       # BB#0:
689; AVX1-NEXT:    vpsrlq $7, %xmm0, %xmm1
690; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
691; AVX1-NEXT:    vpsrlq $7, %xmm0, %xmm0
692; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
693; AVX1-NEXT:    retq
694;
695; AVX2-LABEL: splatconstant_shift_v4i64:
696; AVX2:       # BB#0:
697; AVX2-NEXT:    vpsrlq $7, %ymm0, %ymm0
698; AVX2-NEXT:    retq
699;
700; XOPAVX1-LABEL: splatconstant_shift_v4i64:
701; XOPAVX1:       # BB#0:
702; XOPAVX1-NEXT:    vpsrlq $7, %xmm0, %xmm1
703; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
704; XOPAVX1-NEXT:    vpsrlq $7, %xmm0, %xmm0
705; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
706; XOPAVX1-NEXT:    retq
707;
708; XOPAVX2-LABEL: splatconstant_shift_v4i64:
709; XOPAVX2:       # BB#0:
710; XOPAVX2-NEXT:    vpsrlq $7, %ymm0, %ymm0
711; XOPAVX2-NEXT:    retq
712  %shift = lshr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
713  ret <4 x i64> %shift
714}
715
716define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind {
717; AVX1-LABEL: splatconstant_shift_v8i32:
718; AVX1:       # BB#0:
719; AVX1-NEXT:    vpsrld $5, %xmm0, %xmm1
720; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
721; AVX1-NEXT:    vpsrld $5, %xmm0, %xmm0
722; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
723; AVX1-NEXT:    retq
724;
725; AVX2-LABEL: splatconstant_shift_v8i32:
726; AVX2:       # BB#0:
727; AVX2-NEXT:    vpsrld $5, %ymm0, %ymm0
728; AVX2-NEXT:    retq
729;
730; XOPAVX1-LABEL: splatconstant_shift_v8i32:
731; XOPAVX1:       # BB#0:
732; XOPAVX1-NEXT:    vpsrld $5, %xmm0, %xmm1
733; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
734; XOPAVX1-NEXT:    vpsrld $5, %xmm0, %xmm0
735; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
736; XOPAVX1-NEXT:    retq
737;
738; XOPAVX2-LABEL: splatconstant_shift_v8i32:
739; XOPAVX2:       # BB#0:
740; XOPAVX2-NEXT:    vpsrld $5, %ymm0, %ymm0
741; XOPAVX2-NEXT:    retq
742  %shift = lshr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
743  ret <8 x i32> %shift
744}
745
746define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind {
747; AVX1-LABEL: splatconstant_shift_v16i16:
748; AVX1:       # BB#0:
749; AVX1-NEXT:    vpsrlw $3, %xmm0, %xmm1
750; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
751; AVX1-NEXT:    vpsrlw $3, %xmm0, %xmm0
752; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
753; AVX1-NEXT:    retq
754;
755; AVX2-LABEL: splatconstant_shift_v16i16:
756; AVX2:       # BB#0:
757; AVX2-NEXT:    vpsrlw $3, %ymm0, %ymm0
758; AVX2-NEXT:    retq
759;
760; XOPAVX1-LABEL: splatconstant_shift_v16i16:
761; XOPAVX1:       # BB#0:
762; XOPAVX1-NEXT:    vpsrlw $3, %xmm0, %xmm1
763; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
764; XOPAVX1-NEXT:    vpsrlw $3, %xmm0, %xmm0
765; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
766; XOPAVX1-NEXT:    retq
767;
768; XOPAVX2-LABEL: splatconstant_shift_v16i16:
769; XOPAVX2:       # BB#0:
770; XOPAVX2-NEXT:    vpsrlw $3, %ymm0, %ymm0
771; XOPAVX2-NEXT:    retq
772  %shift = lshr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
773  ret <16 x i16> %shift
774}
775
776define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
777; AVX1-LABEL: splatconstant_shift_v32i8:
778; AVX1:       # BB#0:
779; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
780; AVX1-NEXT:    vpsrlw $3, %xmm1, %xmm1
781; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
782; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
783; AVX1-NEXT:    vpsrlw $3, %xmm0, %xmm0
784; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
785; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
786; AVX1-NEXT:    retq
787;
788; AVX2-LABEL: splatconstant_shift_v32i8:
789; AVX2:       # BB#0:
790; AVX2-NEXT:    vpsrlw $3, %ymm0, %ymm0
791; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
792; AVX2-NEXT:    retq
793;
794; XOPAVX1-LABEL: splatconstant_shift_v32i8:
795; XOPAVX1:       # BB#0:
796; XOPAVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
797; XOPAVX1-NEXT:    vpsubb {{.*}}(%rip), %xmm1, %xmm1
798; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
799; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm2, %xmm2
800; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
801; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
802; XOPAVX1-NEXT:    retq
803;
804; XOPAVX2-LABEL: splatconstant_shift_v32i8:
805; XOPAVX2:       # BB#0:
806; XOPAVX2-NEXT:    vpsrlw $3, %ymm0, %ymm0
807; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
808; XOPAVX2-NEXT:    retq
809  %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
810  ret <32 x i8> %shift
811}
812