• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
6; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
7;
8; Variable Shifts
9;
10
11define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
12; AVX1-LABEL: var_shift_v4i64:
13; AVX1:       # BB#0:
14; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
15; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
16; AVX1-NEXT:    vpsrlq %xmm2, %xmm3, %xmm4
17; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
18; AVX1-NEXT:    vpsrlq %xmm2, %xmm3, %xmm2
19; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
20; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm3
21; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
22; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
23; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
24; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
25; AVX1-NEXT:    retq
26;
27; AVX2-LABEL: var_shift_v4i64:
28; AVX2:       # BB#0:
29; AVX2-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
30; AVX2-NEXT:    retq
31;
32; XOPAVX1-LABEL: var_shift_v4i64:
33; XOPAVX1:       # BB#0:
34; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
35; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
36; XOPAVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
37; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
38; XOPAVX1-NEXT:    vpshlq %xmm2, %xmm4, %xmm2
39; XOPAVX1-NEXT:    vpsubq %xmm1, %xmm3, %xmm1
40; XOPAVX1-NEXT:    vpshlq %xmm1, %xmm0, %xmm0
41; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
42; XOPAVX1-NEXT:    retq
43;
44; XOPAVX2-LABEL: var_shift_v4i64:
45; XOPAVX2:       # BB#0:
46; XOPAVX2-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
47; XOPAVX2-NEXT:    retq
48;
49; AVX512-LABEL: var_shift_v4i64:
50; AVX512:       ## BB#0:
51; AVX512-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
52; AVX512-NEXT:    retq
53  %shift = lshr <4 x i64> %a, %b
54  ret <4 x i64> %shift
55}
56
57define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
58; AVX1-LABEL: var_shift_v8i32:
59; AVX1:       # BB#0:
60; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
61; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
62; AVX1-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
63; AVX1-NEXT:    vpsrld %xmm4, %xmm2, %xmm4
64; AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm5
65; AVX1-NEXT:    vpsrld %xmm5, %xmm2, %xmm5
66; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
67; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
68; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
69; AVX1-NEXT:    vpsrld %xmm6, %xmm2, %xmm6
70; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
71; AVX1-NEXT:    vpsrld %xmm3, %xmm2, %xmm2
72; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7]
73; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
74; AVX1-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
75; AVX1-NEXT:    vpsrld %xmm3, %xmm0, %xmm3
76; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm4
77; AVX1-NEXT:    vpsrld %xmm4, %xmm0, %xmm4
78; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
79; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
80; AVX1-NEXT:    vpsrld %xmm4, %xmm0, %xmm4
81; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
82; AVX1-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
83; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
84; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
85; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
86; AVX1-NEXT:    retq
87;
88; AVX2-LABEL: var_shift_v8i32:
89; AVX2:       # BB#0:
90; AVX2-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
91; AVX2-NEXT:    retq
92;
93; XOPAVX1-LABEL: var_shift_v8i32:
94; XOPAVX1:       # BB#0:
95; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
96; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
97; XOPAVX1-NEXT:    vpsubd %xmm2, %xmm3, %xmm2
98; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
99; XOPAVX1-NEXT:    vpshld %xmm2, %xmm4, %xmm2
100; XOPAVX1-NEXT:    vpsubd %xmm1, %xmm3, %xmm1
101; XOPAVX1-NEXT:    vpshld %xmm1, %xmm0, %xmm0
102; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
103; XOPAVX1-NEXT:    retq
104;
105; XOPAVX2-LABEL: var_shift_v8i32:
106; XOPAVX2:       # BB#0:
107; XOPAVX2-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
108; XOPAVX2-NEXT:    retq
109;
110; AVX512-LABEL: var_shift_v8i32:
111; AVX512:       ## BB#0:
112; AVX512-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
113; AVX512-NEXT:    retq
114  %shift = lshr <8 x i32> %a, %b
115  ret <8 x i32> %shift
116}
117
118define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
119; AVX1-LABEL: var_shift_v16i16:
120; AVX1:       # BB#0:
121; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
122; AVX1-NEXT:    vpsllw $12, %xmm2, %xmm3
123; AVX1-NEXT:    vpsllw $4, %xmm2, %xmm2
124; AVX1-NEXT:    vpor %xmm3, %xmm2, %xmm2
125; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm3
126; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
127; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm5
128; AVX1-NEXT:    vpblendvb %xmm2, %xmm5, %xmm4, %xmm2
129; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm4
130; AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
131; AVX1-NEXT:    vpsrlw $2, %xmm2, %xmm4
132; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
133; AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
134; AVX1-NEXT:    vpsrlw $1, %xmm2, %xmm4
135; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
136; AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
137; AVX1-NEXT:    vpsllw $12, %xmm1, %xmm3
138; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
139; AVX1-NEXT:    vpor %xmm3, %xmm1, %xmm1
140; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm3
141; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm4
142; AVX1-NEXT:    vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
143; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm1
144; AVX1-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
145; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm1
146; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
147; AVX1-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
148; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm1
149; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
150; AVX1-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
151; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
152; AVX1-NEXT:    retq
153;
154; AVX2-LABEL: var_shift_v16i16:
155; AVX2:       # BB#0:
156; AVX2-NEXT:    vpxor %ymm2, %ymm2, %ymm2
157; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
158; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
159; AVX2-NEXT:    vpsrlvd %ymm3, %ymm4, %ymm3
160; AVX2-NEXT:    vpsrld $16, %ymm3, %ymm3
161; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
162; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
163; AVX2-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
164; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
165; AVX2-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
166; AVX2-NEXT:    retq
167;
168; XOPAVX1-LABEL: var_shift_v16i16:
169; XOPAVX1:       # BB#0:
170; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
171; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
172; XOPAVX1-NEXT:    vpsubw %xmm2, %xmm3, %xmm2
173; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
174; XOPAVX1-NEXT:    vpshlw %xmm2, %xmm4, %xmm2
175; XOPAVX1-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
176; XOPAVX1-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
177; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
178; XOPAVX1-NEXT:    retq
179;
180; XOPAVX2-LABEL: var_shift_v16i16:
181; XOPAVX2:       # BB#0:
182; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
183; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
184; XOPAVX2-NEXT:    vpsubw %xmm2, %xmm3, %xmm2
185; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
186; XOPAVX2-NEXT:    vpshlw %xmm2, %xmm4, %xmm2
187; XOPAVX2-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
188; XOPAVX2-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
189; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
190; XOPAVX2-NEXT:    retq
191;
192; AVX512-LABEL: var_shift_v16i16:
193; AVX512:       ## BB#0:
194; AVX512-NEXT:    ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
195; AVX512-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
196; AVX512-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
197; AVX512-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
198; AVX512-NEXT:    retq
199  %shift = lshr <16 x i16> %a, %b
200  ret <16 x i16> %shift
201}
202
203define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
204; AVX1-LABEL: var_shift_v32i8:
205; AVX1:       # BB#0:
206; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
207; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm3
208; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
209; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
210; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
211; AVX1-NEXT:    vpsllw $5, %xmm5, %xmm5
212; AVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
213; AVX1-NEXT:    vpsrlw $2, %xmm2, %xmm3
214; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
215; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
216; AVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
217; AVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
218; AVX1-NEXT:    vpsrlw $1, %xmm2, %xmm3
219; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
220; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
221; AVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
222; AVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
223; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm3
224; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
225; AVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
226; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
227; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm3
228; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
229; AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
230; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
231; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm3
232; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
233; AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
234; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
235; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
236; AVX1-NEXT:    retq
237;
238; AVX2-LABEL: var_shift_v32i8:
239; AVX2:       # BB#0:
240; AVX2-NEXT:    vpsllw $5, %ymm1, %ymm1
241; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm2
242; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
243; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
244; AVX2-NEXT:    vpsrlw $2, %ymm0, %ymm2
245; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
246; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
247; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
248; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm2
249; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
250; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
251; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
252; AVX2-NEXT:    retq
253;
254; XOPAVX1-LABEL: var_shift_v32i8:
255; XOPAVX1:       # BB#0:
256; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
257; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
258; XOPAVX1-NEXT:    vpsubb %xmm2, %xmm3, %xmm2
259; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
260; XOPAVX1-NEXT:    vpshlb %xmm2, %xmm4, %xmm2
261; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
262; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
263; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
264; XOPAVX1-NEXT:    retq
265;
266; XOPAVX2-LABEL: var_shift_v32i8:
267; XOPAVX2:       # BB#0:
268; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
269; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
270; XOPAVX2-NEXT:    vpsubb %xmm2, %xmm3, %xmm2
271; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
272; XOPAVX2-NEXT:    vpshlb %xmm2, %xmm4, %xmm2
273; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
274; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
275; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
276; XOPAVX2-NEXT:    retq
277;
278; AVX512-LABEL: var_shift_v32i8:
279; AVX512:       ## BB#0:
280; AVX512-NEXT:    vpsllw $5, %ymm1, %ymm1
281; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm2
282; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
283; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
284; AVX512-NEXT:    vpsrlw $2, %ymm0, %ymm2
285; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
286; AVX512-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
287; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
288; AVX512-NEXT:    vpsrlw $1, %ymm0, %ymm2
289; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
290; AVX512-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
291; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
292; AVX512-NEXT:    retq
293  %shift = lshr <32 x i8> %a, %b
294  ret <32 x i8> %shift
295}
296
297;
298; Uniform Variable Shifts
299;
300
301define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
302; AVX1-LABEL: splatvar_shift_v4i64:
303; AVX1:       # BB#0:
304; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
305; AVX1-NEXT:    vpsrlq %xmm1, %xmm2, %xmm2
306; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
307; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
308; AVX1-NEXT:    retq
309;
310; AVX2-LABEL: splatvar_shift_v4i64:
311; AVX2:       # BB#0:
312; AVX2-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
313; AVX2-NEXT:    retq
314;
315; XOPAVX1-LABEL: splatvar_shift_v4i64:
316; XOPAVX1:       # BB#0:
317; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
318; XOPAVX1-NEXT:    vpsrlq %xmm1, %xmm2, %xmm2
319; XOPAVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
320; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
321; XOPAVX1-NEXT:    retq
322;
323; XOPAVX2-LABEL: splatvar_shift_v4i64:
324; XOPAVX2:       # BB#0:
325; XOPAVX2-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
326; XOPAVX2-NEXT:    retq
327;
328; AVX512-LABEL: splatvar_shift_v4i64:
329; AVX512:       ## BB#0:
330; AVX512-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
331; AVX512-NEXT:    retq
332  %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
333  %shift = lshr <4 x i64> %a, %splat
334  ret <4 x i64> %shift
335}
336
337define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
338; AVX1-LABEL: splatvar_shift_v8i32:
339; AVX1:       # BB#0:
340; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
341; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
342; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
343; AVX1-NEXT:    vpsrld %xmm1, %xmm2, %xmm2
344; AVX1-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
345; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
346; AVX1-NEXT:    retq
347;
348; AVX2-LABEL: splatvar_shift_v8i32:
349; AVX2:       # BB#0:
350; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
351; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
352; AVX2-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
353; AVX2-NEXT:    retq
354;
355; XOPAVX1-LABEL: splatvar_shift_v8i32:
356; XOPAVX1:       # BB#0:
357; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
358; XOPAVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
359; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
360; XOPAVX1-NEXT:    vpsrld %xmm1, %xmm2, %xmm2
361; XOPAVX1-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
362; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
363; XOPAVX1-NEXT:    retq
364;
365; XOPAVX2-LABEL: splatvar_shift_v8i32:
366; XOPAVX2:       # BB#0:
367; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
368; XOPAVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
369; XOPAVX2-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
370; XOPAVX2-NEXT:    retq
371;
372; AVX512-LABEL: splatvar_shift_v8i32:
373; AVX512:       ## BB#0:
374; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
375; AVX512-NEXT:    vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
376; AVX512-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
377; AVX512-NEXT:    retq
378  %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
379  %shift = lshr <8 x i32> %a, %splat
380  ret <8 x i32> %shift
381}
382
383define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
384; AVX1-LABEL: splatvar_shift_v16i16:
385; AVX1:       # BB#0:
386; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
387; AVX1-NEXT:    vmovd %xmm1, %eax
388; AVX1-NEXT:    movzwl %ax, %eax
389; AVX1-NEXT:    vmovd %eax, %xmm1
390; AVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
391; AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
392; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
393; AVX1-NEXT:    retq
394;
395; AVX2-LABEL: splatvar_shift_v16i16:
396; AVX2:       # BB#0:
397; AVX2-NEXT:    vmovd %xmm1, %eax
398; AVX2-NEXT:    movzwl %ax, %eax
399; AVX2-NEXT:    vmovd %eax, %xmm1
400; AVX2-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
401; AVX2-NEXT:    retq
402;
403; XOPAVX1-LABEL: splatvar_shift_v16i16:
404; XOPAVX1:       # BB#0:
405; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
406; XOPAVX1-NEXT:    vmovd %xmm1, %eax
407; XOPAVX1-NEXT:    movzwl %ax, %eax
408; XOPAVX1-NEXT:    vmovd %eax, %xmm1
409; XOPAVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
410; XOPAVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
411; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
412; XOPAVX1-NEXT:    retq
413;
414; XOPAVX2-LABEL: splatvar_shift_v16i16:
415; XOPAVX2:       # BB#0:
416; XOPAVX2-NEXT:    vmovd %xmm1, %eax
417; XOPAVX2-NEXT:    movzwl %ax, %eax
418; XOPAVX2-NEXT:    vmovd %eax, %xmm1
419; XOPAVX2-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
420; XOPAVX2-NEXT:    retq
421;
422; AVX512-LABEL: splatvar_shift_v16i16:
423; AVX512:       ## BB#0:
424; AVX512-NEXT:    vmovd %xmm1, %eax
425; AVX512-NEXT:    movzwl %ax, %eax
426; AVX512-NEXT:    vmovd %eax, %xmm1
427; AVX512-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
428; AVX512-NEXT:    retq
429  %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
430  %shift = lshr <16 x i16> %a, %splat
431  ret <16 x i16> %shift
432}
433
434define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
435; AVX1-LABEL: splatvar_shift_v32i8:
436; AVX1:       # BB#0:
437; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
438; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
439; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
440; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm3
441; AVX1-NEXT:    vmovdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
442; AVX1-NEXT:    vpand %xmm8, %xmm3, %xmm3
443; AVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
444; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm2, %xmm2
445; AVX1-NEXT:    vpsrlw $2, %xmm2, %xmm3
446; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
447; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
448; AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm6
449; AVX1-NEXT:    vpblendvb %xmm6, %xmm3, %xmm2, %xmm2
450; AVX1-NEXT:    vpsrlw $1, %xmm2, %xmm3
451; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
452; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
453; AVX1-NEXT:    vpaddb %xmm6, %xmm6, %xmm4
454; AVX1-NEXT:    vpblendvb %xmm4, %xmm3, %xmm2, %xmm2
455; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm3
456; AVX1-NEXT:    vpand %xmm8, %xmm3, %xmm3
457; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
458; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm1
459; AVX1-NEXT:    vpand %xmm5, %xmm1, %xmm1
460; AVX1-NEXT:    vpblendvb %xmm6, %xmm1, %xmm0, %xmm0
461; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm1
462; AVX1-NEXT:    vpand %xmm7, %xmm1, %xmm1
463; AVX1-NEXT:    vpblendvb %xmm4, %xmm1, %xmm0, %xmm0
464; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
465; AVX1-NEXT:    retq
466;
467; AVX2-LABEL: splatvar_shift_v32i8:
468; AVX2:       # BB#0:
469; AVX2-NEXT:    vpbroadcastb %xmm1, %ymm1
470; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm2
471; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
472; AVX2-NEXT:    vpsllw $5, %ymm1, %ymm1
473; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
474; AVX2-NEXT:    vpsrlw $2, %ymm0, %ymm2
475; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
476; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
477; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
478; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm2
479; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
480; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
481; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
482; AVX2-NEXT:    retq
483;
484; XOPAVX1-LABEL: splatvar_shift_v32i8:
485; XOPAVX1:       # BB#0:
486; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
487; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
488; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
489; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
490; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm2, %xmm2
491; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
492; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
493; XOPAVX1-NEXT:    retq
494;
495; XOPAVX2-LABEL: splatvar_shift_v32i8:
496; XOPAVX2:       # BB#0:
497; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %ymm1
498; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
499; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
500; XOPAVX2-NEXT:    vpsubb %xmm2, %xmm3, %xmm2
501; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
502; XOPAVX2-NEXT:    vpshlb %xmm2, %xmm4, %xmm2
503; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
504; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
505; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
506; XOPAVX2-NEXT:    retq
507;
508; AVX512-LABEL: splatvar_shift_v32i8:
509; AVX512:       ## BB#0:
510; AVX512-NEXT:    vpbroadcastb %xmm1, %ymm1
511; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm2
512; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
513; AVX512-NEXT:    vpsllw $5, %ymm1, %ymm1
514; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
515; AVX512-NEXT:    vpsrlw $2, %ymm0, %ymm2
516; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
517; AVX512-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
518; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
519; AVX512-NEXT:    vpsrlw $1, %ymm0, %ymm2
520; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
521; AVX512-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
522; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
523; AVX512-NEXT:    retq
524  %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
525  %shift = lshr <32 x i8> %a, %splat
526  ret <32 x i8> %shift
527}
528
529;
530; Constant Shifts
531;
532
533define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
534; AVX1-LABEL: constant_shift_v4i64:
535; AVX1:       # BB#0:
536; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
537; AVX1-NEXT:    vpsrlq $62, %xmm1, %xmm2
538; AVX1-NEXT:    vpsrlq $31, %xmm1, %xmm1
539; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
540; AVX1-NEXT:    vpsrlq $7, %xmm0, %xmm2
541; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
542; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
543; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
544; AVX1-NEXT:    retq
545;
546; AVX2-LABEL: constant_shift_v4i64:
547; AVX2:       # BB#0:
548; AVX2-NEXT:    vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
549; AVX2-NEXT:    retq
550;
551; XOPAVX1-LABEL: constant_shift_v4i64:
552; XOPAVX1:       # BB#0:
553; XOPAVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
554; XOPAVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm1, %xmm2
555; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
556; XOPAVX1-NEXT:    vpshlq %xmm2, %xmm3, %xmm2
557; XOPAVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm1, %xmm1
558; XOPAVX1-NEXT:    vpshlq %xmm1, %xmm0, %xmm0
559; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
560; XOPAVX1-NEXT:    retq
561;
562; XOPAVX2-LABEL: constant_shift_v4i64:
563; XOPAVX2:       # BB#0:
564; XOPAVX2-NEXT:    vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
565; XOPAVX2-NEXT:    retq
566;
567; AVX512-LABEL: constant_shift_v4i64:
568; AVX512:       ## BB#0:
569; AVX512-NEXT:    vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
570; AVX512-NEXT:    retq
571  %shift = lshr <4 x i64> %a, <i64 1, i64 7, i64 31, i64 62>
572  ret <4 x i64> %shift
573}
574
575define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
576; AVX1-LABEL: constant_shift_v8i32:
577; AVX1:       # BB#0:
578; AVX1-NEXT:    vpsrld $7, %xmm0, %xmm1
579; AVX1-NEXT:    vpsrld $5, %xmm0, %xmm2
580; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
581; AVX1-NEXT:    vpsrld $6, %xmm0, %xmm2
582; AVX1-NEXT:    vpsrld $4, %xmm0, %xmm3
583; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
584; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
585; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
586; AVX1-NEXT:    vpsrld $7, %xmm0, %xmm2
587; AVX1-NEXT:    vpsrld $9, %xmm0, %xmm3
588; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
589; AVX1-NEXT:    vpsrld $8, %xmm0, %xmm0
590; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
591; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
592; AVX1-NEXT:    retq
593;
594; AVX2-LABEL: constant_shift_v8i32:
595; AVX2:       # BB#0:
596; AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
597; AVX2-NEXT:    retq
598;
599; XOPAVX1-LABEL: constant_shift_v8i32:
600; XOPAVX1:       # BB#0:
601; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm1
602; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
603; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm0
604; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
605; XOPAVX1-NEXT:    retq
606;
607; XOPAVX2-LABEL: constant_shift_v8i32:
608; XOPAVX2:       # BB#0:
609; XOPAVX2-NEXT:    vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
610; XOPAVX2-NEXT:    retq
611;
612; AVX512-LABEL: constant_shift_v8i32:
613; AVX512:       ## BB#0:
614; AVX512-NEXT:    vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
615; AVX512-NEXT:    retq
616  %shift = lshr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
617  ret <8 x i32> %shift
618}
619
620define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
621; AVX1-LABEL: constant_shift_v16i16:
622; AVX1:       # BB#0:
623; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
624; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
625; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm2
626; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
627; AVX1-NEXT:    vpsrlw $2, %xmm1, %xmm2
628; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
629; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm2
630; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
631; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm2
632; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
633; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm2
634; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
635; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm2
636; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
637; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
638; AVX1-NEXT:    retq
639;
640; AVX2-LABEL: constant_shift_v16i16:
641; AVX2:       # BB#0:
642; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
643; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
644; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15]
645; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
646; AVX2-NEXT:    vpsrlvd %ymm3, %ymm4, %ymm3
647; AVX2-NEXT:    vpsrld $16, %ymm3, %ymm3
648; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
649; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
650; AVX2-NEXT:    vpsrlvd %ymm2, %ymm0, %ymm0
651; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
652; AVX2-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
653; AVX2-NEXT:    retq
654;
655; XOPAVX1-LABEL: constant_shift_v16i16:
656; XOPAVX1:       # BB#0:
657; XOPAVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
658; XOPAVX1-NEXT:    vpsubw {{.*}}(%rip), %xmm1, %xmm2
659; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
660; XOPAVX1-NEXT:    vpshlw %xmm2, %xmm3, %xmm2
661; XOPAVX1-NEXT:    vpsubw {{.*}}(%rip), %xmm1, %xmm1
662; XOPAVX1-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
663; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
664; XOPAVX1-NEXT:    retq
665;
666; XOPAVX2-LABEL: constant_shift_v16i16:
667; XOPAVX2:       # BB#0:
668; XOPAVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
669; XOPAVX2-NEXT:    vpsubw {{.*}}(%rip), %xmm1, %xmm2
670; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
671; XOPAVX2-NEXT:    vpshlw %xmm2, %xmm3, %xmm2
672; XOPAVX2-NEXT:    vpsubw {{.*}}(%rip), %xmm1, %xmm1
673; XOPAVX2-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
674; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
675; XOPAVX2-NEXT:    retq
676;
677; AVX512-LABEL: constant_shift_v16i16:
678; AVX512:       ## BB#0:
679; AVX512-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
680; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
681; AVX512-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
682; AVX512-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
683; AVX512-NEXT:    retq
684  %shift = lshr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
685  ret <16 x i16> %shift
686}
687
688define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
689; AVX1-LABEL: constant_shift_v32i8:
690; AVX1:       # BB#0:
691; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
692; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm2
693; AVX1-NEXT:    vmovdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
694; AVX1-NEXT:    vpand %xmm8, %xmm2, %xmm2
695; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
696; AVX1-NEXT:    vpsllw $5, %xmm4, %xmm4
697; AVX1-NEXT:    vpblendvb %xmm4, %xmm2, %xmm1, %xmm1
698; AVX1-NEXT:    vpsrlw $2, %xmm1, %xmm2
699; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
700; AVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2
701; AVX1-NEXT:    vpaddb %xmm4, %xmm4, %xmm6
702; AVX1-NEXT:    vpblendvb %xmm6, %xmm2, %xmm1, %xmm1
703; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm2
704; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
705; AVX1-NEXT:    vpand %xmm7, %xmm2, %xmm2
706; AVX1-NEXT:    vpaddb %xmm6, %xmm6, %xmm3
707; AVX1-NEXT:    vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
708; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm2
709; AVX1-NEXT:    vpand %xmm8, %xmm2, %xmm2
710; AVX1-NEXT:    vpblendvb %xmm4, %xmm2, %xmm0, %xmm0
711; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm2
712; AVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2
713; AVX1-NEXT:    vpblendvb %xmm6, %xmm2, %xmm0, %xmm0
714; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm2
715; AVX1-NEXT:    vpand %xmm7, %xmm2, %xmm2
716; AVX1-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
717; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
718; AVX1-NEXT:    retq
719;
720; AVX2-LABEL: constant_shift_v32i8:
721; AVX2:       # BB#0:
722; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
723; AVX2-NEXT:    vpsllw $5, %ymm1, %ymm1
724; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm2
725; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
726; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
727; AVX2-NEXT:    vpsrlw $2, %ymm0, %ymm2
728; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
729; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
730; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
731; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm2
732; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
733; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
734; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
735; AVX2-NEXT:    retq
736;
737; XOPAVX1-LABEL: constant_shift_v32i8:
738; XOPAVX1:       # BB#0:
739; XOPAVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
740; XOPAVX1-NEXT:    vpsubb {{.*}}(%rip), %xmm1, %xmm1
741; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
742; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm2, %xmm2
743; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
744; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
745; XOPAVX1-NEXT:    retq
746;
747; XOPAVX2-LABEL: constant_shift_v32i8:
748; XOPAVX2:       # BB#0:
749; XOPAVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
750; XOPAVX2-NEXT:    vpsubb {{.*}}(%rip), %xmm1, %xmm1
751; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
752; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm2, %xmm2
753; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
754; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
755; XOPAVX2-NEXT:    retq
756;
757; AVX512-LABEL: constant_shift_v32i8:
758; AVX512:       ## BB#0:
759; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0,0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
760; AVX512-NEXT:    vpsllw $5, %ymm1, %ymm1
761; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm2
762; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
763; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
764; AVX512-NEXT:    vpsrlw $2, %ymm0, %ymm2
765; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
766; AVX512-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
767; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
768; AVX512-NEXT:    vpsrlw $1, %ymm0, %ymm2
769; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
770; AVX512-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
771; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
772; AVX512-NEXT:    retq
773  %shift = lshr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
774  ret <32 x i8> %shift
775}
776
777;
778; Uniform Constant Shifts
779;
780
781define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
782; AVX1-LABEL: splatconstant_shift_v4i64:
783; AVX1:       # BB#0:
784; AVX1-NEXT:    vpsrlq $7, %xmm0, %xmm1
785; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
786; AVX1-NEXT:    vpsrlq $7, %xmm0, %xmm0
787; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
788; AVX1-NEXT:    retq
789;
790; AVX2-LABEL: splatconstant_shift_v4i64:
791; AVX2:       # BB#0:
792; AVX2-NEXT:    vpsrlq $7, %ymm0, %ymm0
793; AVX2-NEXT:    retq
794;
795; XOPAVX1-LABEL: splatconstant_shift_v4i64:
796; XOPAVX1:       # BB#0:
797; XOPAVX1-NEXT:    vpsrlq $7, %xmm0, %xmm1
798; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
799; XOPAVX1-NEXT:    vpsrlq $7, %xmm0, %xmm0
800; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
801; XOPAVX1-NEXT:    retq
802;
803; XOPAVX2-LABEL: splatconstant_shift_v4i64:
804; XOPAVX2:       # BB#0:
805; XOPAVX2-NEXT:    vpsrlq $7, %ymm0, %ymm0
806; XOPAVX2-NEXT:    retq
807;
808; AVX512-LABEL: splatconstant_shift_v4i64:
809; AVX512:       ## BB#0:
810; AVX512-NEXT:    vpsrlq $7, %ymm0, %ymm0
811; AVX512-NEXT:    retq
812  %shift = lshr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
813  ret <4 x i64> %shift
814}
815
816define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind {
817; AVX1-LABEL: splatconstant_shift_v8i32:
818; AVX1:       # BB#0:
819; AVX1-NEXT:    vpsrld $5, %xmm0, %xmm1
820; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
821; AVX1-NEXT:    vpsrld $5, %xmm0, %xmm0
822; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
823; AVX1-NEXT:    retq
824;
825; AVX2-LABEL: splatconstant_shift_v8i32:
826; AVX2:       # BB#0:
827; AVX2-NEXT:    vpsrld $5, %ymm0, %ymm0
828; AVX2-NEXT:    retq
829;
830; XOPAVX1-LABEL: splatconstant_shift_v8i32:
831; XOPAVX1:       # BB#0:
832; XOPAVX1-NEXT:    vpsrld $5, %xmm0, %xmm1
833; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
834; XOPAVX1-NEXT:    vpsrld $5, %xmm0, %xmm0
835; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
836; XOPAVX1-NEXT:    retq
837;
838; XOPAVX2-LABEL: splatconstant_shift_v8i32:
839; XOPAVX2:       # BB#0:
840; XOPAVX2-NEXT:    vpsrld $5, %ymm0, %ymm0
841; XOPAVX2-NEXT:    retq
842;
843; AVX512-LABEL: splatconstant_shift_v8i32:
844; AVX512:       ## BB#0:
845; AVX512-NEXT:    vpsrld $5, %ymm0, %ymm0
846; AVX512-NEXT:    retq
847  %shift = lshr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
848  ret <8 x i32> %shift
849}
850
851define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind {
852; AVX1-LABEL: splatconstant_shift_v16i16:
853; AVX1:       # BB#0:
854; AVX1-NEXT:    vpsrlw $3, %xmm0, %xmm1
855; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
856; AVX1-NEXT:    vpsrlw $3, %xmm0, %xmm0
857; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
858; AVX1-NEXT:    retq
859;
860; AVX2-LABEL: splatconstant_shift_v16i16:
861; AVX2:       # BB#0:
862; AVX2-NEXT:    vpsrlw $3, %ymm0, %ymm0
863; AVX2-NEXT:    retq
864;
865; XOPAVX1-LABEL: splatconstant_shift_v16i16:
866; XOPAVX1:       # BB#0:
867; XOPAVX1-NEXT:    vpsrlw $3, %xmm0, %xmm1
868; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
869; XOPAVX1-NEXT:    vpsrlw $3, %xmm0, %xmm0
870; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
871; XOPAVX1-NEXT:    retq
872;
873; XOPAVX2-LABEL: splatconstant_shift_v16i16:
874; XOPAVX2:       # BB#0:
875; XOPAVX2-NEXT:    vpsrlw $3, %ymm0, %ymm0
876; XOPAVX2-NEXT:    retq
877;
878; AVX512-LABEL: splatconstant_shift_v16i16:
879; AVX512:       ## BB#0:
880; AVX512-NEXT:    vpsrlw $3, %ymm0, %ymm0
881; AVX512-NEXT:    retq
882  %shift = lshr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
883  ret <16 x i16> %shift
884}
885
886define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
887; AVX1-LABEL: splatconstant_shift_v32i8:
888; AVX1:       # BB#0:
889; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
890; AVX1-NEXT:    vpsrlw $3, %xmm1, %xmm1
891; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
892; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
893; AVX1-NEXT:    vpsrlw $3, %xmm0, %xmm0
894; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
895; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
896; AVX1-NEXT:    retq
897;
898; AVX2-LABEL: splatconstant_shift_v32i8:
899; AVX2:       # BB#0:
900; AVX2-NEXT:    vpsrlw $3, %ymm0, %ymm0
901; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
902; AVX2-NEXT:    retq
903;
904; XOPAVX1-LABEL: splatconstant_shift_v32i8:
905; XOPAVX1:       # BB#0:
906; XOPAVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
907; XOPAVX1-NEXT:    vpsubb {{.*}}(%rip), %xmm1, %xmm1
908; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
909; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm2, %xmm2
910; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
911; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
912; XOPAVX1-NEXT:    retq
913;
914; XOPAVX2-LABEL: splatconstant_shift_v32i8:
915; XOPAVX2:       # BB#0:
916; XOPAVX2-NEXT:    vpsrlw $3, %ymm0, %ymm0
917; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
918; XOPAVX2-NEXT:    retq
919;
920; AVX512-LABEL: splatconstant_shift_v32i8:
921; AVX512:       ## BB#0:
922; AVX512-NEXT:    vpsrlw $3, %ymm0, %ymm0
923; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
924; AVX512-NEXT:    retq
925  %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
926  ret <32 x i8> %shift
927}
928