• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=XOPAVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=XOPAVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=AVX512VL,AVX512DQVL
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512VL,AVX512BWVL
10;
11; 32-bit runs to make sure we do reasonable things for i64 shifts.
12; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx  | FileCheck %s --check-prefix=X86-AVX1
13; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X86-AVX2
14
15;
16; Variable Shifts
17;
18
19define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
20; AVX1-LABEL: var_shift_v4i64:
21; AVX1:       # %bb.0:
22; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
23; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
24; AVX1-NEXT:    vpsrlq %xmm2, %xmm3, %xmm4
25; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
26; AVX1-NEXT:    vpsrlq %xmm2, %xmm3, %xmm2
27; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
28; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm3
29; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
30; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
31; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
32; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
33; AVX1-NEXT:    retq
34;
35; AVX2-LABEL: var_shift_v4i64:
36; AVX2:       # %bb.0:
37; AVX2-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
38; AVX2-NEXT:    retq
39;
40; XOPAVX1-LABEL: var_shift_v4i64:
41; XOPAVX1:       # %bb.0:
42; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
43; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
44; XOPAVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
45; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
46; XOPAVX1-NEXT:    vpshlq %xmm2, %xmm4, %xmm2
47; XOPAVX1-NEXT:    vpsubq %xmm1, %xmm3, %xmm1
48; XOPAVX1-NEXT:    vpshlq %xmm1, %xmm0, %xmm0
49; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
50; XOPAVX1-NEXT:    retq
51;
52; XOPAVX2-LABEL: var_shift_v4i64:
53; XOPAVX2:       # %bb.0:
54; XOPAVX2-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
55; XOPAVX2-NEXT:    retq
56;
57; AVX512-LABEL: var_shift_v4i64:
58; AVX512:       # %bb.0:
59; AVX512-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
60; AVX512-NEXT:    retq
61;
62; AVX512VL-LABEL: var_shift_v4i64:
63; AVX512VL:       # %bb.0:
64; AVX512VL-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
65; AVX512VL-NEXT:    retq
66;
67; X86-AVX1-LABEL: var_shift_v4i64:
68; X86-AVX1:       # %bb.0:
69; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
70; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
71; X86-AVX1-NEXT:    vpsrlq %xmm2, %xmm3, %xmm4
72; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
73; X86-AVX1-NEXT:    vpsrlq %xmm2, %xmm3, %xmm2
74; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
75; X86-AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm3
76; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
77; X86-AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
78; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
79; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
80; X86-AVX1-NEXT:    retl
81;
82; X86-AVX2-LABEL: var_shift_v4i64:
83; X86-AVX2:       # %bb.0:
84; X86-AVX2-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
85; X86-AVX2-NEXT:    retl
86  %shift = lshr <4 x i64> %a, %b
87  ret <4 x i64> %shift
88}
89
90define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
91; AVX1-LABEL: var_shift_v8i32:
92; AVX1:       # %bb.0:
93; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
94; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
95; AVX1-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
96; AVX1-NEXT:    vpsrld %xmm4, %xmm2, %xmm4
97; AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm5
98; AVX1-NEXT:    vpsrld %xmm5, %xmm2, %xmm5
99; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
100; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
101; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
102; AVX1-NEXT:    vpsrld %xmm6, %xmm2, %xmm6
103; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
104; AVX1-NEXT:    vpsrld %xmm3, %xmm2, %xmm2
105; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7]
106; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
107; AVX1-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
108; AVX1-NEXT:    vpsrld %xmm3, %xmm0, %xmm3
109; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm4
110; AVX1-NEXT:    vpsrld %xmm4, %xmm0, %xmm4
111; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
112; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
113; AVX1-NEXT:    vpsrld %xmm4, %xmm0, %xmm4
114; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
115; AVX1-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
116; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
117; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
118; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
119; AVX1-NEXT:    retq
120;
121; AVX2-LABEL: var_shift_v8i32:
122; AVX2:       # %bb.0:
123; AVX2-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
124; AVX2-NEXT:    retq
125;
126; XOPAVX1-LABEL: var_shift_v8i32:
127; XOPAVX1:       # %bb.0:
128; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
129; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
130; XOPAVX1-NEXT:    vpsubd %xmm2, %xmm3, %xmm2
131; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
132; XOPAVX1-NEXT:    vpshld %xmm2, %xmm4, %xmm2
133; XOPAVX1-NEXT:    vpsubd %xmm1, %xmm3, %xmm1
134; XOPAVX1-NEXT:    vpshld %xmm1, %xmm0, %xmm0
135; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
136; XOPAVX1-NEXT:    retq
137;
138; XOPAVX2-LABEL: var_shift_v8i32:
139; XOPAVX2:       # %bb.0:
140; XOPAVX2-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
141; XOPAVX2-NEXT:    retq
142;
143; AVX512-LABEL: var_shift_v8i32:
144; AVX512:       # %bb.0:
145; AVX512-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
146; AVX512-NEXT:    retq
147;
148; AVX512VL-LABEL: var_shift_v8i32:
149; AVX512VL:       # %bb.0:
150; AVX512VL-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
151; AVX512VL-NEXT:    retq
152;
153; X86-AVX1-LABEL: var_shift_v8i32:
154; X86-AVX1:       # %bb.0:
155; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
156; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
157; X86-AVX1-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
158; X86-AVX1-NEXT:    vpsrld %xmm4, %xmm2, %xmm4
159; X86-AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm5
160; X86-AVX1-NEXT:    vpsrld %xmm5, %xmm2, %xmm5
161; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
162; X86-AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
163; X86-AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
164; X86-AVX1-NEXT:    vpsrld %xmm6, %xmm2, %xmm6
165; X86-AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
166; X86-AVX1-NEXT:    vpsrld %xmm3, %xmm2, %xmm2
167; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7]
168; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
169; X86-AVX1-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
170; X86-AVX1-NEXT:    vpsrld %xmm3, %xmm0, %xmm3
171; X86-AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm4
172; X86-AVX1-NEXT:    vpsrld %xmm4, %xmm0, %xmm4
173; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
174; X86-AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
175; X86-AVX1-NEXT:    vpsrld %xmm4, %xmm0, %xmm4
176; X86-AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
177; X86-AVX1-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
178; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
179; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
180; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
181; X86-AVX1-NEXT:    retl
182;
183; X86-AVX2-LABEL: var_shift_v8i32:
184; X86-AVX2:       # %bb.0:
185; X86-AVX2-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
186; X86-AVX2-NEXT:    retl
187  %shift = lshr <8 x i32> %a, %b
188  ret <8 x i32> %shift
189}
190
191define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
192; AVX1-LABEL: var_shift_v16i16:
193; AVX1:       # %bb.0:
194; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
195; AVX1-NEXT:    vpsllw $12, %xmm2, %xmm3
196; AVX1-NEXT:    vpsllw $4, %xmm2, %xmm2
197; AVX1-NEXT:    vpor %xmm3, %xmm2, %xmm2
198; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm3
199; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
200; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm5
201; AVX1-NEXT:    vpblendvb %xmm2, %xmm5, %xmm4, %xmm2
202; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm4
203; AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
204; AVX1-NEXT:    vpsrlw $2, %xmm2, %xmm4
205; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
206; AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
207; AVX1-NEXT:    vpsrlw $1, %xmm2, %xmm4
208; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
209; AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
210; AVX1-NEXT:    vpsllw $12, %xmm1, %xmm3
211; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
212; AVX1-NEXT:    vpor %xmm3, %xmm1, %xmm1
213; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm3
214; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm4
215; AVX1-NEXT:    vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
216; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm1
217; AVX1-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
218; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm1
219; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
220; AVX1-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
221; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm1
222; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
223; AVX1-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
224; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
225; AVX1-NEXT:    retq
226;
227; AVX2-LABEL: var_shift_v16i16:
228; AVX2:       # %bb.0:
229; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
230; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
231; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
232; AVX2-NEXT:    vpsrlvd %ymm3, %ymm4, %ymm3
233; AVX2-NEXT:    vpsrld $16, %ymm3, %ymm3
234; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
235; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
236; AVX2-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
237; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
238; AVX2-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
239; AVX2-NEXT:    retq
240;
241; XOPAVX1-LABEL: var_shift_v16i16:
242; XOPAVX1:       # %bb.0:
243; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
244; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
245; XOPAVX1-NEXT:    vpsubw %xmm2, %xmm3, %xmm2
246; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
247; XOPAVX1-NEXT:    vpshlw %xmm2, %xmm4, %xmm2
248; XOPAVX1-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
249; XOPAVX1-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
250; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
251; XOPAVX1-NEXT:    retq
252;
253; XOPAVX2-LABEL: var_shift_v16i16:
254; XOPAVX2:       # %bb.0:
255; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
256; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
257; XOPAVX2-NEXT:    vpsubw %xmm2, %xmm3, %xmm2
258; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
259; XOPAVX2-NEXT:    vpshlw %xmm2, %xmm4, %xmm2
260; XOPAVX2-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
261; XOPAVX2-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
262; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
263; XOPAVX2-NEXT:    retq
264;
265; AVX512DQ-LABEL: var_shift_v16i16:
266; AVX512DQ:       # %bb.0:
267; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
268; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
269; AVX512DQ-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
270; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
271; AVX512DQ-NEXT:    retq
272;
273; AVX512BW-LABEL: var_shift_v16i16:
274; AVX512BW:       # %bb.0:
275; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
276; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
277; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
278; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
279; AVX512BW-NEXT:    retq
280;
281; AVX512DQVL-LABEL: var_shift_v16i16:
282; AVX512DQVL:       # %bb.0:
283; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
284; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
285; AVX512DQVL-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
286; AVX512DQVL-NEXT:    vpmovdw %zmm0, %ymm0
287; AVX512DQVL-NEXT:    retq
288;
289; AVX512BWVL-LABEL: var_shift_v16i16:
290; AVX512BWVL:       # %bb.0:
291; AVX512BWVL-NEXT:    vpsrlvw %ymm1, %ymm0, %ymm0
292; AVX512BWVL-NEXT:    retq
293;
294; X86-AVX1-LABEL: var_shift_v16i16:
295; X86-AVX1:       # %bb.0:
296; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
297; X86-AVX1-NEXT:    vpsllw $12, %xmm2, %xmm3
298; X86-AVX1-NEXT:    vpsllw $4, %xmm2, %xmm2
299; X86-AVX1-NEXT:    vpor %xmm3, %xmm2, %xmm2
300; X86-AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm3
301; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
302; X86-AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm5
303; X86-AVX1-NEXT:    vpblendvb %xmm2, %xmm5, %xmm4, %xmm2
304; X86-AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm4
305; X86-AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
306; X86-AVX1-NEXT:    vpsrlw $2, %xmm2, %xmm4
307; X86-AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
308; X86-AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
309; X86-AVX1-NEXT:    vpsrlw $1, %xmm2, %xmm4
310; X86-AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
311; X86-AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
312; X86-AVX1-NEXT:    vpsllw $12, %xmm1, %xmm3
313; X86-AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
314; X86-AVX1-NEXT:    vpor %xmm3, %xmm1, %xmm1
315; X86-AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm3
316; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm4
317; X86-AVX1-NEXT:    vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
318; X86-AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm1
319; X86-AVX1-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
320; X86-AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm1
321; X86-AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
322; X86-AVX1-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
323; X86-AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm1
324; X86-AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
325; X86-AVX1-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
326; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
327; X86-AVX1-NEXT:    retl
328;
329; X86-AVX2-LABEL: var_shift_v16i16:
330; X86-AVX2:       # %bb.0:
331; X86-AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
332; X86-AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
333; X86-AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
334; X86-AVX2-NEXT:    vpsrlvd %ymm3, %ymm4, %ymm3
335; X86-AVX2-NEXT:    vpsrld $16, %ymm3, %ymm3
336; X86-AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
337; X86-AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
338; X86-AVX2-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
339; X86-AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
340; X86-AVX2-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
341; X86-AVX2-NEXT:    retl
342  %shift = lshr <16 x i16> %a, %b
343  ret <16 x i16> %shift
344}
345
346define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
347; AVX1-LABEL: var_shift_v32i8:
348; AVX1:       # %bb.0:
349; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
350; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm3
351; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
352; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
353; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
354; AVX1-NEXT:    vpsllw $5, %xmm5, %xmm5
355; AVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
356; AVX1-NEXT:    vpsrlw $2, %xmm2, %xmm3
357; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
358; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
359; AVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
360; AVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
361; AVX1-NEXT:    vpsrlw $1, %xmm2, %xmm3
362; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
363; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
364; AVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
365; AVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
366; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm3
367; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
368; AVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
369; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
370; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm3
371; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
372; AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
373; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
374; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm3
375; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
376; AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
377; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
378; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
379; AVX1-NEXT:    retq
380;
381; AVX2-LABEL: var_shift_v32i8:
382; AVX2:       # %bb.0:
383; AVX2-NEXT:    vpsllw $5, %ymm1, %ymm1
384; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm2
385; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
386; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
387; AVX2-NEXT:    vpsrlw $2, %ymm0, %ymm2
388; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
389; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
390; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
391; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm2
392; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
393; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
394; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
395; AVX2-NEXT:    retq
396;
397; XOPAVX1-LABEL: var_shift_v32i8:
398; XOPAVX1:       # %bb.0:
399; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
400; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
401; XOPAVX1-NEXT:    vpsubb %xmm2, %xmm3, %xmm2
402; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
403; XOPAVX1-NEXT:    vpshlb %xmm2, %xmm4, %xmm2
404; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
405; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
406; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
407; XOPAVX1-NEXT:    retq
408;
409; XOPAVX2-LABEL: var_shift_v32i8:
410; XOPAVX2:       # %bb.0:
411; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
412; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
413; XOPAVX2-NEXT:    vpsubb %xmm2, %xmm3, %xmm2
414; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
415; XOPAVX2-NEXT:    vpshlb %xmm2, %xmm4, %xmm2
416; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
417; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
418; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
419; XOPAVX2-NEXT:    retq
420;
421; AVX512DQ-LABEL: var_shift_v32i8:
422; AVX512DQ:       # %bb.0:
423; AVX512DQ-NEXT:    vpsllw $5, %ymm1, %ymm1
424; AVX512DQ-NEXT:    vpsrlw $4, %ymm0, %ymm2
425; AVX512DQ-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
426; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
427; AVX512DQ-NEXT:    vpsrlw $2, %ymm0, %ymm2
428; AVX512DQ-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
429; AVX512DQ-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
430; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
431; AVX512DQ-NEXT:    vpsrlw $1, %ymm0, %ymm2
432; AVX512DQ-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
433; AVX512DQ-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
434; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
435; AVX512DQ-NEXT:    retq
436;
437; AVX512BW-LABEL: var_shift_v32i8:
438; AVX512BW:       # %bb.0:
439; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
440; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
441; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
442; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
443; AVX512BW-NEXT:    retq
444;
445; AVX512DQVL-LABEL: var_shift_v32i8:
446; AVX512DQVL:       # %bb.0:
447; AVX512DQVL-NEXT:    vpsllw $5, %ymm1, %ymm1
448; AVX512DQVL-NEXT:    vpsrlw $4, %ymm0, %ymm2
449; AVX512DQVL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
450; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
451; AVX512DQVL-NEXT:    vpsrlw $2, %ymm0, %ymm2
452; AVX512DQVL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
453; AVX512DQVL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
454; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
455; AVX512DQVL-NEXT:    vpsrlw $1, %ymm0, %ymm2
456; AVX512DQVL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
457; AVX512DQVL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
458; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
459; AVX512DQVL-NEXT:    retq
460;
461; AVX512BWVL-LABEL: var_shift_v32i8:
462; AVX512BWVL:       # %bb.0:
463; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
464; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
465; AVX512BWVL-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
466; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
467; AVX512BWVL-NEXT:    retq
468;
469; X86-AVX1-LABEL: var_shift_v32i8:
470; X86-AVX1:       # %bb.0:
471; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
472; X86-AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm3
473; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
474; X86-AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
475; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
476; X86-AVX1-NEXT:    vpsllw $5, %xmm5, %xmm5
477; X86-AVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
478; X86-AVX1-NEXT:    vpsrlw $2, %xmm2, %xmm3
479; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
480; X86-AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
481; X86-AVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
482; X86-AVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
483; X86-AVX1-NEXT:    vpsrlw $1, %xmm2, %xmm3
484; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
485; X86-AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
486; X86-AVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
487; X86-AVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
488; X86-AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm3
489; X86-AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
490; X86-AVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
491; X86-AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
492; X86-AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm3
493; X86-AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
494; X86-AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
495; X86-AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
496; X86-AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm3
497; X86-AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
498; X86-AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
499; X86-AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
500; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
501; X86-AVX1-NEXT:    retl
502;
503; X86-AVX2-LABEL: var_shift_v32i8:
504; X86-AVX2:       # %bb.0:
505; X86-AVX2-NEXT:    vpsllw $5, %ymm1, %ymm1
506; X86-AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm2
507; X86-AVX2-NEXT:    vpand {{\.LCPI.*}}, %ymm2, %ymm2
508; X86-AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
509; X86-AVX2-NEXT:    vpsrlw $2, %ymm0, %ymm2
510; X86-AVX2-NEXT:    vpand {{\.LCPI.*}}, %ymm2, %ymm2
511; X86-AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
512; X86-AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
513; X86-AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm2
514; X86-AVX2-NEXT:    vpand {{\.LCPI.*}}, %ymm2, %ymm2
515; X86-AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
516; X86-AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
517; X86-AVX2-NEXT:    retl
518  %shift = lshr <32 x i8> %a, %b
519  ret <32 x i8> %shift
520}
521
522;
523; Uniform Variable Shifts
524;
525
526define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
527; AVX1-LABEL: splatvar_shift_v4i64:
528; AVX1:       # %bb.0:
529; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
530; AVX1-NEXT:    vpsrlq %xmm1, %xmm2, %xmm2
531; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
532; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
533; AVX1-NEXT:    retq
534;
535; AVX2-LABEL: splatvar_shift_v4i64:
536; AVX2:       # %bb.0:
537; AVX2-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
538; AVX2-NEXT:    retq
539;
540; XOPAVX1-LABEL: splatvar_shift_v4i64:
541; XOPAVX1:       # %bb.0:
542; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
543; XOPAVX1-NEXT:    vpsrlq %xmm1, %xmm2, %xmm2
544; XOPAVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
545; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
546; XOPAVX1-NEXT:    retq
547;
548; XOPAVX2-LABEL: splatvar_shift_v4i64:
549; XOPAVX2:       # %bb.0:
550; XOPAVX2-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
551; XOPAVX2-NEXT:    retq
552;
553; AVX512-LABEL: splatvar_shift_v4i64:
554; AVX512:       # %bb.0:
555; AVX512-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
556; AVX512-NEXT:    retq
557;
558; AVX512VL-LABEL: splatvar_shift_v4i64:
559; AVX512VL:       # %bb.0:
560; AVX512VL-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
561; AVX512VL-NEXT:    retq
562;
563; X86-AVX1-LABEL: splatvar_shift_v4i64:
564; X86-AVX1:       # %bb.0:
565; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
566; X86-AVX1-NEXT:    vpsrlq %xmm1, %xmm2, %xmm2
567; X86-AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
568; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
569; X86-AVX1-NEXT:    retl
570;
571; X86-AVX2-LABEL: splatvar_shift_v4i64:
572; X86-AVX2:       # %bb.0:
573; X86-AVX2-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
574; X86-AVX2-NEXT:    retl
575  %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
576  %shift = lshr <4 x i64> %a, %splat
577  ret <4 x i64> %shift
578}
579
580define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
581; AVX1-LABEL: splatvar_shift_v8i32:
582; AVX1:       # %bb.0:
583; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
584; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
585; AVX1-NEXT:    vpsrld %xmm1, %xmm2, %xmm2
586; AVX1-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
587; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
588; AVX1-NEXT:    retq
589;
590; AVX2-LABEL: splatvar_shift_v8i32:
591; AVX2:       # %bb.0:
592; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
593; AVX2-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
594; AVX2-NEXT:    retq
595;
596; XOPAVX1-LABEL: splatvar_shift_v8i32:
597; XOPAVX1:       # %bb.0:
598; XOPAVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
599; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
600; XOPAVX1-NEXT:    vpsrld %xmm1, %xmm2, %xmm2
601; XOPAVX1-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
602; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
603; XOPAVX1-NEXT:    retq
604;
605; XOPAVX2-LABEL: splatvar_shift_v8i32:
606; XOPAVX2:       # %bb.0:
607; XOPAVX2-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
608; XOPAVX2-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
609; XOPAVX2-NEXT:    retq
610;
611; AVX512-LABEL: splatvar_shift_v8i32:
612; AVX512:       # %bb.0:
613; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
614; AVX512-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
615; AVX512-NEXT:    retq
616;
617; AVX512VL-LABEL: splatvar_shift_v8i32:
618; AVX512VL:       # %bb.0:
619; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
620; AVX512VL-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
621; AVX512VL-NEXT:    retq
622;
623; X86-AVX1-LABEL: splatvar_shift_v8i32:
624; X86-AVX1:       # %bb.0:
625; X86-AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
626; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
627; X86-AVX1-NEXT:    vpsrld %xmm1, %xmm2, %xmm2
628; X86-AVX1-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
629; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
630; X86-AVX1-NEXT:    retl
631;
632; X86-AVX2-LABEL: splatvar_shift_v8i32:
633; X86-AVX2:       # %bb.0:
634; X86-AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
635; X86-AVX2-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
636; X86-AVX2-NEXT:    retl
637  %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
638  %shift = lshr <8 x i32> %a, %splat
639  ret <8 x i32> %shift
640}
641
642define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
643; AVX1-LABEL: splatvar_shift_v16i16:
644; AVX1:       # %bb.0:
645; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
646; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
647; AVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
648; AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
649; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
650; AVX1-NEXT:    retq
651;
652; AVX2-LABEL: splatvar_shift_v16i16:
653; AVX2:       # %bb.0:
654; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
655; AVX2-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
656; AVX2-NEXT:    retq
657;
658; XOPAVX1-LABEL: splatvar_shift_v16i16:
659; XOPAVX1:       # %bb.0:
660; XOPAVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
661; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
662; XOPAVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
663; XOPAVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
664; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
665; XOPAVX1-NEXT:    retq
666;
667; XOPAVX2-LABEL: splatvar_shift_v16i16:
668; XOPAVX2:       # %bb.0:
669; XOPAVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
670; XOPAVX2-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
671; XOPAVX2-NEXT:    retq
672;
673; AVX512-LABEL: splatvar_shift_v16i16:
674; AVX512:       # %bb.0:
675; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
676; AVX512-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
677; AVX512-NEXT:    retq
678;
679; AVX512VL-LABEL: splatvar_shift_v16i16:
680; AVX512VL:       # %bb.0:
681; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
682; AVX512VL-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
683; AVX512VL-NEXT:    retq
684;
685; X86-AVX1-LABEL: splatvar_shift_v16i16:
686; X86-AVX1:       # %bb.0:
687; X86-AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
688; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
689; X86-AVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
690; X86-AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
691; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
692; X86-AVX1-NEXT:    retl
693;
694; X86-AVX2-LABEL: splatvar_shift_v16i16:
695; X86-AVX2:       # %bb.0:
696; X86-AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
697; X86-AVX2-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
698; X86-AVX2-NEXT:    retl
699  %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
700  %shift = lshr <16 x i16> %a, %splat
701  ret <16 x i16> %shift
702}
703
704define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
705; AVX1-LABEL: splatvar_shift_v32i8:
706; AVX1:       # %bb.0:
707; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
708; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
709; AVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
710; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
711; AVX1-NEXT:    vpsrlw %xmm1, %xmm3, %xmm3
712; AVX1-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
713; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
714; AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
715; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
716; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
717; AVX1-NEXT:    retq
718;
719; AVX2-LABEL: splatvar_shift_v32i8:
720; AVX2:       # %bb.0:
721; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
722; AVX2-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
723; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
724; AVX2-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
725; AVX2-NEXT:    vpsrlw $8, %xmm1, %xmm1
726; AVX2-NEXT:    vpbroadcastb %xmm1, %ymm1
727; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
728; AVX2-NEXT:    retq
729;
730; XOPAVX1-LABEL: splatvar_shift_v32i8:
731; XOPAVX1:       # %bb.0:
732; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
733; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
734; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
735; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
736; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm2, %xmm2
737; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
738; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
739; XOPAVX1-NEXT:    retq
740;
741; XOPAVX2-LABEL: splatvar_shift_v32i8:
742; XOPAVX2:       # %bb.0:
743; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
744; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
745; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
746; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
747; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm2, %xmm2
748; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
749; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
750; XOPAVX2-NEXT:    retq
751;
752; AVX512DQ-LABEL: splatvar_shift_v32i8:
753; AVX512DQ:       # %bb.0:
754; AVX512DQ-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
755; AVX512DQ-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
756; AVX512DQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
757; AVX512DQ-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
758; AVX512DQ-NEXT:    vpsrlw $8, %xmm1, %xmm1
759; AVX512DQ-NEXT:    vpbroadcastb %xmm1, %ymm1
760; AVX512DQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
761; AVX512DQ-NEXT:    retq
762;
763; AVX512BW-LABEL: splatvar_shift_v32i8:
764; AVX512BW:       # %bb.0:
765; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
766; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
767; AVX512BW-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0
768; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
769; AVX512BW-NEXT:    retq
770;
771; AVX512DQVL-LABEL: splatvar_shift_v32i8:
772; AVX512DQVL:       # %bb.0:
773; AVX512DQVL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
774; AVX512DQVL-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
775; AVX512DQVL-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
776; AVX512DQVL-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
777; AVX512DQVL-NEXT:    vpsrlw $8, %xmm1, %xmm1
778; AVX512DQVL-NEXT:    vpbroadcastb %xmm1, %ymm1
779; AVX512DQVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
780; AVX512DQVL-NEXT:    retq
781;
782; AVX512BWVL-LABEL: splatvar_shift_v32i8:
783; AVX512BWVL:       # %bb.0:
784; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
785; AVX512BWVL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
786; AVX512BWVL-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0
787; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
788; AVX512BWVL-NEXT:    retq
789;
790; X86-AVX1-LABEL: splatvar_shift_v32i8:
791; X86-AVX1:       # %bb.0:
792; X86-AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
793; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
794; X86-AVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
795; X86-AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
796; X86-AVX1-NEXT:    vpsrlw %xmm1, %xmm3, %xmm3
797; X86-AVX1-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
798; X86-AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
799; X86-AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
800; X86-AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
801; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
802; X86-AVX1-NEXT:    retl
803;
804; X86-AVX2-LABEL: splatvar_shift_v32i8:
805; X86-AVX2:       # %bb.0:
806; X86-AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
807; X86-AVX2-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
808; X86-AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
809; X86-AVX2-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
810; X86-AVX2-NEXT:    vpsrlw $8, %xmm1, %xmm1
811; X86-AVX2-NEXT:    vpbroadcastb %xmm1, %ymm1
812; X86-AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
813; X86-AVX2-NEXT:    retl
814  %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
815  %shift = lshr <32 x i8> %a, %splat
816  ret <32 x i8> %shift
817}
818
819;
820; Constant Shifts
821;
822
823define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
824; AVX1-LABEL: constant_shift_v4i64:
825; AVX1:       # %bb.0:
826; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
827; AVX1-NEXT:    vpsrlq $62, %xmm1, %xmm2
828; AVX1-NEXT:    vpsrlq $31, %xmm1, %xmm1
829; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
830; AVX1-NEXT:    vpsrlq $7, %xmm0, %xmm2
831; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
832; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
833; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
834; AVX1-NEXT:    retq
835;
836; AVX2-LABEL: constant_shift_v4i64:
837; AVX2:       # %bb.0:
838; AVX2-NEXT:    vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
839; AVX2-NEXT:    retq
840;
841; XOPAVX1-LABEL: constant_shift_v4i64:
842; XOPAVX1:       # %bb.0:
843; XOPAVX1-NEXT:    vpshlq {{.*}}(%rip), %xmm0, %xmm1
844; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
845; XOPAVX1-NEXT:    vpshlq {{.*}}(%rip), %xmm0, %xmm0
846; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
847; XOPAVX1-NEXT:    retq
848;
849; XOPAVX2-LABEL: constant_shift_v4i64:
850; XOPAVX2:       # %bb.0:
851; XOPAVX2-NEXT:    vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
852; XOPAVX2-NEXT:    retq
853;
854; AVX512-LABEL: constant_shift_v4i64:
855; AVX512:       # %bb.0:
856; AVX512-NEXT:    vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
857; AVX512-NEXT:    retq
858;
859; AVX512VL-LABEL: constant_shift_v4i64:
860; AVX512VL:       # %bb.0:
861; AVX512VL-NEXT:    vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
862; AVX512VL-NEXT:    retq
863;
864; X86-AVX1-LABEL: constant_shift_v4i64:
865; X86-AVX1:       # %bb.0:
866; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
867; X86-AVX1-NEXT:    vpsrlq $62, %xmm1, %xmm2
868; X86-AVX1-NEXT:    vpsrlq $31, %xmm1, %xmm1
869; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
870; X86-AVX1-NEXT:    vpsrlq $7, %xmm0, %xmm2
871; X86-AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
872; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
873; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
874; X86-AVX1-NEXT:    retl
875;
876; X86-AVX2-LABEL: constant_shift_v4i64:
877; X86-AVX2:       # %bb.0:
878; X86-AVX2-NEXT:    vpsrlvq {{\.LCPI.*}}, %ymm0, %ymm0
879; X86-AVX2-NEXT:    retl
880  %shift = lshr <4 x i64> %a, <i64 1, i64 7, i64 31, i64 62>
881  ret <4 x i64> %shift
882}
883
884define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
885; AVX1-LABEL: constant_shift_v8i32:
886; AVX1:       # %bb.0:
887; AVX1-NEXT:    vpsrld $7, %xmm0, %xmm1
888; AVX1-NEXT:    vpsrld $5, %xmm0, %xmm2
889; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
890; AVX1-NEXT:    vpsrld $6, %xmm0, %xmm2
891; AVX1-NEXT:    vpsrld $4, %xmm0, %xmm3
892; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
893; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
894; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
895; AVX1-NEXT:    vpsrld $7, %xmm0, %xmm2
896; AVX1-NEXT:    vpsrld $9, %xmm0, %xmm3
897; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
898; AVX1-NEXT:    vpsrld $8, %xmm0, %xmm0
899; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
900; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
901; AVX1-NEXT:    retq
902;
903; AVX2-LABEL: constant_shift_v8i32:
904; AVX2:       # %bb.0:
905; AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
906; AVX2-NEXT:    retq
907;
908; XOPAVX1-LABEL: constant_shift_v8i32:
909; XOPAVX1:       # %bb.0:
910; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm1
911; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
912; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm0
913; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
914; XOPAVX1-NEXT:    retq
915;
916; XOPAVX2-LABEL: constant_shift_v8i32:
917; XOPAVX2:       # %bb.0:
918; XOPAVX2-NEXT:    vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
919; XOPAVX2-NEXT:    retq
920;
921; AVX512-LABEL: constant_shift_v8i32:
922; AVX512:       # %bb.0:
923; AVX512-NEXT:    vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
924; AVX512-NEXT:    retq
925;
926; AVX512VL-LABEL: constant_shift_v8i32:
927; AVX512VL:       # %bb.0:
928; AVX512VL-NEXT:    vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
929; AVX512VL-NEXT:    retq
930;
931; X86-AVX1-LABEL: constant_shift_v8i32:
932; X86-AVX1:       # %bb.0:
933; X86-AVX1-NEXT:    vpsrld $7, %xmm0, %xmm1
934; X86-AVX1-NEXT:    vpsrld $5, %xmm0, %xmm2
935; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
936; X86-AVX1-NEXT:    vpsrld $6, %xmm0, %xmm2
937; X86-AVX1-NEXT:    vpsrld $4, %xmm0, %xmm3
938; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
939; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
940; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
941; X86-AVX1-NEXT:    vpsrld $7, %xmm0, %xmm2
942; X86-AVX1-NEXT:    vpsrld $9, %xmm0, %xmm3
943; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
944; X86-AVX1-NEXT:    vpsrld $8, %xmm0, %xmm0
945; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
946; X86-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
947; X86-AVX1-NEXT:    retl
948;
949; X86-AVX2-LABEL: constant_shift_v8i32:
950; X86-AVX2:       # %bb.0:
951; X86-AVX2-NEXT:    vpsrlvd {{\.LCPI.*}}, %ymm0, %ymm0
952; X86-AVX2-NEXT:    retl
953  %shift = lshr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
954  ret <8 x i32> %shift
955}
956
957define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
958; AVX1-LABEL: constant_shift_v16i16:
959; AVX1:       # %bb.0:
960; AVX1-NEXT:    vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
961; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
962; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
963; AVX1-NEXT:    vpmulhuw {{.*}}(%rip), %xmm0, %xmm0
964; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
965; AVX1-NEXT:    retq
966;
967; AVX2-LABEL: constant_shift_v16i16:
968; AVX2:       # %bb.0:
969; AVX2-NEXT:    vpmulhuw {{.*}}(%rip), %ymm0, %ymm1
970; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
971; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
972; AVX2-NEXT:    retq
973;
974; XOPAVX1-LABEL: constant_shift_v16i16:
975; XOPAVX1:       # %bb.0:
976; XOPAVX1-NEXT:    vpshlw {{.*}}(%rip), %xmm0, %xmm1
977; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
978; XOPAVX1-NEXT:    vpshlw {{.*}}(%rip), %xmm0, %xmm0
979; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
980; XOPAVX1-NEXT:    retq
981;
982; XOPAVX2-LABEL: constant_shift_v16i16:
983; XOPAVX2:       # %bb.0:
984; XOPAVX2-NEXT:    vpmulhuw {{.*}}(%rip), %ymm0, %ymm1
985; XOPAVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
986; XOPAVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
987; XOPAVX2-NEXT:    retq
988;
989; AVX512DQ-LABEL: constant_shift_v16i16:
990; AVX512DQ:       # %bb.0:
991; AVX512DQ-NEXT:    vpmulhuw {{.*}}(%rip), %ymm0, %ymm1
992; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
993; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
994; AVX512DQ-NEXT:    retq
995;
996; AVX512BW-LABEL: constant_shift_v16i16:
997; AVX512BW:       # %bb.0:
998; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
999; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
1000; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
1001; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1002; AVX512BW-NEXT:    retq
1003;
1004; AVX512DQVL-LABEL: constant_shift_v16i16:
1005; AVX512DQVL:       # %bb.0:
1006; AVX512DQVL-NEXT:    vpmulhuw {{.*}}(%rip), %ymm0, %ymm1
1007; AVX512DQVL-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1008; AVX512DQVL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
1009; AVX512DQVL-NEXT:    retq
1010;
1011; AVX512BWVL-LABEL: constant_shift_v16i16:
1012; AVX512BWVL:       # %bb.0:
1013; AVX512BWVL-NEXT:    vpsrlvw {{.*}}(%rip), %ymm0, %ymm0
1014; AVX512BWVL-NEXT:    retq
1015;
1016; X86-AVX1-LABEL: constant_shift_v16i16:
1017; X86-AVX1:       # %bb.0:
1018; X86-AVX1-NEXT:    vpmulhuw {{\.LCPI.*}}, %xmm0, %xmm1
1019; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1020; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1021; X86-AVX1-NEXT:    vpmulhuw {{\.LCPI.*}}, %xmm0, %xmm0
1022; X86-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1023; X86-AVX1-NEXT:    retl
1024;
1025; X86-AVX2-LABEL: constant_shift_v16i16:
1026; X86-AVX2:       # %bb.0:
1027; X86-AVX2-NEXT:    vpmulhuw {{\.LCPI.*}}, %ymm0, %ymm1
1028; X86-AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1029; X86-AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
1030; X86-AVX2-NEXT:    retl
1031  %shift = lshr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
1032  ret <16 x i16> %shift
1033}
1034
1035define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
1036; AVX1-LABEL: constant_shift_v32i8:
1037; AVX1:       # %bb.0:
1038; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1039; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1040; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
1041; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [2,4,8,16,32,64,128,256]
1042; AVX1-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
1043; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
1044; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1045; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [256,128,64,32,16,8,4,2]
1046; AVX1-NEXT:    vpmullw %xmm5, %xmm1, %xmm1
1047; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
1048; AVX1-NEXT:    vpackuswb %xmm3, %xmm1, %xmm1
1049; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
1050; AVX1-NEXT:    vpmullw %xmm4, %xmm2, %xmm2
1051; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
1052; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1053; AVX1-NEXT:    vpmullw %xmm5, %xmm0, %xmm0
1054; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
1055; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
1056; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1057; AVX1-NEXT:    retq
1058;
1059; AVX2-LABEL: constant_shift_v32i8:
1060; AVX2:       # %bb.0:
1061; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1062; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
1063; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm2, %ymm2
1064; AVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
1065; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
1066; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
1067; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
1068; AVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
1069; AVX2-NEXT:    retq
1070;
1071; XOPAVX1-LABEL: constant_shift_v32i8:
1072; XOPAVX1:       # %bb.0:
1073; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1074; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,255,254,253,252,251,250,249,249,250,251,252,253,254,255,0]
1075; XOPAVX1-NEXT:    vpshlb %xmm2, %xmm1, %xmm1
1076; XOPAVX1-NEXT:    vpshlb %xmm2, %xmm0, %xmm0
1077; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1078; XOPAVX1-NEXT:    retq
1079;
1080; XOPAVX2-LABEL: constant_shift_v32i8:
1081; XOPAVX2:       # %bb.0:
1082; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1083; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,255,254,253,252,251,250,249,249,250,251,252,253,254,255,0]
1084; XOPAVX2-NEXT:    vpshlb %xmm2, %xmm1, %xmm1
1085; XOPAVX2-NEXT:    vpshlb %xmm2, %xmm0, %xmm0
1086; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1087; XOPAVX2-NEXT:    retq
1088;
1089; AVX512DQ-LABEL: constant_shift_v32i8:
1090; AVX512DQ:       # %bb.0:
1091; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1092; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
1093; AVX512DQ-NEXT:    vpmullw {{.*}}(%rip), %ymm2, %ymm2
1094; AVX512DQ-NEXT:    vpsrlw $8, %ymm2, %ymm2
1095; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
1096; AVX512DQ-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
1097; AVX512DQ-NEXT:    vpsrlw $8, %ymm0, %ymm0
1098; AVX512DQ-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
1099; AVX512DQ-NEXT:    retq
1100;
1101; AVX512BW-LABEL: constant_shift_v32i8:
1102; AVX512BW:       # %bb.0:
1103; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1104; AVX512BW-NEXT:    vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
1105; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1106; AVX512BW-NEXT:    retq
1107;
1108; AVX512DQVL-LABEL: constant_shift_v32i8:
1109; AVX512DQVL:       # %bb.0:
1110; AVX512DQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1111; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
1112; AVX512DQVL-NEXT:    vpmullw {{.*}}(%rip), %ymm2, %ymm2
1113; AVX512DQVL-NEXT:    vpsrlw $8, %ymm2, %ymm2
1114; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
1115; AVX512DQVL-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
1116; AVX512DQVL-NEXT:    vpsrlw $8, %ymm0, %ymm0
1117; AVX512DQVL-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
1118; AVX512DQVL-NEXT:    retq
1119;
1120; AVX512BWVL-LABEL: constant_shift_v32i8:
1121; AVX512BWVL:       # %bb.0:
1122; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1123; AVX512BWVL-NEXT:    vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
1124; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
1125; AVX512BWVL-NEXT:    retq
1126;
1127; X86-AVX1-LABEL: constant_shift_v32i8:
1128; X86-AVX1:       # %bb.0:
1129; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1130; X86-AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1131; X86-AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
1132; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [2,4,8,16,32,64,128,256]
1133; X86-AVX1-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
1134; X86-AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
1135; X86-AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1136; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [256,128,64,32,16,8,4,2]
1137; X86-AVX1-NEXT:    vpmullw %xmm5, %xmm1, %xmm1
1138; X86-AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
1139; X86-AVX1-NEXT:    vpackuswb %xmm3, %xmm1, %xmm1
1140; X86-AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
1141; X86-AVX1-NEXT:    vpmullw %xmm4, %xmm2, %xmm2
1142; X86-AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
1143; X86-AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1144; X86-AVX1-NEXT:    vpmullw %xmm5, %xmm0, %xmm0
1145; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
1146; X86-AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
1147; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1148; X86-AVX1-NEXT:    retl
1149;
1150; X86-AVX2-LABEL: constant_shift_v32i8:
1151; X86-AVX2:       # %bb.0:
1152; X86-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1153; X86-AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
1154; X86-AVX2-NEXT:    vpmullw {{\.LCPI.*}}, %ymm2, %ymm2
1155; X86-AVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
1156; X86-AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
1157; X86-AVX2-NEXT:    vpmullw {{\.LCPI.*}}, %ymm0, %ymm0
1158; X86-AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
1159; X86-AVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
1160; X86-AVX2-NEXT:    retl
1161  %shift = lshr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
1162  ret <32 x i8> %shift
1163}
1164
1165;
1166; Uniform Constant Shifts
1167;
1168
1169define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
1170; AVX1-LABEL: splatconstant_shift_v4i64:
1171; AVX1:       # %bb.0:
1172; AVX1-NEXT:    vpsrlq $7, %xmm0, %xmm1
1173; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1174; AVX1-NEXT:    vpsrlq $7, %xmm0, %xmm0
1175; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1176; AVX1-NEXT:    retq
1177;
1178; AVX2-LABEL: splatconstant_shift_v4i64:
1179; AVX2:       # %bb.0:
1180; AVX2-NEXT:    vpsrlq $7, %ymm0, %ymm0
1181; AVX2-NEXT:    retq
1182;
1183; XOPAVX1-LABEL: splatconstant_shift_v4i64:
1184; XOPAVX1:       # %bb.0:
1185; XOPAVX1-NEXT:    vpsrlq $7, %xmm0, %xmm1
1186; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1187; XOPAVX1-NEXT:    vpsrlq $7, %xmm0, %xmm0
1188; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1189; XOPAVX1-NEXT:    retq
1190;
1191; XOPAVX2-LABEL: splatconstant_shift_v4i64:
1192; XOPAVX2:       # %bb.0:
1193; XOPAVX2-NEXT:    vpsrlq $7, %ymm0, %ymm0
1194; XOPAVX2-NEXT:    retq
1195;
1196; AVX512-LABEL: splatconstant_shift_v4i64:
1197; AVX512:       # %bb.0:
1198; AVX512-NEXT:    vpsrlq $7, %ymm0, %ymm0
1199; AVX512-NEXT:    retq
1200;
1201; AVX512VL-LABEL: splatconstant_shift_v4i64:
1202; AVX512VL:       # %bb.0:
1203; AVX512VL-NEXT:    vpsrlq $7, %ymm0, %ymm0
1204; AVX512VL-NEXT:    retq
1205;
1206; X86-AVX1-LABEL: splatconstant_shift_v4i64:
1207; X86-AVX1:       # %bb.0:
1208; X86-AVX1-NEXT:    vpsrlq $7, %xmm0, %xmm1
1209; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1210; X86-AVX1-NEXT:    vpsrlq $7, %xmm0, %xmm0
1211; X86-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1212; X86-AVX1-NEXT:    retl
1213;
1214; X86-AVX2-LABEL: splatconstant_shift_v4i64:
1215; X86-AVX2:       # %bb.0:
1216; X86-AVX2-NEXT:    vpsrlq $7, %ymm0, %ymm0
1217; X86-AVX2-NEXT:    retl
1218  %shift = lshr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
1219  ret <4 x i64> %shift
1220}
1221
1222define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind {
1223; AVX1-LABEL: splatconstant_shift_v8i32:
1224; AVX1:       # %bb.0:
1225; AVX1-NEXT:    vpsrld $5, %xmm0, %xmm1
1226; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1227; AVX1-NEXT:    vpsrld $5, %xmm0, %xmm0
1228; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1229; AVX1-NEXT:    retq
1230;
1231; AVX2-LABEL: splatconstant_shift_v8i32:
1232; AVX2:       # %bb.0:
1233; AVX2-NEXT:    vpsrld $5, %ymm0, %ymm0
1234; AVX2-NEXT:    retq
1235;
1236; XOPAVX1-LABEL: splatconstant_shift_v8i32:
1237; XOPAVX1:       # %bb.0:
1238; XOPAVX1-NEXT:    vpsrld $5, %xmm0, %xmm1
1239; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1240; XOPAVX1-NEXT:    vpsrld $5, %xmm0, %xmm0
1241; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1242; XOPAVX1-NEXT:    retq
1243;
1244; XOPAVX2-LABEL: splatconstant_shift_v8i32:
1245; XOPAVX2:       # %bb.0:
1246; XOPAVX2-NEXT:    vpsrld $5, %ymm0, %ymm0
1247; XOPAVX2-NEXT:    retq
1248;
1249; AVX512-LABEL: splatconstant_shift_v8i32:
1250; AVX512:       # %bb.0:
1251; AVX512-NEXT:    vpsrld $5, %ymm0, %ymm0
1252; AVX512-NEXT:    retq
1253;
1254; AVX512VL-LABEL: splatconstant_shift_v8i32:
1255; AVX512VL:       # %bb.0:
1256; AVX512VL-NEXT:    vpsrld $5, %ymm0, %ymm0
1257; AVX512VL-NEXT:    retq
1258;
1259; X86-AVX1-LABEL: splatconstant_shift_v8i32:
1260; X86-AVX1:       # %bb.0:
1261; X86-AVX1-NEXT:    vpsrld $5, %xmm0, %xmm1
1262; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1263; X86-AVX1-NEXT:    vpsrld $5, %xmm0, %xmm0
1264; X86-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1265; X86-AVX1-NEXT:    retl
1266;
1267; X86-AVX2-LABEL: splatconstant_shift_v8i32:
1268; X86-AVX2:       # %bb.0:
1269; X86-AVX2-NEXT:    vpsrld $5, %ymm0, %ymm0
1270; X86-AVX2-NEXT:    retl
1271  %shift = lshr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
1272  ret <8 x i32> %shift
1273}
1274
1275define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind {
1276; AVX1-LABEL: splatconstant_shift_v16i16:
1277; AVX1:       # %bb.0:
1278; AVX1-NEXT:    vpsrlw $3, %xmm0, %xmm1
1279; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1280; AVX1-NEXT:    vpsrlw $3, %xmm0, %xmm0
1281; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1282; AVX1-NEXT:    retq
1283;
1284; AVX2-LABEL: splatconstant_shift_v16i16:
1285; AVX2:       # %bb.0:
1286; AVX2-NEXT:    vpsrlw $3, %ymm0, %ymm0
1287; AVX2-NEXT:    retq
1288;
1289; XOPAVX1-LABEL: splatconstant_shift_v16i16:
1290; XOPAVX1:       # %bb.0:
1291; XOPAVX1-NEXT:    vpsrlw $3, %xmm0, %xmm1
1292; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1293; XOPAVX1-NEXT:    vpsrlw $3, %xmm0, %xmm0
1294; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1295; XOPAVX1-NEXT:    retq
1296;
1297; XOPAVX2-LABEL: splatconstant_shift_v16i16:
1298; XOPAVX2:       # %bb.0:
1299; XOPAVX2-NEXT:    vpsrlw $3, %ymm0, %ymm0
1300; XOPAVX2-NEXT:    retq
1301;
1302; AVX512-LABEL: splatconstant_shift_v16i16:
1303; AVX512:       # %bb.0:
1304; AVX512-NEXT:    vpsrlw $3, %ymm0, %ymm0
1305; AVX512-NEXT:    retq
1306;
1307; AVX512VL-LABEL: splatconstant_shift_v16i16:
1308; AVX512VL:       # %bb.0:
1309; AVX512VL-NEXT:    vpsrlw $3, %ymm0, %ymm0
1310; AVX512VL-NEXT:    retq
1311;
1312; X86-AVX1-LABEL: splatconstant_shift_v16i16:
1313; X86-AVX1:       # %bb.0:
1314; X86-AVX1-NEXT:    vpsrlw $3, %xmm0, %xmm1
1315; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1316; X86-AVX1-NEXT:    vpsrlw $3, %xmm0, %xmm0
1317; X86-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1318; X86-AVX1-NEXT:    retl
1319;
1320; X86-AVX2-LABEL: splatconstant_shift_v16i16:
1321; X86-AVX2:       # %bb.0:
1322; X86-AVX2-NEXT:    vpsrlw $3, %ymm0, %ymm0
1323; X86-AVX2-NEXT:    retl
1324  %shift = lshr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
1325  ret <16 x i16> %shift
1326}
1327
1328define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
1329; AVX1-LABEL: splatconstant_shift_v32i8:
1330; AVX1:       # %bb.0:
1331; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1332; AVX1-NEXT:    vpsrlw $3, %xmm1, %xmm1
1333; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
1334; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
1335; AVX1-NEXT:    vpsrlw $3, %xmm0, %xmm0
1336; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
1337; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1338; AVX1-NEXT:    retq
1339;
1340; AVX2-LABEL: splatconstant_shift_v32i8:
1341; AVX2:       # %bb.0:
1342; AVX2-NEXT:    vpsrlw $3, %ymm0, %ymm0
1343; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1344; AVX2-NEXT:    retq
1345;
1346; XOPAVX1-LABEL: splatconstant_shift_v32i8:
1347; XOPAVX1:       # %bb.0:
1348; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1349; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253]
1350; XOPAVX1-NEXT:    vpshlb %xmm2, %xmm1, %xmm1
1351; XOPAVX1-NEXT:    vpshlb %xmm2, %xmm0, %xmm0
1352; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1353; XOPAVX1-NEXT:    retq
1354;
1355; XOPAVX2-LABEL: splatconstant_shift_v32i8:
1356; XOPAVX2:       # %bb.0:
1357; XOPAVX2-NEXT:    vpsrlw $3, %ymm0, %ymm0
1358; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1359; XOPAVX2-NEXT:    retq
1360;
1361; AVX512-LABEL: splatconstant_shift_v32i8:
1362; AVX512:       # %bb.0:
1363; AVX512-NEXT:    vpsrlw $3, %ymm0, %ymm0
1364; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1365; AVX512-NEXT:    retq
1366;
1367; AVX512VL-LABEL: splatconstant_shift_v32i8:
1368; AVX512VL:       # %bb.0:
1369; AVX512VL-NEXT:    vpsrlw $3, %ymm0, %ymm0
1370; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1371; AVX512VL-NEXT:    retq
1372;
1373; X86-AVX1-LABEL: splatconstant_shift_v32i8:
1374; X86-AVX1:       # %bb.0:
1375; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1376; X86-AVX1-NEXT:    vpsrlw $3, %xmm1, %xmm1
1377; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
1378; X86-AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
1379; X86-AVX1-NEXT:    vpsrlw $3, %xmm0, %xmm0
1380; X86-AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
1381; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1382; X86-AVX1-NEXT:    retl
1383;
1384; X86-AVX2-LABEL: splatconstant_shift_v32i8:
1385; X86-AVX2:       # %bb.0:
1386; X86-AVX2-NEXT:    vpsrlw $3, %ymm0, %ymm0
1387; X86-AVX2-NEXT:    vpand {{\.LCPI.*}}, %ymm0, %ymm0
1388; X86-AVX2-NEXT:    retl
1389  %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
1390  ret <32 x i8> %shift
1391}
1392
1393define <4 x i32> @sh_trunc_sh_vec(<4 x i64> %x) {
1394; AVX1-LABEL: sh_trunc_sh_vec:
1395; AVX1:       # %bb.0:
1396; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1397; AVX1-NEXT:    vpsrlq $36, %xmm1, %xmm1
1398; AVX1-NEXT:    vpsrlq $36, %xmm0, %xmm0
1399; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1400; AVX1-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
1401; AVX1-NEXT:    vzeroupper
1402; AVX1-NEXT:    retq
1403;
1404; AVX2-LABEL: sh_trunc_sh_vec:
1405; AVX2:       # %bb.0:
1406; AVX2-NEXT:    vpsrlq $36, %ymm0, %ymm0
1407; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1408; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1409; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1048575,1048575,1048575,1048575]
1410; AVX2-NEXT:    vandps %xmm1, %xmm0, %xmm0
1411; AVX2-NEXT:    vzeroupper
1412; AVX2-NEXT:    retq
1413;
1414; XOPAVX1-LABEL: sh_trunc_sh_vec:
1415; XOPAVX1:       # %bb.0:
1416; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1417; XOPAVX1-NEXT:    vpsrlq $36, %xmm1, %xmm1
1418; XOPAVX1-NEXT:    vpsrlq $36, %xmm0, %xmm0
1419; XOPAVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1420; XOPAVX1-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
1421; XOPAVX1-NEXT:    vzeroupper
1422; XOPAVX1-NEXT:    retq
1423;
1424; XOPAVX2-LABEL: sh_trunc_sh_vec:
1425; XOPAVX2:       # %bb.0:
1426; XOPAVX2-NEXT:    vpsrlq $36, %ymm0, %ymm0
1427; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1428; XOPAVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1429; XOPAVX2-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1048575,1048575,1048575,1048575]
1430; XOPAVX2-NEXT:    vandps %xmm1, %xmm0, %xmm0
1431; XOPAVX2-NEXT:    vzeroupper
1432; XOPAVX2-NEXT:    retq
1433;
1434; AVX512-LABEL: sh_trunc_sh_vec:
1435; AVX512:       # %bb.0:
1436; AVX512-NEXT:    vpsrlq $36, %ymm0, %ymm0
1437; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
1438; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1048575,1048575,1048575,1048575]
1439; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
1440; AVX512-NEXT:    vzeroupper
1441; AVX512-NEXT:    retq
1442;
1443; AVX512VL-LABEL: sh_trunc_sh_vec:
1444; AVX512VL:       # %bb.0:
1445; AVX512VL-NEXT:    vpsrlq $36, %ymm0, %ymm0
1446; AVX512VL-NEXT:    vpmovqd %ymm0, %xmm0
1447; AVX512VL-NEXT:    vpandd {{.*}}(%rip){1to4}, %xmm0, %xmm0
1448; AVX512VL-NEXT:    vzeroupper
1449; AVX512VL-NEXT:    retq
1450;
1451; X86-AVX1-LABEL: sh_trunc_sh_vec:
1452; X86-AVX1:       # %bb.0:
1453; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1454; X86-AVX1-NEXT:    vpsrlq $36, %xmm1, %xmm1
1455; X86-AVX1-NEXT:    vpsrlq $36, %xmm0, %xmm0
1456; X86-AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1457; X86-AVX1-NEXT:    vandps {{\.LCPI.*}}, %xmm0, %xmm0
1458; X86-AVX1-NEXT:    vzeroupper
1459; X86-AVX1-NEXT:    retl
1460;
1461; X86-AVX2-LABEL: sh_trunc_sh_vec:
1462; X86-AVX2:       # %bb.0:
1463; X86-AVX2-NEXT:    vpsrlq $36, %ymm0, %ymm0
1464; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1465; X86-AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1466; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1048575,1048575,1048575,1048575]
1467; X86-AVX2-NEXT:    vandps %xmm1, %xmm0, %xmm0
1468; X86-AVX2-NEXT:    vzeroupper
1469; X86-AVX2-NEXT:    retl
1470  %s = lshr <4 x i64> %x, <i64 24, i64 24, i64 24, i64 24>
1471  %t = trunc <4 x i64> %s to <4 x i32>
1472  %r = lshr <4 x i32> %t, <i32 12, i32 12, i32 12, i32 12>
1473  ret <4 x i32> %r
1474}
1475