• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX1
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512,AVX512VBMI2
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLVBMI2
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOPAVX1
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOPAVX2
12
13declare <4 x i64> @llvm.fshl.v4i64(<4 x i64>, <4 x i64>, <4 x i64>)
14declare <8 x i32> @llvm.fshl.v8i32(<8 x i32>, <8 x i32>, <8 x i32>)
15declare <16 x i16> @llvm.fshl.v16i16(<16 x i16>, <16 x i16>, <16 x i16>)
16declare <32 x i8> @llvm.fshl.v32i8(<32 x i8>, <32 x i8>, <32 x i8>)
17
18;
19; Variable Shifts
20;
21
22define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind {
23; AVX1-LABEL: var_funnnel_v4i64:
24; AVX1:       # %bb.0:
25; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
26; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm1, %ymm3
27; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
28; AVX1-NEXT:    vpsllq %xmm4, %xmm2, %xmm5
29; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
30; AVX1-NEXT:    vpsllq %xmm4, %xmm2, %xmm4
31; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
32; AVX1-NEXT:    vpsllq %xmm3, %xmm0, %xmm5
33; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
34; AVX1-NEXT:    vpsllq %xmm3, %xmm0, %xmm3
35; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7]
36; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
37; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
38; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
39; AVX1-NEXT:    vpsubq %xmm4, %xmm5, %xmm4
40; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [63,63]
41; AVX1-NEXT:    vpand %xmm6, %xmm4, %xmm4
42; AVX1-NEXT:    vpsrlq %xmm4, %xmm2, %xmm7
43; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
44; AVX1-NEXT:    vpsrlq %xmm4, %xmm2, %xmm2
45; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3],xmm2[4,5,6,7]
46; AVX1-NEXT:    vpsubq %xmm1, %xmm5, %xmm1
47; AVX1-NEXT:    vpand %xmm6, %xmm1, %xmm1
48; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm4
49; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
50; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
51; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7]
52; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
53; AVX1-NEXT:    vorps %ymm0, %ymm3, %ymm0
54; AVX1-NEXT:    retq
55;
56; AVX2-LABEL: var_funnnel_v4i64:
57; AVX2:       # %bb.0:
58; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [63,63,63,63]
59; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm3
60; AVX2-NEXT:    vpsllvq %ymm3, %ymm0, %ymm3
61; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
62; AVX2-NEXT:    vpsubq %ymm1, %ymm4, %ymm1
63; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
64; AVX2-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
65; AVX2-NEXT:    vpor %ymm0, %ymm3, %ymm0
66; AVX2-NEXT:    retq
67;
68; AVX512F-LABEL: var_funnnel_v4i64:
69; AVX512F:       # %bb.0:
70; AVX512F-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
71; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
72; AVX512F-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
73; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
74; AVX512F-NEXT:    retq
75;
76; AVX512VL-LABEL: var_funnnel_v4i64:
77; AVX512VL:       # %bb.0:
78; AVX512VL-NEXT:    vprolvq %ymm1, %ymm0, %ymm0
79; AVX512VL-NEXT:    retq
80;
81; AVX512BW-LABEL: var_funnnel_v4i64:
82; AVX512BW:       # %bb.0:
83; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
84; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
85; AVX512BW-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
86; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
87; AVX512BW-NEXT:    retq
88;
89; AVX512VLBW-LABEL: var_funnnel_v4i64:
90; AVX512VLBW:       # %bb.0:
91; AVX512VLBW-NEXT:    vprolvq %ymm1, %ymm0, %ymm0
92; AVX512VLBW-NEXT:    retq
93;
94; AVX512VBMI2-LABEL: var_funnnel_v4i64:
95; AVX512VBMI2:       # %bb.0:
96; AVX512VBMI2-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
97; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
98; AVX512VBMI2-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
99; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
100; AVX512VBMI2-NEXT:    retq
101;
102; AVX512VLVBMI2-LABEL: var_funnnel_v4i64:
103; AVX512VLVBMI2:       # %bb.0:
104; AVX512VLVBMI2-NEXT:    vprolvq %ymm1, %ymm0, %ymm0
105; AVX512VLVBMI2-NEXT:    retq
106;
107; XOPAVX1-LABEL: var_funnnel_v4i64:
108; XOPAVX1:       # %bb.0:
109; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
110; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
111; XOPAVX1-NEXT:    vprotq %xmm2, %xmm3, %xmm2
112; XOPAVX1-NEXT:    vprotq %xmm1, %xmm0, %xmm0
113; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
114; XOPAVX1-NEXT:    retq
115;
116; XOPAVX2-LABEL: var_funnnel_v4i64:
117; XOPAVX2:       # %bb.0:
118; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
119; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
120; XOPAVX2-NEXT:    vprotq %xmm2, %xmm3, %xmm2
121; XOPAVX2-NEXT:    vprotq %xmm1, %xmm0, %xmm0
122; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
123; XOPAVX2-NEXT:    retq
124  %res = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %x, <4 x i64> %x, <4 x i64> %amt)
125  ret <4 x i64> %res
126}
127
128define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind {
129; AVX1-LABEL: var_funnnel_v8i32:
130; AVX1:       # %bb.0:
131; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
132; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [31,31,31,31]
133; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
134; AVX1-NEXT:    vpslld $23, %xmm2, %xmm2
135; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
136; AVX1-NEXT:    vpaddd %xmm4, %xmm2, %xmm2
137; AVX1-NEXT:    vcvttps2dq %xmm2, %xmm2
138; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
139; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
140; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
141; AVX1-NEXT:    vpmuludq %xmm5, %xmm7, %xmm5
142; AVX1-NEXT:    vpmuludq %xmm2, %xmm6, %xmm2
143; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm2[1,1,3,3]
144; AVX1-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm5[2,3],xmm6[4,5],xmm5[6,7]
145; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,0,2,2]
146; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7]
147; AVX1-NEXT:    vpor %xmm6, %xmm2, %xmm2
148; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
149; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
150; AVX1-NEXT:    vpaddd %xmm4, %xmm1, %xmm1
151; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
152; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
153; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
154; AVX1-NEXT:    vpmuludq %xmm3, %xmm4, %xmm3
155; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
156; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
157; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
158; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
159; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
160; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
161; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
162; AVX1-NEXT:    retq
163;
164; AVX2-LABEL: var_funnnel_v8i32:
165; AVX2:       # %bb.0:
166; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31]
167; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
168; AVX2-NEXT:    vpsllvd %ymm1, %ymm0, %ymm2
169; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32]
170; AVX2-NEXT:    vpsubd %ymm1, %ymm3, %ymm1
171; AVX2-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
172; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
173; AVX2-NEXT:    retq
174;
175; AVX512F-LABEL: var_funnnel_v8i32:
176; AVX512F:       # %bb.0:
177; AVX512F-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
178; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
179; AVX512F-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
180; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
181; AVX512F-NEXT:    retq
182;
183; AVX512VL-LABEL: var_funnnel_v8i32:
184; AVX512VL:       # %bb.0:
185; AVX512VL-NEXT:    vprolvd %ymm1, %ymm0, %ymm0
186; AVX512VL-NEXT:    retq
187;
188; AVX512BW-LABEL: var_funnnel_v8i32:
189; AVX512BW:       # %bb.0:
190; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
191; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
192; AVX512BW-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
193; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
194; AVX512BW-NEXT:    retq
195;
196; AVX512VLBW-LABEL: var_funnnel_v8i32:
197; AVX512VLBW:       # %bb.0:
198; AVX512VLBW-NEXT:    vprolvd %ymm1, %ymm0, %ymm0
199; AVX512VLBW-NEXT:    retq
200;
201; AVX512VBMI2-LABEL: var_funnnel_v8i32:
202; AVX512VBMI2:       # %bb.0:
203; AVX512VBMI2-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
204; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
205; AVX512VBMI2-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
206; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
207; AVX512VBMI2-NEXT:    retq
208;
209; AVX512VLVBMI2-LABEL: var_funnnel_v8i32:
210; AVX512VLVBMI2:       # %bb.0:
211; AVX512VLVBMI2-NEXT:    vprolvd %ymm1, %ymm0, %ymm0
212; AVX512VLVBMI2-NEXT:    retq
213;
214; XOPAVX1-LABEL: var_funnnel_v8i32:
215; XOPAVX1:       # %bb.0:
216; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
217; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
218; XOPAVX1-NEXT:    vprotd %xmm2, %xmm3, %xmm2
219; XOPAVX1-NEXT:    vprotd %xmm1, %xmm0, %xmm0
220; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
221; XOPAVX1-NEXT:    retq
222;
223; XOPAVX2-LABEL: var_funnnel_v8i32:
224; XOPAVX2:       # %bb.0:
225; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
226; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
227; XOPAVX2-NEXT:    vprotd %xmm2, %xmm3, %xmm2
228; XOPAVX2-NEXT:    vprotd %xmm1, %xmm0, %xmm0
229; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
230; XOPAVX2-NEXT:    retq
231  %res = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %x, <8 x i32> %x, <8 x i32> %amt)
232  ret <8 x i32> %res
233}
234
235define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind {
236; AVX1-LABEL: var_funnnel_v16i16:
237; AVX1:       # %bb.0:
238; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
239; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
240; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
241; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm2[4,4,5,5,6,6,7,7]
242; AVX1-NEXT:    vpslld $23, %xmm4, %xmm4
243; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
244; AVX1-NEXT:    vpaddd %xmm5, %xmm4, %xmm4
245; AVX1-NEXT:    vcvttps2dq %xmm4, %xmm4
246; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
247; AVX1-NEXT:    vpslld $23, %xmm2, %xmm2
248; AVX1-NEXT:    vpaddd %xmm5, %xmm2, %xmm2
249; AVX1-NEXT:    vcvttps2dq %xmm2, %xmm2
250; AVX1-NEXT:    vpackusdw %xmm4, %xmm2, %xmm2
251; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
252; AVX1-NEXT:    vpmulhuw %xmm2, %xmm4, %xmm6
253; AVX1-NEXT:    vpmullw %xmm2, %xmm4, %xmm2
254; AVX1-NEXT:    vpor %xmm6, %xmm2, %xmm2
255; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
256; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm1[4,4,5,5,6,6,7,7]
257; AVX1-NEXT:    vpslld $23, %xmm3, %xmm3
258; AVX1-NEXT:    vpaddd %xmm5, %xmm3, %xmm3
259; AVX1-NEXT:    vcvttps2dq %xmm3, %xmm3
260; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
261; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
262; AVX1-NEXT:    vpaddd %xmm5, %xmm1, %xmm1
263; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
264; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
265; AVX1-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm3
266; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
267; AVX1-NEXT:    vpor %xmm3, %xmm0, %xmm0
268; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
269; AVX1-NEXT:    retq
270;
271; AVX2-LABEL: var_funnnel_v16i16:
272; AVX2:       # %bb.0:
273; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
274; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
275; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
276; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
277; AVX2-NEXT:    vpsllvd %ymm4, %ymm3, %ymm4
278; AVX2-NEXT:    vpsrld $16, %ymm4, %ymm4
279; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
280; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
281; AVX2-NEXT:    vpsllvd %ymm5, %ymm0, %ymm5
282; AVX2-NEXT:    vpsrld $16, %ymm5, %ymm5
283; AVX2-NEXT:    vpackusdw %ymm4, %ymm5, %ymm4
284; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
285; AVX2-NEXT:    vpsubw %ymm1, %ymm5, %ymm1
286; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
287; AVX2-NEXT:    vpsrlvd %ymm5, %ymm3, %ymm3
288; AVX2-NEXT:    vpsrld $16, %ymm3, %ymm3
289; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
290; AVX2-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
291; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
292; AVX2-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
293; AVX2-NEXT:    vpor %ymm0, %ymm4, %ymm0
294; AVX2-NEXT:    retq
295;
296; AVX512F-LABEL: var_funnnel_v16i16:
297; AVX512F:       # %bb.0:
298; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
299; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
300; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
301; AVX512F-NEXT:    vpsllvd %zmm2, %zmm0, %zmm2
302; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
303; AVX512F-NEXT:    vpsubw %ymm1, %ymm3, %ymm1
304; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
305; AVX512F-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
306; AVX512F-NEXT:    vpord %zmm0, %zmm2, %zmm0
307; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
308; AVX512F-NEXT:    retq
309;
310; AVX512VL-LABEL: var_funnnel_v16i16:
311; AVX512VL:       # %bb.0:
312; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
313; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
314; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
315; AVX512VL-NEXT:    vpsllvd %zmm2, %zmm0, %zmm2
316; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
317; AVX512VL-NEXT:    vpsubw %ymm1, %ymm3, %ymm1
318; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
319; AVX512VL-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
320; AVX512VL-NEXT:    vpord %zmm0, %zmm2, %zmm0
321; AVX512VL-NEXT:    vpmovdw %zmm0, %ymm0
322; AVX512VL-NEXT:    retq
323;
324; AVX512BW-LABEL: var_funnnel_v16i16:
325; AVX512BW:       # %bb.0:
326; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
327; AVX512BW-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
328; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm2
329; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
330; AVX512BW-NEXT:    vpsubw %ymm1, %ymm3, %ymm1
331; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
332; AVX512BW-NEXT:    vpor %ymm0, %ymm2, %ymm0
333; AVX512BW-NEXT:    retq
334;
335; AVX512VLBW-LABEL: var_funnnel_v16i16:
336; AVX512VLBW:       # %bb.0:
337; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
338; AVX512VLBW-NEXT:    vpsllvw %ymm1, %ymm0, %ymm2
339; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
340; AVX512VLBW-NEXT:    vpsubw %ymm1, %ymm3, %ymm1
341; AVX512VLBW-NEXT:    vpsrlvw %ymm1, %ymm0, %ymm0
342; AVX512VLBW-NEXT:    vpor %ymm0, %ymm2, %ymm0
343; AVX512VLBW-NEXT:    retq
344;
345; AVX512VBMI2-LABEL: var_funnnel_v16i16:
346; AVX512VBMI2:       # %bb.0:
347; AVX512VBMI2-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
348; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
349; AVX512VBMI2-NEXT:    vpshldvw %zmm1, %zmm0, %zmm0
350; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
351; AVX512VBMI2-NEXT:    retq
352;
353; AVX512VLVBMI2-LABEL: var_funnnel_v16i16:
354; AVX512VLVBMI2:       # %bb.0:
355; AVX512VLVBMI2-NEXT:    vpshldvw %ymm1, %ymm0, %ymm0
356; AVX512VLVBMI2-NEXT:    retq
357;
358; XOPAVX1-LABEL: var_funnnel_v16i16:
359; XOPAVX1:       # %bb.0:
360; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
361; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
362; XOPAVX1-NEXT:    vprotw %xmm2, %xmm3, %xmm2
363; XOPAVX1-NEXT:    vprotw %xmm1, %xmm0, %xmm0
364; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
365; XOPAVX1-NEXT:    retq
366;
367; XOPAVX2-LABEL: var_funnnel_v16i16:
368; XOPAVX2:       # %bb.0:
369; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
370; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
371; XOPAVX2-NEXT:    vprotw %xmm2, %xmm3, %xmm2
372; XOPAVX2-NEXT:    vprotw %xmm1, %xmm0, %xmm0
373; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
374; XOPAVX2-NEXT:    retq
375  %res = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %x, <16 x i16> %x, <16 x i16> %amt)
376  ret <16 x i16> %res
377}
378
379define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
380; AVX1-LABEL: var_funnnel_v32i8:
381; AVX1:       # %bb.0:
382; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
383; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm3
384; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
385; AVX1-NEXT:    vpandn %xmm3, %xmm4, %xmm3
386; AVX1-NEXT:    vpsllw $4, %xmm2, %xmm5
387; AVX1-NEXT:    vpand %xmm4, %xmm5, %xmm5
388; AVX1-NEXT:    vpor %xmm3, %xmm5, %xmm3
389; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
390; AVX1-NEXT:    vpsllw $5, %xmm5, %xmm5
391; AVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
392; AVX1-NEXT:    vpsrlw $6, %xmm2, %xmm3
393; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
394; AVX1-NEXT:    vpandn %xmm3, %xmm6, %xmm3
395; AVX1-NEXT:    vpsllw $2, %xmm2, %xmm7
396; AVX1-NEXT:    vpand %xmm6, %xmm7, %xmm7
397; AVX1-NEXT:    vpor %xmm3, %xmm7, %xmm3
398; AVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
399; AVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
400; AVX1-NEXT:    vpsrlw $7, %xmm2, %xmm3
401; AVX1-NEXT:    vmovdqa {{.*#+}} xmm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
402; AVX1-NEXT:    vpand %xmm3, %xmm8, %xmm3
403; AVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm7
404; AVX1-NEXT:    vpor %xmm3, %xmm7, %xmm3
405; AVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
406; AVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
407; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm3
408; AVX1-NEXT:    vpandn %xmm3, %xmm4, %xmm3
409; AVX1-NEXT:    vpsllw $4, %xmm0, %xmm5
410; AVX1-NEXT:    vpand %xmm4, %xmm5, %xmm4
411; AVX1-NEXT:    vpor %xmm3, %xmm4, %xmm3
412; AVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
413; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
414; AVX1-NEXT:    vpsrlw $6, %xmm0, %xmm3
415; AVX1-NEXT:    vpandn %xmm3, %xmm6, %xmm3
416; AVX1-NEXT:    vpsllw $2, %xmm0, %xmm4
417; AVX1-NEXT:    vpand %xmm6, %xmm4, %xmm4
418; AVX1-NEXT:    vpor %xmm3, %xmm4, %xmm3
419; AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
420; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
421; AVX1-NEXT:    vpsrlw $7, %xmm0, %xmm3
422; AVX1-NEXT:    vpand %xmm3, %xmm8, %xmm3
423; AVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm4
424; AVX1-NEXT:    vpor %xmm3, %xmm4, %xmm3
425; AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
426; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
427; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
428; AVX1-NEXT:    retq
429;
430; AVX2-LABEL: var_funnnel_v32i8:
431; AVX2:       # %bb.0:
432; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm2
433; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
434; AVX2-NEXT:    vpsllw $4, %ymm0, %ymm3
435; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
436; AVX2-NEXT:    vpor %ymm2, %ymm3, %ymm2
437; AVX2-NEXT:    vpsllw $5, %ymm1, %ymm1
438; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
439; AVX2-NEXT:    vpsrlw $6, %ymm0, %ymm2
440; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
441; AVX2-NEXT:    vpsllw $2, %ymm0, %ymm3
442; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
443; AVX2-NEXT:    vpor %ymm2, %ymm3, %ymm2
444; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
445; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
446; AVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
447; AVX2-NEXT:    vpsrlw $7, %ymm0, %ymm3
448; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
449; AVX2-NEXT:    vpor %ymm3, %ymm2, %ymm2
450; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
451; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
452; AVX2-NEXT:    retq
453;
454; AVX512F-LABEL: var_funnnel_v32i8:
455; AVX512F:       # %bb.0:
456; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm2
457; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
458; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm3
459; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
460; AVX512F-NEXT:    vpor %ymm2, %ymm3, %ymm2
461; AVX512F-NEXT:    vpsllw $5, %ymm1, %ymm1
462; AVX512F-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
463; AVX512F-NEXT:    vpsrlw $6, %ymm0, %ymm2
464; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
465; AVX512F-NEXT:    vpsllw $2, %ymm0, %ymm3
466; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
467; AVX512F-NEXT:    vpor %ymm2, %ymm3, %ymm2
468; AVX512F-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
469; AVX512F-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
470; AVX512F-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
471; AVX512F-NEXT:    vpsrlw $7, %ymm0, %ymm3
472; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
473; AVX512F-NEXT:    vpor %ymm3, %ymm2, %ymm2
474; AVX512F-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
475; AVX512F-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
476; AVX512F-NEXT:    retq
477;
478; AVX512VL-LABEL: var_funnnel_v32i8:
479; AVX512VL:       # %bb.0:
480; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm2
481; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm3
482; AVX512VL-NEXT:    vpternlogq $216, {{.*}}(%rip), %ymm2, %ymm3
483; AVX512VL-NEXT:    vpsllw $5, %ymm1, %ymm1
484; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
485; AVX512VL-NEXT:    vpsllw $2, %ymm0, %ymm2
486; AVX512VL-NEXT:    vpsrlw $6, %ymm0, %ymm3
487; AVX512VL-NEXT:    vpternlogq $216, {{.*}}(%rip), %ymm2, %ymm3
488; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
489; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
490; AVX512VL-NEXT:    vpsrlw $7, %ymm0, %ymm2
491; AVX512VL-NEXT:    vpaddb %ymm0, %ymm0, %ymm3
492; AVX512VL-NEXT:    vpternlogq $248, {{.*}}(%rip), %ymm2, %ymm3
493; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
494; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
495; AVX512VL-NEXT:    retq
496;
497; AVX512BW-LABEL: var_funnnel_v32i8:
498; AVX512BW:       # %bb.0:
499; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
500; AVX512BW-NEXT:    vpand %ymm2, %ymm1, %ymm3
501; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero,ymm3[16],zero,ymm3[17],zero,ymm3[18],zero,ymm3[19],zero,ymm3[20],zero,ymm3[21],zero,ymm3[22],zero,ymm3[23],zero,ymm3[24],zero,ymm3[25],zero,ymm3[26],zero,ymm3[27],zero,ymm3[28],zero,ymm3[29],zero,ymm3[30],zero,ymm3[31],zero
502; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
503; AVX512BW-NEXT:    vpsllvw %zmm3, %zmm0, %zmm3
504; AVX512BW-NEXT:    vpxor %xmm4, %xmm4, %xmm4
505; AVX512BW-NEXT:    vpsubb %ymm1, %ymm4, %ymm1
506; AVX512BW-NEXT:    vpand %ymm2, %ymm1, %ymm1
507; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
508; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
509; AVX512BW-NEXT:    vporq %zmm0, %zmm3, %zmm0
510; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
511; AVX512BW-NEXT:    retq
512;
513; AVX512VLBW-LABEL: var_funnnel_v32i8:
514; AVX512VLBW:       # %bb.0:
515; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
516; AVX512VLBW-NEXT:    vpand %ymm2, %ymm1, %ymm3
517; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero,ymm3[16],zero,ymm3[17],zero,ymm3[18],zero,ymm3[19],zero,ymm3[20],zero,ymm3[21],zero,ymm3[22],zero,ymm3[23],zero,ymm3[24],zero,ymm3[25],zero,ymm3[26],zero,ymm3[27],zero,ymm3[28],zero,ymm3[29],zero,ymm3[30],zero,ymm3[31],zero
518; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
519; AVX512VLBW-NEXT:    vpsllvw %zmm3, %zmm0, %zmm3
520; AVX512VLBW-NEXT:    vpxor %xmm4, %xmm4, %xmm4
521; AVX512VLBW-NEXT:    vpsubb %ymm1, %ymm4, %ymm1
522; AVX512VLBW-NEXT:    vpand %ymm2, %ymm1, %ymm1
523; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
524; AVX512VLBW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
525; AVX512VLBW-NEXT:    vporq %zmm0, %zmm3, %zmm0
526; AVX512VLBW-NEXT:    vpmovwb %zmm0, %ymm0
527; AVX512VLBW-NEXT:    retq
528;
529; AVX512VBMI2-LABEL: var_funnnel_v32i8:
530; AVX512VBMI2:       # %bb.0:
531; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
532; AVX512VBMI2-NEXT:    vpand %ymm2, %ymm1, %ymm3
533; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero,ymm3[16],zero,ymm3[17],zero,ymm3[18],zero,ymm3[19],zero,ymm3[20],zero,ymm3[21],zero,ymm3[22],zero,ymm3[23],zero,ymm3[24],zero,ymm3[25],zero,ymm3[26],zero,ymm3[27],zero,ymm3[28],zero,ymm3[29],zero,ymm3[30],zero,ymm3[31],zero
534; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
535; AVX512VBMI2-NEXT:    vpsllvw %zmm3, %zmm0, %zmm3
536; AVX512VBMI2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
537; AVX512VBMI2-NEXT:    vpsubb %ymm1, %ymm4, %ymm1
538; AVX512VBMI2-NEXT:    vpand %ymm2, %ymm1, %ymm1
539; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
540; AVX512VBMI2-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
541; AVX512VBMI2-NEXT:    vporq %zmm0, %zmm3, %zmm0
542; AVX512VBMI2-NEXT:    vpmovwb %zmm0, %ymm0
543; AVX512VBMI2-NEXT:    retq
544;
545; AVX512VLVBMI2-LABEL: var_funnnel_v32i8:
546; AVX512VLVBMI2:       # %bb.0:
547; AVX512VLVBMI2-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
548; AVX512VLVBMI2-NEXT:    vpand %ymm2, %ymm1, %ymm3
549; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero,ymm3[16],zero,ymm3[17],zero,ymm3[18],zero,ymm3[19],zero,ymm3[20],zero,ymm3[21],zero,ymm3[22],zero,ymm3[23],zero,ymm3[24],zero,ymm3[25],zero,ymm3[26],zero,ymm3[27],zero,ymm3[28],zero,ymm3[29],zero,ymm3[30],zero,ymm3[31],zero
550; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
551; AVX512VLVBMI2-NEXT:    vpsllvw %zmm3, %zmm0, %zmm3
552; AVX512VLVBMI2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
553; AVX512VLVBMI2-NEXT:    vpsubb %ymm1, %ymm4, %ymm1
554; AVX512VLVBMI2-NEXT:    vpand %ymm2, %ymm1, %ymm1
555; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
556; AVX512VLVBMI2-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
557; AVX512VLVBMI2-NEXT:    vporq %zmm0, %zmm3, %zmm0
558; AVX512VLVBMI2-NEXT:    vpmovwb %zmm0, %ymm0
559; AVX512VLVBMI2-NEXT:    retq
560;
561; XOPAVX1-LABEL: var_funnnel_v32i8:
562; XOPAVX1:       # %bb.0:
563; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
564; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
565; XOPAVX1-NEXT:    vprotb %xmm2, %xmm3, %xmm2
566; XOPAVX1-NEXT:    vprotb %xmm1, %xmm0, %xmm0
567; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
568; XOPAVX1-NEXT:    retq
569;
570; XOPAVX2-LABEL: var_funnnel_v32i8:
571; XOPAVX2:       # %bb.0:
572; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
573; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
574; XOPAVX2-NEXT:    vprotb %xmm2, %xmm3, %xmm2
575; XOPAVX2-NEXT:    vprotb %xmm1, %xmm0, %xmm0
576; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
577; XOPAVX2-NEXT:    retq
578  %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %x, <32 x i8> %x, <32 x i8> %amt)
579  ret <32 x i8> %res
580}
581
582;
583; Uniform Variable Shifts
584;
585
586define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind {
587; AVX1-LABEL: splatvar_funnnel_v4i64:
588; AVX1:       # %bb.0:
589; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[0,1,0,1]
590; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
591; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
592; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [63,63]
593; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
594; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
595; AVX1-NEXT:    vpsrlq %xmm2, %xmm4, %xmm5
596; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm2[2,3,2,3]
597; AVX1-NEXT:    vpsrlq %xmm6, %xmm4, %xmm7
598; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm7[4,5,6,7]
599; AVX1-NEXT:    vpsrlq %xmm2, %xmm0, %xmm2
600; AVX1-NEXT:    vpsrlq %xmm6, %xmm0, %xmm6
601; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7]
602; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm2, %ymm2
603; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
604; AVX1-NEXT:    vpsllq %xmm1, %xmm4, %xmm3
605; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
606; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
607; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
608; AVX1-NEXT:    retq
609;
610; AVX2-LABEL: splatvar_funnnel_v4i64:
611; AVX2:       # %bb.0:
612; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [63,63]
613; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
614; AVX2-NEXT:    vpsllq %xmm3, %ymm0, %ymm3
615; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
616; AVX2-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
617; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
618; AVX2-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
619; AVX2-NEXT:    vpor %ymm0, %ymm3, %ymm0
620; AVX2-NEXT:    retq
621;
622; AVX512F-LABEL: splatvar_funnnel_v4i64:
623; AVX512F:       # %bb.0:
624; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
625; AVX512F-NEXT:    vpbroadcastq %xmm1, %ymm1
626; AVX512F-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
627; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
628; AVX512F-NEXT:    retq
629;
630; AVX512VL-LABEL: splatvar_funnnel_v4i64:
631; AVX512VL:       # %bb.0:
632; AVX512VL-NEXT:    vpbroadcastq %xmm1, %ymm1
633; AVX512VL-NEXT:    vprolvq %ymm1, %ymm0, %ymm0
634; AVX512VL-NEXT:    retq
635;
636; AVX512BW-LABEL: splatvar_funnnel_v4i64:
637; AVX512BW:       # %bb.0:
638; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
639; AVX512BW-NEXT:    vpbroadcastq %xmm1, %ymm1
640; AVX512BW-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
641; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
642; AVX512BW-NEXT:    retq
643;
644; AVX512VLBW-LABEL: splatvar_funnnel_v4i64:
645; AVX512VLBW:       # %bb.0:
646; AVX512VLBW-NEXT:    vpbroadcastq %xmm1, %ymm1
647; AVX512VLBW-NEXT:    vprolvq %ymm1, %ymm0, %ymm0
648; AVX512VLBW-NEXT:    retq
649;
650; AVX512VBMI2-LABEL: splatvar_funnnel_v4i64:
651; AVX512VBMI2:       # %bb.0:
652; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
653; AVX512VBMI2-NEXT:    vpbroadcastq %xmm1, %ymm1
654; AVX512VBMI2-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
655; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
656; AVX512VBMI2-NEXT:    retq
657;
658; AVX512VLVBMI2-LABEL: splatvar_funnnel_v4i64:
659; AVX512VLVBMI2:       # %bb.0:
660; AVX512VLVBMI2-NEXT:    vpbroadcastq %xmm1, %ymm1
661; AVX512VLVBMI2-NEXT:    vprolvq %ymm1, %ymm0, %ymm0
662; AVX512VLVBMI2-NEXT:    retq
663;
664; XOPAVX1-LABEL: splatvar_funnnel_v4i64:
665; XOPAVX1:       # %bb.0:
666; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
667; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
668; XOPAVX1-NEXT:    vprotq %xmm1, %xmm2, %xmm2
669; XOPAVX1-NEXT:    vprotq %xmm1, %xmm0, %xmm0
670; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
671; XOPAVX1-NEXT:    retq
672;
673; XOPAVX2-LABEL: splatvar_funnnel_v4i64:
674; XOPAVX2:       # %bb.0:
675; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
676; XOPAVX2-NEXT:    vpbroadcastq %xmm1, %xmm1
677; XOPAVX2-NEXT:    vprotq %xmm1, %xmm2, %xmm2
678; XOPAVX2-NEXT:    vprotq %xmm1, %xmm0, %xmm0
679; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
680; XOPAVX2-NEXT:    retq
681  %splat = shufflevector <4 x i64> %amt, <4 x i64> undef, <4 x i32> zeroinitializer
682  %res = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %x, <4 x i64> %x, <4 x i64> %splat)
683  ret <4 x i64> %res
684}
685
686define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind {
687; AVX1-LABEL: splatvar_funnnel_v8i32:
688; AVX1:       # %bb.0:
689; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
690; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
691; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
692; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero
693; AVX1-NEXT:    vpslld %xmm3, %xmm2, %xmm4
694; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [32,32,32,32]
695; AVX1-NEXT:    vpsubd %xmm1, %xmm5, %xmm1
696; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
697; AVX1-NEXT:    vpsrld %xmm1, %xmm2, %xmm2
698; AVX1-NEXT:    vpor %xmm2, %xmm4, %xmm2
699; AVX1-NEXT:    vpslld %xmm3, %xmm0, %xmm3
700; AVX1-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
701; AVX1-NEXT:    vpor %xmm0, %xmm3, %xmm0
702; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
703; AVX1-NEXT:    retq
704;
705; AVX2-LABEL: splatvar_funnnel_v8i32:
706; AVX2:       # %bb.0:
707; AVX2-NEXT:    vpbroadcastd %xmm1, %xmm1
708; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
709; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
710; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
711; AVX2-NEXT:    vpslld %xmm2, %ymm0, %ymm2
712; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32]
713; AVX2-NEXT:    vpsubd %xmm1, %xmm3, %xmm1
714; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
715; AVX2-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
716; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
717; AVX2-NEXT:    retq
718;
719; AVX512F-LABEL: splatvar_funnnel_v8i32:
720; AVX512F:       # %bb.0:
721; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
722; AVX512F-NEXT:    vpbroadcastd %xmm1, %ymm1
723; AVX512F-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
724; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
725; AVX512F-NEXT:    retq
726;
727; AVX512VL-LABEL: splatvar_funnnel_v8i32:
728; AVX512VL:       # %bb.0:
729; AVX512VL-NEXT:    vpbroadcastd %xmm1, %ymm1
730; AVX512VL-NEXT:    vprolvd %ymm1, %ymm0, %ymm0
731; AVX512VL-NEXT:    retq
732;
733; AVX512BW-LABEL: splatvar_funnnel_v8i32:
734; AVX512BW:       # %bb.0:
735; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
736; AVX512BW-NEXT:    vpbroadcastd %xmm1, %ymm1
737; AVX512BW-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
738; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
739; AVX512BW-NEXT:    retq
740;
741; AVX512VLBW-LABEL: splatvar_funnnel_v8i32:
742; AVX512VLBW:       # %bb.0:
743; AVX512VLBW-NEXT:    vpbroadcastd %xmm1, %ymm1
744; AVX512VLBW-NEXT:    vprolvd %ymm1, %ymm0, %ymm0
745; AVX512VLBW-NEXT:    retq
746;
747; AVX512VBMI2-LABEL: splatvar_funnnel_v8i32:
748; AVX512VBMI2:       # %bb.0:
749; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
750; AVX512VBMI2-NEXT:    vpbroadcastd %xmm1, %ymm1
751; AVX512VBMI2-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
752; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
753; AVX512VBMI2-NEXT:    retq
754;
755; AVX512VLVBMI2-LABEL: splatvar_funnnel_v8i32:
756; AVX512VLVBMI2:       # %bb.0:
757; AVX512VLVBMI2-NEXT:    vpbroadcastd %xmm1, %ymm1
758; AVX512VLVBMI2-NEXT:    vprolvd %ymm1, %ymm0, %ymm0
759; AVX512VLVBMI2-NEXT:    retq
760;
761; XOPAVX1-LABEL: splatvar_funnnel_v8i32:
762; XOPAVX1:       # %bb.0:
763; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
764; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
765; XOPAVX1-NEXT:    vprotd %xmm1, %xmm2, %xmm2
766; XOPAVX1-NEXT:    vprotd %xmm1, %xmm0, %xmm0
767; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
768; XOPAVX1-NEXT:    retq
769;
770; XOPAVX2-LABEL: splatvar_funnnel_v8i32:
771; XOPAVX2:       # %bb.0:
772; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
773; XOPAVX2-NEXT:    vpbroadcastd %xmm1, %xmm1
774; XOPAVX2-NEXT:    vprotd %xmm1, %xmm2, %xmm2
775; XOPAVX2-NEXT:    vprotd %xmm1, %xmm0, %xmm0
776; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
777; XOPAVX2-NEXT:    retq
778  %splat = shufflevector <8 x i32> %amt, <8 x i32> undef, <8 x i32> zeroinitializer
779  %res = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %x, <8 x i32> %x, <8 x i32> %splat)
780  ret <8 x i32> %res
781}
782
783define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind {
784; AVX1-LABEL: splatvar_funnnel_v16i16:
785; AVX1:       # %bb.0:
786; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
787; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
788; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
789; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
790; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
791; AVX1-NEXT:    vpsllw %xmm3, %xmm2, %xmm4
792; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
793; AVX1-NEXT:    vpsubw %xmm1, %xmm5, %xmm1
794; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
795; AVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
796; AVX1-NEXT:    vpor %xmm2, %xmm4, %xmm2
797; AVX1-NEXT:    vpsllw %xmm3, %xmm0, %xmm3
798; AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
799; AVX1-NEXT:    vpor %xmm0, %xmm3, %xmm0
800; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
801; AVX1-NEXT:    retq
802;
803; AVX2-LABEL: splatvar_funnnel_v16i16:
804; AVX2:       # %bb.0:
805; AVX2-NEXT:    vpbroadcastw %xmm1, %xmm1
806; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
807; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
808; AVX2-NEXT:    vpsllw %xmm2, %ymm0, %ymm2
809; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
810; AVX2-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
811; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
812; AVX2-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
813; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
814; AVX2-NEXT:    retq
815;
816; AVX512F-LABEL: splatvar_funnnel_v16i16:
817; AVX512F:       # %bb.0:
818; AVX512F-NEXT:    vpbroadcastw %xmm1, %xmm1
819; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
820; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
821; AVX512F-NEXT:    vpsllw %xmm2, %ymm0, %ymm2
822; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
823; AVX512F-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
824; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
825; AVX512F-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
826; AVX512F-NEXT:    vpor %ymm0, %ymm2, %ymm0
827; AVX512F-NEXT:    retq
828;
829; AVX512VL-LABEL: splatvar_funnnel_v16i16:
830; AVX512VL:       # %bb.0:
831; AVX512VL-NEXT:    vpbroadcastw %xmm1, %xmm1
832; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
833; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
834; AVX512VL-NEXT:    vpsllw %xmm2, %ymm0, %ymm2
835; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
836; AVX512VL-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
837; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
838; AVX512VL-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
839; AVX512VL-NEXT:    vpor %ymm0, %ymm2, %ymm0
840; AVX512VL-NEXT:    retq
841;
842; AVX512BW-LABEL: splatvar_funnnel_v16i16:
843; AVX512BW:       # %bb.0:
844; AVX512BW-NEXT:    vpbroadcastw %xmm1, %xmm1
845; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
846; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
847; AVX512BW-NEXT:    vpsllw %xmm2, %ymm0, %ymm2
848; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
849; AVX512BW-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
850; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
851; AVX512BW-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
852; AVX512BW-NEXT:    vpor %ymm0, %ymm2, %ymm0
853; AVX512BW-NEXT:    retq
854;
855; AVX512VLBW-LABEL: splatvar_funnnel_v16i16:
856; AVX512VLBW:       # %bb.0:
857; AVX512VLBW-NEXT:    vpbroadcastw %xmm1, %xmm1
858; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
859; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
860; AVX512VLBW-NEXT:    vpsllw %xmm2, %ymm0, %ymm2
861; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
862; AVX512VLBW-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
863; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
864; AVX512VLBW-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
865; AVX512VLBW-NEXT:    vpor %ymm0, %ymm2, %ymm0
866; AVX512VLBW-NEXT:    retq
867;
868; AVX512VBMI2-LABEL: splatvar_funnnel_v16i16:
869; AVX512VBMI2:       # %bb.0:
870; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
871; AVX512VBMI2-NEXT:    vpbroadcastw %xmm1, %ymm1
872; AVX512VBMI2-NEXT:    vpshldvw %zmm1, %zmm0, %zmm0
873; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
874; AVX512VBMI2-NEXT:    retq
875;
876; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i16:
877; AVX512VLVBMI2:       # %bb.0:
878; AVX512VLVBMI2-NEXT:    vpbroadcastw %xmm1, %ymm1
879; AVX512VLVBMI2-NEXT:    vpshldvw %ymm1, %ymm0, %ymm0
880; AVX512VLVBMI2-NEXT:    retq
881;
882; XOPAVX1-LABEL: splatvar_funnnel_v16i16:
883; XOPAVX1:       # %bb.0:
884; XOPAVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
885; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
886; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
887; XOPAVX1-NEXT:    vprotw %xmm1, %xmm2, %xmm2
888; XOPAVX1-NEXT:    vprotw %xmm1, %xmm0, %xmm0
889; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
890; XOPAVX1-NEXT:    retq
891;
892; XOPAVX2-LABEL: splatvar_funnnel_v16i16:
893; XOPAVX2:       # %bb.0:
894; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
895; XOPAVX2-NEXT:    vpbroadcastw %xmm1, %xmm1
896; XOPAVX2-NEXT:    vprotw %xmm1, %xmm2, %xmm2
897; XOPAVX2-NEXT:    vprotw %xmm1, %xmm0, %xmm0
898; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
899; XOPAVX2-NEXT:    retq
900  %splat = shufflevector <16 x i16> %amt, <16 x i16> undef, <16 x i32> zeroinitializer
901  %res = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %x, <16 x i16> %x, <16 x i16> %splat)
902  ret <16 x i16> %res
903}
904
905define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
906; AVX1-LABEL: splatvar_funnnel_v32i8:
907; AVX1:       # %bb.0:
908; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
909; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
910; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
911; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
912; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
913; AVX1-NEXT:    vpsllw %xmm3, %xmm4, %xmm5
914; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm6, %xmm6
915; AVX1-NEXT:    vpsllw %xmm3, %xmm6, %xmm7
916; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
917; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm5
918; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
919; AVX1-NEXT:    vpsubb %xmm1, %xmm7, %xmm1
920; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
921; AVX1-NEXT:    vpsrlw %xmm1, %xmm4, %xmm4
922; AVX1-NEXT:    vpsrlw %xmm1, %xmm6, %xmm6
923; AVX1-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
924; AVX1-NEXT:    vpand %xmm6, %xmm4, %xmm4
925; AVX1-NEXT:    vpor %xmm4, %xmm5, %xmm4
926; AVX1-NEXT:    vpsllw %xmm3, %xmm0, %xmm3
927; AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm2
928; AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
929; AVX1-NEXT:    vpand %xmm6, %xmm0, %xmm0
930; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
931; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
932; AVX1-NEXT:    retq
933;
934; AVX2-LABEL: splatvar_funnnel_v32i8:
935; AVX2:       # %bb.0:
936; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
937; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
938; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
939; AVX2-NEXT:    vpsllw %xmm2, %ymm0, %ymm3
940; AVX2-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
941; AVX2-NEXT:    vpsllw %xmm2, %xmm4, %xmm2
942; AVX2-NEXT:    vpbroadcastb %xmm2, %ymm2
943; AVX2-NEXT:    vpand %ymm2, %ymm3, %ymm2
944; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
945; AVX2-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
946; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
947; AVX2-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
948; AVX2-NEXT:    vpsrlw %xmm1, %xmm4, %xmm1
949; AVX2-NEXT:    vpsrlw $8, %xmm1, %xmm1
950; AVX2-NEXT:    vpbroadcastb %xmm1, %ymm1
951; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
952; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
953; AVX2-NEXT:    retq
954;
955; AVX512F-LABEL: splatvar_funnnel_v32i8:
956; AVX512F:       # %bb.0:
957; AVX512F-NEXT:    vpbroadcastb %xmm1, %xmm1
958; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
959; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
960; AVX512F-NEXT:    vpsllw %xmm2, %ymm0, %ymm3
961; AVX512F-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
962; AVX512F-NEXT:    vpsllw %xmm2, %xmm4, %xmm2
963; AVX512F-NEXT:    vpbroadcastb %xmm2, %ymm2
964; AVX512F-NEXT:    vpand %ymm2, %ymm3, %ymm2
965; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
966; AVX512F-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
967; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
968; AVX512F-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
969; AVX512F-NEXT:    vpsrlw %xmm1, %xmm4, %xmm1
970; AVX512F-NEXT:    vpsrlw $8, %xmm1, %xmm1
971; AVX512F-NEXT:    vpbroadcastb %xmm1, %ymm1
972; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm0
973; AVX512F-NEXT:    vpor %ymm0, %ymm2, %ymm0
974; AVX512F-NEXT:    retq
975;
976; AVX512VL-LABEL: splatvar_funnnel_v32i8:
977; AVX512VL:       # %bb.0:
978; AVX512VL-NEXT:    vpbroadcastb %xmm1, %xmm1
979; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
980; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
981; AVX512VL-NEXT:    vpsllw %xmm2, %ymm0, %ymm3
982; AVX512VL-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
983; AVX512VL-NEXT:    vpsllw %xmm2, %xmm4, %xmm2
984; AVX512VL-NEXT:    vpbroadcastb %xmm2, %ymm2
985; AVX512VL-NEXT:    vpand %ymm2, %ymm3, %ymm2
986; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
987; AVX512VL-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
988; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
989; AVX512VL-NEXT:    vpsrlw %xmm1, %ymm0, %ymm3
990; AVX512VL-NEXT:    vpsrlw %xmm1, %xmm4, %xmm0
991; AVX512VL-NEXT:    vpsrlw $8, %xmm0, %xmm0
992; AVX512VL-NEXT:    vpbroadcastb %xmm0, %ymm0
993; AVX512VL-NEXT:    vpternlogq $236, %ymm3, %ymm2, %ymm0
994; AVX512VL-NEXT:    retq
995;
996; AVX512BW-LABEL: splatvar_funnnel_v32i8:
997; AVX512BW:       # %bb.0:
998; AVX512BW-NEXT:    vpbroadcastb %xmm1, %xmm1
999; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1000; AVX512BW-NEXT:    vpand %xmm2, %xmm1, %xmm3
1001; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
1002; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1003; AVX512BW-NEXT:    vpsllw %xmm3, %zmm0, %zmm3
1004; AVX512BW-NEXT:    vpxor %xmm4, %xmm4, %xmm4
1005; AVX512BW-NEXT:    vpsubb %xmm1, %xmm4, %xmm1
1006; AVX512BW-NEXT:    vpand %xmm2, %xmm1, %xmm1
1007; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1008; AVX512BW-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0
1009; AVX512BW-NEXT:    vporq %zmm0, %zmm3, %zmm0
1010; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1011; AVX512BW-NEXT:    retq
1012;
1013; AVX512VLBW-LABEL: splatvar_funnnel_v32i8:
1014; AVX512VLBW:       # %bb.0:
1015; AVX512VLBW-NEXT:    vpbroadcastb %xmm1, %xmm1
1016; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1017; AVX512VLBW-NEXT:    vpand %xmm2, %xmm1, %xmm3
1018; AVX512VLBW-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
1019; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1020; AVX512VLBW-NEXT:    vpsllw %xmm3, %zmm0, %zmm3
1021; AVX512VLBW-NEXT:    vpxor %xmm4, %xmm4, %xmm4
1022; AVX512VLBW-NEXT:    vpsubb %xmm1, %xmm4, %xmm1
1023; AVX512VLBW-NEXT:    vpand %xmm2, %xmm1, %xmm1
1024; AVX512VLBW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1025; AVX512VLBW-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0
1026; AVX512VLBW-NEXT:    vporq %zmm0, %zmm3, %zmm0
1027; AVX512VLBW-NEXT:    vpmovwb %zmm0, %ymm0
1028; AVX512VLBW-NEXT:    retq
1029;
1030; AVX512VBMI2-LABEL: splatvar_funnnel_v32i8:
1031; AVX512VBMI2:       # %bb.0:
1032; AVX512VBMI2-NEXT:    vpbroadcastb %xmm1, %xmm1
1033; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1034; AVX512VBMI2-NEXT:    vpand %xmm2, %xmm1, %xmm3
1035; AVX512VBMI2-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
1036; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1037; AVX512VBMI2-NEXT:    vpsllw %xmm3, %zmm0, %zmm3
1038; AVX512VBMI2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
1039; AVX512VBMI2-NEXT:    vpsubb %xmm1, %xmm4, %xmm1
1040; AVX512VBMI2-NEXT:    vpand %xmm2, %xmm1, %xmm1
1041; AVX512VBMI2-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1042; AVX512VBMI2-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0
1043; AVX512VBMI2-NEXT:    vporq %zmm0, %zmm3, %zmm0
1044; AVX512VBMI2-NEXT:    vpmovwb %zmm0, %ymm0
1045; AVX512VBMI2-NEXT:    retq
1046;
1047; AVX512VLVBMI2-LABEL: splatvar_funnnel_v32i8:
1048; AVX512VLVBMI2:       # %bb.0:
1049; AVX512VLVBMI2-NEXT:    vpbroadcastb %xmm1, %xmm1
1050; AVX512VLVBMI2-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1051; AVX512VLVBMI2-NEXT:    vpand %xmm2, %xmm1, %xmm3
1052; AVX512VLVBMI2-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
1053; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1054; AVX512VLVBMI2-NEXT:    vpsllw %xmm3, %zmm0, %zmm3
1055; AVX512VLVBMI2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
1056; AVX512VLVBMI2-NEXT:    vpsubb %xmm1, %xmm4, %xmm1
1057; AVX512VLVBMI2-NEXT:    vpand %xmm2, %xmm1, %xmm1
1058; AVX512VLVBMI2-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1059; AVX512VLVBMI2-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0
1060; AVX512VLVBMI2-NEXT:    vporq %zmm0, %zmm3, %zmm0
1061; AVX512VLVBMI2-NEXT:    vpmovwb %zmm0, %ymm0
1062; AVX512VLVBMI2-NEXT:    retq
1063;
1064; XOPAVX1-LABEL: splatvar_funnnel_v32i8:
1065; XOPAVX1:       # %bb.0:
1066; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1067; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1068; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1069; XOPAVX1-NEXT:    vprotb %xmm1, %xmm2, %xmm2
1070; XOPAVX1-NEXT:    vprotb %xmm1, %xmm0, %xmm0
1071; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1072; XOPAVX1-NEXT:    retq
1073;
1074; XOPAVX2-LABEL: splatvar_funnnel_v32i8:
1075; XOPAVX2:       # %bb.0:
1076; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
1077; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
1078; XOPAVX2-NEXT:    vprotb %xmm1, %xmm2, %xmm2
1079; XOPAVX2-NEXT:    vprotb %xmm1, %xmm0, %xmm0
1080; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
1081; XOPAVX2-NEXT:    retq
1082  %splat = shufflevector <32 x i8> %amt, <32 x i8> undef, <32 x i32> zeroinitializer
1083  %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %x, <32 x i8> %x, <32 x i8> %splat)
1084  ret <32 x i8> %res
1085}
1086
1087;
1088; Constant Shifts
1089;
1090
1091define <4 x i64> @constant_funnnel_v4i64(<4 x i64> %x) nounwind {
1092; AVX1-LABEL: constant_funnnel_v4i64:
1093; AVX1:       # %bb.0:
1094; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1095; AVX1-NEXT:    vpsrlq $4, %xmm1, %xmm2
1096; AVX1-NEXT:    vpsrlq $14, %xmm1, %xmm3
1097; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1098; AVX1-NEXT:    vpsrlq $50, %xmm0, %xmm3
1099; AVX1-NEXT:    vpsrlq $60, %xmm0, %xmm4
1100; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1101; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
1102; AVX1-NEXT:    vpsllq $60, %xmm1, %xmm3
1103; AVX1-NEXT:    vpsllq $50, %xmm1, %xmm1
1104; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
1105; AVX1-NEXT:    vpsllq $14, %xmm0, %xmm3
1106; AVX1-NEXT:    vpsllq $4, %xmm0, %xmm0
1107; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
1108; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1109; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
1110; AVX1-NEXT:    retq
1111;
1112; AVX2-LABEL: constant_funnnel_v4i64:
1113; AVX2:       # %bb.0:
1114; AVX2-NEXT:    vpsrlvq {{.*}}(%rip), %ymm0, %ymm1
1115; AVX2-NEXT:    vpsllvq {{.*}}(%rip), %ymm0, %ymm0
1116; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
1117; AVX2-NEXT:    retq
1118;
1119; AVX512F-LABEL: constant_funnnel_v4i64:
1120; AVX512F:       # %bb.0:
1121; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1122; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,14,50,60]
1123; AVX512F-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
1124; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1125; AVX512F-NEXT:    retq
1126;
1127; AVX512VL-LABEL: constant_funnnel_v4i64:
1128; AVX512VL:       # %bb.0:
1129; AVX512VL-NEXT:    vprolvq {{.*}}(%rip), %ymm0, %ymm0
1130; AVX512VL-NEXT:    retq
1131;
1132; AVX512BW-LABEL: constant_funnnel_v4i64:
1133; AVX512BW:       # %bb.0:
1134; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1135; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,14,50,60]
1136; AVX512BW-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
1137; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1138; AVX512BW-NEXT:    retq
1139;
1140; AVX512VLBW-LABEL: constant_funnnel_v4i64:
1141; AVX512VLBW:       # %bb.0:
1142; AVX512VLBW-NEXT:    vprolvq {{.*}}(%rip), %ymm0, %ymm0
1143; AVX512VLBW-NEXT:    retq
1144;
1145; AVX512VBMI2-LABEL: constant_funnnel_v4i64:
1146; AVX512VBMI2:       # %bb.0:
1147; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1148; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,14,50,60]
1149; AVX512VBMI2-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
1150; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1151; AVX512VBMI2-NEXT:    retq
1152;
1153; AVX512VLVBMI2-LABEL: constant_funnnel_v4i64:
1154; AVX512VLVBMI2:       # %bb.0:
1155; AVX512VLVBMI2-NEXT:    vprolvq {{.*}}(%rip), %ymm0, %ymm0
1156; AVX512VLVBMI2-NEXT:    retq
1157;
1158; XOPAVX1-LABEL: constant_funnnel_v4i64:
1159; XOPAVX1:       # %bb.0:
1160; XOPAVX1-NEXT:    vprotq {{.*}}(%rip), %xmm0, %xmm1
1161; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1162; XOPAVX1-NEXT:    vprotq {{.*}}(%rip), %xmm0, %xmm0
1163; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1164; XOPAVX1-NEXT:    retq
1165;
1166; XOPAVX2-LABEL: constant_funnnel_v4i64:
1167; XOPAVX2:       # %bb.0:
1168; XOPAVX2-NEXT:    vprotq {{.*}}(%rip), %xmm0, %xmm1
1169; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1170; XOPAVX2-NEXT:    vprotq {{.*}}(%rip), %xmm0, %xmm0
1171; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
1172; XOPAVX2-NEXT:    retq
1173  %res = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %x, <4 x i64> %x, <4 x i64> <i64 4, i64 14, i64 50, i64 60>)
1174  ret <4 x i64> %res
1175}
1176
1177define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x) nounwind {
1178; AVX1-LABEL: constant_funnnel_v8i32:
1179; AVX1:       # %bb.0:
1180; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [256,512,1024,2048]
1181; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
1182; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
1183; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
1184; AVX1-NEXT:    vpmuludq %xmm2, %xmm4, %xmm2
1185; AVX1-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
1186; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
1187; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
1188; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
1189; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1190; AVX1-NEXT:    vpor %xmm3, %xmm1, %xmm1
1191; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [16,32,64,128]
1192; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
1193; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
1194; AVX1-NEXT:    vpmuludq %xmm3, %xmm4, %xmm3
1195; AVX1-NEXT:    vpmuludq %xmm2, %xmm0, %xmm0
1196; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
1197; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1198; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
1199; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
1200; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
1201; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1202; AVX1-NEXT:    retq
1203;
1204; AVX2-LABEL: constant_funnnel_v8i32:
1205; AVX2:       # %bb.0:
1206; AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %ymm0, %ymm1
1207; AVX2-NEXT:    vpsllvd {{.*}}(%rip), %ymm0, %ymm0
1208; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
1209; AVX2-NEXT:    retq
1210;
1211; AVX512F-LABEL: constant_funnnel_v8i32:
1212; AVX512F:       # %bb.0:
1213; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1214; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11]
1215; AVX512F-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
1216; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1217; AVX512F-NEXT:    retq
1218;
1219; AVX512VL-LABEL: constant_funnnel_v8i32:
1220; AVX512VL:       # %bb.0:
1221; AVX512VL-NEXT:    vprolvd {{.*}}(%rip), %ymm0, %ymm0
1222; AVX512VL-NEXT:    retq
1223;
1224; AVX512BW-LABEL: constant_funnnel_v8i32:
1225; AVX512BW:       # %bb.0:
1226; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1227; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11]
1228; AVX512BW-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
1229; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1230; AVX512BW-NEXT:    retq
1231;
1232; AVX512VLBW-LABEL: constant_funnnel_v8i32:
1233; AVX512VLBW:       # %bb.0:
1234; AVX512VLBW-NEXT:    vprolvd {{.*}}(%rip), %ymm0, %ymm0
1235; AVX512VLBW-NEXT:    retq
1236;
1237; AVX512VBMI2-LABEL: constant_funnnel_v8i32:
1238; AVX512VBMI2:       # %bb.0:
1239; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1240; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11]
1241; AVX512VBMI2-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
1242; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1243; AVX512VBMI2-NEXT:    retq
1244;
1245; AVX512VLVBMI2-LABEL: constant_funnnel_v8i32:
1246; AVX512VLVBMI2:       # %bb.0:
1247; AVX512VLVBMI2-NEXT:    vprolvd {{.*}}(%rip), %ymm0, %ymm0
1248; AVX512VLVBMI2-NEXT:    retq
1249;
1250; XOPAVX1-LABEL: constant_funnnel_v8i32:
1251; XOPAVX1:       # %bb.0:
1252; XOPAVX1-NEXT:    vprotd {{.*}}(%rip), %xmm0, %xmm1
1253; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1254; XOPAVX1-NEXT:    vprotd {{.*}}(%rip), %xmm0, %xmm0
1255; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1256; XOPAVX1-NEXT:    retq
1257;
1258; XOPAVX2-LABEL: constant_funnnel_v8i32:
1259; XOPAVX2:       # %bb.0:
1260; XOPAVX2-NEXT:    vprotd {{.*}}(%rip), %xmm0, %xmm1
1261; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1262; XOPAVX2-NEXT:    vprotd {{.*}}(%rip), %xmm0, %xmm0
1263; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
1264; XOPAVX2-NEXT:    retq
1265  %res = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %x, <8 x i32> %x, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>)
1266  ret <8 x i32> %res
1267}
1268
1269define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x) nounwind {
1270; AVX1-LABEL: constant_funnnel_v16i16:
1271; AVX1:       # %bb.0:
1272; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1273; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [256,512,1024,2048,4096,8192,16384,32768]
1274; AVX1-NEXT:    vpmulhuw %xmm2, %xmm1, %xmm3
1275; AVX1-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
1276; AVX1-NEXT:    vpor %xmm3, %xmm1, %xmm1
1277; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128]
1278; AVX1-NEXT:    vpmulhuw %xmm2, %xmm0, %xmm3
1279; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
1280; AVX1-NEXT:    vpor %xmm3, %xmm0, %xmm0
1281; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1282; AVX1-NEXT:    retq
1283;
1284; AVX2-LABEL: constant_funnnel_v16i16:
1285; AVX2:       # %bb.0:
1286; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
1287; AVX2-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm2
1288; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1289; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
1290; AVX2-NEXT:    retq
1291;
1292; AVX512F-LABEL: constant_funnnel_v16i16:
1293; AVX512F:       # %bb.0:
1294; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
1295; AVX512F-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm2
1296; AVX512F-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1297; AVX512F-NEXT:    vpor %ymm2, %ymm0, %ymm0
1298; AVX512F-NEXT:    retq
1299;
1300; AVX512VL-LABEL: constant_funnnel_v16i16:
1301; AVX512VL:       # %bb.0:
1302; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
1303; AVX512VL-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm2
1304; AVX512VL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1305; AVX512VL-NEXT:    vpor %ymm2, %ymm0, %ymm0
1306; AVX512VL-NEXT:    retq
1307;
1308; AVX512BW-LABEL: constant_funnnel_v16i16:
1309; AVX512BW:       # %bb.0:
1310; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1311; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1]
1312; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm1
1313; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
1314; AVX512BW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm0
1315; AVX512BW-NEXT:    vpor %ymm1, %ymm0, %ymm0
1316; AVX512BW-NEXT:    retq
1317;
1318; AVX512VLBW-LABEL: constant_funnnel_v16i16:
1319; AVX512VLBW:       # %bb.0:
1320; AVX512VLBW-NEXT:    vpsrlvw {{.*}}(%rip), %ymm0, %ymm1
1321; AVX512VLBW-NEXT:    vpsllvw {{.*}}(%rip), %ymm0, %ymm0
1322; AVX512VLBW-NEXT:    vpor %ymm1, %ymm0, %ymm0
1323; AVX512VLBW-NEXT:    retq
1324;
1325; AVX512VBMI2-LABEL: constant_funnnel_v16i16:
1326; AVX512VBMI2:       # %bb.0:
1327; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1328; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
1329; AVX512VBMI2-NEXT:    vpshldvw %zmm1, %zmm0, %zmm0
1330; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1331; AVX512VBMI2-NEXT:    retq
1332;
1333; AVX512VLVBMI2-LABEL: constant_funnnel_v16i16:
1334; AVX512VLVBMI2:       # %bb.0:
1335; AVX512VLVBMI2-NEXT:    vpshldvw {{.*}}(%rip), %ymm0, %ymm0
1336; AVX512VLVBMI2-NEXT:    retq
1337;
1338; XOPAVX1-LABEL: constant_funnnel_v16i16:
1339; XOPAVX1:       # %bb.0:
1340; XOPAVX1-NEXT:    vprotw {{.*}}(%rip), %xmm0, %xmm1
1341; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1342; XOPAVX1-NEXT:    vprotw {{.*}}(%rip), %xmm0, %xmm0
1343; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1344; XOPAVX1-NEXT:    retq
1345;
1346; XOPAVX2-LABEL: constant_funnnel_v16i16:
1347; XOPAVX2:       # %bb.0:
1348; XOPAVX2-NEXT:    vprotw {{.*}}(%rip), %xmm0, %xmm1
1349; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1350; XOPAVX2-NEXT:    vprotw {{.*}}(%rip), %xmm0, %xmm0
1351; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
1352; XOPAVX2-NEXT:    retq
1353  %res = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %x, <16 x i16> %x, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>)
1354  ret <16 x i16> %res
1355}
1356
1357define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x) nounwind {
1358; AVX1-LABEL: constant_funnnel_v32i8:
1359; AVX1:       # %bb.0:
1360; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1361; AVX1-NEXT:    vpxor %xmm8, %xmm8, %xmm8
1362; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15]
1363; AVX1-NEXT:    vmovdqa {{.*#+}} xmm9 = [256,128,64,32,16,8,4,2]
1364; AVX1-NEXT:    vpmullw %xmm3, %xmm9, %xmm3
1365; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
1366; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1367; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [256,2,4,8,16,32,64,128]
1368; AVX1-NEXT:    vpmullw %xmm6, %xmm5, %xmm7
1369; AVX1-NEXT:    vpsrlw $8, %xmm7, %xmm7
1370; AVX1-NEXT:    vpackuswb %xmm3, %xmm7, %xmm3
1371; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1372; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [1,128,64,32,16,8,4,2]
1373; AVX1-NEXT:    vpmullw %xmm7, %xmm1, %xmm1
1374; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1375; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
1376; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128]
1377; AVX1-NEXT:    vpmullw %xmm4, %xmm5, %xmm5
1378; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm5
1379; AVX1-NEXT:    vpackuswb %xmm1, %xmm5, %xmm1
1380; AVX1-NEXT:    vpor %xmm3, %xmm1, %xmm1
1381; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
1382; AVX1-NEXT:    vpmullw %xmm3, %xmm9, %xmm3
1383; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
1384; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1385; AVX1-NEXT:    vpmullw %xmm6, %xmm5, %xmm6
1386; AVX1-NEXT:    vpsrlw $8, %xmm6, %xmm6
1387; AVX1-NEXT:    vpackuswb %xmm3, %xmm6, %xmm3
1388; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1389; AVX1-NEXT:    vpmullw %xmm7, %xmm0, %xmm0
1390; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
1391; AVX1-NEXT:    vpmullw %xmm4, %xmm5, %xmm4
1392; AVX1-NEXT:    vpand %xmm2, %xmm4, %xmm2
1393; AVX1-NEXT:    vpackuswb %xmm0, %xmm2, %xmm0
1394; AVX1-NEXT:    vpor %xmm3, %xmm0, %xmm0
1395; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1396; AVX1-NEXT:    retq
1397;
1398; AVX2-LABEL: constant_funnnel_v32i8:
1399; AVX2:       # %bb.0:
1400; AVX2-NEXT:    vpsllw $4, %ymm0, %ymm1
1401; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
1402; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
1403; AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
1404; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
1405; AVX2-NEXT:    vpsllw $2, %ymm1, %ymm3
1406; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
1407; AVX2-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
1408; AVX2-NEXT:    vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1409; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm3
1410; AVX2-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
1411; AVX2-NEXT:    vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1412; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1413; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
1414; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm3, %ymm3
1415; AVX2-NEXT:    vpsrlw $8, %ymm3, %ymm3
1416; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
1417; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
1418; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
1419; AVX2-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
1420; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
1421; AVX2-NEXT:    retq
1422;
1423; AVX512F-LABEL: constant_funnnel_v32i8:
1424; AVX512F:       # %bb.0:
1425; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm1
1426; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
1427; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
1428; AVX512F-NEXT:    # ymm2 = mem[0,1,0,1]
1429; AVX512F-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
1430; AVX512F-NEXT:    vpsllw $2, %ymm1, %ymm3
1431; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
1432; AVX512F-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
1433; AVX512F-NEXT:    vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1434; AVX512F-NEXT:    vpaddb %ymm1, %ymm1, %ymm3
1435; AVX512F-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
1436; AVX512F-NEXT:    vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1437; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1438; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
1439; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm3, %ymm3
1440; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
1441; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
1442; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
1443; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
1444; AVX512F-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
1445; AVX512F-NEXT:    vpor %ymm0, %ymm1, %ymm0
1446; AVX512F-NEXT:    retq
1447;
1448; AVX512VL-LABEL: constant_funnnel_v32i8:
1449; AVX512VL:       # %bb.0:
1450; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm1
1451; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
1452; AVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
1453; AVX512VL-NEXT:    # ymm2 = mem[0,1,0,1]
1454; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
1455; AVX512VL-NEXT:    vpsllw $2, %ymm1, %ymm3
1456; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
1457; AVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
1458; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1459; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm3
1460; AVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
1461; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1462; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1463; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
1464; AVX512VL-NEXT:    vpmullw {{.*}}(%rip), %ymm3, %ymm3
1465; AVX512VL-NEXT:    vpsrlw $8, %ymm3, %ymm3
1466; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
1467; AVX512VL-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
1468; AVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
1469; AVX512VL-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
1470; AVX512VL-NEXT:    vpor %ymm0, %ymm1, %ymm0
1471; AVX512VL-NEXT:    retq
1472;
1473; AVX512BW-LABEL: constant_funnnel_v32i8:
1474; AVX512BW:       # %bb.0:
1475; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1476; AVX512BW-NEXT:    vpsrlvw {{.*}}(%rip), %zmm0, %zmm1
1477; AVX512BW-NEXT:    vpsllvw {{.*}}(%rip), %zmm0, %zmm0
1478; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
1479; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1480; AVX512BW-NEXT:    retq
1481;
1482; AVX512VLBW-LABEL: constant_funnnel_v32i8:
1483; AVX512VLBW:       # %bb.0:
1484; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1485; AVX512VLBW-NEXT:    vpsrlvw {{.*}}(%rip), %zmm0, %zmm1
1486; AVX512VLBW-NEXT:    vpsllvw {{.*}}(%rip), %zmm0, %zmm0
1487; AVX512VLBW-NEXT:    vporq %zmm1, %zmm0, %zmm0
1488; AVX512VLBW-NEXT:    vpmovwb %zmm0, %ymm0
1489; AVX512VLBW-NEXT:    retq
1490;
1491; AVX512VBMI2-LABEL: constant_funnnel_v32i8:
1492; AVX512VBMI2:       # %bb.0:
1493; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1494; AVX512VBMI2-NEXT:    vpsrlvw {{.*}}(%rip), %zmm0, %zmm1
1495; AVX512VBMI2-NEXT:    vpsllvw {{.*}}(%rip), %zmm0, %zmm0
1496; AVX512VBMI2-NEXT:    vporq %zmm1, %zmm0, %zmm0
1497; AVX512VBMI2-NEXT:    vpmovwb %zmm0, %ymm0
1498; AVX512VBMI2-NEXT:    retq
1499;
1500; AVX512VLVBMI2-LABEL: constant_funnnel_v32i8:
1501; AVX512VLVBMI2:       # %bb.0:
1502; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1503; AVX512VLVBMI2-NEXT:    vpsrlvw {{.*}}(%rip), %zmm0, %zmm1
1504; AVX512VLVBMI2-NEXT:    vpsllvw {{.*}}(%rip), %zmm0, %zmm0
1505; AVX512VLVBMI2-NEXT:    vporq %zmm1, %zmm0, %zmm0
1506; AVX512VLVBMI2-NEXT:    vpmovwb %zmm0, %ymm0
1507; AVX512VLVBMI2-NEXT:    retq
1508;
1509; XOPAVX1-LABEL: constant_funnnel_v32i8:
1510; XOPAVX1:       # %bb.0:
1511; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1512; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
1513; XOPAVX1-NEXT:    vprotb %xmm2, %xmm1, %xmm1
1514; XOPAVX1-NEXT:    vprotb %xmm2, %xmm0, %xmm0
1515; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1516; XOPAVX1-NEXT:    retq
1517;
1518; XOPAVX2-LABEL: constant_funnnel_v32i8:
1519; XOPAVX2:       # %bb.0:
1520; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1521; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
1522; XOPAVX2-NEXT:    vprotb %xmm2, %xmm1, %xmm1
1523; XOPAVX2-NEXT:    vprotb %xmm2, %xmm0, %xmm0
1524; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1525; XOPAVX2-NEXT:    retq
1526  %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %x, <32 x i8> %x, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
1527  ret <32 x i8> %res
1528}
1529
1530;
1531; Uniform Constant Shifts
1532;
1533
1534define <4 x i64> @splatconstant_funnnel_v4i64(<4 x i64> %x) nounwind {
1535; AVX1-LABEL: splatconstant_funnnel_v4i64:
1536; AVX1:       # %bb.0:
1537; AVX1-NEXT:    vpsrlq $50, %xmm0, %xmm1
1538; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1539; AVX1-NEXT:    vpsrlq $50, %xmm2, %xmm3
1540; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
1541; AVX1-NEXT:    vpsllq $14, %xmm0, %xmm0
1542; AVX1-NEXT:    vpsllq $14, %xmm2, %xmm2
1543; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1544; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
1545; AVX1-NEXT:    retq
1546;
1547; AVX2-LABEL: splatconstant_funnnel_v4i64:
1548; AVX2:       # %bb.0:
1549; AVX2-NEXT:    vpsrlq $50, %ymm0, %ymm1
1550; AVX2-NEXT:    vpsllq $14, %ymm0, %ymm0
1551; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
1552; AVX2-NEXT:    retq
1553;
1554; AVX512F-LABEL: splatconstant_funnnel_v4i64:
1555; AVX512F:       # %bb.0:
1556; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1557; AVX512F-NEXT:    vprolq $14, %zmm0, %zmm0
1558; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1559; AVX512F-NEXT:    retq
1560;
1561; AVX512VL-LABEL: splatconstant_funnnel_v4i64:
1562; AVX512VL:       # %bb.0:
1563; AVX512VL-NEXT:    vprolq $14, %ymm0, %ymm0
1564; AVX512VL-NEXT:    retq
1565;
1566; AVX512BW-LABEL: splatconstant_funnnel_v4i64:
1567; AVX512BW:       # %bb.0:
1568; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1569; AVX512BW-NEXT:    vprolq $14, %zmm0, %zmm0
1570; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1571; AVX512BW-NEXT:    retq
1572;
1573; AVX512VLBW-LABEL: splatconstant_funnnel_v4i64:
1574; AVX512VLBW:       # %bb.0:
1575; AVX512VLBW-NEXT:    vprolq $14, %ymm0, %ymm0
1576; AVX512VLBW-NEXT:    retq
1577;
1578; AVX512VBMI2-LABEL: splatconstant_funnnel_v4i64:
1579; AVX512VBMI2:       # %bb.0:
1580; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1581; AVX512VBMI2-NEXT:    vprolq $14, %zmm0, %zmm0
1582; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1583; AVX512VBMI2-NEXT:    retq
1584;
1585; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v4i64:
1586; AVX512VLVBMI2:       # %bb.0:
1587; AVX512VLVBMI2-NEXT:    vprolq $14, %ymm0, %ymm0
1588; AVX512VLVBMI2-NEXT:    retq
1589;
1590; XOPAVX1-LABEL: splatconstant_funnnel_v4i64:
1591; XOPAVX1:       # %bb.0:
1592; XOPAVX1-NEXT:    vprotq $14, %xmm0, %xmm1
1593; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1594; XOPAVX1-NEXT:    vprotq $14, %xmm0, %xmm0
1595; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1596; XOPAVX1-NEXT:    retq
1597;
1598; XOPAVX2-LABEL: splatconstant_funnnel_v4i64:
1599; XOPAVX2:       # %bb.0:
1600; XOPAVX2-NEXT:    vprotq $14, %xmm0, %xmm1
1601; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1602; XOPAVX2-NEXT:    vprotq $14, %xmm0, %xmm0
1603; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
1604; XOPAVX2-NEXT:    retq
1605  %res = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %x, <4 x i64> %x, <4 x i64> <i64 14, i64 14, i64 14, i64 14>)
1606  ret <4 x i64> %res
1607}
1608
1609define <8 x i32> @splatconstant_funnnel_v8i32(<8 x i32> %x) nounwind {
1610; AVX1-LABEL: splatconstant_funnnel_v8i32:
1611; AVX1:       # %bb.0:
1612; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1613; AVX1-NEXT:    vpsrld $28, %xmm1, %xmm2
1614; AVX1-NEXT:    vpslld $4, %xmm1, %xmm1
1615; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
1616; AVX1-NEXT:    vpsrld $28, %xmm0, %xmm2
1617; AVX1-NEXT:    vpslld $4, %xmm0, %xmm0
1618; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
1619; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1620; AVX1-NEXT:    retq
1621;
1622; AVX2-LABEL: splatconstant_funnnel_v8i32:
1623; AVX2:       # %bb.0:
1624; AVX2-NEXT:    vpsrld $28, %ymm0, %ymm1
1625; AVX2-NEXT:    vpslld $4, %ymm0, %ymm0
1626; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
1627; AVX2-NEXT:    retq
1628;
1629; AVX512F-LABEL: splatconstant_funnnel_v8i32:
1630; AVX512F:       # %bb.0:
1631; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1632; AVX512F-NEXT:    vprold $4, %zmm0, %zmm0
1633; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1634; AVX512F-NEXT:    retq
1635;
1636; AVX512VL-LABEL: splatconstant_funnnel_v8i32:
1637; AVX512VL:       # %bb.0:
1638; AVX512VL-NEXT:    vprold $4, %ymm0, %ymm0
1639; AVX512VL-NEXT:    retq
1640;
1641; AVX512BW-LABEL: splatconstant_funnnel_v8i32:
1642; AVX512BW:       # %bb.0:
1643; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1644; AVX512BW-NEXT:    vprold $4, %zmm0, %zmm0
1645; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1646; AVX512BW-NEXT:    retq
1647;
1648; AVX512VLBW-LABEL: splatconstant_funnnel_v8i32:
1649; AVX512VLBW:       # %bb.0:
1650; AVX512VLBW-NEXT:    vprold $4, %ymm0, %ymm0
1651; AVX512VLBW-NEXT:    retq
1652;
1653; AVX512VBMI2-LABEL: splatconstant_funnnel_v8i32:
1654; AVX512VBMI2:       # %bb.0:
1655; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1656; AVX512VBMI2-NEXT:    vprold $4, %zmm0, %zmm0
1657; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1658; AVX512VBMI2-NEXT:    retq
1659;
1660; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v8i32:
1661; AVX512VLVBMI2:       # %bb.0:
1662; AVX512VLVBMI2-NEXT:    vprold $4, %ymm0, %ymm0
1663; AVX512VLVBMI2-NEXT:    retq
1664;
1665; XOPAVX1-LABEL: splatconstant_funnnel_v8i32:
1666; XOPAVX1:       # %bb.0:
1667; XOPAVX1-NEXT:    vprotd $4, %xmm0, %xmm1
1668; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1669; XOPAVX1-NEXT:    vprotd $4, %xmm0, %xmm0
1670; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1671; XOPAVX1-NEXT:    retq
1672;
1673; XOPAVX2-LABEL: splatconstant_funnnel_v8i32:
1674; XOPAVX2:       # %bb.0:
1675; XOPAVX2-NEXT:    vprotd $4, %xmm0, %xmm1
1676; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1677; XOPAVX2-NEXT:    vprotd $4, %xmm0, %xmm0
1678; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
1679; XOPAVX2-NEXT:    retq
1680  %res = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %x, <8 x i32> %x, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>)
1681  ret <8 x i32> %res
1682}
1683
1684define <16 x i16> @splatconstant_funnnel_v16i16(<16 x i16> %x) nounwind {
1685; AVX1-LABEL: splatconstant_funnnel_v16i16:
1686; AVX1:       # %bb.0:
1687; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1688; AVX1-NEXT:    vpsrlw $9, %xmm1, %xmm2
1689; AVX1-NEXT:    vpsllw $7, %xmm1, %xmm1
1690; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
1691; AVX1-NEXT:    vpsrlw $9, %xmm0, %xmm2
1692; AVX1-NEXT:    vpsllw $7, %xmm0, %xmm0
1693; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
1694; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1695; AVX1-NEXT:    retq
1696;
1697; AVX2-LABEL: splatconstant_funnnel_v16i16:
1698; AVX2:       # %bb.0:
1699; AVX2-NEXT:    vpsrlw $9, %ymm0, %ymm1
1700; AVX2-NEXT:    vpsllw $7, %ymm0, %ymm0
1701; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
1702; AVX2-NEXT:    retq
1703;
1704; AVX512F-LABEL: splatconstant_funnnel_v16i16:
1705; AVX512F:       # %bb.0:
1706; AVX512F-NEXT:    vpsrlw $9, %ymm0, %ymm1
1707; AVX512F-NEXT:    vpsllw $7, %ymm0, %ymm0
1708; AVX512F-NEXT:    vpor %ymm1, %ymm0, %ymm0
1709; AVX512F-NEXT:    retq
1710;
1711; AVX512VL-LABEL: splatconstant_funnnel_v16i16:
1712; AVX512VL:       # %bb.0:
1713; AVX512VL-NEXT:    vpsrlw $9, %ymm0, %ymm1
1714; AVX512VL-NEXT:    vpsllw $7, %ymm0, %ymm0
1715; AVX512VL-NEXT:    vpor %ymm1, %ymm0, %ymm0
1716; AVX512VL-NEXT:    retq
1717;
1718; AVX512BW-LABEL: splatconstant_funnnel_v16i16:
1719; AVX512BW:       # %bb.0:
1720; AVX512BW-NEXT:    vpsrlw $9, %ymm0, %ymm1
1721; AVX512BW-NEXT:    vpsllw $7, %ymm0, %ymm0
1722; AVX512BW-NEXT:    vpor %ymm1, %ymm0, %ymm0
1723; AVX512BW-NEXT:    retq
1724;
1725; AVX512VLBW-LABEL: splatconstant_funnnel_v16i16:
1726; AVX512VLBW:       # %bb.0:
1727; AVX512VLBW-NEXT:    vpsrlw $9, %ymm0, %ymm1
1728; AVX512VLBW-NEXT:    vpsllw $7, %ymm0, %ymm0
1729; AVX512VLBW-NEXT:    vpor %ymm1, %ymm0, %ymm0
1730; AVX512VLBW-NEXT:    retq
1731;
1732; AVX512VBMI2-LABEL: splatconstant_funnnel_v16i16:
1733; AVX512VBMI2:       # %bb.0:
1734; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1735; AVX512VBMI2-NEXT:    vpshldw $7, %zmm0, %zmm0, %zmm0
1736; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1737; AVX512VBMI2-NEXT:    retq
1738;
1739; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v16i16:
1740; AVX512VLVBMI2:       # %bb.0:
1741; AVX512VLVBMI2-NEXT:    vpshldw $7, %ymm0, %ymm0, %ymm0
1742; AVX512VLVBMI2-NEXT:    retq
1743;
1744; XOPAVX1-LABEL: splatconstant_funnnel_v16i16:
1745; XOPAVX1:       # %bb.0:
1746; XOPAVX1-NEXT:    vprotw $7, %xmm0, %xmm1
1747; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1748; XOPAVX1-NEXT:    vprotw $7, %xmm0, %xmm0
1749; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1750; XOPAVX1-NEXT:    retq
1751;
1752; XOPAVX2-LABEL: splatconstant_funnnel_v16i16:
1753; XOPAVX2:       # %bb.0:
1754; XOPAVX2-NEXT:    vprotw $7, %xmm0, %xmm1
1755; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1756; XOPAVX2-NEXT:    vprotw $7, %xmm0, %xmm0
1757; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
1758; XOPAVX2-NEXT:    retq
1759  %res = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %x, <16 x i16> %x, <16 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
1760  ret <16 x i16> %res
1761}
1762
1763define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind {
1764; AVX1-LABEL: splatconstant_funnnel_v32i8:
1765; AVX1:       # %bb.0:
1766; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1767; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm2
1768; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
1769; AVX1-NEXT:    vpandn %xmm2, %xmm3, %xmm2
1770; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
1771; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
1772; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
1773; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm2
1774; AVX1-NEXT:    vpandn %xmm2, %xmm3, %xmm2
1775; AVX1-NEXT:    vpsllw $4, %xmm0, %xmm0
1776; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
1777; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
1778; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1779; AVX1-NEXT:    retq
1780;
1781; AVX2-LABEL: splatconstant_funnnel_v32i8:
1782; AVX2:       # %bb.0:
1783; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm1
1784; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
1785; AVX2-NEXT:    vpsllw $4, %ymm0, %ymm0
1786; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1787; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
1788; AVX2-NEXT:    retq
1789;
1790; AVX512F-LABEL: splatconstant_funnnel_v32i8:
1791; AVX512F:       # %bb.0:
1792; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm1
1793; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
1794; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm0
1795; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1796; AVX512F-NEXT:    vpor %ymm1, %ymm0, %ymm0
1797; AVX512F-NEXT:    retq
1798;
1799; AVX512VL-LABEL: splatconstant_funnnel_v32i8:
1800; AVX512VL:       # %bb.0:
1801; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm1
1802; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm0
1803; AVX512VL-NEXT:    vpternlogq $216, {{.*}}(%rip), %ymm1, %ymm0
1804; AVX512VL-NEXT:    retq
1805;
1806; AVX512BW-LABEL: splatconstant_funnnel_v32i8:
1807; AVX512BW:       # %bb.0:
1808; AVX512BW-NEXT:    vpsrlw $4, %ymm0, %ymm1
1809; AVX512BW-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
1810; AVX512BW-NEXT:    vpsllw $4, %ymm0, %ymm0
1811; AVX512BW-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1812; AVX512BW-NEXT:    vpor %ymm1, %ymm0, %ymm0
1813; AVX512BW-NEXT:    retq
1814;
1815; AVX512VLBW-LABEL: splatconstant_funnnel_v32i8:
1816; AVX512VLBW:       # %bb.0:
1817; AVX512VLBW-NEXT:    vpsllw $4, %ymm0, %ymm1
1818; AVX512VLBW-NEXT:    vpsrlw $4, %ymm0, %ymm0
1819; AVX512VLBW-NEXT:    vpternlogq $216, {{.*}}(%rip), %ymm1, %ymm0
1820; AVX512VLBW-NEXT:    retq
1821;
1822; AVX512VBMI2-LABEL: splatconstant_funnnel_v32i8:
1823; AVX512VBMI2:       # %bb.0:
1824; AVX512VBMI2-NEXT:    vpsrlw $4, %ymm0, %ymm1
1825; AVX512VBMI2-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
1826; AVX512VBMI2-NEXT:    vpsllw $4, %ymm0, %ymm0
1827; AVX512VBMI2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1828; AVX512VBMI2-NEXT:    vpor %ymm1, %ymm0, %ymm0
1829; AVX512VBMI2-NEXT:    retq
1830;
1831; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v32i8:
1832; AVX512VLVBMI2:       # %bb.0:
1833; AVX512VLVBMI2-NEXT:    vpsllw $4, %ymm0, %ymm1
1834; AVX512VLVBMI2-NEXT:    vpsrlw $4, %ymm0, %ymm0
1835; AVX512VLVBMI2-NEXT:    vpternlogq $216, {{.*}}(%rip), %ymm1, %ymm0
1836; AVX512VLVBMI2-NEXT:    retq
1837;
1838; XOPAVX1-LABEL: splatconstant_funnnel_v32i8:
1839; XOPAVX1:       # %bb.0:
1840; XOPAVX1-NEXT:    vprotb $4, %xmm0, %xmm1
1841; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1842; XOPAVX1-NEXT:    vprotb $4, %xmm0, %xmm0
1843; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1844; XOPAVX1-NEXT:    retq
1845;
1846; XOPAVX2-LABEL: splatconstant_funnnel_v32i8:
1847; XOPAVX2:       # %bb.0:
1848; XOPAVX2-NEXT:    vprotb $4, %xmm0, %xmm1
1849; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1850; XOPAVX2-NEXT:    vprotb $4, %xmm0, %xmm0
1851; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
1852; XOPAVX2-NEXT:    retq
1853  %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %x, <32 x i8> %x, <32 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)
1854  ret <32 x i8> %res
1855}
1856