• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2
12
13; Just one 32-bit run to make sure we do reasonable things for i64 rotates.
14; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X32-SSE,X32-SSE2
15
16;
17; Variable Rotates
18;
19
20define <2 x i64> @var_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
21; SSE2-LABEL: var_rotate_v2i64:
22; SSE2:       # %bb.0:
23; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [64,64]
24; SSE2-NEXT:    psubq %xmm1, %xmm2
25; SSE2-NEXT:    movdqa %xmm0, %xmm3
26; SSE2-NEXT:    psllq %xmm1, %xmm3
27; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
28; SSE2-NEXT:    movdqa %xmm0, %xmm4
29; SSE2-NEXT:    psllq %xmm1, %xmm4
30; SSE2-NEXT:    movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1]
31; SSE2-NEXT:    movdqa %xmm0, %xmm1
32; SSE2-NEXT:    psrlq %xmm2, %xmm1
33; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
34; SSE2-NEXT:    psrlq %xmm2, %xmm0
35; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
36; SSE2-NEXT:    orpd %xmm4, %xmm0
37; SSE2-NEXT:    retq
38;
39; SSE41-LABEL: var_rotate_v2i64:
40; SSE41:       # %bb.0:
41; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [64,64]
42; SSE41-NEXT:    psubq %xmm1, %xmm2
43; SSE41-NEXT:    movdqa %xmm0, %xmm3
44; SSE41-NEXT:    psllq %xmm1, %xmm3
45; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
46; SSE41-NEXT:    movdqa %xmm0, %xmm4
47; SSE41-NEXT:    psllq %xmm1, %xmm4
48; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm3[0,1,2,3],xmm4[4,5,6,7]
49; SSE41-NEXT:    movdqa %xmm0, %xmm1
50; SSE41-NEXT:    psrlq %xmm2, %xmm1
51; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
52; SSE41-NEXT:    psrlq %xmm2, %xmm0
53; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
54; SSE41-NEXT:    por %xmm4, %xmm0
55; SSE41-NEXT:    retq
56;
57; AVX1-LABEL: var_rotate_v2i64:
58; AVX1:       # %bb.0:
59; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [64,64]
60; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm2
61; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm3
62; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
63; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm1
64; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
65; AVX1-NEXT:    vpsrlq %xmm2, %xmm0, %xmm3
66; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
67; AVX1-NEXT:    vpsrlq %xmm2, %xmm0, %xmm0
68; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
69; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
70; AVX1-NEXT:    retq
71;
72; AVX2-LABEL: var_rotate_v2i64:
73; AVX2:       # %bb.0:
74; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [64,64]
75; AVX2-NEXT:    vpsubq %xmm1, %xmm2, %xmm2
76; AVX2-NEXT:    vpsllvq %xmm1, %xmm0, %xmm1
77; AVX2-NEXT:    vpsrlvq %xmm2, %xmm0, %xmm0
78; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
79; AVX2-NEXT:    retq
80;
81; AVX512F-LABEL: var_rotate_v2i64:
82; AVX512F:       # %bb.0:
83; AVX512F-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
84; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
85; AVX512F-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
86; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
87; AVX512F-NEXT:    vzeroupper
88; AVX512F-NEXT:    retq
89;
90; AVX512VL-LABEL: var_rotate_v2i64:
91; AVX512VL:       # %bb.0:
92; AVX512VL-NEXT:    vprolvq %xmm1, %xmm0, %xmm0
93; AVX512VL-NEXT:    retq
94;
95; AVX512BW-LABEL: var_rotate_v2i64:
96; AVX512BW:       # %bb.0:
97; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
98; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
99; AVX512BW-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
100; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
101; AVX512BW-NEXT:    vzeroupper
102; AVX512BW-NEXT:    retq
103;
104; AVX512VLBW-LABEL: var_rotate_v2i64:
105; AVX512VLBW:       # %bb.0:
106; AVX512VLBW-NEXT:    vprolvq %xmm1, %xmm0, %xmm0
107; AVX512VLBW-NEXT:    retq
108;
109; XOP-LABEL: var_rotate_v2i64:
110; XOP:       # %bb.0:
111; XOP-NEXT:    vprotq %xmm1, %xmm0, %xmm0
112; XOP-NEXT:    retq
113;
114; X32-SSE-LABEL: var_rotate_v2i64:
115; X32-SSE:       # %bb.0:
116; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [64,0,64,0]
117; X32-SSE-NEXT:    psubq %xmm1, %xmm2
118; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
119; X32-SSE-NEXT:    psllq %xmm1, %xmm3
120; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
121; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
122; X32-SSE-NEXT:    psllq %xmm1, %xmm4
123; X32-SSE-NEXT:    movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1]
124; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
125; X32-SSE-NEXT:    psrlq %xmm2, %xmm1
126; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
127; X32-SSE-NEXT:    psrlq %xmm2, %xmm0
128; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
129; X32-SSE-NEXT:    orpd %xmm4, %xmm0
130; X32-SSE-NEXT:    retl
131  %b64 = sub <2 x i64> <i64 64, i64 64>, %b
132  %shl = shl <2 x i64> %a, %b
133  %lshr = lshr <2 x i64> %a, %b64
134  %or = or <2 x i64> %shl, %lshr
135  ret <2 x i64> %or
136}
137
138define <4 x i32> @var_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
139; SSE2-LABEL: var_rotate_v4i32:
140; SSE2:       # %bb.0:
141; SSE2-NEXT:    pslld $23, %xmm1
142; SSE2-NEXT:    paddd {{.*}}(%rip), %xmm1
143; SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
144; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
145; SSE2-NEXT:    pmuludq %xmm1, %xmm0
146; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
147; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
148; SSE2-NEXT:    pmuludq %xmm2, %xmm1
149; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
150; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
151; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
152; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
153; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
154; SSE2-NEXT:    por %xmm3, %xmm0
155; SSE2-NEXT:    retq
156;
157; SSE41-LABEL: var_rotate_v4i32:
158; SSE41:       # %bb.0:
159; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
160; SSE41-NEXT:    pslld $23, %xmm1
161; SSE41-NEXT:    paddd {{.*}}(%rip), %xmm1
162; SSE41-NEXT:    cvttps2dq %xmm1, %xmm1
163; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
164; SSE41-NEXT:    pmuludq %xmm2, %xmm3
165; SSE41-NEXT:    pmuludq %xmm1, %xmm0
166; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
167; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
168; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
169; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
170; SSE41-NEXT:    por %xmm1, %xmm0
171; SSE41-NEXT:    retq
172;
173; AVX1-LABEL: var_rotate_v4i32:
174; AVX1:       # %bb.0:
175; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
176; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
177; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm1, %xmm1
178; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
179; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
180; AVX1-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
181; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
182; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
183; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
184; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
185; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
186; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
187; AVX1-NEXT:    retq
188;
189; AVX2-LABEL: var_rotate_v4i32:
190; AVX2:       # %bb.0:
191; AVX2-NEXT:    vpsllvd %xmm1, %xmm0, %xmm2
192; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32]
193; AVX2-NEXT:    vpsubd %xmm1, %xmm3, %xmm1
194; AVX2-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
195; AVX2-NEXT:    vpor %xmm0, %xmm2, %xmm0
196; AVX2-NEXT:    retq
197;
198; AVX512F-LABEL: var_rotate_v4i32:
199; AVX512F:       # %bb.0:
200; AVX512F-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
201; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
202; AVX512F-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
203; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
204; AVX512F-NEXT:    vzeroupper
205; AVX512F-NEXT:    retq
206;
207; AVX512VL-LABEL: var_rotate_v4i32:
208; AVX512VL:       # %bb.0:
209; AVX512VL-NEXT:    vprolvd %xmm1, %xmm0, %xmm0
210; AVX512VL-NEXT:    retq
211;
212; AVX512BW-LABEL: var_rotate_v4i32:
213; AVX512BW:       # %bb.0:
214; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
215; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
216; AVX512BW-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
217; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
218; AVX512BW-NEXT:    vzeroupper
219; AVX512BW-NEXT:    retq
220;
221; AVX512VLBW-LABEL: var_rotate_v4i32:
222; AVX512VLBW:       # %bb.0:
223; AVX512VLBW-NEXT:    vprolvd %xmm1, %xmm0, %xmm0
224; AVX512VLBW-NEXT:    retq
225;
226; XOP-LABEL: var_rotate_v4i32:
227; XOP:       # %bb.0:
228; XOP-NEXT:    vprotd %xmm1, %xmm0, %xmm0
229; XOP-NEXT:    retq
230;
231; X32-SSE-LABEL: var_rotate_v4i32:
232; X32-SSE:       # %bb.0:
233; X32-SSE-NEXT:    pslld $23, %xmm1
234; X32-SSE-NEXT:    paddd {{\.LCPI.*}}, %xmm1
235; X32-SSE-NEXT:    cvttps2dq %xmm1, %xmm1
236; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
237; X32-SSE-NEXT:    pmuludq %xmm1, %xmm0
238; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
239; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
240; X32-SSE-NEXT:    pmuludq %xmm2, %xmm1
241; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
242; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
243; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
244; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
245; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
246; X32-SSE-NEXT:    por %xmm3, %xmm0
247; X32-SSE-NEXT:    retl
248  %b32 = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %b
249  %shl = shl <4 x i32> %a, %b
250  %lshr = lshr <4 x i32> %a, %b32
251  %or = or <4 x i32> %shl, %lshr
252  ret <4 x i32> %or
253}
254
255define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
256; SSE2-LABEL: var_rotate_v8i16:
257; SSE2:       # %bb.0:
258; SSE2-NEXT:    pxor %xmm2, %xmm2
259; SSE2-NEXT:    movdqa %xmm1, %xmm3
260; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
261; SSE2-NEXT:    pslld $23, %xmm3
262; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
263; SSE2-NEXT:    paddd %xmm4, %xmm3
264; SSE2-NEXT:    cvttps2dq %xmm3, %xmm3
265; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
266; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
267; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
268; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
269; SSE2-NEXT:    pslld $23, %xmm1
270; SSE2-NEXT:    paddd %xmm4, %xmm1
271; SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
272; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
273; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
274; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
275; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
276; SSE2-NEXT:    movdqa %xmm0, %xmm2
277; SSE2-NEXT:    pmulhuw %xmm1, %xmm2
278; SSE2-NEXT:    pmullw %xmm1, %xmm0
279; SSE2-NEXT:    por %xmm2, %xmm0
280; SSE2-NEXT:    retq
281;
282; SSE41-LABEL: var_rotate_v8i16:
283; SSE41:       # %bb.0:
284; SSE41-NEXT:    pxor %xmm2, %xmm2
285; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
286; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
287; SSE41-NEXT:    pslld $23, %xmm1
288; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216]
289; SSE41-NEXT:    paddd %xmm2, %xmm1
290; SSE41-NEXT:    cvttps2dq %xmm1, %xmm1
291; SSE41-NEXT:    pslld $23, %xmm3
292; SSE41-NEXT:    paddd %xmm2, %xmm3
293; SSE41-NEXT:    cvttps2dq %xmm3, %xmm2
294; SSE41-NEXT:    packusdw %xmm1, %xmm2
295; SSE41-NEXT:    movdqa %xmm0, %xmm1
296; SSE41-NEXT:    pmulhuw %xmm2, %xmm1
297; SSE41-NEXT:    pmullw %xmm2, %xmm0
298; SSE41-NEXT:    por %xmm1, %xmm0
299; SSE41-NEXT:    retq
300;
301; AVX1-LABEL: var_rotate_v8i16:
302; AVX1:       # %bb.0:
303; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
304; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
305; AVX1-NEXT:    vpslld $23, %xmm2, %xmm2
306; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
307; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
308; AVX1-NEXT:    vcvttps2dq %xmm2, %xmm2
309; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
310; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
311; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
312; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
313; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
314; AVX1-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm2
315; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
316; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
317; AVX1-NEXT:    retq
318;
319; AVX2-LABEL: var_rotate_v8i16:
320; AVX2:       # %bb.0:
321; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
322; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
323; AVX2-NEXT:    vpsllvd %ymm2, %ymm0, %ymm2
324; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
325; AVX2-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
326; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
327; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
328; AVX2-NEXT:    vpsubw %xmm1, %xmm4, %xmm1
329; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
330; AVX2-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
331; AVX2-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
332; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
333; AVX2-NEXT:    vpor %xmm0, %xmm2, %xmm0
334; AVX2-NEXT:    vzeroupper
335; AVX2-NEXT:    retq
336;
337; AVX512F-LABEL: var_rotate_v8i16:
338; AVX512F:       # %bb.0:
339; AVX512F-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
340; AVX512F-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
341; AVX512F-NEXT:    vpsllvd %ymm2, %ymm0, %ymm2
342; AVX512F-NEXT:    vpmovdw %zmm2, %ymm2
343; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
344; AVX512F-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
345; AVX512F-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
346; AVX512F-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
347; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
348; AVX512F-NEXT:    vpor %xmm0, %xmm2, %xmm0
349; AVX512F-NEXT:    vzeroupper
350; AVX512F-NEXT:    retq
351;
352; AVX512VL-LABEL: var_rotate_v8i16:
353; AVX512VL:       # %bb.0:
354; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
355; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
356; AVX512VL-NEXT:    vpsllvd %ymm2, %ymm0, %ymm2
357; AVX512VL-NEXT:    vpmovdw %ymm2, %xmm2
358; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
359; AVX512VL-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
360; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
361; AVX512VL-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
362; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
363; AVX512VL-NEXT:    vpor %xmm0, %xmm2, %xmm0
364; AVX512VL-NEXT:    vzeroupper
365; AVX512VL-NEXT:    retq
366;
367; AVX512BW-LABEL: var_rotate_v8i16:
368; AVX512BW:       # %bb.0:
369; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
370; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
371; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm2
372; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
373; AVX512BW-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
374; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
375; AVX512BW-NEXT:    vpor %xmm0, %xmm2, %xmm0
376; AVX512BW-NEXT:    vzeroupper
377; AVX512BW-NEXT:    retq
378;
379; AVX512VLBW-LABEL: var_rotate_v8i16:
380; AVX512VLBW:       # %bb.0:
381; AVX512VLBW-NEXT:    vpsllvw %xmm1, %xmm0, %xmm2
382; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
383; AVX512VLBW-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
384; AVX512VLBW-NEXT:    vpsrlvw %xmm1, %xmm0, %xmm0
385; AVX512VLBW-NEXT:    vpor %xmm0, %xmm2, %xmm0
386; AVX512VLBW-NEXT:    retq
387;
388; XOP-LABEL: var_rotate_v8i16:
389; XOP:       # %bb.0:
390; XOP-NEXT:    vprotw %xmm1, %xmm0, %xmm0
391; XOP-NEXT:    retq
392;
393; X32-SSE-LABEL: var_rotate_v8i16:
394; X32-SSE:       # %bb.0:
395; X32-SSE-NEXT:    pxor %xmm2, %xmm2
396; X32-SSE-NEXT:    movdqa %xmm1, %xmm3
397; X32-SSE-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
398; X32-SSE-NEXT:    pslld $23, %xmm3
399; X32-SSE-NEXT:    movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
400; X32-SSE-NEXT:    paddd %xmm4, %xmm3
401; X32-SSE-NEXT:    cvttps2dq %xmm3, %xmm3
402; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
403; X32-SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
404; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
405; X32-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
406; X32-SSE-NEXT:    pslld $23, %xmm1
407; X32-SSE-NEXT:    paddd %xmm4, %xmm1
408; X32-SSE-NEXT:    cvttps2dq %xmm1, %xmm1
409; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
410; X32-SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
411; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
412; X32-SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
413; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
414; X32-SSE-NEXT:    pmulhuw %xmm1, %xmm2
415; X32-SSE-NEXT:    pmullw %xmm1, %xmm0
416; X32-SSE-NEXT:    por %xmm2, %xmm0
417; X32-SSE-NEXT:    retl
418  %b16 = sub <8 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %b
419  %shl = shl <8 x i16> %a, %b
420  %lshr = lshr <8 x i16> %a, %b16
421  %or = or <8 x i16> %shl, %lshr
422  ret <8 x i16> %or
423}
424
425define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
426; SSE2-LABEL: var_rotate_v16i8:
427; SSE2:       # %bb.0:
428; SSE2-NEXT:    movdqa %xmm0, %xmm2
429; SSE2-NEXT:    psllw $5, %xmm1
430; SSE2-NEXT:    pxor %xmm0, %xmm0
431; SSE2-NEXT:    pxor %xmm3, %xmm3
432; SSE2-NEXT:    pcmpgtb %xmm1, %xmm3
433; SSE2-NEXT:    movdqa %xmm2, %xmm4
434; SSE2-NEXT:    psrlw $4, %xmm4
435; SSE2-NEXT:    pand {{.*}}(%rip), %xmm4
436; SSE2-NEXT:    movdqa %xmm2, %xmm5
437; SSE2-NEXT:    psllw $4, %xmm5
438; SSE2-NEXT:    pand {{.*}}(%rip), %xmm5
439; SSE2-NEXT:    por %xmm4, %xmm5
440; SSE2-NEXT:    pand %xmm3, %xmm5
441; SSE2-NEXT:    pandn %xmm2, %xmm3
442; SSE2-NEXT:    por %xmm5, %xmm3
443; SSE2-NEXT:    movdqa %xmm3, %xmm2
444; SSE2-NEXT:    psrlw $6, %xmm2
445; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
446; SSE2-NEXT:    movdqa %xmm3, %xmm4
447; SSE2-NEXT:    psllw $2, %xmm4
448; SSE2-NEXT:    pand {{.*}}(%rip), %xmm4
449; SSE2-NEXT:    por %xmm2, %xmm4
450; SSE2-NEXT:    paddb %xmm1, %xmm1
451; SSE2-NEXT:    pxor %xmm2, %xmm2
452; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
453; SSE2-NEXT:    pand %xmm2, %xmm4
454; SSE2-NEXT:    pandn %xmm3, %xmm2
455; SSE2-NEXT:    por %xmm4, %xmm2
456; SSE2-NEXT:    movdqa %xmm2, %xmm3
457; SSE2-NEXT:    paddb %xmm2, %xmm3
458; SSE2-NEXT:    movdqa %xmm2, %xmm4
459; SSE2-NEXT:    psrlw $7, %xmm4
460; SSE2-NEXT:    pand {{.*}}(%rip), %xmm4
461; SSE2-NEXT:    por %xmm3, %xmm4
462; SSE2-NEXT:    paddb %xmm1, %xmm1
463; SSE2-NEXT:    pcmpgtb %xmm1, %xmm0
464; SSE2-NEXT:    pand %xmm0, %xmm4
465; SSE2-NEXT:    pandn %xmm2, %xmm0
466; SSE2-NEXT:    por %xmm4, %xmm0
467; SSE2-NEXT:    retq
468;
469; SSE41-LABEL: var_rotate_v16i8:
470; SSE41:       # %bb.0:
471; SSE41-NEXT:    movdqa %xmm1, %xmm2
472; SSE41-NEXT:    movdqa %xmm0, %xmm1
473; SSE41-NEXT:    psrlw $4, %xmm0
474; SSE41-NEXT:    pand {{.*}}(%rip), %xmm0
475; SSE41-NEXT:    movdqa %xmm1, %xmm3
476; SSE41-NEXT:    psllw $4, %xmm3
477; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
478; SSE41-NEXT:    por %xmm0, %xmm3
479; SSE41-NEXT:    psllw $5, %xmm2
480; SSE41-NEXT:    movdqa %xmm2, %xmm0
481; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
482; SSE41-NEXT:    movdqa %xmm1, %xmm0
483; SSE41-NEXT:    psrlw $6, %xmm0
484; SSE41-NEXT:    pand {{.*}}(%rip), %xmm0
485; SSE41-NEXT:    movdqa %xmm1, %xmm3
486; SSE41-NEXT:    psllw $2, %xmm3
487; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
488; SSE41-NEXT:    por %xmm0, %xmm3
489; SSE41-NEXT:    paddb %xmm2, %xmm2
490; SSE41-NEXT:    movdqa %xmm2, %xmm0
491; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
492; SSE41-NEXT:    movdqa %xmm1, %xmm0
493; SSE41-NEXT:    paddb %xmm1, %xmm0
494; SSE41-NEXT:    movdqa %xmm1, %xmm3
495; SSE41-NEXT:    psrlw $7, %xmm3
496; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
497; SSE41-NEXT:    por %xmm0, %xmm3
498; SSE41-NEXT:    paddb %xmm2, %xmm2
499; SSE41-NEXT:    movdqa %xmm2, %xmm0
500; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
501; SSE41-NEXT:    movdqa %xmm1, %xmm0
502; SSE41-NEXT:    retq
503;
504; AVX-LABEL: var_rotate_v16i8:
505; AVX:       # %bb.0:
506; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm2
507; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
508; AVX-NEXT:    vpsllw $4, %xmm0, %xmm3
509; AVX-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
510; AVX-NEXT:    vpor %xmm2, %xmm3, %xmm2
511; AVX-NEXT:    vpsllw $5, %xmm1, %xmm1
512; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
513; AVX-NEXT:    vpsrlw $6, %xmm0, %xmm2
514; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
515; AVX-NEXT:    vpsllw $2, %xmm0, %xmm3
516; AVX-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
517; AVX-NEXT:    vpor %xmm2, %xmm3, %xmm2
518; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
519; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
520; AVX-NEXT:    vpaddb %xmm0, %xmm0, %xmm2
521; AVX-NEXT:    vpsrlw $7, %xmm0, %xmm3
522; AVX-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
523; AVX-NEXT:    vpor %xmm3, %xmm2, %xmm2
524; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
525; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
526; AVX-NEXT:    retq
527;
528; AVX512F-LABEL: var_rotate_v16i8:
529; AVX512F:       # %bb.0:
530; AVX512F-NEXT:    vpsrlw $4, %xmm0, %xmm2
531; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
532; AVX512F-NEXT:    vpsllw $4, %xmm0, %xmm3
533; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
534; AVX512F-NEXT:    vpor %xmm2, %xmm3, %xmm2
535; AVX512F-NEXT:    vpsllw $5, %xmm1, %xmm1
536; AVX512F-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
537; AVX512F-NEXT:    vpsrlw $6, %xmm0, %xmm2
538; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
539; AVX512F-NEXT:    vpsllw $2, %xmm0, %xmm3
540; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
541; AVX512F-NEXT:    vpor %xmm2, %xmm3, %xmm2
542; AVX512F-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
543; AVX512F-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
544; AVX512F-NEXT:    vpaddb %xmm0, %xmm0, %xmm2
545; AVX512F-NEXT:    vpsrlw $7, %xmm0, %xmm3
546; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
547; AVX512F-NEXT:    vpor %xmm3, %xmm2, %xmm2
548; AVX512F-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
549; AVX512F-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
550; AVX512F-NEXT:    retq
551;
552; AVX512VL-LABEL: var_rotate_v16i8:
553; AVX512VL:       # %bb.0:
554; AVX512VL-NEXT:    vpsrlw $4, %xmm0, %xmm2
555; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
556; AVX512VL-NEXT:    vpsllw $4, %xmm0, %xmm3
557; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
558; AVX512VL-NEXT:    vpor %xmm2, %xmm3, %xmm2
559; AVX512VL-NEXT:    vpsllw $5, %xmm1, %xmm1
560; AVX512VL-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
561; AVX512VL-NEXT:    vpsrlw $6, %xmm0, %xmm2
562; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
563; AVX512VL-NEXT:    vpsllw $2, %xmm0, %xmm3
564; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
565; AVX512VL-NEXT:    vpor %xmm2, %xmm3, %xmm2
566; AVX512VL-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
567; AVX512VL-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
568; AVX512VL-NEXT:    vpaddb %xmm0, %xmm0, %xmm2
569; AVX512VL-NEXT:    vpsrlw $7, %xmm0, %xmm3
570; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
571; AVX512VL-NEXT:    vpor %xmm3, %xmm2, %xmm2
572; AVX512VL-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
573; AVX512VL-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
574; AVX512VL-NEXT:    retq
575;
576; AVX512BW-LABEL: var_rotate_v16i8:
577; AVX512BW:       # %bb.0:
578; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
579; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
580; AVX512BW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm2
581; AVX512BW-NEXT:    vpmovwb %zmm2, %ymm2
582; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
583; AVX512BW-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
584; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
585; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
586; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
587; AVX512BW-NEXT:    vpor %xmm0, %xmm2, %xmm0
588; AVX512BW-NEXT:    vzeroupper
589; AVX512BW-NEXT:    retq
590;
591; AVX512VLBW-LABEL: var_rotate_v16i8:
592; AVX512VLBW:       # %bb.0:
593; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
594; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
595; AVX512VLBW-NEXT:    vpsllvw %ymm2, %ymm0, %ymm2
596; AVX512VLBW-NEXT:    vpmovwb %ymm2, %xmm2
597; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
598; AVX512VLBW-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
599; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
600; AVX512VLBW-NEXT:    vpsrlvw %ymm1, %ymm0, %ymm0
601; AVX512VLBW-NEXT:    vpmovwb %ymm0, %xmm0
602; AVX512VLBW-NEXT:    vpor %xmm0, %xmm2, %xmm0
603; AVX512VLBW-NEXT:    vzeroupper
604; AVX512VLBW-NEXT:    retq
605;
606; XOP-LABEL: var_rotate_v16i8:
607; XOP:       # %bb.0:
608; XOP-NEXT:    vprotb %xmm1, %xmm0, %xmm0
609; XOP-NEXT:    retq
610;
611; X32-SSE-LABEL: var_rotate_v16i8:
612; X32-SSE:       # %bb.0:
613; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
614; X32-SSE-NEXT:    psllw $5, %xmm1
615; X32-SSE-NEXT:    pxor %xmm0, %xmm0
616; X32-SSE-NEXT:    pxor %xmm3, %xmm3
617; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm3
618; X32-SSE-NEXT:    movdqa %xmm2, %xmm4
619; X32-SSE-NEXT:    psrlw $4, %xmm4
620; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm4
621; X32-SSE-NEXT:    movdqa %xmm2, %xmm5
622; X32-SSE-NEXT:    psllw $4, %xmm5
623; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm5
624; X32-SSE-NEXT:    por %xmm4, %xmm5
625; X32-SSE-NEXT:    pand %xmm3, %xmm5
626; X32-SSE-NEXT:    pandn %xmm2, %xmm3
627; X32-SSE-NEXT:    por %xmm5, %xmm3
628; X32-SSE-NEXT:    movdqa %xmm3, %xmm2
629; X32-SSE-NEXT:    psrlw $6, %xmm2
630; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm2
631; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
632; X32-SSE-NEXT:    psllw $2, %xmm4
633; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm4
634; X32-SSE-NEXT:    por %xmm2, %xmm4
635; X32-SSE-NEXT:    paddb %xmm1, %xmm1
636; X32-SSE-NEXT:    pxor %xmm2, %xmm2
637; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm2
638; X32-SSE-NEXT:    pand %xmm2, %xmm4
639; X32-SSE-NEXT:    pandn %xmm3, %xmm2
640; X32-SSE-NEXT:    por %xmm4, %xmm2
641; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
642; X32-SSE-NEXT:    paddb %xmm2, %xmm3
643; X32-SSE-NEXT:    movdqa %xmm2, %xmm4
644; X32-SSE-NEXT:    psrlw $7, %xmm4
645; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm4
646; X32-SSE-NEXT:    por %xmm3, %xmm4
647; X32-SSE-NEXT:    paddb %xmm1, %xmm1
648; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm0
649; X32-SSE-NEXT:    pand %xmm0, %xmm4
650; X32-SSE-NEXT:    pandn %xmm2, %xmm0
651; X32-SSE-NEXT:    por %xmm4, %xmm0
652; X32-SSE-NEXT:    retl
653  %b8 = sub <16 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b
654  %shl = shl <16 x i8> %a, %b
655  %lshr = lshr <16 x i8> %a, %b8
656  %or = or <16 x i8> %shl, %lshr
657  ret <16 x i8> %or
658}
659
660;
661; Uniform Variable Rotates
662;
663
664define <2 x i64> @splatvar_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
665; SSE-LABEL: splatvar_rotate_v2i64:
666; SSE:       # %bb.0:
667; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,1,0,1]
668; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [64,64]
669; SSE-NEXT:    psubq %xmm2, %xmm3
670; SSE-NEXT:    movdqa %xmm0, %xmm2
671; SSE-NEXT:    psllq %xmm1, %xmm2
672; SSE-NEXT:    psrlq %xmm3, %xmm0
673; SSE-NEXT:    por %xmm2, %xmm0
674; SSE-NEXT:    retq
675;
676; AVX1-LABEL: splatvar_rotate_v2i64:
677; AVX1:       # %bb.0:
678; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[0,1,0,1]
679; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [64,64]
680; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
681; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm1
682; AVX1-NEXT:    vpsrlq %xmm2, %xmm0, %xmm0
683; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
684; AVX1-NEXT:    retq
685;
686; AVX2-LABEL: splatvar_rotate_v2i64:
687; AVX2:       # %bb.0:
688; AVX2-NEXT:    vpbroadcastq %xmm1, %xmm2
689; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [64,64]
690; AVX2-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
691; AVX2-NEXT:    vpsllq %xmm1, %xmm0, %xmm1
692; AVX2-NEXT:    vpsrlvq %xmm2, %xmm0, %xmm0
693; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
694; AVX2-NEXT:    retq
695;
696; AVX512F-LABEL: splatvar_rotate_v2i64:
697; AVX512F:       # %bb.0:
698; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
699; AVX512F-NEXT:    vpbroadcastq %xmm1, %xmm1
700; AVX512F-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
701; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
702; AVX512F-NEXT:    vzeroupper
703; AVX512F-NEXT:    retq
704;
705; AVX512VL-LABEL: splatvar_rotate_v2i64:
706; AVX512VL:       # %bb.0:
707; AVX512VL-NEXT:    vpbroadcastq %xmm1, %xmm1
708; AVX512VL-NEXT:    vprolvq %xmm1, %xmm0, %xmm0
709; AVX512VL-NEXT:    retq
710;
711; AVX512BW-LABEL: splatvar_rotate_v2i64:
712; AVX512BW:       # %bb.0:
713; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
714; AVX512BW-NEXT:    vpbroadcastq %xmm1, %xmm1
715; AVX512BW-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
716; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
717; AVX512BW-NEXT:    vzeroupper
718; AVX512BW-NEXT:    retq
719;
720; AVX512VLBW-LABEL: splatvar_rotate_v2i64:
721; AVX512VLBW:       # %bb.0:
722; AVX512VLBW-NEXT:    vpbroadcastq %xmm1, %xmm1
723; AVX512VLBW-NEXT:    vprolvq %xmm1, %xmm0, %xmm0
724; AVX512VLBW-NEXT:    retq
725;
726; XOPAVX1-LABEL: splatvar_rotate_v2i64:
727; XOPAVX1:       # %bb.0:
728; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
729; XOPAVX1-NEXT:    vprotq %xmm1, %xmm0, %xmm0
730; XOPAVX1-NEXT:    retq
731;
732; XOPAVX2-LABEL: splatvar_rotate_v2i64:
733; XOPAVX2:       # %bb.0:
734; XOPAVX2-NEXT:    vpbroadcastq %xmm1, %xmm1
735; XOPAVX2-NEXT:    vprotq %xmm1, %xmm0, %xmm0
736; XOPAVX2-NEXT:    retq
737;
738; X32-SSE-LABEL: splatvar_rotate_v2i64:
739; X32-SSE:       # %bb.0:
740; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,1,0,1]
741; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [64,0,64,0]
742; X32-SSE-NEXT:    psubq %xmm2, %xmm3
743; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
744; X32-SSE-NEXT:    psllq %xmm1, %xmm2
745; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
746; X32-SSE-NEXT:    psrlq %xmm3, %xmm1
747; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
748; X32-SSE-NEXT:    psrlq %xmm3, %xmm0
749; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
750; X32-SSE-NEXT:    orpd %xmm2, %xmm0
751; X32-SSE-NEXT:    retl
752  %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer
753  %splat64 = sub <2 x i64> <i64 64, i64 64>, %splat
754  %shl = shl <2 x i64> %a, %splat
755  %lshr = lshr <2 x i64> %a, %splat64
756  %or = or <2 x i64> %shl, %lshr
757  ret <2 x i64> %or
758}
759
760define <4 x i32> @splatvar_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
761; SSE2-LABEL: splatvar_rotate_v4i32:
762; SSE2:       # %bb.0:
763; SSE2-NEXT:    xorps %xmm2, %xmm2
764; SSE2-NEXT:    xorps %xmm3, %xmm3
765; SSE2-NEXT:    movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3]
766; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
767; SSE2-NEXT:    movdqa %xmm0, %xmm4
768; SSE2-NEXT:    pslld %xmm3, %xmm4
769; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [32,32,32,32]
770; SSE2-NEXT:    psubd %xmm1, %xmm3
771; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3]
772; SSE2-NEXT:    psrld %xmm2, %xmm0
773; SSE2-NEXT:    por %xmm4, %xmm0
774; SSE2-NEXT:    retq
775;
776; SSE41-LABEL: splatvar_rotate_v4i32:
777; SSE41:       # %bb.0:
778; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
779; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
780; SSE41-NEXT:    movdqa %xmm0, %xmm3
781; SSE41-NEXT:    pslld %xmm2, %xmm3
782; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [32,32,32,32]
783; SSE41-NEXT:    psubd %xmm1, %xmm2
784; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero
785; SSE41-NEXT:    psrld %xmm1, %xmm0
786; SSE41-NEXT:    por %xmm3, %xmm0
787; SSE41-NEXT:    retq
788;
789; AVX1-LABEL: splatvar_rotate_v4i32:
790; AVX1:       # %bb.0:
791; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
792; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
793; AVX1-NEXT:    vpslld %xmm2, %xmm0, %xmm2
794; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [32,32,32,32]
795; AVX1-NEXT:    vpsubd %xmm1, %xmm3, %xmm1
796; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
797; AVX1-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
798; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
799; AVX1-NEXT:    retq
800;
801; AVX2-LABEL: splatvar_rotate_v4i32:
802; AVX2:       # %bb.0:
803; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
804; AVX2-NEXT:    vpbroadcastd %xmm1, %xmm1
805; AVX2-NEXT:    vpslld %xmm2, %xmm0, %xmm2
806; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32]
807; AVX2-NEXT:    vpsubd %xmm1, %xmm3, %xmm1
808; AVX2-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
809; AVX2-NEXT:    vpor %xmm0, %xmm2, %xmm0
810; AVX2-NEXT:    retq
811;
812; AVX512F-LABEL: splatvar_rotate_v4i32:
813; AVX512F:       # %bb.0:
814; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
815; AVX512F-NEXT:    vpbroadcastd %xmm1, %xmm1
816; AVX512F-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
817; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
818; AVX512F-NEXT:    vzeroupper
819; AVX512F-NEXT:    retq
820;
821; AVX512VL-LABEL: splatvar_rotate_v4i32:
822; AVX512VL:       # %bb.0:
823; AVX512VL-NEXT:    vpbroadcastd %xmm1, %xmm1
824; AVX512VL-NEXT:    vprolvd %xmm1, %xmm0, %xmm0
825; AVX512VL-NEXT:    retq
826;
827; AVX512BW-LABEL: splatvar_rotate_v4i32:
828; AVX512BW:       # %bb.0:
829; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
830; AVX512BW-NEXT:    vpbroadcastd %xmm1, %xmm1
831; AVX512BW-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
832; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
833; AVX512BW-NEXT:    vzeroupper
834; AVX512BW-NEXT:    retq
835;
836; AVX512VLBW-LABEL: splatvar_rotate_v4i32:
837; AVX512VLBW:       # %bb.0:
838; AVX512VLBW-NEXT:    vpbroadcastd %xmm1, %xmm1
839; AVX512VLBW-NEXT:    vprolvd %xmm1, %xmm0, %xmm0
840; AVX512VLBW-NEXT:    retq
841;
842; XOPAVX1-LABEL: splatvar_rotate_v4i32:
843; XOPAVX1:       # %bb.0:
844; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
845; XOPAVX1-NEXT:    vprotd %xmm1, %xmm0, %xmm0
846; XOPAVX1-NEXT:    retq
847;
848; XOPAVX2-LABEL: splatvar_rotate_v4i32:
849; XOPAVX2:       # %bb.0:
850; XOPAVX2-NEXT:    vpbroadcastd %xmm1, %xmm1
851; XOPAVX2-NEXT:    vprotd %xmm1, %xmm0, %xmm0
852; XOPAVX2-NEXT:    retq
853;
854; X32-SSE-LABEL: splatvar_rotate_v4i32:
855; X32-SSE:       # %bb.0:
856; X32-SSE-NEXT:    xorps %xmm2, %xmm2
857; X32-SSE-NEXT:    xorps %xmm3, %xmm3
858; X32-SSE-NEXT:    movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3]
859; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
860; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
861; X32-SSE-NEXT:    pslld %xmm3, %xmm4
862; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [32,32,32,32]
863; X32-SSE-NEXT:    psubd %xmm1, %xmm3
864; X32-SSE-NEXT:    movss {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3]
865; X32-SSE-NEXT:    psrld %xmm2, %xmm0
866; X32-SSE-NEXT:    por %xmm4, %xmm0
867; X32-SSE-NEXT:    retl
868  %splat = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer
869  %splat32 = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %splat
870  %shl = shl <4 x i32> %a, %splat
871  %lshr = lshr <4 x i32> %a, %splat32
872  %or = or <4 x i32> %shl, %lshr
873  ret <4 x i32> %or
874}
875
876define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
877; SSE2-LABEL: splatvar_rotate_v8i16:
878; SSE2:       # %bb.0:
879; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[0,0,2,3,4,5,6,7]
880; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
881; SSE2-NEXT:    pextrw $0, %xmm1, %eax
882; SSE2-NEXT:    movd %eax, %xmm1
883; SSE2-NEXT:    movdqa %xmm0, %xmm3
884; SSE2-NEXT:    psllw %xmm1, %xmm3
885; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16]
886; SSE2-NEXT:    psubw %xmm2, %xmm1
887; SSE2-NEXT:    pextrw $0, %xmm1, %eax
888; SSE2-NEXT:    movd %eax, %xmm1
889; SSE2-NEXT:    psrlw %xmm1, %xmm0
890; SSE2-NEXT:    por %xmm3, %xmm0
891; SSE2-NEXT:    retq
892;
893; SSE41-LABEL: splatvar_rotate_v8i16:
894; SSE41:       # %bb.0:
895; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
896; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
897; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
898; SSE41-NEXT:    movdqa %xmm0, %xmm3
899; SSE41-NEXT:    psllw %xmm2, %xmm3
900; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16]
901; SSE41-NEXT:    psubw %xmm1, %xmm2
902; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
903; SSE41-NEXT:    psrlw %xmm1, %xmm0
904; SSE41-NEXT:    por %xmm3, %xmm0
905; SSE41-NEXT:    retq
906;
907; AVX1-LABEL: splatvar_rotate_v8i16:
908; AVX1:       # %bb.0:
909; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
910; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
911; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
912; AVX1-NEXT:    vpsllw %xmm2, %xmm0, %xmm2
913; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
914; AVX1-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
915; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
916; AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
917; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
918; AVX1-NEXT:    retq
919;
920; AVX2-LABEL: splatvar_rotate_v8i16:
921; AVX2:       # %bb.0:
922; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
923; AVX2-NEXT:    vpbroadcastw %xmm1, %xmm1
924; AVX2-NEXT:    vpsllw %xmm2, %xmm0, %xmm2
925; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
926; AVX2-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
927; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
928; AVX2-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
929; AVX2-NEXT:    vpor %xmm0, %xmm2, %xmm0
930; AVX2-NEXT:    retq
931;
932; AVX512F-LABEL: splatvar_rotate_v8i16:
933; AVX512F:       # %bb.0:
934; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
935; AVX512F-NEXT:    vpbroadcastw %xmm1, %xmm1
936; AVX512F-NEXT:    vpsllw %xmm2, %xmm0, %xmm2
937; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
938; AVX512F-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
939; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
940; AVX512F-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
941; AVX512F-NEXT:    vpor %xmm0, %xmm2, %xmm0
942; AVX512F-NEXT:    retq
943;
944; AVX512VL-LABEL: splatvar_rotate_v8i16:
945; AVX512VL:       # %bb.0:
946; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
947; AVX512VL-NEXT:    vpbroadcastw %xmm1, %xmm1
948; AVX512VL-NEXT:    vpsllw %xmm2, %xmm0, %xmm2
949; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
950; AVX512VL-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
951; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
952; AVX512VL-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
953; AVX512VL-NEXT:    vpor %xmm0, %xmm2, %xmm0
954; AVX512VL-NEXT:    retq
955;
956; AVX512BW-LABEL: splatvar_rotate_v8i16:
957; AVX512BW:       # %bb.0:
958; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
959; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
960; AVX512BW-NEXT:    vpbroadcastw %xmm1, %xmm1
961; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
962; AVX512BW-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
963; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm1
964; AVX512BW-NEXT:    vpsllw %xmm2, %xmm0, %xmm0
965; AVX512BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
966; AVX512BW-NEXT:    vzeroupper
967; AVX512BW-NEXT:    retq
968;
969; AVX512VLBW-LABEL: splatvar_rotate_v8i16:
970; AVX512VLBW:       # %bb.0:
971; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
972; AVX512VLBW-NEXT:    vpbroadcastw %xmm1, %xmm1
973; AVX512VLBW-NEXT:    vpsllw %xmm2, %xmm0, %xmm2
974; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
975; AVX512VLBW-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
976; AVX512VLBW-NEXT:    vpsrlvw %xmm1, %xmm0, %xmm0
977; AVX512VLBW-NEXT:    vpor %xmm0, %xmm2, %xmm0
978; AVX512VLBW-NEXT:    retq
979;
980; XOPAVX1-LABEL: splatvar_rotate_v8i16:
981; XOPAVX1:       # %bb.0:
982; XOPAVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
983; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
984; XOPAVX1-NEXT:    vprotw %xmm1, %xmm0, %xmm0
985; XOPAVX1-NEXT:    retq
986;
987; XOPAVX2-LABEL: splatvar_rotate_v8i16:
988; XOPAVX2:       # %bb.0:
989; XOPAVX2-NEXT:    vpbroadcastw %xmm1, %xmm1
990; XOPAVX2-NEXT:    vprotw %xmm1, %xmm0, %xmm0
991; XOPAVX2-NEXT:    retq
992;
993; X32-SSE-LABEL: splatvar_rotate_v8i16:
994; X32-SSE:       # %bb.0:
995; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[0,0,2,3,4,5,6,7]
996; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
997; X32-SSE-NEXT:    pextrw $0, %xmm1, %eax
998; X32-SSE-NEXT:    movd %eax, %xmm1
999; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
1000; X32-SSE-NEXT:    psllw %xmm1, %xmm3
1001; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16]
1002; X32-SSE-NEXT:    psubw %xmm2, %xmm1
1003; X32-SSE-NEXT:    pextrw $0, %xmm1, %eax
1004; X32-SSE-NEXT:    movd %eax, %xmm1
1005; X32-SSE-NEXT:    psrlw %xmm1, %xmm0
1006; X32-SSE-NEXT:    por %xmm3, %xmm0
1007; X32-SSE-NEXT:    retl
1008  %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer
1009  %splat16 = sub <8 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %splat
1010  %shl = shl <8 x i16> %a, %splat
1011  %lshr = lshr <8 x i16> %a, %splat16
1012  %or = or <8 x i16> %shl, %lshr
1013  ret <8 x i16> %or
1014}
1015
1016define <16 x i8> @splatvar_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
1017; SSE2-LABEL: splatvar_rotate_v16i8:
1018; SSE2:       # %bb.0:
1019; SSE2-NEXT:    movdqa %xmm0, %xmm2
1020; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1021; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7]
1022; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
1023; SSE2-NEXT:    movdqa %xmm2, %xmm0
1024; SSE2-NEXT:    psrlw $4, %xmm0
1025; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
1026; SSE2-NEXT:    movdqa %xmm2, %xmm3
1027; SSE2-NEXT:    psllw $4, %xmm3
1028; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
1029; SSE2-NEXT:    por %xmm0, %xmm3
1030; SSE2-NEXT:    psllw $5, %xmm1
1031; SSE2-NEXT:    pxor %xmm0, %xmm0
1032; SSE2-NEXT:    pxor %xmm4, %xmm4
1033; SSE2-NEXT:    pcmpgtb %xmm1, %xmm4
1034; SSE2-NEXT:    pand %xmm4, %xmm3
1035; SSE2-NEXT:    pandn %xmm2, %xmm4
1036; SSE2-NEXT:    por %xmm3, %xmm4
1037; SSE2-NEXT:    movdqa %xmm4, %xmm2
1038; SSE2-NEXT:    psrlw $6, %xmm2
1039; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
1040; SSE2-NEXT:    movdqa %xmm4, %xmm3
1041; SSE2-NEXT:    psllw $2, %xmm3
1042; SSE2-NEXT:    pand {{.*}}(%rip), %xmm3
1043; SSE2-NEXT:    por %xmm2, %xmm3
1044; SSE2-NEXT:    paddb %xmm1, %xmm1
1045; SSE2-NEXT:    pxor %xmm2, %xmm2
1046; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
1047; SSE2-NEXT:    pand %xmm2, %xmm3
1048; SSE2-NEXT:    pandn %xmm4, %xmm2
1049; SSE2-NEXT:    por %xmm3, %xmm2
1050; SSE2-NEXT:    movdqa %xmm2, %xmm3
1051; SSE2-NEXT:    paddb %xmm2, %xmm3
1052; SSE2-NEXT:    movdqa %xmm2, %xmm4
1053; SSE2-NEXT:    psrlw $7, %xmm4
1054; SSE2-NEXT:    pand {{.*}}(%rip), %xmm4
1055; SSE2-NEXT:    por %xmm3, %xmm4
1056; SSE2-NEXT:    paddb %xmm1, %xmm1
1057; SSE2-NEXT:    pcmpgtb %xmm1, %xmm0
1058; SSE2-NEXT:    pand %xmm0, %xmm4
1059; SSE2-NEXT:    pandn %xmm2, %xmm0
1060; SSE2-NEXT:    por %xmm4, %xmm0
1061; SSE2-NEXT:    retq
1062;
1063; SSE41-LABEL: splatvar_rotate_v16i8:
1064; SSE41:       # %bb.0:
1065; SSE41-NEXT:    movdqa %xmm1, %xmm2
1066; SSE41-NEXT:    movdqa %xmm0, %xmm1
1067; SSE41-NEXT:    pxor %xmm0, %xmm0
1068; SSE41-NEXT:    pshufb %xmm0, %xmm2
1069; SSE41-NEXT:    movdqa %xmm1, %xmm0
1070; SSE41-NEXT:    psrlw $4, %xmm0
1071; SSE41-NEXT:    pand {{.*}}(%rip), %xmm0
1072; SSE41-NEXT:    movdqa %xmm1, %xmm3
1073; SSE41-NEXT:    psllw $4, %xmm3
1074; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
1075; SSE41-NEXT:    por %xmm0, %xmm3
1076; SSE41-NEXT:    psllw $5, %xmm2
1077; SSE41-NEXT:    movdqa %xmm2, %xmm0
1078; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
1079; SSE41-NEXT:    movdqa %xmm1, %xmm0
1080; SSE41-NEXT:    psrlw $6, %xmm0
1081; SSE41-NEXT:    pand {{.*}}(%rip), %xmm0
1082; SSE41-NEXT:    movdqa %xmm1, %xmm3
1083; SSE41-NEXT:    psllw $2, %xmm3
1084; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
1085; SSE41-NEXT:    por %xmm0, %xmm3
1086; SSE41-NEXT:    paddb %xmm2, %xmm2
1087; SSE41-NEXT:    movdqa %xmm2, %xmm0
1088; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
1089; SSE41-NEXT:    movdqa %xmm1, %xmm0
1090; SSE41-NEXT:    paddb %xmm1, %xmm0
1091; SSE41-NEXT:    movdqa %xmm1, %xmm3
1092; SSE41-NEXT:    psrlw $7, %xmm3
1093; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
1094; SSE41-NEXT:    por %xmm0, %xmm3
1095; SSE41-NEXT:    paddb %xmm2, %xmm2
1096; SSE41-NEXT:    movdqa %xmm2, %xmm0
1097; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
1098; SSE41-NEXT:    movdqa %xmm1, %xmm0
1099; SSE41-NEXT:    retq
1100;
1101; AVX1-LABEL: splatvar_rotate_v16i8:
1102; AVX1:       # %bb.0:
1103; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1104; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1105; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm2
1106; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
1107; AVX1-NEXT:    vpsllw $4, %xmm0, %xmm3
1108; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
1109; AVX1-NEXT:    vpor %xmm2, %xmm3, %xmm2
1110; AVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
1111; AVX1-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
1112; AVX1-NEXT:    vpsrlw $6, %xmm0, %xmm2
1113; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
1114; AVX1-NEXT:    vpsllw $2, %xmm0, %xmm3
1115; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
1116; AVX1-NEXT:    vpor %xmm2, %xmm3, %xmm2
1117; AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
1118; AVX1-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
1119; AVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm2
1120; AVX1-NEXT:    vpsrlw $7, %xmm0, %xmm3
1121; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
1122; AVX1-NEXT:    vpor %xmm3, %xmm2, %xmm2
1123; AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
1124; AVX1-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
1125; AVX1-NEXT:    retq
1126;
1127; AVX2-LABEL: splatvar_rotate_v16i8:
1128; AVX2:       # %bb.0:
1129; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
1130; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm2
1131; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
1132; AVX2-NEXT:    vpsllw $4, %xmm0, %xmm3
1133; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
1134; AVX2-NEXT:    vpor %xmm2, %xmm3, %xmm2
1135; AVX2-NEXT:    vpsllw $5, %xmm1, %xmm1
1136; AVX2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
1137; AVX2-NEXT:    vpsrlw $6, %xmm0, %xmm2
1138; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
1139; AVX2-NEXT:    vpsllw $2, %xmm0, %xmm3
1140; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
1141; AVX2-NEXT:    vpor %xmm2, %xmm3, %xmm2
1142; AVX2-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
1143; AVX2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
1144; AVX2-NEXT:    vpaddb %xmm0, %xmm0, %xmm2
1145; AVX2-NEXT:    vpsrlw $7, %xmm0, %xmm3
1146; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
1147; AVX2-NEXT:    vpor %xmm3, %xmm2, %xmm2
1148; AVX2-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
1149; AVX2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
1150; AVX2-NEXT:    retq
1151;
1152; AVX512F-LABEL: splatvar_rotate_v16i8:
1153; AVX512F:       # %bb.0:
1154; AVX512F-NEXT:    vpbroadcastb %xmm1, %xmm1
1155; AVX512F-NEXT:    vpsrlw $4, %xmm0, %xmm2
1156; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
1157; AVX512F-NEXT:    vpsllw $4, %xmm0, %xmm3
1158; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
1159; AVX512F-NEXT:    vpor %xmm2, %xmm3, %xmm2
1160; AVX512F-NEXT:    vpsllw $5, %xmm1, %xmm1
1161; AVX512F-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
1162; AVX512F-NEXT:    vpsrlw $6, %xmm0, %xmm2
1163; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
1164; AVX512F-NEXT:    vpsllw $2, %xmm0, %xmm3
1165; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
1166; AVX512F-NEXT:    vpor %xmm2, %xmm3, %xmm2
1167; AVX512F-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
1168; AVX512F-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
1169; AVX512F-NEXT:    vpaddb %xmm0, %xmm0, %xmm2
1170; AVX512F-NEXT:    vpsrlw $7, %xmm0, %xmm3
1171; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
1172; AVX512F-NEXT:    vpor %xmm3, %xmm2, %xmm2
1173; AVX512F-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
1174; AVX512F-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
1175; AVX512F-NEXT:    retq
1176;
1177; AVX512VL-LABEL: splatvar_rotate_v16i8:
1178; AVX512VL:       # %bb.0:
1179; AVX512VL-NEXT:    vpbroadcastb %xmm1, %xmm1
1180; AVX512VL-NEXT:    vpsrlw $4, %xmm0, %xmm2
1181; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
1182; AVX512VL-NEXT:    vpsllw $4, %xmm0, %xmm3
1183; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
1184; AVX512VL-NEXT:    vpor %xmm2, %xmm3, %xmm2
1185; AVX512VL-NEXT:    vpsllw $5, %xmm1, %xmm1
1186; AVX512VL-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
1187; AVX512VL-NEXT:    vpsrlw $6, %xmm0, %xmm2
1188; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
1189; AVX512VL-NEXT:    vpsllw $2, %xmm0, %xmm3
1190; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
1191; AVX512VL-NEXT:    vpor %xmm2, %xmm3, %xmm2
1192; AVX512VL-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
1193; AVX512VL-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
1194; AVX512VL-NEXT:    vpaddb %xmm0, %xmm0, %xmm2
1195; AVX512VL-NEXT:    vpsrlw $7, %xmm0, %xmm3
1196; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
1197; AVX512VL-NEXT:    vpor %xmm3, %xmm2, %xmm2
1198; AVX512VL-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
1199; AVX512VL-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
1200; AVX512VL-NEXT:    retq
1201;
1202; AVX512BW-LABEL: splatvar_rotate_v16i8:
1203; AVX512BW:       # %bb.0:
1204; AVX512BW-NEXT:    vpbroadcastb %xmm1, %xmm1
1205; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1206; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1207; AVX512BW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm2
1208; AVX512BW-NEXT:    vpmovwb %zmm2, %ymm2
1209; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1210; AVX512BW-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
1211; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1212; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
1213; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1214; AVX512BW-NEXT:    vpor %xmm0, %xmm2, %xmm0
1215; AVX512BW-NEXT:    vzeroupper
1216; AVX512BW-NEXT:    retq
1217;
1218; AVX512VLBW-LABEL: splatvar_rotate_v16i8:
1219; AVX512VLBW:       # %bb.0:
1220; AVX512VLBW-NEXT:    vpbroadcastb %xmm1, %xmm1
1221; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1222; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1223; AVX512VLBW-NEXT:    vpsllvw %ymm2, %ymm0, %ymm2
1224; AVX512VLBW-NEXT:    vpmovwb %ymm2, %xmm2
1225; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1226; AVX512VLBW-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
1227; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1228; AVX512VLBW-NEXT:    vpsrlvw %ymm1, %ymm0, %ymm0
1229; AVX512VLBW-NEXT:    vpmovwb %ymm0, %xmm0
1230; AVX512VLBW-NEXT:    vpor %xmm0, %xmm2, %xmm0
1231; AVX512VLBW-NEXT:    vzeroupper
1232; AVX512VLBW-NEXT:    retq
1233;
1234; XOPAVX1-LABEL: splatvar_rotate_v16i8:
1235; XOPAVX1:       # %bb.0:
1236; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1237; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1238; XOPAVX1-NEXT:    vprotb %xmm1, %xmm0, %xmm0
1239; XOPAVX1-NEXT:    retq
1240;
1241; XOPAVX2-LABEL: splatvar_rotate_v16i8:
1242; XOPAVX2:       # %bb.0:
1243; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
1244; XOPAVX2-NEXT:    vprotb %xmm1, %xmm0, %xmm0
1245; XOPAVX2-NEXT:    retq
1246;
1247; X32-SSE-LABEL: splatvar_rotate_v16i8:
1248; X32-SSE:       # %bb.0:
1249; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
1250; X32-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1251; X32-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,0,2,3,4,5,6,7]
1252; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
1253; X32-SSE-NEXT:    movdqa %xmm2, %xmm0
1254; X32-SSE-NEXT:    psrlw $4, %xmm0
1255; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
1256; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
1257; X32-SSE-NEXT:    psllw $4, %xmm3
1258; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm3
1259; X32-SSE-NEXT:    por %xmm0, %xmm3
1260; X32-SSE-NEXT:    psllw $5, %xmm1
1261; X32-SSE-NEXT:    pxor %xmm0, %xmm0
1262; X32-SSE-NEXT:    pxor %xmm4, %xmm4
1263; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm4
1264; X32-SSE-NEXT:    pand %xmm4, %xmm3
1265; X32-SSE-NEXT:    pandn %xmm2, %xmm4
1266; X32-SSE-NEXT:    por %xmm3, %xmm4
1267; X32-SSE-NEXT:    movdqa %xmm4, %xmm2
1268; X32-SSE-NEXT:    psrlw $6, %xmm2
1269; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm2
1270; X32-SSE-NEXT:    movdqa %xmm4, %xmm3
1271; X32-SSE-NEXT:    psllw $2, %xmm3
1272; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm3
1273; X32-SSE-NEXT:    por %xmm2, %xmm3
1274; X32-SSE-NEXT:    paddb %xmm1, %xmm1
1275; X32-SSE-NEXT:    pxor %xmm2, %xmm2
1276; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm2
1277; X32-SSE-NEXT:    pand %xmm2, %xmm3
1278; X32-SSE-NEXT:    pandn %xmm4, %xmm2
1279; X32-SSE-NEXT:    por %xmm3, %xmm2
1280; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
1281; X32-SSE-NEXT:    paddb %xmm2, %xmm3
1282; X32-SSE-NEXT:    movdqa %xmm2, %xmm4
1283; X32-SSE-NEXT:    psrlw $7, %xmm4
1284; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm4
1285; X32-SSE-NEXT:    por %xmm3, %xmm4
1286; X32-SSE-NEXT:    paddb %xmm1, %xmm1
1287; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm0
1288; X32-SSE-NEXT:    pand %xmm0, %xmm4
1289; X32-SSE-NEXT:    pandn %xmm2, %xmm0
1290; X32-SSE-NEXT:    por %xmm4, %xmm0
1291; X32-SSE-NEXT:    retl
1292  %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer
1293  %splat8 = sub <16 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %splat
1294  %shl = shl <16 x i8> %a, %splat
1295  %lshr = lshr <16 x i8> %a, %splat8
1296  %or = or <16 x i8> %shl, %lshr
1297  ret <16 x i8> %or
1298}
1299
1300;
1301; Constant Rotates
1302;
1303
1304define <2 x i64> @constant_rotate_v2i64(<2 x i64> %a) nounwind {
1305; SSE2-LABEL: constant_rotate_v2i64:
1306; SSE2:       # %bb.0:
1307; SSE2-NEXT:    movdqa %xmm0, %xmm1
1308; SSE2-NEXT:    psllq $4, %xmm1
1309; SSE2-NEXT:    movdqa %xmm0, %xmm2
1310; SSE2-NEXT:    psllq $14, %xmm2
1311; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
1312; SSE2-NEXT:    movdqa %xmm0, %xmm1
1313; SSE2-NEXT:    psrlq $60, %xmm1
1314; SSE2-NEXT:    psrlq $50, %xmm0
1315; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1316; SSE2-NEXT:    orpd %xmm2, %xmm0
1317; SSE2-NEXT:    retq
1318;
1319; SSE41-LABEL: constant_rotate_v2i64:
1320; SSE41:       # %bb.0:
1321; SSE41-NEXT:    movdqa %xmm0, %xmm1
1322; SSE41-NEXT:    psllq $14, %xmm1
1323; SSE41-NEXT:    movdqa %xmm0, %xmm2
1324; SSE41-NEXT:    psllq $4, %xmm2
1325; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
1326; SSE41-NEXT:    movdqa %xmm0, %xmm1
1327; SSE41-NEXT:    psrlq $50, %xmm1
1328; SSE41-NEXT:    psrlq $60, %xmm0
1329; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1330; SSE41-NEXT:    por %xmm2, %xmm0
1331; SSE41-NEXT:    retq
1332;
1333; AVX1-LABEL: constant_rotate_v2i64:
1334; AVX1:       # %bb.0:
1335; AVX1-NEXT:    vpsllq $14, %xmm0, %xmm1
1336; AVX1-NEXT:    vpsllq $4, %xmm0, %xmm2
1337; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
1338; AVX1-NEXT:    vpsrlq $50, %xmm0, %xmm2
1339; AVX1-NEXT:    vpsrlq $60, %xmm0, %xmm0
1340; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
1341; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
1342; AVX1-NEXT:    retq
1343;
1344; AVX2-LABEL: constant_rotate_v2i64:
1345; AVX2:       # %bb.0:
1346; AVX2-NEXT:    vpsllvq {{.*}}(%rip), %xmm0, %xmm1
1347; AVX2-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
1348; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
1349; AVX2-NEXT:    retq
1350;
1351; AVX512F-LABEL: constant_rotate_v2i64:
1352; AVX512F:       # %bb.0:
1353; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1354; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,14]
1355; AVX512F-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
1356; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1357; AVX512F-NEXT:    vzeroupper
1358; AVX512F-NEXT:    retq
1359;
1360; AVX512VL-LABEL: constant_rotate_v2i64:
1361; AVX512VL:       # %bb.0:
1362; AVX512VL-NEXT:    vprolvq {{.*}}(%rip), %xmm0, %xmm0
1363; AVX512VL-NEXT:    retq
1364;
1365; AVX512BW-LABEL: constant_rotate_v2i64:
1366; AVX512BW:       # %bb.0:
1367; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1368; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,14]
1369; AVX512BW-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
1370; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1371; AVX512BW-NEXT:    vzeroupper
1372; AVX512BW-NEXT:    retq
1373;
1374; AVX512VLBW-LABEL: constant_rotate_v2i64:
1375; AVX512VLBW:       # %bb.0:
1376; AVX512VLBW-NEXT:    vprolvq {{.*}}(%rip), %xmm0, %xmm0
1377; AVX512VLBW-NEXT:    retq
1378;
1379; XOP-LABEL: constant_rotate_v2i64:
1380; XOP:       # %bb.0:
1381; XOP-NEXT:    vprotq {{.*}}(%rip), %xmm0, %xmm0
1382; XOP-NEXT:    retq
1383;
1384; X32-SSE-LABEL: constant_rotate_v2i64:
1385; X32-SSE:       # %bb.0:
1386; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
1387; X32-SSE-NEXT:    psllq $4, %xmm1
1388; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
1389; X32-SSE-NEXT:    psllq $14, %xmm2
1390; X32-SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
1391; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
1392; X32-SSE-NEXT:    psrlq $60, %xmm1
1393; X32-SSE-NEXT:    psrlq $50, %xmm0
1394; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1395; X32-SSE-NEXT:    orpd %xmm2, %xmm0
1396; X32-SSE-NEXT:    retl
1397  %shl = shl <2 x i64> %a, <i64 4, i64 14>
1398  %lshr = lshr <2 x i64> %a, <i64 60, i64 50>
1399  %or = or <2 x i64> %shl, %lshr
1400  ret <2 x i64> %or
1401}
1402
1403define <4 x i32> @constant_rotate_v4i32(<4 x i32> %a) nounwind {
1404; SSE2-LABEL: constant_rotate_v4i32:
1405; SSE2:       # %bb.0:
1406; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [16,32,64,128]
1407; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
1408; SSE2-NEXT:    pmuludq %xmm1, %xmm0
1409; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
1410; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1411; SSE2-NEXT:    pmuludq %xmm2, %xmm1
1412; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
1413; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
1414; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1415; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1416; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1417; SSE2-NEXT:    por %xmm3, %xmm0
1418; SSE2-NEXT:    retq
1419;
1420; SSE41-LABEL: constant_rotate_v4i32:
1421; SSE41:       # %bb.0:
1422; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [16,32,64,128]
1423; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
1424; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
1425; SSE41-NEXT:    pmuludq %xmm2, %xmm3
1426; SSE41-NEXT:    pmuludq %xmm1, %xmm0
1427; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
1428; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
1429; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
1430; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
1431; SSE41-NEXT:    por %xmm1, %xmm0
1432; SSE41-NEXT:    retq
1433;
1434; AVX1-LABEL: constant_rotate_v4i32:
1435; AVX1:       # %bb.0:
1436; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,32,64,128]
1437; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
1438; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
1439; AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
1440; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
1441; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
1442; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1443; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
1444; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
1445; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
1446; AVX1-NEXT:    retq
1447;
1448; AVX2-LABEL: constant_rotate_v4i32:
1449; AVX2:       # %bb.0:
1450; AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm1
1451; AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
1452; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
1453; AVX2-NEXT:    retq
1454;
1455; AVX512F-LABEL: constant_rotate_v4i32:
1456; AVX512F:       # %bb.0:
1457; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1458; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,5,6,7]
1459; AVX512F-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
1460; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1461; AVX512F-NEXT:    vzeroupper
1462; AVX512F-NEXT:    retq
1463;
1464; AVX512VL-LABEL: constant_rotate_v4i32:
1465; AVX512VL:       # %bb.0:
1466; AVX512VL-NEXT:    vprolvd {{.*}}(%rip), %xmm0, %xmm0
1467; AVX512VL-NEXT:    retq
1468;
1469; AVX512BW-LABEL: constant_rotate_v4i32:
1470; AVX512BW:       # %bb.0:
1471; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1472; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,5,6,7]
1473; AVX512BW-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
1474; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1475; AVX512BW-NEXT:    vzeroupper
1476; AVX512BW-NEXT:    retq
1477;
1478; AVX512VLBW-LABEL: constant_rotate_v4i32:
1479; AVX512VLBW:       # %bb.0:
1480; AVX512VLBW-NEXT:    vprolvd {{.*}}(%rip), %xmm0, %xmm0
1481; AVX512VLBW-NEXT:    retq
1482;
1483; XOP-LABEL: constant_rotate_v4i32:
1484; XOP:       # %bb.0:
1485; XOP-NEXT:    vprotd {{.*}}(%rip), %xmm0, %xmm0
1486; XOP-NEXT:    retq
1487;
1488; X32-SSE-LABEL: constant_rotate_v4i32:
1489; X32-SSE:       # %bb.0:
1490; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [16,32,64,128]
1491; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
1492; X32-SSE-NEXT:    pmuludq %xmm1, %xmm0
1493; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
1494; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1495; X32-SSE-NEXT:    pmuludq %xmm2, %xmm1
1496; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
1497; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
1498; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1499; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1500; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1501; X32-SSE-NEXT:    por %xmm3, %xmm0
1502; X32-SSE-NEXT:    retl
1503  %shl = shl <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7>
1504  %lshr = lshr <4 x i32> %a, <i32 28, i32 27, i32 26, i32 25>
1505  %or = or <4 x i32> %shl, %lshr
1506  ret <4 x i32> %or
1507}
1508
1509define <8 x i16> @constant_rotate_v8i16(<8 x i16> %a) nounwind {
1510; SSE-LABEL: constant_rotate_v8i16:
1511; SSE:       # %bb.0:
1512; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
1513; SSE-NEXT:    movdqa %xmm0, %xmm2
1514; SSE-NEXT:    pmulhuw %xmm1, %xmm2
1515; SSE-NEXT:    pmullw %xmm1, %xmm0
1516; SSE-NEXT:    por %xmm2, %xmm0
1517; SSE-NEXT:    retq
1518;
1519; AVX-LABEL: constant_rotate_v8i16:
1520; AVX:       # %bb.0:
1521; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
1522; AVX-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm2
1523; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1524; AVX-NEXT:    vpor %xmm2, %xmm0, %xmm0
1525; AVX-NEXT:    retq
1526;
1527; AVX512F-LABEL: constant_rotate_v8i16:
1528; AVX512F:       # %bb.0:
1529; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
1530; AVX512F-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm2
1531; AVX512F-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1532; AVX512F-NEXT:    vpor %xmm2, %xmm0, %xmm0
1533; AVX512F-NEXT:    retq
1534;
1535; AVX512VL-LABEL: constant_rotate_v8i16:
1536; AVX512VL:       # %bb.0:
1537; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
1538; AVX512VL-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm2
1539; AVX512VL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1540; AVX512VL-NEXT:    vpor %xmm2, %xmm0, %xmm0
1541; AVX512VL-NEXT:    retq
1542;
1543; AVX512BW-LABEL: constant_rotate_v8i16:
1544; AVX512BW:       # %bb.0:
1545; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1546; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
1547; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [16,15,14,13,12,11,10,9]
1548; AVX512BW-NEXT:    vpsrlvw %zmm2, %zmm0, %zmm2
1549; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
1550; AVX512BW-NEXT:    vpor %xmm2, %xmm0, %xmm0
1551; AVX512BW-NEXT:    vzeroupper
1552; AVX512BW-NEXT:    retq
1553;
1554; AVX512VLBW-LABEL: constant_rotate_v8i16:
1555; AVX512VLBW:       # %bb.0:
1556; AVX512VLBW-NEXT:    vpsrlvw {{.*}}(%rip), %xmm0, %xmm1
1557; AVX512VLBW-NEXT:    vpsllvw {{.*}}(%rip), %xmm0, %xmm0
1558; AVX512VLBW-NEXT:    vpor %xmm1, %xmm0, %xmm0
1559; AVX512VLBW-NEXT:    retq
1560;
1561; XOP-LABEL: constant_rotate_v8i16:
1562; XOP:       # %bb.0:
1563; XOP-NEXT:    vprotw {{.*}}(%rip), %xmm0, %xmm0
1564; XOP-NEXT:    retq
1565;
1566; X32-SSE-LABEL: constant_rotate_v8i16:
1567; X32-SSE:       # %bb.0:
1568; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
1569; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
1570; X32-SSE-NEXT:    pmulhuw %xmm1, %xmm2
1571; X32-SSE-NEXT:    pmullw %xmm1, %xmm0
1572; X32-SSE-NEXT:    por %xmm2, %xmm0
1573; X32-SSE-NEXT:    retl
1574  %shl = shl <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
1575  %lshr = lshr <8 x i16> %a, <i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9>
1576  %or = or <8 x i16> %shl, %lshr
1577  ret <8 x i16> %or
1578}
1579
1580define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind {
1581; SSE2-LABEL: constant_rotate_v16i8:
1582; SSE2:       # %bb.0:
1583; SSE2-NEXT:    movdqa %xmm0, %xmm1
1584; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [8192,24640,41088,57536,57600,41152,24704,8256]
1585; SSE2-NEXT:    pxor %xmm0, %xmm0
1586; SSE2-NEXT:    pxor %xmm3, %xmm3
1587; SSE2-NEXT:    pcmpgtb %xmm2, %xmm3
1588; SSE2-NEXT:    movdqa %xmm1, %xmm4
1589; SSE2-NEXT:    psrlw $4, %xmm4
1590; SSE2-NEXT:    pand {{.*}}(%rip), %xmm4
1591; SSE2-NEXT:    movdqa %xmm1, %xmm5
1592; SSE2-NEXT:    psllw $4, %xmm5
1593; SSE2-NEXT:    pand {{.*}}(%rip), %xmm5
1594; SSE2-NEXT:    por %xmm4, %xmm5
1595; SSE2-NEXT:    pand %xmm3, %xmm5
1596; SSE2-NEXT:    pandn %xmm1, %xmm3
1597; SSE2-NEXT:    por %xmm5, %xmm3
1598; SSE2-NEXT:    movdqa %xmm3, %xmm1
1599; SSE2-NEXT:    psrlw $6, %xmm1
1600; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
1601; SSE2-NEXT:    movdqa %xmm3, %xmm4
1602; SSE2-NEXT:    psllw $2, %xmm4
1603; SSE2-NEXT:    pand {{.*}}(%rip), %xmm4
1604; SSE2-NEXT:    por %xmm1, %xmm4
1605; SSE2-NEXT:    paddb %xmm2, %xmm2
1606; SSE2-NEXT:    pxor %xmm1, %xmm1
1607; SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
1608; SSE2-NEXT:    pand %xmm1, %xmm4
1609; SSE2-NEXT:    pandn %xmm3, %xmm1
1610; SSE2-NEXT:    por %xmm4, %xmm1
1611; SSE2-NEXT:    movdqa %xmm1, %xmm3
1612; SSE2-NEXT:    paddb %xmm1, %xmm3
1613; SSE2-NEXT:    movdqa %xmm1, %xmm4
1614; SSE2-NEXT:    psrlw $7, %xmm4
1615; SSE2-NEXT:    pand {{.*}}(%rip), %xmm4
1616; SSE2-NEXT:    por %xmm3, %xmm4
1617; SSE2-NEXT:    paddb %xmm2, %xmm2
1618; SSE2-NEXT:    pcmpgtb %xmm2, %xmm0
1619; SSE2-NEXT:    pand %xmm0, %xmm4
1620; SSE2-NEXT:    pandn %xmm1, %xmm0
1621; SSE2-NEXT:    por %xmm4, %xmm0
1622; SSE2-NEXT:    retq
1623;
1624; SSE41-LABEL: constant_rotate_v16i8:
1625; SSE41:       # %bb.0:
1626; SSE41-NEXT:    movdqa %xmm0, %xmm1
1627; SSE41-NEXT:    psrlw $4, %xmm0
1628; SSE41-NEXT:    pand {{.*}}(%rip), %xmm0
1629; SSE41-NEXT:    movdqa %xmm1, %xmm2
1630; SSE41-NEXT:    psllw $4, %xmm2
1631; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
1632; SSE41-NEXT:    por %xmm0, %xmm2
1633; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [8192,24640,41088,57536,57600,41152,24704,8256]
1634; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
1635; SSE41-NEXT:    movdqa %xmm1, %xmm2
1636; SSE41-NEXT:    psrlw $6, %xmm2
1637; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
1638; SSE41-NEXT:    movdqa %xmm1, %xmm3
1639; SSE41-NEXT:    psllw $2, %xmm3
1640; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
1641; SSE41-NEXT:    por %xmm2, %xmm3
1642; SSE41-NEXT:    paddb %xmm0, %xmm0
1643; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
1644; SSE41-NEXT:    movdqa %xmm1, %xmm2
1645; SSE41-NEXT:    paddb %xmm1, %xmm2
1646; SSE41-NEXT:    movdqa %xmm1, %xmm3
1647; SSE41-NEXT:    psrlw $7, %xmm3
1648; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
1649; SSE41-NEXT:    por %xmm2, %xmm3
1650; SSE41-NEXT:    paddb %xmm0, %xmm0
1651; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
1652; SSE41-NEXT:    movdqa %xmm1, %xmm0
1653; SSE41-NEXT:    retq
1654;
1655; AVX-LABEL: constant_rotate_v16i8:
1656; AVX:       # %bb.0:
1657; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm1
1658; AVX-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
1659; AVX-NEXT:    vpsllw $4, %xmm0, %xmm2
1660; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
1661; AVX-NEXT:    vpor %xmm1, %xmm2, %xmm1
1662; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [8192,24640,41088,57536,57600,41152,24704,8256]
1663; AVX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1664; AVX-NEXT:    vpsrlw $6, %xmm0, %xmm1
1665; AVX-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
1666; AVX-NEXT:    vpsllw $2, %xmm0, %xmm3
1667; AVX-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
1668; AVX-NEXT:    vpor %xmm1, %xmm3, %xmm1
1669; AVX-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
1670; AVX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1671; AVX-NEXT:    vpaddb %xmm0, %xmm0, %xmm1
1672; AVX-NEXT:    vpsrlw $7, %xmm0, %xmm3
1673; AVX-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
1674; AVX-NEXT:    vpor %xmm3, %xmm1, %xmm1
1675; AVX-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
1676; AVX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1677; AVX-NEXT:    retq
1678;
1679; AVX512F-LABEL: constant_rotate_v16i8:
1680; AVX512F:       # %bb.0:
1681; AVX512F-NEXT:    vpsrlw $4, %xmm0, %xmm1
1682; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
1683; AVX512F-NEXT:    vpsllw $4, %xmm0, %xmm2
1684; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
1685; AVX512F-NEXT:    vpor %xmm1, %xmm2, %xmm1
1686; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = [8192,24640,41088,57536,57600,41152,24704,8256]
1687; AVX512F-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1688; AVX512F-NEXT:    vpsrlw $6, %xmm0, %xmm1
1689; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
1690; AVX512F-NEXT:    vpsllw $2, %xmm0, %xmm3
1691; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
1692; AVX512F-NEXT:    vpor %xmm1, %xmm3, %xmm1
1693; AVX512F-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
1694; AVX512F-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1695; AVX512F-NEXT:    vpaddb %xmm0, %xmm0, %xmm1
1696; AVX512F-NEXT:    vpsrlw $7, %xmm0, %xmm3
1697; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
1698; AVX512F-NEXT:    vpor %xmm3, %xmm1, %xmm1
1699; AVX512F-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
1700; AVX512F-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1701; AVX512F-NEXT:    retq
1702;
1703; AVX512VL-LABEL: constant_rotate_v16i8:
1704; AVX512VL:       # %bb.0:
1705; AVX512VL-NEXT:    vpsrlw $4, %xmm0, %xmm1
1706; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
1707; AVX512VL-NEXT:    vpsllw $4, %xmm0, %xmm2
1708; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
1709; AVX512VL-NEXT:    vpor %xmm1, %xmm2, %xmm1
1710; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [8192,24640,41088,57536,57600,41152,24704,8256]
1711; AVX512VL-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1712; AVX512VL-NEXT:    vpsrlw $6, %xmm0, %xmm1
1713; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
1714; AVX512VL-NEXT:    vpsllw $2, %xmm0, %xmm3
1715; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
1716; AVX512VL-NEXT:    vpor %xmm1, %xmm3, %xmm1
1717; AVX512VL-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
1718; AVX512VL-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1719; AVX512VL-NEXT:    vpaddb %xmm0, %xmm0, %xmm1
1720; AVX512VL-NEXT:    vpsrlw $7, %xmm0, %xmm3
1721; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
1722; AVX512VL-NEXT:    vpor %xmm3, %xmm1, %xmm1
1723; AVX512VL-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
1724; AVX512VL-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1725; AVX512VL-NEXT:    retq
1726;
1727; AVX512BW-LABEL: constant_rotate_v16i8:
1728; AVX512BW:       # %bb.0:
1729; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7]
1730; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1731; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm1
1732; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
1733; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
1734; AVX512BW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm0
1735; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1736; AVX512BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
1737; AVX512BW-NEXT:    vzeroupper
1738; AVX512BW-NEXT:    retq
1739;
1740; AVX512VLBW-LABEL: constant_rotate_v16i8:
1741; AVX512VLBW:       # %bb.0:
1742; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1743; AVX512VLBW-NEXT:    vpsrlvw {{.*}}(%rip), %ymm0, %ymm1
1744; AVX512VLBW-NEXT:    vpmovwb %ymm1, %xmm1
1745; AVX512VLBW-NEXT:    vpsllvw {{.*}}(%rip), %ymm0, %ymm0
1746; AVX512VLBW-NEXT:    vpmovwb %ymm0, %xmm0
1747; AVX512VLBW-NEXT:    vpor %xmm1, %xmm0, %xmm0
1748; AVX512VLBW-NEXT:    vzeroupper
1749; AVX512VLBW-NEXT:    retq
1750;
1751; XOP-LABEL: constant_rotate_v16i8:
1752; XOP:       # %bb.0:
1753; XOP-NEXT:    vprotb {{.*}}(%rip), %xmm0, %xmm0
1754; XOP-NEXT:    retq
1755;
1756; X32-SSE-LABEL: constant_rotate_v16i8:
1757; X32-SSE:       # %bb.0:
1758; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
1759; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [8192,24640,41088,57536,57600,41152,24704,8256]
1760; X32-SSE-NEXT:    pxor %xmm0, %xmm0
1761; X32-SSE-NEXT:    pxor %xmm3, %xmm3
1762; X32-SSE-NEXT:    pcmpgtb %xmm2, %xmm3
1763; X32-SSE-NEXT:    movdqa %xmm1, %xmm4
1764; X32-SSE-NEXT:    psrlw $4, %xmm4
1765; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm4
1766; X32-SSE-NEXT:    movdqa %xmm1, %xmm5
1767; X32-SSE-NEXT:    psllw $4, %xmm5
1768; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm5
1769; X32-SSE-NEXT:    por %xmm4, %xmm5
1770; X32-SSE-NEXT:    pand %xmm3, %xmm5
1771; X32-SSE-NEXT:    pandn %xmm1, %xmm3
1772; X32-SSE-NEXT:    por %xmm5, %xmm3
1773; X32-SSE-NEXT:    movdqa %xmm3, %xmm1
1774; X32-SSE-NEXT:    psrlw $6, %xmm1
1775; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
1776; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
1777; X32-SSE-NEXT:    psllw $2, %xmm4
1778; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm4
1779; X32-SSE-NEXT:    por %xmm1, %xmm4
1780; X32-SSE-NEXT:    paddb %xmm2, %xmm2
1781; X32-SSE-NEXT:    pxor %xmm1, %xmm1
1782; X32-SSE-NEXT:    pcmpgtb %xmm2, %xmm1
1783; X32-SSE-NEXT:    pand %xmm1, %xmm4
1784; X32-SSE-NEXT:    pandn %xmm3, %xmm1
1785; X32-SSE-NEXT:    por %xmm4, %xmm1
1786; X32-SSE-NEXT:    movdqa %xmm1, %xmm3
1787; X32-SSE-NEXT:    paddb %xmm1, %xmm3
1788; X32-SSE-NEXT:    movdqa %xmm1, %xmm4
1789; X32-SSE-NEXT:    psrlw $7, %xmm4
1790; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm4
1791; X32-SSE-NEXT:    por %xmm3, %xmm4
1792; X32-SSE-NEXT:    paddb %xmm2, %xmm2
1793; X32-SSE-NEXT:    pcmpgtb %xmm2, %xmm0
1794; X32-SSE-NEXT:    pand %xmm0, %xmm4
1795; X32-SSE-NEXT:    pandn %xmm1, %xmm0
1796; X32-SSE-NEXT:    por %xmm4, %xmm0
1797; X32-SSE-NEXT:    retl
1798  %shl = shl <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>
1799  %lshr = lshr <16 x i8> %a, <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
1800  %or = or <16 x i8> %shl, %lshr
1801  ret <16 x i8> %or
1802}
1803
1804;
1805; Uniform Constant Rotates
1806;
1807
1808define <2 x i64> @splatconstant_rotate_v2i64(<2 x i64> %a) nounwind {
1809; SSE-LABEL: splatconstant_rotate_v2i64:
1810; SSE:       # %bb.0:
1811; SSE-NEXT:    movdqa %xmm0, %xmm1
1812; SSE-NEXT:    psllq $14, %xmm1
1813; SSE-NEXT:    psrlq $50, %xmm0
1814; SSE-NEXT:    por %xmm1, %xmm0
1815; SSE-NEXT:    retq
1816;
1817; AVX-LABEL: splatconstant_rotate_v2i64:
1818; AVX:       # %bb.0:
1819; AVX-NEXT:    vpsllq $14, %xmm0, %xmm1
1820; AVX-NEXT:    vpsrlq $50, %xmm0, %xmm0
1821; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
1822; AVX-NEXT:    retq
1823;
1824; AVX512F-LABEL: splatconstant_rotate_v2i64:
1825; AVX512F:       # %bb.0:
1826; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1827; AVX512F-NEXT:    vprolq $14, %zmm0, %zmm0
1828; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1829; AVX512F-NEXT:    vzeroupper
1830; AVX512F-NEXT:    retq
1831;
1832; AVX512VL-LABEL: splatconstant_rotate_v2i64:
1833; AVX512VL:       # %bb.0:
1834; AVX512VL-NEXT:    vprolq $14, %xmm0, %xmm0
1835; AVX512VL-NEXT:    retq
1836;
1837; AVX512BW-LABEL: splatconstant_rotate_v2i64:
1838; AVX512BW:       # %bb.0:
1839; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1840; AVX512BW-NEXT:    vprolq $14, %zmm0, %zmm0
1841; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1842; AVX512BW-NEXT:    vzeroupper
1843; AVX512BW-NEXT:    retq
1844;
1845; AVX512VLBW-LABEL: splatconstant_rotate_v2i64:
1846; AVX512VLBW:       # %bb.0:
1847; AVX512VLBW-NEXT:    vprolq $14, %xmm0, %xmm0
1848; AVX512VLBW-NEXT:    retq
1849;
1850; XOP-LABEL: splatconstant_rotate_v2i64:
1851; XOP:       # %bb.0:
1852; XOP-NEXT:    vprotq $14, %xmm0, %xmm0
1853; XOP-NEXT:    retq
1854;
1855; X32-SSE-LABEL: splatconstant_rotate_v2i64:
1856; X32-SSE:       # %bb.0:
1857; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
1858; X32-SSE-NEXT:    psllq $14, %xmm1
1859; X32-SSE-NEXT:    psrlq $50, %xmm0
1860; X32-SSE-NEXT:    por %xmm1, %xmm0
1861; X32-SSE-NEXT:    retl
1862  %shl = shl <2 x i64> %a, <i64 14, i64 14>
1863  %lshr = lshr <2 x i64> %a, <i64 50, i64 50>
1864  %or = or <2 x i64> %shl, %lshr
1865  ret <2 x i64> %or
1866}
1867
1868define <4 x i32> @splatconstant_rotate_v4i32(<4 x i32> %a) nounwind {
1869; SSE-LABEL: splatconstant_rotate_v4i32:
1870; SSE:       # %bb.0:
1871; SSE-NEXT:    movdqa %xmm0, %xmm1
1872; SSE-NEXT:    psrld $28, %xmm1
1873; SSE-NEXT:    pslld $4, %xmm0
1874; SSE-NEXT:    por %xmm1, %xmm0
1875; SSE-NEXT:    retq
1876;
1877; AVX-LABEL: splatconstant_rotate_v4i32:
1878; AVX:       # %bb.0:
1879; AVX-NEXT:    vpsrld $28, %xmm0, %xmm1
1880; AVX-NEXT:    vpslld $4, %xmm0, %xmm0
1881; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
1882; AVX-NEXT:    retq
1883;
1884; AVX512F-LABEL: splatconstant_rotate_v4i32:
1885; AVX512F:       # %bb.0:
1886; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1887; AVX512F-NEXT:    vprold $4, %zmm0, %zmm0
1888; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1889; AVX512F-NEXT:    vzeroupper
1890; AVX512F-NEXT:    retq
1891;
1892; AVX512VL-LABEL: splatconstant_rotate_v4i32:
1893; AVX512VL:       # %bb.0:
1894; AVX512VL-NEXT:    vprold $4, %xmm0, %xmm0
1895; AVX512VL-NEXT:    retq
1896;
1897; AVX512BW-LABEL: splatconstant_rotate_v4i32:
1898; AVX512BW:       # %bb.0:
1899; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1900; AVX512BW-NEXT:    vprold $4, %zmm0, %zmm0
1901; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1902; AVX512BW-NEXT:    vzeroupper
1903; AVX512BW-NEXT:    retq
1904;
1905; AVX512VLBW-LABEL: splatconstant_rotate_v4i32:
1906; AVX512VLBW:       # %bb.0:
1907; AVX512VLBW-NEXT:    vprold $4, %xmm0, %xmm0
1908; AVX512VLBW-NEXT:    retq
1909;
1910; XOP-LABEL: splatconstant_rotate_v4i32:
1911; XOP:       # %bb.0:
1912; XOP-NEXT:    vprotd $4, %xmm0, %xmm0
1913; XOP-NEXT:    retq
1914;
1915; X32-SSE-LABEL: splatconstant_rotate_v4i32:
1916; X32-SSE:       # %bb.0:
1917; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
1918; X32-SSE-NEXT:    psrld $28, %xmm1
1919; X32-SSE-NEXT:    pslld $4, %xmm0
1920; X32-SSE-NEXT:    por %xmm1, %xmm0
1921; X32-SSE-NEXT:    retl
1922  %shl = shl <4 x i32> %a, <i32 4, i32 4, i32 4, i32 4>
1923  %lshr = lshr <4 x i32> %a, <i32 28, i32 28, i32 28, i32 28>
1924  %or = or <4 x i32> %shl, %lshr
1925  ret <4 x i32> %or
1926}
1927
1928define <8 x i16> @splatconstant_rotate_v8i16(<8 x i16> %a) nounwind {
1929; SSE-LABEL: splatconstant_rotate_v8i16:
1930; SSE:       # %bb.0:
1931; SSE-NEXT:    movdqa %xmm0, %xmm1
1932; SSE-NEXT:    psrlw $9, %xmm1
1933; SSE-NEXT:    psllw $7, %xmm0
1934; SSE-NEXT:    por %xmm1, %xmm0
1935; SSE-NEXT:    retq
1936;
1937; AVX-LABEL: splatconstant_rotate_v8i16:
1938; AVX:       # %bb.0:
1939; AVX-NEXT:    vpsrlw $9, %xmm0, %xmm1
1940; AVX-NEXT:    vpsllw $7, %xmm0, %xmm0
1941; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
1942; AVX-NEXT:    retq
1943;
1944; AVX512-LABEL: splatconstant_rotate_v8i16:
1945; AVX512:       # %bb.0:
1946; AVX512-NEXT:    vpsrlw $9, %xmm0, %xmm1
1947; AVX512-NEXT:    vpsllw $7, %xmm0, %xmm0
1948; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
1949; AVX512-NEXT:    retq
1950;
1951; XOP-LABEL: splatconstant_rotate_v8i16:
1952; XOP:       # %bb.0:
1953; XOP-NEXT:    vprotw $7, %xmm0, %xmm0
1954; XOP-NEXT:    retq
1955;
1956; X32-SSE-LABEL: splatconstant_rotate_v8i16:
1957; X32-SSE:       # %bb.0:
1958; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
1959; X32-SSE-NEXT:    psrlw $9, %xmm1
1960; X32-SSE-NEXT:    psllw $7, %xmm0
1961; X32-SSE-NEXT:    por %xmm1, %xmm0
1962; X32-SSE-NEXT:    retl
1963  %shl = shl <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
1964  %lshr = lshr <8 x i16> %a, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9>
1965  %or = or <8 x i16> %shl, %lshr
1966  ret <8 x i16> %or
1967}
1968
1969define <16 x i8> @splatconstant_rotate_v16i8(<16 x i8> %a) nounwind {
1970; SSE-LABEL: splatconstant_rotate_v16i8:
1971; SSE:       # %bb.0:
1972; SSE-NEXT:    movdqa %xmm0, %xmm1
1973; SSE-NEXT:    psrlw $4, %xmm1
1974; SSE-NEXT:    pand {{.*}}(%rip), %xmm1
1975; SSE-NEXT:    psllw $4, %xmm0
1976; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
1977; SSE-NEXT:    por %xmm1, %xmm0
1978; SSE-NEXT:    retq
1979;
1980; AVX-LABEL: splatconstant_rotate_v16i8:
1981; AVX:       # %bb.0:
1982; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm1
1983; AVX-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
1984; AVX-NEXT:    vpsllw $4, %xmm0, %xmm0
1985; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
1986; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
1987; AVX-NEXT:    retq
1988;
1989; AVX512-LABEL: splatconstant_rotate_v16i8:
1990; AVX512:       # %bb.0:
1991; AVX512-NEXT:    vpsrlw $4, %xmm0, %xmm1
1992; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
1993; AVX512-NEXT:    vpsllw $4, %xmm0, %xmm0
1994; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
1995; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
1996; AVX512-NEXT:    retq
1997;
1998; XOP-LABEL: splatconstant_rotate_v16i8:
1999; XOP:       # %bb.0:
2000; XOP-NEXT:    vprotb $4, %xmm0, %xmm0
2001; XOP-NEXT:    retq
2002;
2003; X32-SSE-LABEL: splatconstant_rotate_v16i8:
2004; X32-SSE:       # %bb.0:
2005; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
2006; X32-SSE-NEXT:    psrlw $4, %xmm1
2007; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
2008; X32-SSE-NEXT:    psllw $4, %xmm0
2009; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
2010; X32-SSE-NEXT:    por %xmm1, %xmm0
2011; X32-SSE-NEXT:    retl
2012  %shl = shl <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
2013  %lshr = lshr <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
2014  %or = or <16 x i8> %shl, %lshr
2015  ret <16 x i8> %or
2016}
2017
2018;
2019; Masked Uniform Constant Rotates
2020;
2021
2022define <2 x i64> @splatconstant_rotate_mask_v2i64(<2 x i64> %a) nounwind {
2023; SSE-LABEL: splatconstant_rotate_mask_v2i64:
2024; SSE:       # %bb.0:
2025; SSE-NEXT:    psrlq $49, %xmm0
2026; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
2027; SSE-NEXT:    retq
2028;
2029; AVX-LABEL: splatconstant_rotate_mask_v2i64:
2030; AVX:       # %bb.0:
2031; AVX-NEXT:    vpsrlq $49, %xmm0, %xmm0
2032; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
2033; AVX-NEXT:    retq
2034;
2035; AVX512F-LABEL: splatconstant_rotate_mask_v2i64:
2036; AVX512F:       # %bb.0:
2037; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
2038; AVX512F-NEXT:    vprolq $15, %zmm0, %zmm0
2039; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
2040; AVX512F-NEXT:    vzeroupper
2041; AVX512F-NEXT:    retq
2042;
2043; AVX512VL-LABEL: splatconstant_rotate_mask_v2i64:
2044; AVX512VL:       # %bb.0:
2045; AVX512VL-NEXT:    vprolq $15, %xmm0, %xmm0
2046; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
2047; AVX512VL-NEXT:    retq
2048;
2049; AVX512BW-LABEL: splatconstant_rotate_mask_v2i64:
2050; AVX512BW:       # %bb.0:
2051; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
2052; AVX512BW-NEXT:    vprolq $15, %zmm0, %zmm0
2053; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
2054; AVX512BW-NEXT:    vzeroupper
2055; AVX512BW-NEXT:    retq
2056;
2057; AVX512VLBW-LABEL: splatconstant_rotate_mask_v2i64:
2058; AVX512VLBW:       # %bb.0:
2059; AVX512VLBW-NEXT:    vprolq $15, %xmm0, %xmm0
2060; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
2061; AVX512VLBW-NEXT:    retq
2062;
2063; XOP-LABEL: splatconstant_rotate_mask_v2i64:
2064; XOP:       # %bb.0:
2065; XOP-NEXT:    vprotq $15, %xmm0, %xmm0
2066; XOP-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
2067; XOP-NEXT:    retq
2068;
2069; X32-SSE-LABEL: splatconstant_rotate_mask_v2i64:
2070; X32-SSE:       # %bb.0:
2071; X32-SSE-NEXT:    psrlq $49, %xmm0
2072; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
2073; X32-SSE-NEXT:    retl
2074  %shl = shl <2 x i64> %a, <i64 15, i64 15>
2075  %lshr = lshr <2 x i64> %a, <i64 49, i64 49>
2076  %rmask = and <2 x i64> %lshr, <i64 255, i64 127>
2077  %lmask = and <2 x i64> %shl, <i64 65, i64 33>
2078  %or = or <2 x i64> %lmask, %rmask
2079  ret <2 x i64> %or
2080}
2081
2082define <4 x i32> @splatconstant_rotate_mask_v4i32(<4 x i32> %a) nounwind {
2083; SSE-LABEL: splatconstant_rotate_mask_v4i32:
2084; SSE:       # %bb.0:
2085; SSE-NEXT:    movdqa %xmm0, %xmm1
2086; SSE-NEXT:    psrld $28, %xmm1
2087; SSE-NEXT:    pslld $4, %xmm0
2088; SSE-NEXT:    por %xmm1, %xmm0
2089; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
2090; SSE-NEXT:    retq
2091;
2092; AVX-LABEL: splatconstant_rotate_mask_v4i32:
2093; AVX:       # %bb.0:
2094; AVX-NEXT:    vpsrld $28, %xmm0, %xmm1
2095; AVX-NEXT:    vpslld $4, %xmm0, %xmm0
2096; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
2097; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
2098; AVX-NEXT:    retq
2099;
2100; AVX512F-LABEL: splatconstant_rotate_mask_v4i32:
2101; AVX512F:       # %bb.0:
2102; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
2103; AVX512F-NEXT:    vprold $4, %zmm0, %zmm0
2104; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
2105; AVX512F-NEXT:    vzeroupper
2106; AVX512F-NEXT:    retq
2107;
2108; AVX512VL-LABEL: splatconstant_rotate_mask_v4i32:
2109; AVX512VL:       # %bb.0:
2110; AVX512VL-NEXT:    vprold $4, %xmm0, %xmm0
2111; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
2112; AVX512VL-NEXT:    retq
2113;
2114; AVX512BW-LABEL: splatconstant_rotate_mask_v4i32:
2115; AVX512BW:       # %bb.0:
2116; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
2117; AVX512BW-NEXT:    vprold $4, %zmm0, %zmm0
2118; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
2119; AVX512BW-NEXT:    vzeroupper
2120; AVX512BW-NEXT:    retq
2121;
2122; AVX512VLBW-LABEL: splatconstant_rotate_mask_v4i32:
2123; AVX512VLBW:       # %bb.0:
2124; AVX512VLBW-NEXT:    vprold $4, %xmm0, %xmm0
2125; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
2126; AVX512VLBW-NEXT:    retq
2127;
2128; XOP-LABEL: splatconstant_rotate_mask_v4i32:
2129; XOP:       # %bb.0:
2130; XOP-NEXT:    vprotd $4, %xmm0, %xmm0
2131; XOP-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
2132; XOP-NEXT:    retq
2133;
2134; X32-SSE-LABEL: splatconstant_rotate_mask_v4i32:
2135; X32-SSE:       # %bb.0:
2136; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
2137; X32-SSE-NEXT:    psrld $28, %xmm1
2138; X32-SSE-NEXT:    pslld $4, %xmm0
2139; X32-SSE-NEXT:    por %xmm1, %xmm0
2140; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
2141; X32-SSE-NEXT:    retl
2142  %shl = shl <4 x i32> %a, <i32 4, i32 4, i32 4, i32 4>
2143  %lshr = lshr <4 x i32> %a, <i32 28, i32 28, i32 28, i32 28>
2144  %rmask = and <4 x i32> %lshr, <i32 127, i32 255, i32 511, i32 1023>
2145  %lmask = and <4 x i32> %shl, <i32 1023, i32 511, i32 255, i32 127>
2146  %or = or <4 x i32> %lmask, %rmask
2147  ret <4 x i32> %or
2148}
2149
2150define <8 x i16> @splatconstant_rotate_mask_v8i16(<8 x i16> %a) nounwind {
2151; SSE-LABEL: splatconstant_rotate_mask_v8i16:
2152; SSE:       # %bb.0:
2153; SSE-NEXT:    movdqa %xmm0, %xmm1
2154; SSE-NEXT:    psrlw $11, %xmm1
2155; SSE-NEXT:    psllw $5, %xmm0
2156; SSE-NEXT:    por %xmm1, %xmm0
2157; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
2158; SSE-NEXT:    retq
2159;
2160; AVX-LABEL: splatconstant_rotate_mask_v8i16:
2161; AVX:       # %bb.0:
2162; AVX-NEXT:    vpsrlw $11, %xmm0, %xmm1
2163; AVX-NEXT:    vpsllw $5, %xmm0, %xmm0
2164; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
2165; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
2166; AVX-NEXT:    retq
2167;
2168; AVX512-LABEL: splatconstant_rotate_mask_v8i16:
2169; AVX512:       # %bb.0:
2170; AVX512-NEXT:    vpsrlw $11, %xmm0, %xmm1
2171; AVX512-NEXT:    vpsllw $5, %xmm0, %xmm0
2172; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
2173; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
2174; AVX512-NEXT:    retq
2175;
2176; XOP-LABEL: splatconstant_rotate_mask_v8i16:
2177; XOP:       # %bb.0:
2178; XOP-NEXT:    vprotw $5, %xmm0, %xmm0
2179; XOP-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
2180; XOP-NEXT:    retq
2181;
2182; X32-SSE-LABEL: splatconstant_rotate_mask_v8i16:
2183; X32-SSE:       # %bb.0:
2184; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
2185; X32-SSE-NEXT:    psrlw $11, %xmm1
2186; X32-SSE-NEXT:    psllw $5, %xmm0
2187; X32-SSE-NEXT:    por %xmm1, %xmm0
2188; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
2189; X32-SSE-NEXT:    retl
2190  %shl = shl <8 x i16> %a, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
2191  %lshr = lshr <8 x i16> %a, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
2192  %rmask = and <8 x i16> %lshr, <i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55>
2193  %lmask = and <8 x i16> %shl, <i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33>
2194  %or = or <8 x i16> %lmask, %rmask
2195  ret <8 x i16> %or
2196}
2197
2198define <16 x i8> @splatconstant_rotate_mask_v16i8(<16 x i8> %a) nounwind {
2199; SSE-LABEL: splatconstant_rotate_mask_v16i8:
2200; SSE:       # %bb.0:
2201; SSE-NEXT:    movdqa %xmm0, %xmm1
2202; SSE-NEXT:    psrlw $4, %xmm1
2203; SSE-NEXT:    pand {{.*}}(%rip), %xmm1
2204; SSE-NEXT:    psllw $4, %xmm0
2205; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
2206; SSE-NEXT:    por %xmm1, %xmm0
2207; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
2208; SSE-NEXT:    retq
2209;
2210; AVX-LABEL: splatconstant_rotate_mask_v16i8:
2211; AVX:       # %bb.0:
2212; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm1
2213; AVX-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
2214; AVX-NEXT:    vpsllw $4, %xmm0, %xmm0
2215; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
2216; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
2217; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
2218; AVX-NEXT:    retq
2219;
2220; AVX512-LABEL: splatconstant_rotate_mask_v16i8:
2221; AVX512:       # %bb.0:
2222; AVX512-NEXT:    vpsrlw $4, %xmm0, %xmm1
2223; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
2224; AVX512-NEXT:    vpsllw $4, %xmm0, %xmm0
2225; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
2226; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
2227; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
2228; AVX512-NEXT:    retq
2229;
2230; XOP-LABEL: splatconstant_rotate_mask_v16i8:
2231; XOP:       # %bb.0:
2232; XOP-NEXT:    vprotb $4, %xmm0, %xmm0
2233; XOP-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
2234; XOP-NEXT:    retq
2235;
2236; X32-SSE-LABEL: splatconstant_rotate_mask_v16i8:
2237; X32-SSE:       # %bb.0:
2238; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
2239; X32-SSE-NEXT:    psrlw $4, %xmm1
2240; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
2241; X32-SSE-NEXT:    psllw $4, %xmm0
2242; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
2243; X32-SSE-NEXT:    por %xmm1, %xmm0
2244; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
2245; X32-SSE-NEXT:    retl
2246  %shl = shl <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
2247  %lshr = lshr <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
2248  %rmask = and <16 x i8> %lshr, <i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55>
2249  %lmask = and <16 x i8> %shl, <i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33>
2250  %or = or <16 x i8> %lmask, %rmask
2251  ret <16 x i8> %or
2252}
2253