• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
8;
9; Just one 32-bit run to make sure we do reasonable things for i64 rotates.
10; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2
11
12;
13; Variable Rotates
14;
15
16define <2 x i64> @var_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
17; SSE2-LABEL: var_rotate_v2i64:
18; SSE2:       # BB#0:
19; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [64,64]
20; SSE2-NEXT:    psubq %xmm1, %xmm2
21; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
22; SSE2-NEXT:    movdqa %xmm0, %xmm4
23; SSE2-NEXT:    psllq %xmm3, %xmm4
24; SSE2-NEXT:    movdqa %xmm0, %xmm3
25; SSE2-NEXT:    psllq %xmm1, %xmm3
26; SSE2-NEXT:    movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1]
27; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
28; SSE2-NEXT:    movdqa %xmm0, %xmm1
29; SSE2-NEXT:    psrlq %xmm3, %xmm1
30; SSE2-NEXT:    psrlq %xmm2, %xmm0
31; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
32; SSE2-NEXT:    orpd %xmm4, %xmm1
33; SSE2-NEXT:    movapd %xmm1, %xmm0
34; SSE2-NEXT:    retq
35;
36; SSE41-LABEL: var_rotate_v2i64:
37; SSE41:       # BB#0:
38; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [64,64]
39; SSE41-NEXT:    psubq %xmm1, %xmm2
40; SSE41-NEXT:    movdqa %xmm0, %xmm3
41; SSE41-NEXT:    psllq %xmm1, %xmm3
42; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
43; SSE41-NEXT:    movdqa %xmm0, %xmm4
44; SSE41-NEXT:    psllq %xmm1, %xmm4
45; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm3[0,1,2,3],xmm4[4,5,6,7]
46; SSE41-NEXT:    movdqa %xmm0, %xmm1
47; SSE41-NEXT:    psrlq %xmm2, %xmm1
48; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
49; SSE41-NEXT:    psrlq %xmm2, %xmm0
50; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
51; SSE41-NEXT:    por %xmm4, %xmm0
52; SSE41-NEXT:    retq
53;
54; AVX1-LABEL: var_rotate_v2i64:
55; AVX1:       # BB#0:
56; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [64,64]
57; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm2
58; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm3
59; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
60; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm1
61; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
62; AVX1-NEXT:    vpsrlq %xmm2, %xmm0, %xmm3
63; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
64; AVX1-NEXT:    vpsrlq %xmm2, %xmm0, %xmm0
65; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
66; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
67; AVX1-NEXT:    retq
68;
69; AVX2-LABEL: var_rotate_v2i64:
70; AVX2:       # BB#0:
71; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [64,64]
72; AVX2-NEXT:    vpsubq %xmm1, %xmm2, %xmm2
73; AVX2-NEXT:    vpsllvq %xmm1, %xmm0, %xmm1
74; AVX2-NEXT:    vpsrlvq %xmm2, %xmm0, %xmm0
75; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
76; AVX2-NEXT:    retq
77;
78; XOP-LABEL: var_rotate_v2i64:
79; XOP:       # BB#0:
80; XOP-NEXT:    vprotq %xmm1, %xmm0, %xmm0
81; XOP-NEXT:    retq
82;
83; X32-SSE-LABEL: var_rotate_v2i64:
84; X32-SSE:       # BB#0:
85; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [64,0,64,0]
86; X32-SSE-NEXT:    psubq %xmm1, %xmm2
87; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
88; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
89; X32-SSE-NEXT:    psllq %xmm3, %xmm4
90; X32-SSE-NEXT:    movq {{.*#+}} xmm1 = xmm1[0],zero
91; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
92; X32-SSE-NEXT:    psllq %xmm1, %xmm3
93; X32-SSE-NEXT:    movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1]
94; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
95; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
96; X32-SSE-NEXT:    psrlq %xmm3, %xmm1
97; X32-SSE-NEXT:    movq {{.*#+}} xmm2 = xmm2[0],zero
98; X32-SSE-NEXT:    psrlq %xmm2, %xmm0
99; X32-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
100; X32-SSE-NEXT:    orpd %xmm4, %xmm1
101; X32-SSE-NEXT:    movapd %xmm1, %xmm0
102; X32-SSE-NEXT:    retl
103  %b64 = sub <2 x i64> <i64 64, i64 64>, %b
104  %shl = shl <2 x i64> %a, %b
105  %lshr = lshr <2 x i64> %a, %b64
106  %or = or <2 x i64> %shl, %lshr
107  ret <2 x i64> %or
108}
109
110define <4 x i32> @var_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
111; SSE2-LABEL: var_rotate_v4i32:
112; SSE2:       # BB#0:
113; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [32,32,32,32]
114; SSE2-NEXT:    psubd %xmm1, %xmm2
115; SSE2-NEXT:    pslld $23, %xmm1
116; SSE2-NEXT:    paddd {{.*}}(%rip), %xmm1
117; SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
118; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
119; SSE2-NEXT:    pmuludq %xmm0, %xmm1
120; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
121; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
122; SSE2-NEXT:    pmuludq %xmm3, %xmm4
123; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
124; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
125; SSE2-NEXT:    movdqa %xmm2, %xmm3
126; SSE2-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
127; SSE2-NEXT:    movdqa %xmm0, %xmm4
128; SSE2-NEXT:    psrld %xmm3, %xmm4
129; SSE2-NEXT:    movdqa %xmm2, %xmm3
130; SSE2-NEXT:    psrlq $32, %xmm3
131; SSE2-NEXT:    movdqa %xmm0, %xmm5
132; SSE2-NEXT:    psrld %xmm3, %xmm5
133; SSE2-NEXT:    movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1]
134; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,3,2,3]
135; SSE2-NEXT:    pxor %xmm4, %xmm4
136; SSE2-NEXT:    movdqa %xmm2, %xmm5
137; SSE2-NEXT:    punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
138; SSE2-NEXT:    movdqa %xmm0, %xmm6
139; SSE2-NEXT:    psrld %xmm5, %xmm6
140; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
141; SSE2-NEXT:    psrld %xmm2, %xmm0
142; SSE2-NEXT:    movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1]
143; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3]
144; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
145; SSE2-NEXT:    por %xmm1, %xmm0
146; SSE2-NEXT:    retq
147;
148; SSE41-LABEL: var_rotate_v4i32:
149; SSE41:       # BB#0:
150; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [32,32,32,32]
151; SSE41-NEXT:    psubd %xmm1, %xmm2
152; SSE41-NEXT:    pslld $23, %xmm1
153; SSE41-NEXT:    paddd {{.*}}(%rip), %xmm1
154; SSE41-NEXT:    cvttps2dq %xmm1, %xmm1
155; SSE41-NEXT:    pmulld %xmm0, %xmm1
156; SSE41-NEXT:    movdqa %xmm2, %xmm3
157; SSE41-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
158; SSE41-NEXT:    movdqa %xmm0, %xmm4
159; SSE41-NEXT:    psrld %xmm3, %xmm4
160; SSE41-NEXT:    movdqa %xmm2, %xmm3
161; SSE41-NEXT:    psrlq $32, %xmm3
162; SSE41-NEXT:    movdqa %xmm0, %xmm5
163; SSE41-NEXT:    psrld %xmm3, %xmm5
164; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm4[4,5,6,7]
165; SSE41-NEXT:    pxor %xmm3, %xmm3
166; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero
167; SSE41-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
168; SSE41-NEXT:    movdqa %xmm0, %xmm3
169; SSE41-NEXT:    psrld %xmm2, %xmm3
170; SSE41-NEXT:    psrld %xmm4, %xmm0
171; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
172; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
173; SSE41-NEXT:    por %xmm1, %xmm0
174; SSE41-NEXT:    retq
175;
176; AVX1-LABEL: var_rotate_v4i32:
177; AVX1:       # BB#0:
178; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [32,32,32,32]
179; AVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm2
180; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
181; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm1, %xmm1
182; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
183; AVX1-NEXT:    vpmulld %xmm0, %xmm1, %xmm1
184; AVX1-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
185; AVX1-NEXT:    vpsrld %xmm3, %xmm0, %xmm3
186; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm4
187; AVX1-NEXT:    vpsrld %xmm4, %xmm0, %xmm4
188; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
189; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
190; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
191; AVX1-NEXT:    vpsrld %xmm4, %xmm0, %xmm4
192; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
193; AVX1-NEXT:    vpsrld %xmm2, %xmm0, %xmm0
194; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
195; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
196; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
197; AVX1-NEXT:    retq
198;
199; AVX2-LABEL: var_rotate_v4i32:
200; AVX2:       # BB#0:
201; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
202; AVX2-NEXT:    vpsubd %xmm1, %xmm2, %xmm2
203; AVX2-NEXT:    vpsllvd %xmm1, %xmm0, %xmm1
204; AVX2-NEXT:    vpsrlvd %xmm2, %xmm0, %xmm0
205; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
206; AVX2-NEXT:    retq
207;
208; XOP-LABEL: var_rotate_v4i32:
209; XOP:       # BB#0:
210; XOP-NEXT:    vprotd %xmm1, %xmm0, %xmm0
211; XOP-NEXT:    retq
212;
213; X32-SSE-LABEL: var_rotate_v4i32:
214; X32-SSE:       # BB#0:
215; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [32,32,32,32]
216; X32-SSE-NEXT:    psubd %xmm1, %xmm2
217; X32-SSE-NEXT:    pslld $23, %xmm1
218; X32-SSE-NEXT:    paddd {{\.LCPI.*}}, %xmm1
219; X32-SSE-NEXT:    cvttps2dq %xmm1, %xmm1
220; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
221; X32-SSE-NEXT:    pmuludq %xmm0, %xmm1
222; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
223; X32-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
224; X32-SSE-NEXT:    pmuludq %xmm3, %xmm4
225; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
226; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
227; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
228; X32-SSE-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
229; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
230; X32-SSE-NEXT:    psrld %xmm3, %xmm4
231; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
232; X32-SSE-NEXT:    psrlq $32, %xmm3
233; X32-SSE-NEXT:    movdqa %xmm0, %xmm5
234; X32-SSE-NEXT:    psrld %xmm3, %xmm5
235; X32-SSE-NEXT:    movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1]
236; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,3,2,3]
237; X32-SSE-NEXT:    pxor %xmm4, %xmm4
238; X32-SSE-NEXT:    movdqa %xmm2, %xmm5
239; X32-SSE-NEXT:    punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
240; X32-SSE-NEXT:    movdqa %xmm0, %xmm6
241; X32-SSE-NEXT:    psrld %xmm5, %xmm6
242; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
243; X32-SSE-NEXT:    psrld %xmm2, %xmm0
244; X32-SSE-NEXT:    movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1]
245; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3]
246; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
247; X32-SSE-NEXT:    por %xmm1, %xmm0
248; X32-SSE-NEXT:    retl
249  %b32 = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %b
250  %shl = shl <4 x i32> %a, %b
251  %lshr = lshr <4 x i32> %a, %b32
252  %or = or <4 x i32> %shl, %lshr
253  ret <4 x i32> %or
254}
255
256define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
257; SSE2-LABEL: var_rotate_v8i16:
258; SSE2:       # BB#0:
259; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
260; SSE2-NEXT:    psubw %xmm1, %xmm3
261; SSE2-NEXT:    psllw $12, %xmm1
262; SSE2-NEXT:    movdqa %xmm1, %xmm2
263; SSE2-NEXT:    psraw $15, %xmm2
264; SSE2-NEXT:    movdqa %xmm0, %xmm4
265; SSE2-NEXT:    psllw $8, %xmm4
266; SSE2-NEXT:    pand %xmm2, %xmm4
267; SSE2-NEXT:    pandn %xmm0, %xmm2
268; SSE2-NEXT:    por %xmm4, %xmm2
269; SSE2-NEXT:    paddw %xmm1, %xmm1
270; SSE2-NEXT:    movdqa %xmm1, %xmm4
271; SSE2-NEXT:    psraw $15, %xmm4
272; SSE2-NEXT:    movdqa %xmm4, %xmm5
273; SSE2-NEXT:    pandn %xmm2, %xmm5
274; SSE2-NEXT:    psllw $4, %xmm2
275; SSE2-NEXT:    pand %xmm4, %xmm2
276; SSE2-NEXT:    por %xmm5, %xmm2
277; SSE2-NEXT:    paddw %xmm1, %xmm1
278; SSE2-NEXT:    movdqa %xmm1, %xmm4
279; SSE2-NEXT:    psraw $15, %xmm4
280; SSE2-NEXT:    movdqa %xmm4, %xmm5
281; SSE2-NEXT:    pandn %xmm2, %xmm5
282; SSE2-NEXT:    psllw $2, %xmm2
283; SSE2-NEXT:    pand %xmm4, %xmm2
284; SSE2-NEXT:    por %xmm5, %xmm2
285; SSE2-NEXT:    paddw %xmm1, %xmm1
286; SSE2-NEXT:    psraw $15, %xmm1
287; SSE2-NEXT:    movdqa %xmm1, %xmm4
288; SSE2-NEXT:    pandn %xmm2, %xmm4
289; SSE2-NEXT:    psllw $1, %xmm2
290; SSE2-NEXT:    pand %xmm1, %xmm2
291; SSE2-NEXT:    psllw $12, %xmm3
292; SSE2-NEXT:    movdqa %xmm3, %xmm1
293; SSE2-NEXT:    psraw $15, %xmm1
294; SSE2-NEXT:    movdqa %xmm1, %xmm5
295; SSE2-NEXT:    pandn %xmm0, %xmm5
296; SSE2-NEXT:    psrlw $8, %xmm0
297; SSE2-NEXT:    pand %xmm1, %xmm0
298; SSE2-NEXT:    por %xmm5, %xmm0
299; SSE2-NEXT:    paddw %xmm3, %xmm3
300; SSE2-NEXT:    movdqa %xmm3, %xmm1
301; SSE2-NEXT:    psraw $15, %xmm1
302; SSE2-NEXT:    movdqa %xmm1, %xmm5
303; SSE2-NEXT:    pandn %xmm0, %xmm5
304; SSE2-NEXT:    psrlw $4, %xmm0
305; SSE2-NEXT:    pand %xmm1, %xmm0
306; SSE2-NEXT:    por %xmm5, %xmm0
307; SSE2-NEXT:    paddw %xmm3, %xmm3
308; SSE2-NEXT:    movdqa %xmm3, %xmm1
309; SSE2-NEXT:    psraw $15, %xmm1
310; SSE2-NEXT:    movdqa %xmm1, %xmm5
311; SSE2-NEXT:    pandn %xmm0, %xmm5
312; SSE2-NEXT:    psrlw $2, %xmm0
313; SSE2-NEXT:    pand %xmm1, %xmm0
314; SSE2-NEXT:    por %xmm5, %xmm0
315; SSE2-NEXT:    paddw %xmm3, %xmm3
316; SSE2-NEXT:    psraw $15, %xmm3
317; SSE2-NEXT:    movdqa %xmm3, %xmm1
318; SSE2-NEXT:    pandn %xmm0, %xmm1
319; SSE2-NEXT:    psrlw $1, %xmm0
320; SSE2-NEXT:    pand %xmm3, %xmm0
321; SSE2-NEXT:    por %xmm1, %xmm0
322; SSE2-NEXT:    por %xmm4, %xmm0
323; SSE2-NEXT:    por %xmm2, %xmm0
324; SSE2-NEXT:    retq
325;
326; SSE41-LABEL: var_rotate_v8i16:
327; SSE41:       # BB#0:
328; SSE41-NEXT:    movdqa %xmm0, %xmm3
329; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16]
330; SSE41-NEXT:    psubw %xmm1, %xmm2
331; SSE41-NEXT:    movdqa %xmm1, %xmm0
332; SSE41-NEXT:    psllw $12, %xmm0
333; SSE41-NEXT:    psllw $4, %xmm1
334; SSE41-NEXT:    por %xmm0, %xmm1
335; SSE41-NEXT:    movdqa %xmm1, %xmm4
336; SSE41-NEXT:    paddw %xmm4, %xmm4
337; SSE41-NEXT:    movdqa %xmm3, %xmm6
338; SSE41-NEXT:    psllw $8, %xmm6
339; SSE41-NEXT:    movdqa %xmm3, %xmm5
340; SSE41-NEXT:    movdqa %xmm1, %xmm0
341; SSE41-NEXT:    pblendvb %xmm6, %xmm5
342; SSE41-NEXT:    movdqa %xmm5, %xmm1
343; SSE41-NEXT:    psllw $4, %xmm1
344; SSE41-NEXT:    movdqa %xmm4, %xmm0
345; SSE41-NEXT:    pblendvb %xmm1, %xmm5
346; SSE41-NEXT:    movdqa %xmm5, %xmm1
347; SSE41-NEXT:    psllw $2, %xmm1
348; SSE41-NEXT:    paddw %xmm4, %xmm4
349; SSE41-NEXT:    movdqa %xmm4, %xmm0
350; SSE41-NEXT:    pblendvb %xmm1, %xmm5
351; SSE41-NEXT:    movdqa %xmm5, %xmm1
352; SSE41-NEXT:    psllw $1, %xmm1
353; SSE41-NEXT:    paddw %xmm4, %xmm4
354; SSE41-NEXT:    movdqa %xmm4, %xmm0
355; SSE41-NEXT:    pblendvb %xmm1, %xmm5
356; SSE41-NEXT:    movdqa %xmm2, %xmm0
357; SSE41-NEXT:    psllw $12, %xmm0
358; SSE41-NEXT:    psllw $4, %xmm2
359; SSE41-NEXT:    por %xmm0, %xmm2
360; SSE41-NEXT:    movdqa %xmm2, %xmm1
361; SSE41-NEXT:    paddw %xmm1, %xmm1
362; SSE41-NEXT:    movdqa %xmm3, %xmm4
363; SSE41-NEXT:    psrlw $8, %xmm4
364; SSE41-NEXT:    movdqa %xmm2, %xmm0
365; SSE41-NEXT:    pblendvb %xmm4, %xmm3
366; SSE41-NEXT:    movdqa %xmm3, %xmm2
367; SSE41-NEXT:    psrlw $4, %xmm2
368; SSE41-NEXT:    movdqa %xmm1, %xmm0
369; SSE41-NEXT:    pblendvb %xmm2, %xmm3
370; SSE41-NEXT:    movdqa %xmm3, %xmm2
371; SSE41-NEXT:    psrlw $2, %xmm2
372; SSE41-NEXT:    paddw %xmm1, %xmm1
373; SSE41-NEXT:    movdqa %xmm1, %xmm0
374; SSE41-NEXT:    pblendvb %xmm2, %xmm3
375; SSE41-NEXT:    movdqa %xmm3, %xmm2
376; SSE41-NEXT:    psrlw $1, %xmm2
377; SSE41-NEXT:    paddw %xmm1, %xmm1
378; SSE41-NEXT:    movdqa %xmm1, %xmm0
379; SSE41-NEXT:    pblendvb %xmm2, %xmm3
380; SSE41-NEXT:    por %xmm5, %xmm3
381; SSE41-NEXT:    movdqa %xmm3, %xmm0
382; SSE41-NEXT:    retq
383;
384; AVX1-LABEL: var_rotate_v8i16:
385; AVX1:       # BB#0:
386; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16]
387; AVX1-NEXT:    vpsubw %xmm1, %xmm2, %xmm2
388; AVX1-NEXT:    vpsllw $12, %xmm1, %xmm3
389; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
390; AVX1-NEXT:    vpor %xmm3, %xmm1, %xmm1
391; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm3
392; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm4
393; AVX1-NEXT:    vpblendvb %xmm1, %xmm4, %xmm0, %xmm1
394; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm4
395; AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm1, %xmm1
396; AVX1-NEXT:    vpsllw $2, %xmm1, %xmm4
397; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
398; AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm1, %xmm1
399; AVX1-NEXT:    vpsllw $1, %xmm1, %xmm4
400; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
401; AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm1, %xmm1
402; AVX1-NEXT:    vpsllw $12, %xmm2, %xmm3
403; AVX1-NEXT:    vpsllw $4, %xmm2, %xmm2
404; AVX1-NEXT:    vpor %xmm3, %xmm2, %xmm2
405; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm3
406; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm4
407; AVX1-NEXT:    vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
408; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm2
409; AVX1-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
410; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm2
411; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
412; AVX1-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
413; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm2
414; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
415; AVX1-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
416; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
417; AVX1-NEXT:    retq
418;
419; AVX2-LABEL: var_rotate_v8i16:
420; AVX2:       # BB#0:
421; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16]
422; AVX2-NEXT:    vpsubw %xmm1, %xmm2, %xmm2
423; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
424; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
425; AVX2-NEXT:    vpsllvd %ymm1, %ymm0, %ymm1
426; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
427; AVX2-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
428; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
429; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
430; AVX2-NEXT:    vpsrlvd %ymm2, %ymm0, %ymm0
431; AVX2-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
432; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
433; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
434; AVX2-NEXT:    vzeroupper
435; AVX2-NEXT:    retq
436;
437; XOP-LABEL: var_rotate_v8i16:
438; XOP:       # BB#0:
439; XOP-NEXT:    vprotw %xmm1, %xmm0, %xmm0
440; XOP-NEXT:    retq
441;
442; X32-SSE-LABEL: var_rotate_v8i16:
443; X32-SSE:       # BB#0:
444; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
445; X32-SSE-NEXT:    psubw %xmm1, %xmm3
446; X32-SSE-NEXT:    psllw $12, %xmm1
447; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
448; X32-SSE-NEXT:    psraw $15, %xmm2
449; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
450; X32-SSE-NEXT:    psllw $8, %xmm4
451; X32-SSE-NEXT:    pand %xmm2, %xmm4
452; X32-SSE-NEXT:    pandn %xmm0, %xmm2
453; X32-SSE-NEXT:    por %xmm4, %xmm2
454; X32-SSE-NEXT:    paddw %xmm1, %xmm1
455; X32-SSE-NEXT:    movdqa %xmm1, %xmm4
456; X32-SSE-NEXT:    psraw $15, %xmm4
457; X32-SSE-NEXT:    movdqa %xmm4, %xmm5
458; X32-SSE-NEXT:    pandn %xmm2, %xmm5
459; X32-SSE-NEXT:    psllw $4, %xmm2
460; X32-SSE-NEXT:    pand %xmm4, %xmm2
461; X32-SSE-NEXT:    por %xmm5, %xmm2
462; X32-SSE-NEXT:    paddw %xmm1, %xmm1
463; X32-SSE-NEXT:    movdqa %xmm1, %xmm4
464; X32-SSE-NEXT:    psraw $15, %xmm4
465; X32-SSE-NEXT:    movdqa %xmm4, %xmm5
466; X32-SSE-NEXT:    pandn %xmm2, %xmm5
467; X32-SSE-NEXT:    psllw $2, %xmm2
468; X32-SSE-NEXT:    pand %xmm4, %xmm2
469; X32-SSE-NEXT:    por %xmm5, %xmm2
470; X32-SSE-NEXT:    paddw %xmm1, %xmm1
471; X32-SSE-NEXT:    psraw $15, %xmm1
472; X32-SSE-NEXT:    movdqa %xmm1, %xmm4
473; X32-SSE-NEXT:    pandn %xmm2, %xmm4
474; X32-SSE-NEXT:    psllw $1, %xmm2
475; X32-SSE-NEXT:    pand %xmm1, %xmm2
476; X32-SSE-NEXT:    psllw $12, %xmm3
477; X32-SSE-NEXT:    movdqa %xmm3, %xmm1
478; X32-SSE-NEXT:    psraw $15, %xmm1
479; X32-SSE-NEXT:    movdqa %xmm1, %xmm5
480; X32-SSE-NEXT:    pandn %xmm0, %xmm5
481; X32-SSE-NEXT:    psrlw $8, %xmm0
482; X32-SSE-NEXT:    pand %xmm1, %xmm0
483; X32-SSE-NEXT:    por %xmm5, %xmm0
484; X32-SSE-NEXT:    paddw %xmm3, %xmm3
485; X32-SSE-NEXT:    movdqa %xmm3, %xmm1
486; X32-SSE-NEXT:    psraw $15, %xmm1
487; X32-SSE-NEXT:    movdqa %xmm1, %xmm5
488; X32-SSE-NEXT:    pandn %xmm0, %xmm5
489; X32-SSE-NEXT:    psrlw $4, %xmm0
490; X32-SSE-NEXT:    pand %xmm1, %xmm0
491; X32-SSE-NEXT:    por %xmm5, %xmm0
492; X32-SSE-NEXT:    paddw %xmm3, %xmm3
493; X32-SSE-NEXT:    movdqa %xmm3, %xmm1
494; X32-SSE-NEXT:    psraw $15, %xmm1
495; X32-SSE-NEXT:    movdqa %xmm1, %xmm5
496; X32-SSE-NEXT:    pandn %xmm0, %xmm5
497; X32-SSE-NEXT:    psrlw $2, %xmm0
498; X32-SSE-NEXT:    pand %xmm1, %xmm0
499; X32-SSE-NEXT:    por %xmm5, %xmm0
500; X32-SSE-NEXT:    paddw %xmm3, %xmm3
501; X32-SSE-NEXT:    psraw $15, %xmm3
502; X32-SSE-NEXT:    movdqa %xmm3, %xmm1
503; X32-SSE-NEXT:    pandn %xmm0, %xmm1
504; X32-SSE-NEXT:    psrlw $1, %xmm0
505; X32-SSE-NEXT:    pand %xmm3, %xmm0
506; X32-SSE-NEXT:    por %xmm1, %xmm0
507; X32-SSE-NEXT:    por %xmm4, %xmm0
508; X32-SSE-NEXT:    por %xmm2, %xmm0
509; X32-SSE-NEXT:    retl
510  %b16 = sub <8 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %b
511  %shl = shl <8 x i16> %a, %b
512  %lshr = lshr <8 x i16> %a, %b16
513  %or = or <8 x i16> %shl, %lshr
514  ret <8 x i16> %or
515}
516
517define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
518; SSE2-LABEL: var_rotate_v16i8:
519; SSE2:       # BB#0:
520; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
521; SSE2-NEXT:    psubb %xmm1, %xmm4
522; SSE2-NEXT:    psllw $5, %xmm1
523; SSE2-NEXT:    pxor %xmm3, %xmm3
524; SSE2-NEXT:    pxor %xmm2, %xmm2
525; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
526; SSE2-NEXT:    movdqa %xmm0, %xmm5
527; SSE2-NEXT:    psllw $4, %xmm5
528; SSE2-NEXT:    pand {{.*}}(%rip), %xmm5
529; SSE2-NEXT:    pand %xmm2, %xmm5
530; SSE2-NEXT:    pandn %xmm0, %xmm2
531; SSE2-NEXT:    por %xmm5, %xmm2
532; SSE2-NEXT:    paddb %xmm1, %xmm1
533; SSE2-NEXT:    pxor %xmm5, %xmm5
534; SSE2-NEXT:    pcmpgtb %xmm1, %xmm5
535; SSE2-NEXT:    movdqa %xmm5, %xmm6
536; SSE2-NEXT:    pandn %xmm2, %xmm6
537; SSE2-NEXT:    psllw $2, %xmm2
538; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
539; SSE2-NEXT:    pand %xmm5, %xmm2
540; SSE2-NEXT:    por %xmm6, %xmm2
541; SSE2-NEXT:    paddb %xmm1, %xmm1
542; SSE2-NEXT:    pxor %xmm5, %xmm5
543; SSE2-NEXT:    pcmpgtb %xmm1, %xmm5
544; SSE2-NEXT:    movdqa %xmm5, %xmm1
545; SSE2-NEXT:    pandn %xmm2, %xmm1
546; SSE2-NEXT:    paddb %xmm2, %xmm2
547; SSE2-NEXT:    pand %xmm5, %xmm2
548; SSE2-NEXT:    psllw $5, %xmm4
549; SSE2-NEXT:    pxor %xmm5, %xmm5
550; SSE2-NEXT:    pcmpgtb %xmm4, %xmm5
551; SSE2-NEXT:    movdqa %xmm5, %xmm6
552; SSE2-NEXT:    pandn %xmm0, %xmm6
553; SSE2-NEXT:    psrlw $4, %xmm0
554; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
555; SSE2-NEXT:    pand %xmm5, %xmm0
556; SSE2-NEXT:    por %xmm6, %xmm0
557; SSE2-NEXT:    paddb %xmm4, %xmm4
558; SSE2-NEXT:    pxor %xmm5, %xmm5
559; SSE2-NEXT:    pcmpgtb %xmm4, %xmm5
560; SSE2-NEXT:    movdqa %xmm5, %xmm6
561; SSE2-NEXT:    pandn %xmm0, %xmm6
562; SSE2-NEXT:    psrlw $2, %xmm0
563; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
564; SSE2-NEXT:    pand %xmm5, %xmm0
565; SSE2-NEXT:    por %xmm6, %xmm0
566; SSE2-NEXT:    paddb %xmm4, %xmm4
567; SSE2-NEXT:    pcmpgtb %xmm4, %xmm3
568; SSE2-NEXT:    movdqa %xmm3, %xmm4
569; SSE2-NEXT:    pandn %xmm0, %xmm4
570; SSE2-NEXT:    psrlw $1, %xmm0
571; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
572; SSE2-NEXT:    pand %xmm3, %xmm0
573; SSE2-NEXT:    por %xmm4, %xmm0
574; SSE2-NEXT:    por %xmm1, %xmm0
575; SSE2-NEXT:    por %xmm2, %xmm0
576; SSE2-NEXT:    retq
577;
578; SSE41-LABEL: var_rotate_v16i8:
579; SSE41:       # BB#0:
580; SSE41-NEXT:    movdqa %xmm1, %xmm3
581; SSE41-NEXT:    movdqa %xmm0, %xmm1
582; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
583; SSE41-NEXT:    psubb %xmm3, %xmm2
584; SSE41-NEXT:    psllw $5, %xmm3
585; SSE41-NEXT:    movdqa %xmm1, %xmm5
586; SSE41-NEXT:    psllw $4, %xmm5
587; SSE41-NEXT:    pand {{.*}}(%rip), %xmm5
588; SSE41-NEXT:    movdqa %xmm1, %xmm4
589; SSE41-NEXT:    movdqa %xmm3, %xmm0
590; SSE41-NEXT:    pblendvb %xmm5, %xmm4
591; SSE41-NEXT:    movdqa %xmm4, %xmm5
592; SSE41-NEXT:    psllw $2, %xmm5
593; SSE41-NEXT:    pand {{.*}}(%rip), %xmm5
594; SSE41-NEXT:    paddb %xmm3, %xmm3
595; SSE41-NEXT:    movdqa %xmm3, %xmm0
596; SSE41-NEXT:    pblendvb %xmm5, %xmm4
597; SSE41-NEXT:    movdqa %xmm4, %xmm5
598; SSE41-NEXT:    paddb %xmm5, %xmm5
599; SSE41-NEXT:    paddb %xmm3, %xmm3
600; SSE41-NEXT:    movdqa %xmm3, %xmm0
601; SSE41-NEXT:    pblendvb %xmm5, %xmm4
602; SSE41-NEXT:    psllw $5, %xmm2
603; SSE41-NEXT:    movdqa %xmm2, %xmm3
604; SSE41-NEXT:    paddb %xmm3, %xmm3
605; SSE41-NEXT:    movdqa %xmm1, %xmm5
606; SSE41-NEXT:    psrlw $4, %xmm5
607; SSE41-NEXT:    pand {{.*}}(%rip), %xmm5
608; SSE41-NEXT:    movdqa %xmm2, %xmm0
609; SSE41-NEXT:    pblendvb %xmm5, %xmm1
610; SSE41-NEXT:    movdqa %xmm1, %xmm2
611; SSE41-NEXT:    psrlw $2, %xmm2
612; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
613; SSE41-NEXT:    movdqa %xmm3, %xmm0
614; SSE41-NEXT:    pblendvb %xmm2, %xmm1
615; SSE41-NEXT:    movdqa %xmm1, %xmm2
616; SSE41-NEXT:    psrlw $1, %xmm2
617; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
618; SSE41-NEXT:    paddb %xmm3, %xmm3
619; SSE41-NEXT:    movdqa %xmm3, %xmm0
620; SSE41-NEXT:    pblendvb %xmm2, %xmm1
621; SSE41-NEXT:    por %xmm4, %xmm1
622; SSE41-NEXT:    movdqa %xmm1, %xmm0
623; SSE41-NEXT:    retq
624;
625; AVX-LABEL: var_rotate_v16i8:
626; AVX:       # BB#0:
627; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
628; AVX-NEXT:    vpsubb %xmm1, %xmm2, %xmm2
629; AVX-NEXT:    vpsllw $5, %xmm1, %xmm1
630; AVX-NEXT:    vpsllw $4, %xmm0, %xmm3
631; AVX-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
632; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm3
633; AVX-NEXT:    vpsllw $2, %xmm3, %xmm4
634; AVX-NEXT:    vpand {{.*}}(%rip), %xmm4, %xmm4
635; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
636; AVX-NEXT:    vpblendvb %xmm1, %xmm4, %xmm3, %xmm3
637; AVX-NEXT:    vpaddb %xmm3, %xmm3, %xmm4
638; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
639; AVX-NEXT:    vpblendvb %xmm1, %xmm4, %xmm3, %xmm1
640; AVX-NEXT:    vpsllw $5, %xmm2, %xmm2
641; AVX-NEXT:    vpaddb %xmm2, %xmm2, %xmm3
642; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm4
643; AVX-NEXT:    vpand {{.*}}(%rip), %xmm4, %xmm4
644; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
645; AVX-NEXT:    vpsrlw $2, %xmm0, %xmm2
646; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
647; AVX-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
648; AVX-NEXT:    vpsrlw $1, %xmm0, %xmm2
649; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
650; AVX-NEXT:    vpaddb %xmm3, %xmm3, %xmm3
651; AVX-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
652; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
653; AVX-NEXT:    retq
654;
655; XOP-LABEL: var_rotate_v16i8:
656; XOP:       # BB#0:
657; XOP-NEXT:    vprotb %xmm1, %xmm0, %xmm0
658; XOP-NEXT:    retq
659;
660; X32-SSE-LABEL: var_rotate_v16i8:
661; X32-SSE:       # BB#0:
662; X32-SSE-NEXT:    movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
663; X32-SSE-NEXT:    psubb %xmm1, %xmm4
664; X32-SSE-NEXT:    psllw $5, %xmm1
665; X32-SSE-NEXT:    pxor %xmm3, %xmm3
666; X32-SSE-NEXT:    pxor %xmm2, %xmm2
667; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm2
668; X32-SSE-NEXT:    movdqa %xmm0, %xmm5
669; X32-SSE-NEXT:    psllw $4, %xmm5
670; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm5
671; X32-SSE-NEXT:    pand %xmm2, %xmm5
672; X32-SSE-NEXT:    pandn %xmm0, %xmm2
673; X32-SSE-NEXT:    por %xmm5, %xmm2
674; X32-SSE-NEXT:    paddb %xmm1, %xmm1
675; X32-SSE-NEXT:    pxor %xmm5, %xmm5
676; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm5
677; X32-SSE-NEXT:    movdqa %xmm5, %xmm6
678; X32-SSE-NEXT:    pandn %xmm2, %xmm6
679; X32-SSE-NEXT:    psllw $2, %xmm2
680; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm2
681; X32-SSE-NEXT:    pand %xmm5, %xmm2
682; X32-SSE-NEXT:    por %xmm6, %xmm2
683; X32-SSE-NEXT:    paddb %xmm1, %xmm1
684; X32-SSE-NEXT:    pxor %xmm5, %xmm5
685; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm5
686; X32-SSE-NEXT:    movdqa %xmm5, %xmm1
687; X32-SSE-NEXT:    pandn %xmm2, %xmm1
688; X32-SSE-NEXT:    paddb %xmm2, %xmm2
689; X32-SSE-NEXT:    pand %xmm5, %xmm2
690; X32-SSE-NEXT:    psllw $5, %xmm4
691; X32-SSE-NEXT:    pxor %xmm5, %xmm5
692; X32-SSE-NEXT:    pcmpgtb %xmm4, %xmm5
693; X32-SSE-NEXT:    movdqa %xmm5, %xmm6
694; X32-SSE-NEXT:    pandn %xmm0, %xmm6
695; X32-SSE-NEXT:    psrlw $4, %xmm0
696; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
697; X32-SSE-NEXT:    pand %xmm5, %xmm0
698; X32-SSE-NEXT:    por %xmm6, %xmm0
699; X32-SSE-NEXT:    paddb %xmm4, %xmm4
700; X32-SSE-NEXT:    pxor %xmm5, %xmm5
701; X32-SSE-NEXT:    pcmpgtb %xmm4, %xmm5
702; X32-SSE-NEXT:    movdqa %xmm5, %xmm6
703; X32-SSE-NEXT:    pandn %xmm0, %xmm6
704; X32-SSE-NEXT:    psrlw $2, %xmm0
705; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
706; X32-SSE-NEXT:    pand %xmm5, %xmm0
707; X32-SSE-NEXT:    por %xmm6, %xmm0
708; X32-SSE-NEXT:    paddb %xmm4, %xmm4
709; X32-SSE-NEXT:    pcmpgtb %xmm4, %xmm3
710; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
711; X32-SSE-NEXT:    pandn %xmm0, %xmm4
712; X32-SSE-NEXT:    psrlw $1, %xmm0
713; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
714; X32-SSE-NEXT:    pand %xmm3, %xmm0
715; X32-SSE-NEXT:    por %xmm4, %xmm0
716; X32-SSE-NEXT:    por %xmm1, %xmm0
717; X32-SSE-NEXT:    por %xmm2, %xmm0
718; X32-SSE-NEXT:    retl
719  %b8 = sub <16 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b
720  %shl = shl <16 x i8> %a, %b
721  %lshr = lshr <16 x i8> %a, %b8
722  %or = or <16 x i8> %shl, %lshr
723  ret <16 x i8> %or
724}
725
726;
727; Constant Rotates
728;
729
730define <2 x i64> @constant_rotate_v2i64(<2 x i64> %a) nounwind {
731; SSE2-LABEL: constant_rotate_v2i64:
732; SSE2:       # BB#0:
733; SSE2-NEXT:    movdqa %xmm0, %xmm2
734; SSE2-NEXT:    psllq $14, %xmm2
735; SSE2-NEXT:    movdqa %xmm0, %xmm1
736; SSE2-NEXT:    psllq $4, %xmm1
737; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
738; SSE2-NEXT:    movdqa %xmm0, %xmm1
739; SSE2-NEXT:    psrlq $50, %xmm1
740; SSE2-NEXT:    psrlq $60, %xmm0
741; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
742; SSE2-NEXT:    orpd %xmm2, %xmm1
743; SSE2-NEXT:    movapd %xmm1, %xmm0
744; SSE2-NEXT:    retq
745;
746; SSE41-LABEL: constant_rotate_v2i64:
747; SSE41:       # BB#0:
748; SSE41-NEXT:    movdqa %xmm0, %xmm1
749; SSE41-NEXT:    psllq $14, %xmm1
750; SSE41-NEXT:    movdqa %xmm0, %xmm2
751; SSE41-NEXT:    psllq $4, %xmm2
752; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
753; SSE41-NEXT:    movdqa %xmm0, %xmm1
754; SSE41-NEXT:    psrlq $50, %xmm1
755; SSE41-NEXT:    psrlq $60, %xmm0
756; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
757; SSE41-NEXT:    por %xmm2, %xmm0
758; SSE41-NEXT:    retq
759;
760; AVX1-LABEL: constant_rotate_v2i64:
761; AVX1:       # BB#0:
762; AVX1-NEXT:    vpsllq $14, %xmm0, %xmm1
763; AVX1-NEXT:    vpsllq $4, %xmm0, %xmm2
764; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
765; AVX1-NEXT:    vpsrlq $50, %xmm0, %xmm2
766; AVX1-NEXT:    vpsrlq $60, %xmm0, %xmm0
767; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
768; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
769; AVX1-NEXT:    retq
770;
771; AVX2-LABEL: constant_rotate_v2i64:
772; AVX2:       # BB#0:
773; AVX2-NEXT:    vpsllvq {{.*}}(%rip), %xmm0, %xmm1
774; AVX2-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
775; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
776; AVX2-NEXT:    retq
777;
778; XOPAVX1-LABEL: constant_rotate_v2i64:
779; XOPAVX1:       # BB#0:
780; XOPAVX1-NEXT:    vpshlq {{.*}}(%rip), %xmm0, %xmm1
781; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
782; XOPAVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm2, %xmm2
783; XOPAVX1-NEXT:    vpshlq %xmm2, %xmm0, %xmm0
784; XOPAVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
785; XOPAVX1-NEXT:    retq
786;
787; XOPAVX2-LABEL: constant_rotate_v2i64:
788; XOPAVX2:       # BB#0:
789; XOPAVX2-NEXT:    vpsllvq {{.*}}(%rip), %xmm0, %xmm1
790; XOPAVX2-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
791; XOPAVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
792; XOPAVX2-NEXT:    retq
793;
794; X32-SSE-LABEL: constant_rotate_v2i64:
795; X32-SSE:       # BB#0:
796; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
797; X32-SSE-NEXT:    psllq $14, %xmm2
798; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
799; X32-SSE-NEXT:    psllq $4, %xmm1
800; X32-SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
801; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
802; X32-SSE-NEXT:    psrlq $50, %xmm1
803; X32-SSE-NEXT:    psrlq $60, %xmm0
804; X32-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
805; X32-SSE-NEXT:    orpd %xmm2, %xmm1
806; X32-SSE-NEXT:    movapd %xmm1, %xmm0
807; X32-SSE-NEXT:    retl
808  %shl = shl <2 x i64> %a, <i64 4, i64 14>
809  %lshr = lshr <2 x i64> %a, <i64 60, i64 50>
810  %or = or <2 x i64> %shl, %lshr
811  ret <2 x i64> %or
812}
813
814define <4 x i32> @constant_rotate_v4i32(<4 x i32> %a) nounwind {
815; SSE2-LABEL: constant_rotate_v4i32:
816; SSE2:       # BB#0:
817; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [16,32,64,128]
818; SSE2-NEXT:    movdqa %xmm0, %xmm2
819; SSE2-NEXT:    pmuludq %xmm1, %xmm2
820; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
821; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
822; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
823; SSE2-NEXT:    pmuludq %xmm1, %xmm3
824; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
825; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
826; SSE2-NEXT:    movdqa %xmm0, %xmm1
827; SSE2-NEXT:    psrld $25, %xmm1
828; SSE2-NEXT:    movdqa %xmm0, %xmm3
829; SSE2-NEXT:    psrld $27, %xmm3
830; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1]
831; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
832; SSE2-NEXT:    movdqa %xmm0, %xmm3
833; SSE2-NEXT:    psrld $26, %xmm3
834; SSE2-NEXT:    psrld $28, %xmm0
835; SSE2-NEXT:    movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1]
836; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
837; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
838; SSE2-NEXT:    por %xmm2, %xmm0
839; SSE2-NEXT:    retq
840;
841; SSE41-LABEL: constant_rotate_v4i32:
842; SSE41:       # BB#0:
843; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [16,32,64,128]
844; SSE41-NEXT:    pmulld %xmm0, %xmm1
845; SSE41-NEXT:    movdqa %xmm0, %xmm2
846; SSE41-NEXT:    psrld $25, %xmm2
847; SSE41-NEXT:    movdqa %xmm0, %xmm3
848; SSE41-NEXT:    psrld $27, %xmm3
849; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
850; SSE41-NEXT:    movdqa %xmm0, %xmm2
851; SSE41-NEXT:    psrld $26, %xmm2
852; SSE41-NEXT:    psrld $28, %xmm0
853; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
854; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
855; SSE41-NEXT:    por %xmm1, %xmm0
856; SSE41-NEXT:    retq
857;
858; AVX1-LABEL: constant_rotate_v4i32:
859; AVX1:       # BB#0:
860; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm1
861; AVX1-NEXT:    vpsrld $25, %xmm0, %xmm2
862; AVX1-NEXT:    vpsrld $27, %xmm0, %xmm3
863; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
864; AVX1-NEXT:    vpsrld $26, %xmm0, %xmm3
865; AVX1-NEXT:    vpsrld $28, %xmm0, %xmm0
866; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
867; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
868; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
869; AVX1-NEXT:    retq
870;
871; AVX2-LABEL: constant_rotate_v4i32:
872; AVX2:       # BB#0:
873; AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm1
874; AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
875; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
876; AVX2-NEXT:    retq
877;
878; XOPAVX1-LABEL: constant_rotate_v4i32:
879; XOPAVX1:       # BB#0:
880; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm1
881; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm0
882; XOPAVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
883; XOPAVX1-NEXT:    retq
884;
885; XOPAVX2-LABEL: constant_rotate_v4i32:
886; XOPAVX2:       # BB#0:
887; XOPAVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm1
888; XOPAVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
889; XOPAVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
890; XOPAVX2-NEXT:    retq
891;
892; X32-SSE-LABEL: constant_rotate_v4i32:
893; X32-SSE:       # BB#0:
894; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [16,32,64,128]
895; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
896; X32-SSE-NEXT:    pmuludq %xmm1, %xmm2
897; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
898; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
899; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
900; X32-SSE-NEXT:    pmuludq %xmm1, %xmm3
901; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
902; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
903; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
904; X32-SSE-NEXT:    psrld $25, %xmm1
905; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
906; X32-SSE-NEXT:    psrld $27, %xmm3
907; X32-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1]
908; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
909; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
910; X32-SSE-NEXT:    psrld $26, %xmm3
911; X32-SSE-NEXT:    psrld $28, %xmm0
912; X32-SSE-NEXT:    movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1]
913; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
914; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
915; X32-SSE-NEXT:    por %xmm2, %xmm0
916; X32-SSE-NEXT:    retl
917  %shl = shl <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7>
918  %lshr = lshr <4 x i32> %a, <i32 28, i32 27, i32 26, i32 25>
919  %or = or <4 x i32> %shl, %lshr
920  ret <4 x i32> %or
921}
922
923define <8 x i16> @constant_rotate_v8i16(<8 x i16> %a) nounwind {
924; SSE2-LABEL: constant_rotate_v8i16:
925; SSE2:       # BB#0:
926; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128]
927; SSE2-NEXT:    pmullw %xmm0, %xmm2
928; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
929; SSE2-NEXT:    movdqa %xmm1, %xmm3
930; SSE2-NEXT:    pandn %xmm0, %xmm3
931; SSE2-NEXT:    psrlw $8, %xmm0
932; SSE2-NEXT:    pand %xmm1, %xmm0
933; SSE2-NEXT:    por %xmm3, %xmm0
934; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,0,0,0]
935; SSE2-NEXT:    movdqa %xmm1, %xmm3
936; SSE2-NEXT:    pandn %xmm0, %xmm3
937; SSE2-NEXT:    psrlw $4, %xmm0
938; SSE2-NEXT:    pand %xmm1, %xmm0
939; SSE2-NEXT:    por %xmm3, %xmm0
940; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,0,0,65535,65535,0]
941; SSE2-NEXT:    movdqa %xmm1, %xmm3
942; SSE2-NEXT:    pandn %xmm0, %xmm3
943; SSE2-NEXT:    psrlw $2, %xmm0
944; SSE2-NEXT:    pand %xmm1, %xmm0
945; SSE2-NEXT:    por %xmm3, %xmm0
946; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0]
947; SSE2-NEXT:    movdqa %xmm0, %xmm1
948; SSE2-NEXT:    pand %xmm3, %xmm1
949; SSE2-NEXT:    psrlw $1, %xmm0
950; SSE2-NEXT:    pandn %xmm0, %xmm3
951; SSE2-NEXT:    por %xmm2, %xmm3
952; SSE2-NEXT:    por %xmm3, %xmm1
953; SSE2-NEXT:    movdqa %xmm1, %xmm0
954; SSE2-NEXT:    retq
955;
956; SSE41-LABEL: constant_rotate_v8i16:
957; SSE41:       # BB#0:
958; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
959; SSE41-NEXT:    pmullw %xmm0, %xmm1
960; SSE41-NEXT:    movdqa %xmm0, %xmm2
961; SSE41-NEXT:    psrlw $8, %xmm2
962; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7]
963; SSE41-NEXT:    movdqa %xmm2, %xmm0
964; SSE41-NEXT:    psrlw $4, %xmm0
965; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3,4],xmm2[5,6,7]
966; SSE41-NEXT:    movdqa %xmm0, %xmm2
967; SSE41-NEXT:    psrlw $2, %xmm2
968; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2],xmm0[3,4],xmm2[5,6],xmm0[7]
969; SSE41-NEXT:    movdqa %xmm2, %xmm0
970; SSE41-NEXT:    psrlw $1, %xmm0
971; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7]
972; SSE41-NEXT:    por %xmm1, %xmm0
973; SSE41-NEXT:    retq
974;
975; AVX1-LABEL: constant_rotate_v8i16:
976; AVX1:       # BB#0:
977; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm1
978; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2
979; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
980; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm2
981; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4],xmm0[5,6,7]
982; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm2
983; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3,4],xmm2[5,6],xmm0[7]
984; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm2
985; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
986; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
987; AVX1-NEXT:    retq
988;
989; AVX2-LABEL: constant_rotate_v8i16:
990; AVX2:       # BB#0:
991; AVX2-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm1
992; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
993; AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
994; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
995; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
996; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
997; AVX2-NEXT:    vzeroupper
998; AVX2-NEXT:    retq
999;
1000; XOP-LABEL: constant_rotate_v8i16:
1001; XOP:       # BB#0:
1002; XOP-NEXT:    vpshlw {{.*}}(%rip), %xmm0, %xmm1
1003; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1004; XOP-NEXT:    vpsubw {{.*}}(%rip), %xmm2, %xmm2
1005; XOP-NEXT:    vpshlw %xmm2, %xmm0, %xmm0
1006; XOP-NEXT:    vpor %xmm0, %xmm1, %xmm0
1007; XOP-NEXT:    retq
1008;
1009; X32-SSE-LABEL: constant_rotate_v8i16:
1010; X32-SSE:       # BB#0:
1011; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128]
1012; X32-SSE-NEXT:    pmullw %xmm0, %xmm2
1013; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
1014; X32-SSE-NEXT:    movdqa %xmm1, %xmm3
1015; X32-SSE-NEXT:    pandn %xmm0, %xmm3
1016; X32-SSE-NEXT:    psrlw $8, %xmm0
1017; X32-SSE-NEXT:    pand %xmm1, %xmm0
1018; X32-SSE-NEXT:    por %xmm3, %xmm0
1019; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,0,0,0]
1020; X32-SSE-NEXT:    movdqa %xmm1, %xmm3
1021; X32-SSE-NEXT:    pandn %xmm0, %xmm3
1022; X32-SSE-NEXT:    psrlw $4, %xmm0
1023; X32-SSE-NEXT:    pand %xmm1, %xmm0
1024; X32-SSE-NEXT:    por %xmm3, %xmm0
1025; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,0,0,65535,65535,0]
1026; X32-SSE-NEXT:    movdqa %xmm1, %xmm3
1027; X32-SSE-NEXT:    pandn %xmm0, %xmm3
1028; X32-SSE-NEXT:    psrlw $2, %xmm0
1029; X32-SSE-NEXT:    pand %xmm1, %xmm0
1030; X32-SSE-NEXT:    por %xmm3, %xmm0
1031; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0]
1032; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
1033; X32-SSE-NEXT:    pand %xmm3, %xmm1
1034; X32-SSE-NEXT:    psrlw $1, %xmm0
1035; X32-SSE-NEXT:    pandn %xmm0, %xmm3
1036; X32-SSE-NEXT:    por %xmm2, %xmm3
1037; X32-SSE-NEXT:    por %xmm3, %xmm1
1038; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
1039; X32-SSE-NEXT:    retl
1040  %shl = shl <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
1041  %lshr = lshr <8 x i16> %a, <i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9>
1042  %or = or <8 x i16> %shl, %lshr
1043  ret <8 x i16> %or
1044}
1045
1046define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind {
1047; SSE2-LABEL: constant_rotate_v16i8:
1048; SSE2:       # BB#0:
1049; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
1050; SSE2-NEXT:    psllw $5, %xmm3
1051; SSE2-NEXT:    pxor %xmm2, %xmm2
1052; SSE2-NEXT:    pxor %xmm1, %xmm1
1053; SSE2-NEXT:    pcmpgtb %xmm3, %xmm1
1054; SSE2-NEXT:    movdqa %xmm0, %xmm4
1055; SSE2-NEXT:    psllw $4, %xmm4
1056; SSE2-NEXT:    pand {{.*}}(%rip), %xmm4
1057; SSE2-NEXT:    pand %xmm1, %xmm4
1058; SSE2-NEXT:    pandn %xmm0, %xmm1
1059; SSE2-NEXT:    por %xmm4, %xmm1
1060; SSE2-NEXT:    paddb %xmm3, %xmm3
1061; SSE2-NEXT:    pxor %xmm4, %xmm4
1062; SSE2-NEXT:    pcmpgtb %xmm3, %xmm4
1063; SSE2-NEXT:    movdqa %xmm4, %xmm5
1064; SSE2-NEXT:    pandn %xmm1, %xmm5
1065; SSE2-NEXT:    psllw $2, %xmm1
1066; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
1067; SSE2-NEXT:    pand %xmm4, %xmm1
1068; SSE2-NEXT:    por %xmm5, %xmm1
1069; SSE2-NEXT:    paddb %xmm3, %xmm3
1070; SSE2-NEXT:    pxor %xmm4, %xmm4
1071; SSE2-NEXT:    pcmpgtb %xmm3, %xmm4
1072; SSE2-NEXT:    movdqa %xmm4, %xmm3
1073; SSE2-NEXT:    pandn %xmm1, %xmm3
1074; SSE2-NEXT:    paddb %xmm1, %xmm1
1075; SSE2-NEXT:    pand %xmm4, %xmm1
1076; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7]
1077; SSE2-NEXT:    psllw $5, %xmm4
1078; SSE2-NEXT:    pxor %xmm5, %xmm5
1079; SSE2-NEXT:    pcmpgtb %xmm4, %xmm5
1080; SSE2-NEXT:    movdqa %xmm5, %xmm6
1081; SSE2-NEXT:    pandn %xmm0, %xmm6
1082; SSE2-NEXT:    psrlw $4, %xmm0
1083; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
1084; SSE2-NEXT:    pand %xmm5, %xmm0
1085; SSE2-NEXT:    por %xmm6, %xmm0
1086; SSE2-NEXT:    paddb %xmm4, %xmm4
1087; SSE2-NEXT:    pxor %xmm5, %xmm5
1088; SSE2-NEXT:    pcmpgtb %xmm4, %xmm5
1089; SSE2-NEXT:    movdqa %xmm5, %xmm6
1090; SSE2-NEXT:    pandn %xmm0, %xmm6
1091; SSE2-NEXT:    psrlw $2, %xmm0
1092; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
1093; SSE2-NEXT:    pand %xmm5, %xmm0
1094; SSE2-NEXT:    por %xmm6, %xmm0
1095; SSE2-NEXT:    paddb %xmm4, %xmm4
1096; SSE2-NEXT:    pcmpgtb %xmm4, %xmm2
1097; SSE2-NEXT:    movdqa %xmm2, %xmm4
1098; SSE2-NEXT:    pandn %xmm0, %xmm4
1099; SSE2-NEXT:    psrlw $1, %xmm0
1100; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
1101; SSE2-NEXT:    pand %xmm2, %xmm0
1102; SSE2-NEXT:    por %xmm4, %xmm0
1103; SSE2-NEXT:    por %xmm3, %xmm0
1104; SSE2-NEXT:    por %xmm1, %xmm0
1105; SSE2-NEXT:    retq
1106;
1107; SSE41-LABEL: constant_rotate_v16i8:
1108; SSE41:       # BB#0:
1109; SSE41-NEXT:    movdqa %xmm0, %xmm1
1110; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
1111; SSE41-NEXT:    psllw $5, %xmm0
1112; SSE41-NEXT:    movdqa %xmm1, %xmm3
1113; SSE41-NEXT:    psllw $4, %xmm3
1114; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
1115; SSE41-NEXT:    movdqa %xmm1, %xmm2
1116; SSE41-NEXT:    pblendvb %xmm3, %xmm2
1117; SSE41-NEXT:    movdqa %xmm2, %xmm3
1118; SSE41-NEXT:    psllw $2, %xmm3
1119; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
1120; SSE41-NEXT:    paddb %xmm0, %xmm0
1121; SSE41-NEXT:    pblendvb %xmm3, %xmm2
1122; SSE41-NEXT:    movdqa %xmm2, %xmm3
1123; SSE41-NEXT:    paddb %xmm3, %xmm3
1124; SSE41-NEXT:    paddb %xmm0, %xmm0
1125; SSE41-NEXT:    pblendvb %xmm3, %xmm2
1126; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7]
1127; SSE41-NEXT:    psllw $5, %xmm0
1128; SSE41-NEXT:    movdqa %xmm1, %xmm3
1129; SSE41-NEXT:    psrlw $4, %xmm3
1130; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
1131; SSE41-NEXT:    pblendvb %xmm3, %xmm1
1132; SSE41-NEXT:    movdqa %xmm1, %xmm3
1133; SSE41-NEXT:    psrlw $2, %xmm3
1134; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
1135; SSE41-NEXT:    paddb %xmm0, %xmm0
1136; SSE41-NEXT:    pblendvb %xmm3, %xmm1
1137; SSE41-NEXT:    movdqa %xmm1, %xmm3
1138; SSE41-NEXT:    psrlw $1, %xmm3
1139; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
1140; SSE41-NEXT:    paddb %xmm0, %xmm0
1141; SSE41-NEXT:    pblendvb %xmm3, %xmm1
1142; SSE41-NEXT:    por %xmm2, %xmm1
1143; SSE41-NEXT:    movdqa %xmm1, %xmm0
1144; SSE41-NEXT:    retq
1145;
1146; AVX-LABEL: constant_rotate_v16i8:
1147; AVX:       # BB#0:
1148; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
1149; AVX-NEXT:    vpsllw $5, %xmm1, %xmm1
1150; AVX-NEXT:    vpsllw $4, %xmm0, %xmm2
1151; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
1152; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm2
1153; AVX-NEXT:    vpsllw $2, %xmm2, %xmm3
1154; AVX-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
1155; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
1156; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm2, %xmm2
1157; AVX-NEXT:    vpaddb %xmm2, %xmm2, %xmm3
1158; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
1159; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm2, %xmm1
1160; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7]
1161; AVX-NEXT:    vpsllw $5, %xmm2, %xmm2
1162; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm3
1163; AVX-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
1164; AVX-NEXT:    vpblendvb %xmm2, %xmm3, %xmm0, %xmm0
1165; AVX-NEXT:    vpsrlw $2, %xmm0, %xmm3
1166; AVX-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
1167; AVX-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
1168; AVX-NEXT:    vpblendvb %xmm2, %xmm3, %xmm0, %xmm0
1169; AVX-NEXT:    vpsrlw $1, %xmm0, %xmm3
1170; AVX-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
1171; AVX-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
1172; AVX-NEXT:    vpblendvb %xmm2, %xmm3, %xmm0, %xmm0
1173; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
1174; AVX-NEXT:    retq
1175;
1176; XOP-LABEL: constant_rotate_v16i8:
1177; XOP:       # BB#0:
1178; XOP-NEXT:    vpshlb {{.*}}(%rip), %xmm0, %xmm1
1179; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1180; XOP-NEXT:    vpsubb {{.*}}(%rip), %xmm2, %xmm2
1181; XOP-NEXT:    vpshlb %xmm2, %xmm0, %xmm0
1182; XOP-NEXT:    vpor %xmm0, %xmm1, %xmm0
1183; XOP-NEXT:    retq
1184;
1185; X32-SSE-LABEL: constant_rotate_v16i8:
1186; X32-SSE:       # BB#0:
1187; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
1188; X32-SSE-NEXT:    psllw $5, %xmm3
1189; X32-SSE-NEXT:    pxor %xmm2, %xmm2
1190; X32-SSE-NEXT:    pxor %xmm1, %xmm1
1191; X32-SSE-NEXT:    pcmpgtb %xmm3, %xmm1
1192; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
1193; X32-SSE-NEXT:    psllw $4, %xmm4
1194; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm4
1195; X32-SSE-NEXT:    pand %xmm1, %xmm4
1196; X32-SSE-NEXT:    pandn %xmm0, %xmm1
1197; X32-SSE-NEXT:    por %xmm4, %xmm1
1198; X32-SSE-NEXT:    paddb %xmm3, %xmm3
1199; X32-SSE-NEXT:    pxor %xmm4, %xmm4
1200; X32-SSE-NEXT:    pcmpgtb %xmm3, %xmm4
1201; X32-SSE-NEXT:    movdqa %xmm4, %xmm5
1202; X32-SSE-NEXT:    pandn %xmm1, %xmm5
1203; X32-SSE-NEXT:    psllw $2, %xmm1
1204; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
1205; X32-SSE-NEXT:    pand %xmm4, %xmm1
1206; X32-SSE-NEXT:    por %xmm5, %xmm1
1207; X32-SSE-NEXT:    paddb %xmm3, %xmm3
1208; X32-SSE-NEXT:    pxor %xmm4, %xmm4
1209; X32-SSE-NEXT:    pcmpgtb %xmm3, %xmm4
1210; X32-SSE-NEXT:    movdqa %xmm4, %xmm3
1211; X32-SSE-NEXT:    pandn %xmm1, %xmm3
1212; X32-SSE-NEXT:    paddb %xmm1, %xmm1
1213; X32-SSE-NEXT:    pand %xmm4, %xmm1
1214; X32-SSE-NEXT:    movdqa {{.*#+}} xmm4 = [8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7]
1215; X32-SSE-NEXT:    psllw $5, %xmm4
1216; X32-SSE-NEXT:    pxor %xmm5, %xmm5
1217; X32-SSE-NEXT:    pcmpgtb %xmm4, %xmm5
1218; X32-SSE-NEXT:    movdqa %xmm5, %xmm6
1219; X32-SSE-NEXT:    pandn %xmm0, %xmm6
1220; X32-SSE-NEXT:    psrlw $4, %xmm0
1221; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
1222; X32-SSE-NEXT:    pand %xmm5, %xmm0
1223; X32-SSE-NEXT:    por %xmm6, %xmm0
1224; X32-SSE-NEXT:    paddb %xmm4, %xmm4
1225; X32-SSE-NEXT:    pxor %xmm5, %xmm5
1226; X32-SSE-NEXT:    pcmpgtb %xmm4, %xmm5
1227; X32-SSE-NEXT:    movdqa %xmm5, %xmm6
1228; X32-SSE-NEXT:    pandn %xmm0, %xmm6
1229; X32-SSE-NEXT:    psrlw $2, %xmm0
1230; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
1231; X32-SSE-NEXT:    pand %xmm5, %xmm0
1232; X32-SSE-NEXT:    por %xmm6, %xmm0
1233; X32-SSE-NEXT:    paddb %xmm4, %xmm4
1234; X32-SSE-NEXT:    pcmpgtb %xmm4, %xmm2
1235; X32-SSE-NEXT:    movdqa %xmm2, %xmm4
1236; X32-SSE-NEXT:    pandn %xmm0, %xmm4
1237; X32-SSE-NEXT:    psrlw $1, %xmm0
1238; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
1239; X32-SSE-NEXT:    pand %xmm2, %xmm0
1240; X32-SSE-NEXT:    por %xmm4, %xmm0
1241; X32-SSE-NEXT:    por %xmm3, %xmm0
1242; X32-SSE-NEXT:    por %xmm1, %xmm0
1243; X32-SSE-NEXT:    retl
1244  %shl = shl <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>
1245  %lshr = lshr <16 x i8> %a, <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
1246  %or = or <16 x i8> %shl, %lshr
1247  ret <16 x i8> %or
1248}
1249
1250;
1251; Uniform Constant Rotates
1252;
1253
1254define <2 x i64> @splatconstant_rotate_v2i64(<2 x i64> %a) nounwind {
1255; SSE-LABEL: splatconstant_rotate_v2i64:
1256; SSE:       # BB#0:
1257; SSE-NEXT:    movdqa %xmm0, %xmm1
1258; SSE-NEXT:    psllq $14, %xmm1
1259; SSE-NEXT:    psrlq $50, %xmm0
1260; SSE-NEXT:    por %xmm1, %xmm0
1261; SSE-NEXT:    retq
1262;
1263; AVX-LABEL: splatconstant_rotate_v2i64:
1264; AVX:       # BB#0:
1265; AVX-NEXT:    vpsllq $14, %xmm0, %xmm1
1266; AVX-NEXT:    vpsrlq $50, %xmm0, %xmm0
1267; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
1268; AVX-NEXT:    retq
1269;
1270; XOP-LABEL: splatconstant_rotate_v2i64:
1271; XOP:       # BB#0:
1272; XOP-NEXT:    vprotq $14, %xmm0, %xmm0
1273; XOP-NEXT:    retq
1274;
1275; X32-SSE-LABEL: splatconstant_rotate_v2i64:
1276; X32-SSE:       # BB#0:
1277; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
1278; X32-SSE-NEXT:    psllq $14, %xmm1
1279; X32-SSE-NEXT:    psrlq $50, %xmm0
1280; X32-SSE-NEXT:    por %xmm1, %xmm0
1281; X32-SSE-NEXT:    retl
1282  %shl = shl <2 x i64> %a, <i64 14, i64 14>
1283  %lshr = lshr <2 x i64> %a, <i64 50, i64 50>
1284  %or = or <2 x i64> %shl, %lshr
1285  ret <2 x i64> %or
1286}
1287
1288define <4 x i32> @splatconstant_rotate_v4i32(<4 x i32> %a) nounwind {
1289; SSE-LABEL: splatconstant_rotate_v4i32:
1290; SSE:       # BB#0:
1291; SSE-NEXT:    movdqa %xmm0, %xmm1
1292; SSE-NEXT:    pslld $4, %xmm1
1293; SSE-NEXT:    psrld $28, %xmm0
1294; SSE-NEXT:    por %xmm1, %xmm0
1295; SSE-NEXT:    retq
1296;
1297; AVX-LABEL: splatconstant_rotate_v4i32:
1298; AVX:       # BB#0:
1299; AVX-NEXT:    vpslld $4, %xmm0, %xmm1
1300; AVX-NEXT:    vpsrld $28, %xmm0, %xmm0
1301; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
1302; AVX-NEXT:    retq
1303;
1304; XOP-LABEL: splatconstant_rotate_v4i32:
1305; XOP:       # BB#0:
1306; XOP-NEXT:    vprotd $4, %xmm0, %xmm0
1307; XOP-NEXT:    retq
1308;
1309; X32-SSE-LABEL: splatconstant_rotate_v4i32:
1310; X32-SSE:       # BB#0:
1311; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
1312; X32-SSE-NEXT:    pslld $4, %xmm1
1313; X32-SSE-NEXT:    psrld $28, %xmm0
1314; X32-SSE-NEXT:    por %xmm1, %xmm0
1315; X32-SSE-NEXT:    retl
1316  %shl = shl <4 x i32> %a, <i32 4, i32 4, i32 4, i32 4>
1317  %lshr = lshr <4 x i32> %a, <i32 28, i32 28, i32 28, i32 28>
1318  %or = or <4 x i32> %shl, %lshr
1319  ret <4 x i32> %or
1320}
1321
1322define <8 x i16> @splatconstant_rotate_v8i16(<8 x i16> %a) nounwind {
1323; SSE-LABEL: splatconstant_rotate_v8i16:
1324; SSE:       # BB#0:
1325; SSE-NEXT:    movdqa %xmm0, %xmm1
1326; SSE-NEXT:    psllw $7, %xmm1
1327; SSE-NEXT:    psrlw $9, %xmm0
1328; SSE-NEXT:    por %xmm1, %xmm0
1329; SSE-NEXT:    retq
1330;
1331; AVX-LABEL: splatconstant_rotate_v8i16:
1332; AVX:       # BB#0:
1333; AVX-NEXT:    vpsllw $7, %xmm0, %xmm1
1334; AVX-NEXT:    vpsrlw $9, %xmm0, %xmm0
1335; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
1336; AVX-NEXT:    retq
1337;
1338; XOP-LABEL: splatconstant_rotate_v8i16:
1339; XOP:       # BB#0:
1340; XOP-NEXT:    vprotw $7, %xmm0, %xmm0
1341; XOP-NEXT:    retq
1342;
1343; X32-SSE-LABEL: splatconstant_rotate_v8i16:
1344; X32-SSE:       # BB#0:
1345; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
1346; X32-SSE-NEXT:    psllw $7, %xmm1
1347; X32-SSE-NEXT:    psrlw $9, %xmm0
1348; X32-SSE-NEXT:    por %xmm1, %xmm0
1349; X32-SSE-NEXT:    retl
1350  %shl = shl <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
1351  %lshr = lshr <8 x i16> %a, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9>
1352  %or = or <8 x i16> %shl, %lshr
1353  ret <8 x i16> %or
1354}
1355
1356define <16 x i8> @splatconstant_rotate_v16i8(<16 x i8> %a) nounwind {
1357; SSE-LABEL: splatconstant_rotate_v16i8:
1358; SSE:       # BB#0:
1359; SSE-NEXT:    movdqa %xmm0, %xmm1
1360; SSE-NEXT:    psllw $4, %xmm1
1361; SSE-NEXT:    pand {{.*}}(%rip), %xmm1
1362; SSE-NEXT:    psrlw $4, %xmm0
1363; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
1364; SSE-NEXT:    por %xmm1, %xmm0
1365; SSE-NEXT:    retq
1366;
1367; AVX-LABEL: splatconstant_rotate_v16i8:
1368; AVX:       # BB#0:
1369; AVX-NEXT:    vpsllw $4, %xmm0, %xmm1
1370; AVX-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
1371; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
1372; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
1373; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
1374; AVX-NEXT:    retq
1375;
1376; XOP-LABEL: splatconstant_rotate_v16i8:
1377; XOP:       # BB#0:
1378; XOP-NEXT:    vprotb $4, %xmm0, %xmm0
1379; XOP-NEXT:    retq
1380;
1381; X32-SSE-LABEL: splatconstant_rotate_v16i8:
1382; X32-SSE:       # BB#0:
1383; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
1384; X32-SSE-NEXT:    psllw $4, %xmm1
1385; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
1386; X32-SSE-NEXT:    psrlw $4, %xmm0
1387; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
1388; X32-SSE-NEXT:    por %xmm1, %xmm0
1389; X32-SSE-NEXT:    retl
1390  %shl = shl <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1391  %lshr = lshr <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1392  %or = or <16 x i8> %shl, %lshr
1393  ret <16 x i8> %or
1394}
1395
1396;
1397; Masked Uniform Constant Rotates
1398;
1399
1400define <2 x i64> @splatconstant_rotate_mask_v2i64(<2 x i64> %a) nounwind {
1401; SSE-LABEL: splatconstant_rotate_mask_v2i64:
1402; SSE:       # BB#0:
1403; SSE-NEXT:    movdqa %xmm0, %xmm1
1404; SSE-NEXT:    psllq $15, %xmm1
1405; SSE-NEXT:    psrlq $49, %xmm0
1406; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
1407; SSE-NEXT:    pand {{.*}}(%rip), %xmm1
1408; SSE-NEXT:    por %xmm0, %xmm1
1409; SSE-NEXT:    movdqa %xmm1, %xmm0
1410; SSE-NEXT:    retq
1411;
1412; AVX-LABEL: splatconstant_rotate_mask_v2i64:
1413; AVX:       # BB#0:
1414; AVX-NEXT:    vpsllq $15, %xmm0, %xmm1
1415; AVX-NEXT:    vpsrlq $49, %xmm0, %xmm0
1416; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
1417; AVX-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
1418; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
1419; AVX-NEXT:    retq
1420;
1421; XOP-LABEL: splatconstant_rotate_mask_v2i64:
1422; XOP:       # BB#0:
1423; XOP-NEXT:    vprotq $15, %xmm0, %xmm0
1424; XOP-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
1425; XOP-NEXT:    retq
1426;
1427; X32-SSE-LABEL: splatconstant_rotate_mask_v2i64:
1428; X32-SSE:       # BB#0:
1429; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
1430; X32-SSE-NEXT:    psllq $15, %xmm1
1431; X32-SSE-NEXT:    psrlq $49, %xmm0
1432; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
1433; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
1434; X32-SSE-NEXT:    por %xmm0, %xmm1
1435; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
1436; X32-SSE-NEXT:    retl
1437  %shl = shl <2 x i64> %a, <i64 15, i64 15>
1438  %lshr = lshr <2 x i64> %a, <i64 49, i64 49>
1439  %rmask = and <2 x i64> %lshr, <i64 255, i64 127>
1440  %lmask = and <2 x i64> %shl, <i64 65, i64 33>
1441  %or = or <2 x i64> %lmask, %rmask
1442  ret <2 x i64> %or
1443}
1444
1445define <4 x i32> @splatconstant_rotate_mask_v4i32(<4 x i32> %a) nounwind {
1446; SSE-LABEL: splatconstant_rotate_mask_v4i32:
1447; SSE:       # BB#0:
1448; SSE-NEXT:    movdqa %xmm0, %xmm1
1449; SSE-NEXT:    pslld $4, %xmm1
1450; SSE-NEXT:    psrld $28, %xmm0
1451; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
1452; SSE-NEXT:    pand {{.*}}(%rip), %xmm1
1453; SSE-NEXT:    por %xmm0, %xmm1
1454; SSE-NEXT:    movdqa %xmm1, %xmm0
1455; SSE-NEXT:    retq
1456;
1457; AVX-LABEL: splatconstant_rotate_mask_v4i32:
1458; AVX:       # BB#0:
1459; AVX-NEXT:    vpslld $4, %xmm0, %xmm1
1460; AVX-NEXT:    vpsrld $28, %xmm0, %xmm0
1461; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
1462; AVX-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
1463; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
1464; AVX-NEXT:    retq
1465;
1466; XOP-LABEL: splatconstant_rotate_mask_v4i32:
1467; XOP:       # BB#0:
1468; XOP-NEXT:    vprotd $4, %xmm0, %xmm0
1469; XOP-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
1470; XOP-NEXT:    retq
1471;
1472; X32-SSE-LABEL: splatconstant_rotate_mask_v4i32:
1473; X32-SSE:       # BB#0:
1474; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
1475; X32-SSE-NEXT:    pslld $4, %xmm1
1476; X32-SSE-NEXT:    psrld $28, %xmm0
1477; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
1478; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
1479; X32-SSE-NEXT:    por %xmm0, %xmm1
1480; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
1481; X32-SSE-NEXT:    retl
1482  %shl = shl <4 x i32> %a, <i32 4, i32 4, i32 4, i32 4>
1483  %lshr = lshr <4 x i32> %a, <i32 28, i32 28, i32 28, i32 28>
1484  %rmask = and <4 x i32> %lshr, <i32 127, i32 255, i32 511, i32 1023>
1485  %lmask = and <4 x i32> %shl, <i32 1023, i32 511, i32 255, i32 127>
1486  %or = or <4 x i32> %lmask, %rmask
1487  ret <4 x i32> %or
1488}
1489
1490define <8 x i16> @splatconstant_rotate_mask_v8i16(<8 x i16> %a) nounwind {
1491; SSE-LABEL: splatconstant_rotate_mask_v8i16:
1492; SSE:       # BB#0:
1493; SSE-NEXT:    movdqa %xmm0, %xmm1
1494; SSE-NEXT:    psllw $5, %xmm1
1495; SSE-NEXT:    psrlw $11, %xmm0
1496; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
1497; SSE-NEXT:    pand {{.*}}(%rip), %xmm1
1498; SSE-NEXT:    por %xmm0, %xmm1
1499; SSE-NEXT:    movdqa %xmm1, %xmm0
1500; SSE-NEXT:    retq
1501;
1502; AVX-LABEL: splatconstant_rotate_mask_v8i16:
1503; AVX:       # BB#0:
1504; AVX-NEXT:    vpsllw $5, %xmm0, %xmm1
1505; AVX-NEXT:    vpsrlw $11, %xmm0, %xmm0
1506; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
1507; AVX-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
1508; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
1509; AVX-NEXT:    retq
1510;
1511; XOP-LABEL: splatconstant_rotate_mask_v8i16:
1512; XOP:       # BB#0:
1513; XOP-NEXT:    vprotw $5, %xmm0, %xmm0
1514; XOP-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
1515; XOP-NEXT:    retq
1516;
1517; X32-SSE-LABEL: splatconstant_rotate_mask_v8i16:
1518; X32-SSE:       # BB#0:
1519; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
1520; X32-SSE-NEXT:    psllw $5, %xmm1
1521; X32-SSE-NEXT:    psrlw $11, %xmm0
1522; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
1523; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
1524; X32-SSE-NEXT:    por %xmm0, %xmm1
1525; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
1526; X32-SSE-NEXT:    retl
1527  %shl = shl <8 x i16> %a, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
1528  %lshr = lshr <8 x i16> %a, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
1529  %rmask = and <8 x i16> %lshr, <i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55>
1530  %lmask = and <8 x i16> %shl, <i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33>
1531  %or = or <8 x i16> %lmask, %rmask
1532  ret <8 x i16> %or
1533}
1534
1535define <16 x i8> @splatconstant_rotate_mask_v16i8(<16 x i8> %a) nounwind {
1536; SSE-LABEL: splatconstant_rotate_mask_v16i8:
1537; SSE:       # BB#0:
1538; SSE-NEXT:    movdqa %xmm0, %xmm1
1539; SSE-NEXT:    psllw $4, %xmm1
1540; SSE-NEXT:    pand {{.*}}(%rip), %xmm1
1541; SSE-NEXT:    psrlw $4, %xmm0
1542; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
1543; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
1544; SSE-NEXT:    pand {{.*}}(%rip), %xmm1
1545; SSE-NEXT:    por %xmm0, %xmm1
1546; SSE-NEXT:    movdqa %xmm1, %xmm0
1547; SSE-NEXT:    retq
1548;
1549; AVX-LABEL: splatconstant_rotate_mask_v16i8:
1550; AVX:       # BB#0:
1551; AVX-NEXT:    vpsllw $4, %xmm0, %xmm1
1552; AVX-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
1553; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
1554; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
1555; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
1556; AVX-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
1557; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
1558; AVX-NEXT:    retq
1559;
1560; XOP-LABEL: splatconstant_rotate_mask_v16i8:
1561; XOP:       # BB#0:
1562; XOP-NEXT:    vprotb $4, %xmm0, %xmm0
1563; XOP-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
1564; XOP-NEXT:    retq
1565;
1566; X32-SSE-LABEL: splatconstant_rotate_mask_v16i8:
1567; X32-SSE:       # BB#0:
1568; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
1569; X32-SSE-NEXT:    psllw $4, %xmm1
1570; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
1571; X32-SSE-NEXT:    psrlw $4, %xmm0
1572; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
1573; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
1574; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm1
1575; X32-SSE-NEXT:    por %xmm0, %xmm1
1576; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
1577; X32-SSE-NEXT:    retl
1578  %shl = shl <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1579  %lshr = lshr <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1580  %rmask = and <16 x i8> %lshr, <i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55>
1581  %lmask = and <16 x i8> %shl, <i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33>
1582  %or = or <16 x i8> %lmask, %rmask
1583  ret <16 x i8> %or
1584}
1585