• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
8;
9; Just one 32-bit run to make sure we do reasonable things for i64 rotates.
10; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2
11
12;
13; Variable Rotates
14;
15
16define <2 x i64> @var_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
17; SSE2-LABEL: var_rotate_v2i64:
18; SSE2:       # BB#0:
19; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [64,64]
20; SSE2-NEXT:    psubq %xmm1, %xmm2
21; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
22; SSE2-NEXT:    movdqa %xmm0, %xmm4
23; SSE2-NEXT:    psllq %xmm3, %xmm4
24; SSE2-NEXT:    movdqa %xmm0, %xmm3
25; SSE2-NEXT:    psllq %xmm1, %xmm3
26; SSE2-NEXT:    movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1]
27; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
28; SSE2-NEXT:    movdqa %xmm0, %xmm1
29; SSE2-NEXT:    psrlq %xmm3, %xmm1
30; SSE2-NEXT:    psrlq %xmm2, %xmm0
31; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
32; SSE2-NEXT:    orpd %xmm4, %xmm1
33; SSE2-NEXT:    movapd %xmm1, %xmm0
34; SSE2-NEXT:    retq
35;
36; SSE41-LABEL: var_rotate_v2i64:
37; SSE41:       # BB#0:
38; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [64,64]
39; SSE41-NEXT:    psubq %xmm1, %xmm2
40; SSE41-NEXT:    movdqa %xmm0, %xmm3
41; SSE41-NEXT:    psllq %xmm1, %xmm3
42; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
43; SSE41-NEXT:    movdqa %xmm0, %xmm4
44; SSE41-NEXT:    psllq %xmm1, %xmm4
45; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm3[0,1,2,3],xmm4[4,5,6,7]
46; SSE41-NEXT:    movdqa %xmm0, %xmm1
47; SSE41-NEXT:    psrlq %xmm2, %xmm1
48; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
49; SSE41-NEXT:    psrlq %xmm2, %xmm0
50; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
51; SSE41-NEXT:    por %xmm4, %xmm0
52; SSE41-NEXT:    retq
53;
54; AVX1-LABEL: var_rotate_v2i64:
55; AVX1:       # BB#0:
56; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [64,64]
57; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm2
58; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm3
59; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
60; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm1
61; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
62; AVX1-NEXT:    vpsrlq %xmm2, %xmm0, %xmm3
63; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
64; AVX1-NEXT:    vpsrlq %xmm2, %xmm0, %xmm0
65; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
66; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
67; AVX1-NEXT:    retq
68;
69; AVX2-LABEL: var_rotate_v2i64:
70; AVX2:       # BB#0:
71; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [64,64]
72; AVX2-NEXT:    vpsubq %xmm1, %xmm2, %xmm2
73; AVX2-NEXT:    vpsllvq %xmm1, %xmm0, %xmm1
74; AVX2-NEXT:    vpsrlvq %xmm2, %xmm0, %xmm0
75; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
76; AVX2-NEXT:    retq
77;
78; XOP-LABEL: var_rotate_v2i64:
79; XOP:       # BB#0:
80; XOP-NEXT:    vprotq %xmm1, %xmm0, %xmm0
81; XOP-NEXT:    retq
82;
83; X32-SSE-LABEL: var_rotate_v2i64:
84; X32-SSE:       # BB#0:
85; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [64,0,64,0]
86; X32-SSE-NEXT:    psubq %xmm1, %xmm2
87; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
88; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
89; X32-SSE-NEXT:    psllq %xmm3, %xmm4
90; X32-SSE-NEXT:    movq {{.*#+}} xmm1 = xmm1[0],zero
91; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
92; X32-SSE-NEXT:    psllq %xmm1, %xmm3
93; X32-SSE-NEXT:    movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1]
94; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
95; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
96; X32-SSE-NEXT:    psrlq %xmm3, %xmm1
97; X32-SSE-NEXT:    movq {{.*#+}} xmm2 = xmm2[0],zero
98; X32-SSE-NEXT:    psrlq %xmm2, %xmm0
99; X32-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
100; X32-SSE-NEXT:    orpd %xmm4, %xmm1
101; X32-SSE-NEXT:    movapd %xmm1, %xmm0
102; X32-SSE-NEXT:    retl
103  %b64 = sub <2 x i64> <i64 64, i64 64>, %b
104  %shl = shl <2 x i64> %a, %b
105  %lshr = lshr <2 x i64> %a, %b64
106  %or = or <2 x i64> %shl, %lshr
107  ret <2 x i64> %or
108}
109
110define <4 x i32> @var_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
111; SSE2-LABEL: var_rotate_v4i32:
112; SSE2:       # BB#0:
113; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [32,32,32,32]
114; SSE2-NEXT:    psubd %xmm1, %xmm2
115; SSE2-NEXT:    pslld $23, %xmm1
116; SSE2-NEXT:    paddd {{.*}}(%rip), %xmm1
117; SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
118; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
119; SSE2-NEXT:    pmuludq %xmm0, %xmm1
120; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
121; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
122; SSE2-NEXT:    pmuludq %xmm3, %xmm4
123; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
124; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
125; SSE2-NEXT:    movdqa %xmm2, %xmm3
126; SSE2-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
127; SSE2-NEXT:    movdqa %xmm0, %xmm4
128; SSE2-NEXT:    psrld %xmm3, %xmm4
129; SSE2-NEXT:    movdqa %xmm2, %xmm3
130; SSE2-NEXT:    psrlq $32, %xmm3
131; SSE2-NEXT:    movdqa %xmm0, %xmm5
132; SSE2-NEXT:    psrld %xmm3, %xmm5
133; SSE2-NEXT:    movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1]
134; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,3,2,3]
135; SSE2-NEXT:    pxor %xmm4, %xmm4
136; SSE2-NEXT:    movdqa %xmm2, %xmm5
137; SSE2-NEXT:    punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
138; SSE2-NEXT:    movdqa %xmm0, %xmm6
139; SSE2-NEXT:    psrld %xmm5, %xmm6
140; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
141; SSE2-NEXT:    psrld %xmm2, %xmm0
142; SSE2-NEXT:    movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1]
143; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3]
144; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
145; SSE2-NEXT:    por %xmm1, %xmm0
146; SSE2-NEXT:    retq
147;
148; SSE41-LABEL: var_rotate_v4i32:
149; SSE41:       # BB#0:
150; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [32,32,32,32]
151; SSE41-NEXT:    psubd %xmm1, %xmm2
152; SSE41-NEXT:    pslld $23, %xmm1
153; SSE41-NEXT:    paddd {{.*}}(%rip), %xmm1
154; SSE41-NEXT:    cvttps2dq %xmm1, %xmm1
155; SSE41-NEXT:    pmulld %xmm0, %xmm1
156; SSE41-NEXT:    movdqa %xmm2, %xmm3
157; SSE41-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
158; SSE41-NEXT:    movdqa %xmm0, %xmm4
159; SSE41-NEXT:    psrld %xmm3, %xmm4
160; SSE41-NEXT:    movdqa %xmm2, %xmm3
161; SSE41-NEXT:    psrlq $32, %xmm3
162; SSE41-NEXT:    movdqa %xmm0, %xmm5
163; SSE41-NEXT:    psrld %xmm3, %xmm5
164; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm4[4,5,6,7]
165; SSE41-NEXT:    pxor %xmm3, %xmm3
166; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero
167; SSE41-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
168; SSE41-NEXT:    movdqa %xmm0, %xmm3
169; SSE41-NEXT:    psrld %xmm2, %xmm3
170; SSE41-NEXT:    psrld %xmm4, %xmm0
171; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
172; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
173; SSE41-NEXT:    por %xmm1, %xmm0
174; SSE41-NEXT:    retq
175;
176; AVX1-LABEL: var_rotate_v4i32:
177; AVX1:       # BB#0:
178; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [32,32,32,32]
179; AVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm2
180; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
181; AVX1-NEXT:    vpaddd {{.*}}(%rip), %xmm1, %xmm1
182; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
183; AVX1-NEXT:    vpmulld %xmm0, %xmm1, %xmm1
184; AVX1-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
185; AVX1-NEXT:    vpsrld %xmm3, %xmm0, %xmm3
186; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm4
187; AVX1-NEXT:    vpsrld %xmm4, %xmm0, %xmm4
188; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
189; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
190; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
191; AVX1-NEXT:    vpsrld %xmm4, %xmm0, %xmm4
192; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
193; AVX1-NEXT:    vpsrld %xmm2, %xmm0, %xmm0
194; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
195; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
196; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
197; AVX1-NEXT:    retq
198;
199; AVX2-LABEL: var_rotate_v4i32:
200; AVX2:       # BB#0:
201; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
202; AVX2-NEXT:    vpsubd %xmm1, %xmm2, %xmm2
203; AVX2-NEXT:    vpsllvd %xmm1, %xmm0, %xmm1
204; AVX2-NEXT:    vpsrlvd %xmm2, %xmm0, %xmm0
205; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
206; AVX2-NEXT:    retq
207;
208; XOP-LABEL: var_rotate_v4i32:
209; XOP:       # BB#0:
210; XOP-NEXT:    vprotd %xmm1, %xmm0, %xmm0
211; XOP-NEXT:    retq
212;
213; X32-SSE-LABEL: var_rotate_v4i32:
214; X32-SSE:       # BB#0:
215; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [32,32,32,32]
216; X32-SSE-NEXT:    psubd %xmm1, %xmm2
217; X32-SSE-NEXT:    pslld $23, %xmm1
218; X32-SSE-NEXT:    paddd .LCPI1_1, %xmm1
219; X32-SSE-NEXT:    cvttps2dq %xmm1, %xmm1
220; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
221; X32-SSE-NEXT:    pmuludq %xmm0, %xmm1
222; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
223; X32-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
224; X32-SSE-NEXT:    pmuludq %xmm3, %xmm4
225; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
226; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
227; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
228; X32-SSE-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
229; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
230; X32-SSE-NEXT:    psrld %xmm3, %xmm4
231; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
232; X32-SSE-NEXT:    psrlq $32, %xmm3
233; X32-SSE-NEXT:    movdqa %xmm0, %xmm5
234; X32-SSE-NEXT:    psrld %xmm3, %xmm5
235; X32-SSE-NEXT:    movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1]
236; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,3,2,3]
237; X32-SSE-NEXT:    pxor %xmm4, %xmm4
238; X32-SSE-NEXT:    movdqa %xmm2, %xmm5
239; X32-SSE-NEXT:    punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
240; X32-SSE-NEXT:    movdqa %xmm0, %xmm6
241; X32-SSE-NEXT:    psrld %xmm5, %xmm6
242; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
243; X32-SSE-NEXT:    psrld %xmm2, %xmm0
244; X32-SSE-NEXT:    movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1]
245; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3]
246; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
247; X32-SSE-NEXT:    por %xmm1, %xmm0
248; X32-SSE-NEXT:    retl
249  %b32 = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %b
250  %shl = shl <4 x i32> %a, %b
251  %lshr = lshr <4 x i32> %a, %b32
252  %or = or <4 x i32> %shl, %lshr
253  ret <4 x i32> %or
254}
255
256define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
257; SSE2-LABEL: var_rotate_v8i16:
258; SSE2:       # BB#0:
259; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
260; SSE2-NEXT:    psubw %xmm1, %xmm3
261; SSE2-NEXT:    psllw $12, %xmm1
262; SSE2-NEXT:    movdqa %xmm1, %xmm2
263; SSE2-NEXT:    psraw $15, %xmm2
264; SSE2-NEXT:    movdqa %xmm0, %xmm4
265; SSE2-NEXT:    psllw $8, %xmm4
266; SSE2-NEXT:    pand %xmm2, %xmm4
267; SSE2-NEXT:    pandn %xmm0, %xmm2
268; SSE2-NEXT:    por %xmm4, %xmm2
269; SSE2-NEXT:    paddw %xmm1, %xmm1
270; SSE2-NEXT:    movdqa %xmm1, %xmm4
271; SSE2-NEXT:    psraw $15, %xmm4
272; SSE2-NEXT:    movdqa %xmm4, %xmm5
273; SSE2-NEXT:    pandn %xmm2, %xmm5
274; SSE2-NEXT:    psllw $4, %xmm2
275; SSE2-NEXT:    pand %xmm4, %xmm2
276; SSE2-NEXT:    por %xmm5, %xmm2
277; SSE2-NEXT:    paddw %xmm1, %xmm1
278; SSE2-NEXT:    movdqa %xmm1, %xmm4
279; SSE2-NEXT:    psraw $15, %xmm4
280; SSE2-NEXT:    movdqa %xmm4, %xmm5
281; SSE2-NEXT:    pandn %xmm2, %xmm5
282; SSE2-NEXT:    psllw $2, %xmm2
283; SSE2-NEXT:    pand %xmm4, %xmm2
284; SSE2-NEXT:    por %xmm5, %xmm2
285; SSE2-NEXT:    paddw %xmm1, %xmm1
286; SSE2-NEXT:    psraw $15, %xmm1
287; SSE2-NEXT:    movdqa %xmm1, %xmm4
288; SSE2-NEXT:    pandn %xmm2, %xmm4
289; SSE2-NEXT:    psllw $1, %xmm2
290; SSE2-NEXT:    pand %xmm1, %xmm2
291; SSE2-NEXT:    psllw $12, %xmm3
292; SSE2-NEXT:    movdqa %xmm3, %xmm1
293; SSE2-NEXT:    psraw $15, %xmm1
294; SSE2-NEXT:    movdqa %xmm1, %xmm5
295; SSE2-NEXT:    pandn %xmm0, %xmm5
296; SSE2-NEXT:    psrlw $8, %xmm0
297; SSE2-NEXT:    pand %xmm1, %xmm0
298; SSE2-NEXT:    por %xmm5, %xmm0
299; SSE2-NEXT:    paddw %xmm3, %xmm3
300; SSE2-NEXT:    movdqa %xmm3, %xmm1
301; SSE2-NEXT:    psraw $15, %xmm1
302; SSE2-NEXT:    movdqa %xmm1, %xmm5
303; SSE2-NEXT:    pandn %xmm0, %xmm5
304; SSE2-NEXT:    psrlw $4, %xmm0
305; SSE2-NEXT:    pand %xmm1, %xmm0
306; SSE2-NEXT:    por %xmm5, %xmm0
307; SSE2-NEXT:    paddw %xmm3, %xmm3
308; SSE2-NEXT:    movdqa %xmm3, %xmm1
309; SSE2-NEXT:    psraw $15, %xmm1
310; SSE2-NEXT:    movdqa %xmm1, %xmm5
311; SSE2-NEXT:    pandn %xmm0, %xmm5
312; SSE2-NEXT:    psrlw $2, %xmm0
313; SSE2-NEXT:    pand %xmm1, %xmm0
314; SSE2-NEXT:    por %xmm5, %xmm0
315; SSE2-NEXT:    paddw %xmm3, %xmm3
316; SSE2-NEXT:    psraw $15, %xmm3
317; SSE2-NEXT:    movdqa %xmm3, %xmm1
318; SSE2-NEXT:    pandn %xmm0, %xmm1
319; SSE2-NEXT:    psrlw $1, %xmm0
320; SSE2-NEXT:    pand %xmm3, %xmm0
321; SSE2-NEXT:    por %xmm1, %xmm0
322; SSE2-NEXT:    por %xmm4, %xmm0
323; SSE2-NEXT:    por %xmm2, %xmm0
324; SSE2-NEXT:    retq
325;
326; SSE41-LABEL: var_rotate_v8i16:
327; SSE41:       # BB#0:
328; SSE41-NEXT:    movdqa %xmm0, %xmm3
329; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16]
330; SSE41-NEXT:    psubw %xmm1, %xmm2
331; SSE41-NEXT:    movdqa %xmm1, %xmm0
332; SSE41-NEXT:    psllw $12, %xmm0
333; SSE41-NEXT:    psllw $4, %xmm1
334; SSE41-NEXT:    por %xmm0, %xmm1
335; SSE41-NEXT:    movdqa %xmm1, %xmm4
336; SSE41-NEXT:    paddw %xmm4, %xmm4
337; SSE41-NEXT:    movdqa %xmm3, %xmm6
338; SSE41-NEXT:    psllw $8, %xmm6
339; SSE41-NEXT:    movdqa %xmm3, %xmm5
340; SSE41-NEXT:    movdqa %xmm1, %xmm0
341; SSE41-NEXT:    pblendvb %xmm6, %xmm5
342; SSE41-NEXT:    movdqa %xmm5, %xmm1
343; SSE41-NEXT:    psllw $4, %xmm1
344; SSE41-NEXT:    movdqa %xmm4, %xmm0
345; SSE41-NEXT:    pblendvb %xmm1, %xmm5
346; SSE41-NEXT:    movdqa %xmm5, %xmm1
347; SSE41-NEXT:    psllw $2, %xmm1
348; SSE41-NEXT:    paddw %xmm4, %xmm4
349; SSE41-NEXT:    movdqa %xmm4, %xmm0
350; SSE41-NEXT:    pblendvb %xmm1, %xmm5
351; SSE41-NEXT:    movdqa %xmm5, %xmm1
352; SSE41-NEXT:    psllw $1, %xmm1
353; SSE41-NEXT:    paddw %xmm4, %xmm4
354; SSE41-NEXT:    movdqa %xmm4, %xmm0
355; SSE41-NEXT:    pblendvb %xmm1, %xmm5
356; SSE41-NEXT:    movdqa %xmm2, %xmm0
357; SSE41-NEXT:    psllw $12, %xmm0
358; SSE41-NEXT:    psllw $4, %xmm2
359; SSE41-NEXT:    por %xmm0, %xmm2
360; SSE41-NEXT:    movdqa %xmm2, %xmm1
361; SSE41-NEXT:    paddw %xmm1, %xmm1
362; SSE41-NEXT:    movdqa %xmm3, %xmm4
363; SSE41-NEXT:    psrlw $8, %xmm4
364; SSE41-NEXT:    movdqa %xmm2, %xmm0
365; SSE41-NEXT:    pblendvb %xmm4, %xmm3
366; SSE41-NEXT:    movdqa %xmm3, %xmm2
367; SSE41-NEXT:    psrlw $4, %xmm2
368; SSE41-NEXT:    movdqa %xmm1, %xmm0
369; SSE41-NEXT:    pblendvb %xmm2, %xmm3
370; SSE41-NEXT:    movdqa %xmm3, %xmm2
371; SSE41-NEXT:    psrlw $2, %xmm2
372; SSE41-NEXT:    paddw %xmm1, %xmm1
373; SSE41-NEXT:    movdqa %xmm1, %xmm0
374; SSE41-NEXT:    pblendvb %xmm2, %xmm3
375; SSE41-NEXT:    movdqa %xmm3, %xmm2
376; SSE41-NEXT:    psrlw $1, %xmm2
377; SSE41-NEXT:    paddw %xmm1, %xmm1
378; SSE41-NEXT:    movdqa %xmm1, %xmm0
379; SSE41-NEXT:    pblendvb %xmm2, %xmm3
380; SSE41-NEXT:    por %xmm5, %xmm3
381; SSE41-NEXT:    movdqa %xmm3, %xmm0
382; SSE41-NEXT:    retq
383;
384; AVX1-LABEL: var_rotate_v8i16:
385; AVX1:       # BB#0:
386; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16]
387; AVX1-NEXT:    vpsubw %xmm1, %xmm2, %xmm2
388; AVX1-NEXT:    vpsllw $12, %xmm1, %xmm3
389; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
390; AVX1-NEXT:    vpor %xmm3, %xmm1, %xmm1
391; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm3
392; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm4
393; AVX1-NEXT:    vpblendvb %xmm1, %xmm4, %xmm0, %xmm1
394; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm4
395; AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm1, %xmm1
396; AVX1-NEXT:    vpsllw $2, %xmm1, %xmm4
397; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
398; AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm1, %xmm1
399; AVX1-NEXT:    vpsllw $1, %xmm1, %xmm4
400; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
401; AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm1, %xmm1
402; AVX1-NEXT:    vpsllw $12, %xmm2, %xmm3
403; AVX1-NEXT:    vpsllw $4, %xmm2, %xmm2
404; AVX1-NEXT:    vpor %xmm3, %xmm2, %xmm2
405; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm3
406; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm4
407; AVX1-NEXT:    vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
408; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm2
409; AVX1-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
410; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm2
411; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
412; AVX1-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
413; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm2
414; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
415; AVX1-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
416; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
417; AVX1-NEXT:    retq
418;
419; AVX2-LABEL: var_rotate_v8i16:
420; AVX2:       # BB#0:
421; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16]
422; AVX2-NEXT:    vpsubw %xmm1, %xmm2, %xmm2
423; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
424; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
425; AVX2-NEXT:    vpsllvd %ymm1, %ymm0, %ymm1
426; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
427; AVX2-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
428; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
429; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
430; AVX2-NEXT:    vpsrlvd %ymm2, %ymm0, %ymm0
431; AVX2-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
432; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
433; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
434; AVX2-NEXT:    vzeroupper
435; AVX2-NEXT:    retq
436;
437; XOP-LABEL: var_rotate_v8i16:
438; XOP:       # BB#0:
439; XOP-NEXT:    vprotw %xmm1, %xmm0, %xmm0
440; XOP-NEXT:    retq
441;
442; X32-SSE-LABEL: var_rotate_v8i16:
443; X32-SSE:       # BB#0:
444; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
445; X32-SSE-NEXT:    psubw %xmm1, %xmm3
446; X32-SSE-NEXT:    psllw $12, %xmm1
447; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
448; X32-SSE-NEXT:    psraw $15, %xmm2
449; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
450; X32-SSE-NEXT:    psllw $8, %xmm4
451; X32-SSE-NEXT:    pand %xmm2, %xmm4
452; X32-SSE-NEXT:    pandn %xmm0, %xmm2
453; X32-SSE-NEXT:    por %xmm4, %xmm2
454; X32-SSE-NEXT:    paddw %xmm1, %xmm1
455; X32-SSE-NEXT:    movdqa %xmm1, %xmm4
456; X32-SSE-NEXT:    psraw $15, %xmm4
457; X32-SSE-NEXT:    movdqa %xmm4, %xmm5
458; X32-SSE-NEXT:    pandn %xmm2, %xmm5
459; X32-SSE-NEXT:    psllw $4, %xmm2
460; X32-SSE-NEXT:    pand %xmm4, %xmm2
461; X32-SSE-NEXT:    por %xmm5, %xmm2
462; X32-SSE-NEXT:    paddw %xmm1, %xmm1
463; X32-SSE-NEXT:    movdqa %xmm1, %xmm4
464; X32-SSE-NEXT:    psraw $15, %xmm4
465; X32-SSE-NEXT:    movdqa %xmm4, %xmm5
466; X32-SSE-NEXT:    pandn %xmm2, %xmm5
467; X32-SSE-NEXT:    psllw $2, %xmm2
468; X32-SSE-NEXT:    pand %xmm4, %xmm2
469; X32-SSE-NEXT:    por %xmm5, %xmm2
470; X32-SSE-NEXT:    paddw %xmm1, %xmm1
471; X32-SSE-NEXT:    psraw $15, %xmm1
472; X32-SSE-NEXT:    movdqa %xmm1, %xmm4
473; X32-SSE-NEXT:    pandn %xmm2, %xmm4
474; X32-SSE-NEXT:    psllw $1, %xmm2
475; X32-SSE-NEXT:    pand %xmm1, %xmm2
476; X32-SSE-NEXT:    psllw $12, %xmm3
477; X32-SSE-NEXT:    movdqa %xmm3, %xmm1
478; X32-SSE-NEXT:    psraw $15, %xmm1
479; X32-SSE-NEXT:    movdqa %xmm1, %xmm5
480; X32-SSE-NEXT:    pandn %xmm0, %xmm5
481; X32-SSE-NEXT:    psrlw $8, %xmm0
482; X32-SSE-NEXT:    pand %xmm1, %xmm0
483; X32-SSE-NEXT:    por %xmm5, %xmm0
484; X32-SSE-NEXT:    paddw %xmm3, %xmm3
485; X32-SSE-NEXT:    movdqa %xmm3, %xmm1
486; X32-SSE-NEXT:    psraw $15, %xmm1
487; X32-SSE-NEXT:    movdqa %xmm1, %xmm5
488; X32-SSE-NEXT:    pandn %xmm0, %xmm5
489; X32-SSE-NEXT:    psrlw $4, %xmm0
490; X32-SSE-NEXT:    pand %xmm1, %xmm0
491; X32-SSE-NEXT:    por %xmm5, %xmm0
492; X32-SSE-NEXT:    paddw %xmm3, %xmm3
493; X32-SSE-NEXT:    movdqa %xmm3, %xmm1
494; X32-SSE-NEXT:    psraw $15, %xmm1
495; X32-SSE-NEXT:    movdqa %xmm1, %xmm5
496; X32-SSE-NEXT:    pandn %xmm0, %xmm5
497; X32-SSE-NEXT:    psrlw $2, %xmm0
498; X32-SSE-NEXT:    pand %xmm1, %xmm0
499; X32-SSE-NEXT:    por %xmm5, %xmm0
500; X32-SSE-NEXT:    paddw %xmm3, %xmm3
501; X32-SSE-NEXT:    psraw $15, %xmm3
502; X32-SSE-NEXT:    movdqa %xmm3, %xmm1
503; X32-SSE-NEXT:    pandn %xmm0, %xmm1
504; X32-SSE-NEXT:    psrlw $1, %xmm0
505; X32-SSE-NEXT:    pand %xmm3, %xmm0
506; X32-SSE-NEXT:    por %xmm1, %xmm0
507; X32-SSE-NEXT:    por %xmm4, %xmm0
508; X32-SSE-NEXT:    por %xmm2, %xmm0
509; X32-SSE-NEXT:    retl
510  %b16 = sub <8 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %b
511  %shl = shl <8 x i16> %a, %b
512  %lshr = lshr <8 x i16> %a, %b16
513  %or = or <8 x i16> %shl, %lshr
514  ret <8 x i16> %or
515}
516
517define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
518; SSE2-LABEL: var_rotate_v16i8:
519; SSE2:       # BB#0:
520; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
521; SSE2-NEXT:    psubb %xmm1, %xmm4
522; SSE2-NEXT:    psllw $5, %xmm1
523; SSE2-NEXT:    pxor %xmm3, %xmm3
524; SSE2-NEXT:    pxor %xmm2, %xmm2
525; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
526; SSE2-NEXT:    movdqa %xmm0, %xmm5
527; SSE2-NEXT:    psllw $4, %xmm5
528; SSE2-NEXT:    pand {{.*}}(%rip), %xmm5
529; SSE2-NEXT:    pand %xmm2, %xmm5
530; SSE2-NEXT:    pandn %xmm0, %xmm2
531; SSE2-NEXT:    por %xmm5, %xmm2
532; SSE2-NEXT:    paddb %xmm1, %xmm1
533; SSE2-NEXT:    pxor %xmm5, %xmm5
534; SSE2-NEXT:    pcmpgtb %xmm1, %xmm5
535; SSE2-NEXT:    movdqa %xmm5, %xmm6
536; SSE2-NEXT:    pandn %xmm2, %xmm6
537; SSE2-NEXT:    psllw $2, %xmm2
538; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
539; SSE2-NEXT:    pand %xmm5, %xmm2
540; SSE2-NEXT:    por %xmm6, %xmm2
541; SSE2-NEXT:    paddb %xmm1, %xmm1
542; SSE2-NEXT:    pxor %xmm5, %xmm5
543; SSE2-NEXT:    pcmpgtb %xmm1, %xmm5
544; SSE2-NEXT:    movdqa %xmm5, %xmm1
545; SSE2-NEXT:    pandn %xmm2, %xmm1
546; SSE2-NEXT:    paddb %xmm2, %xmm2
547; SSE2-NEXT:    pand %xmm5, %xmm2
548; SSE2-NEXT:    psllw $5, %xmm4
549; SSE2-NEXT:    pxor %xmm5, %xmm5
550; SSE2-NEXT:    pcmpgtb %xmm4, %xmm5
551; SSE2-NEXT:    movdqa %xmm5, %xmm6
552; SSE2-NEXT:    pandn %xmm0, %xmm6
553; SSE2-NEXT:    psrlw $4, %xmm0
554; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
555; SSE2-NEXT:    pand %xmm5, %xmm0
556; SSE2-NEXT:    por %xmm6, %xmm0
557; SSE2-NEXT:    paddb %xmm4, %xmm4
558; SSE2-NEXT:    pxor %xmm5, %xmm5
559; SSE2-NEXT:    pcmpgtb %xmm4, %xmm5
560; SSE2-NEXT:    movdqa %xmm5, %xmm6
561; SSE2-NEXT:    pandn %xmm0, %xmm6
562; SSE2-NEXT:    psrlw $2, %xmm0
563; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
564; SSE2-NEXT:    pand %xmm5, %xmm0
565; SSE2-NEXT:    por %xmm6, %xmm0
566; SSE2-NEXT:    paddb %xmm4, %xmm4
567; SSE2-NEXT:    pcmpgtb %xmm4, %xmm3
568; SSE2-NEXT:    movdqa %xmm3, %xmm4
569; SSE2-NEXT:    pandn %xmm0, %xmm4
570; SSE2-NEXT:    psrlw $1, %xmm0
571; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
572; SSE2-NEXT:    pand %xmm3, %xmm0
573; SSE2-NEXT:    por %xmm4, %xmm0
574; SSE2-NEXT:    por %xmm1, %xmm0
575; SSE2-NEXT:    por %xmm2, %xmm0
576; SSE2-NEXT:    retq
577;
578; SSE41-LABEL: var_rotate_v16i8:
579; SSE41:       # BB#0:
580; SSE41-NEXT:    movdqa %xmm1, %xmm3
581; SSE41-NEXT:    movdqa %xmm0, %xmm1
582; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
583; SSE41-NEXT:    psubb %xmm3, %xmm2
584; SSE41-NEXT:    psllw $5, %xmm3
585; SSE41-NEXT:    movdqa %xmm1, %xmm5
586; SSE41-NEXT:    psllw $4, %xmm5
587; SSE41-NEXT:    pand {{.*}}(%rip), %xmm5
588; SSE41-NEXT:    movdqa %xmm1, %xmm4
589; SSE41-NEXT:    movdqa %xmm3, %xmm0
590; SSE41-NEXT:    pblendvb %xmm5, %xmm4
591; SSE41-NEXT:    movdqa %xmm4, %xmm5
592; SSE41-NEXT:    psllw $2, %xmm5
593; SSE41-NEXT:    pand {{.*}}(%rip), %xmm5
594; SSE41-NEXT:    paddb %xmm3, %xmm3
595; SSE41-NEXT:    movdqa %xmm3, %xmm0
596; SSE41-NEXT:    pblendvb %xmm5, %xmm4
597; SSE41-NEXT:    movdqa %xmm4, %xmm5
598; SSE41-NEXT:    paddb %xmm5, %xmm5
599; SSE41-NEXT:    paddb %xmm3, %xmm3
600; SSE41-NEXT:    movdqa %xmm3, %xmm0
601; SSE41-NEXT:    pblendvb %xmm5, %xmm4
602; SSE41-NEXT:    psllw $5, %xmm2
603; SSE41-NEXT:    movdqa %xmm2, %xmm3
604; SSE41-NEXT:    paddb %xmm3, %xmm3
605; SSE41-NEXT:    movdqa %xmm1, %xmm5
606; SSE41-NEXT:    psrlw $4, %xmm5
607; SSE41-NEXT:    pand {{.*}}(%rip), %xmm5
608; SSE41-NEXT:    movdqa %xmm2, %xmm0
609; SSE41-NEXT:    pblendvb %xmm5, %xmm1
610; SSE41-NEXT:    movdqa %xmm1, %xmm2
611; SSE41-NEXT:    psrlw $2, %xmm2
612; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
613; SSE41-NEXT:    movdqa %xmm3, %xmm0
614; SSE41-NEXT:    pblendvb %xmm2, %xmm1
615; SSE41-NEXT:    movdqa %xmm1, %xmm2
616; SSE41-NEXT:    psrlw $1, %xmm2
617; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
618; SSE41-NEXT:    paddb %xmm3, %xmm3
619; SSE41-NEXT:    movdqa %xmm3, %xmm0
620; SSE41-NEXT:    pblendvb %xmm2, %xmm1
621; SSE41-NEXT:    por %xmm4, %xmm1
622; SSE41-NEXT:    movdqa %xmm1, %xmm0
623; SSE41-NEXT:    retq
624;
625; AVX-LABEL: var_rotate_v16i8:
626; AVX:       # BB#0:
627; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
628; AVX-NEXT:    vpsubb %xmm1, %xmm2, %xmm2
629; AVX-NEXT:    vpsllw $5, %xmm1, %xmm1
630; AVX-NEXT:    vpsllw $4, %xmm0, %xmm3
631; AVX-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
632; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm3
633; AVX-NEXT:    vpsllw $2, %xmm3, %xmm4
634; AVX-NEXT:    vpand {{.*}}(%rip), %xmm4, %xmm4
635; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
636; AVX-NEXT:    vpblendvb %xmm1, %xmm4, %xmm3, %xmm3
637; AVX-NEXT:    vpaddb %xmm3, %xmm3, %xmm4
638; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
639; AVX-NEXT:    vpblendvb %xmm1, %xmm4, %xmm3, %xmm1
640; AVX-NEXT:    vpsllw $5, %xmm2, %xmm2
641; AVX-NEXT:    vpaddb %xmm2, %xmm2, %xmm3
642; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm4
643; AVX-NEXT:    vpand {{.*}}(%rip), %xmm4, %xmm4
644; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
645; AVX-NEXT:    vpsrlw $2, %xmm0, %xmm2
646; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
647; AVX-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
648; AVX-NEXT:    vpsrlw $1, %xmm0, %xmm2
649; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
650; AVX-NEXT:    vpaddb %xmm3, %xmm3, %xmm3
651; AVX-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
652; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
653; AVX-NEXT:    retq
654;
655; XOP-LABEL: var_rotate_v16i8:
656; XOP:       # BB#0:
657; XOP-NEXT:    vprotb %xmm1, %xmm0, %xmm0
658; XOP-NEXT:    retq
659;
660; X32-SSE-LABEL: var_rotate_v16i8:
661; X32-SSE:       # BB#0:
662; X32-SSE-NEXT:    movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
663; X32-SSE-NEXT:    psubb %xmm1, %xmm4
664; X32-SSE-NEXT:    psllw $5, %xmm1
665; X32-SSE-NEXT:    pxor %xmm3, %xmm3
666; X32-SSE-NEXT:    pxor %xmm2, %xmm2
667; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm2
668; X32-SSE-NEXT:    movdqa %xmm0, %xmm5
669; X32-SSE-NEXT:    psllw $4, %xmm5
670; X32-SSE-NEXT:    pand .LCPI3_1, %xmm5
671; X32-SSE-NEXT:    pand %xmm2, %xmm5
672; X32-SSE-NEXT:    pandn %xmm0, %xmm2
673; X32-SSE-NEXT:    por %xmm5, %xmm2
674; X32-SSE-NEXT:    paddb %xmm1, %xmm1
675; X32-SSE-NEXT:    pxor %xmm5, %xmm5
676; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm5
677; X32-SSE-NEXT:    movdqa %xmm5, %xmm6
678; X32-SSE-NEXT:    pandn %xmm2, %xmm6
679; X32-SSE-NEXT:    psllw $2, %xmm2
680; X32-SSE-NEXT:    pand .LCPI3_2, %xmm2
681; X32-SSE-NEXT:    pand %xmm5, %xmm2
682; X32-SSE-NEXT:    por %xmm6, %xmm2
683; X32-SSE-NEXT:    paddb %xmm1, %xmm1
684; X32-SSE-NEXT:    pxor %xmm5, %xmm5
685; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm5
686; X32-SSE-NEXT:    movdqa %xmm5, %xmm1
687; X32-SSE-NEXT:    pandn %xmm2, %xmm1
688; X32-SSE-NEXT:    paddb %xmm2, %xmm2
689; X32-SSE-NEXT:    pand %xmm5, %xmm2
690; X32-SSE-NEXT:    psllw $5, %xmm4
691; X32-SSE-NEXT:    pxor %xmm5, %xmm5
692; X32-SSE-NEXT:    pcmpgtb %xmm4, %xmm5
693; X32-SSE-NEXT:    movdqa %xmm5, %xmm6
694; X32-SSE-NEXT:    pandn %xmm0, %xmm6
695; X32-SSE-NEXT:    psrlw $4, %xmm0
696; X32-SSE-NEXT:    pand .LCPI3_3, %xmm0
697; X32-SSE-NEXT:    pand %xmm5, %xmm0
698; X32-SSE-NEXT:    por %xmm6, %xmm0
699; X32-SSE-NEXT:    paddb %xmm4, %xmm4
700; X32-SSE-NEXT:    pxor %xmm5, %xmm5
701; X32-SSE-NEXT:    pcmpgtb %xmm4, %xmm5
702; X32-SSE-NEXT:    movdqa %xmm5, %xmm6
703; X32-SSE-NEXT:    pandn %xmm0, %xmm6
704; X32-SSE-NEXT:    psrlw $2, %xmm0
705; X32-SSE-NEXT:    pand .LCPI3_4, %xmm0
706; X32-SSE-NEXT:    pand %xmm5, %xmm0
707; X32-SSE-NEXT:    por %xmm6, %xmm0
708; X32-SSE-NEXT:    paddb %xmm4, %xmm4
709; X32-SSE-NEXT:    pcmpgtb %xmm4, %xmm3
710; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
711; X32-SSE-NEXT:    pandn %xmm0, %xmm4
712; X32-SSE-NEXT:    psrlw $1, %xmm0
713; X32-SSE-NEXT:    pand .LCPI3_5, %xmm0
714; X32-SSE-NEXT:    pand %xmm3, %xmm0
715; X32-SSE-NEXT:    por %xmm4, %xmm0
716; X32-SSE-NEXT:    por %xmm1, %xmm0
717; X32-SSE-NEXT:    por %xmm2, %xmm0
718; X32-SSE-NEXT:    retl
719  %b8 = sub <16 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b
720  %shl = shl <16 x i8> %a, %b
721  %lshr = lshr <16 x i8> %a, %b8
722  %or = or <16 x i8> %shl, %lshr
723  ret <16 x i8> %or
724}
725
726;
727; Constant Rotates
728;
729
730define <2 x i64> @constant_rotate_v2i64(<2 x i64> %a) nounwind {
731; SSE2-LABEL: constant_rotate_v2i64:
732; SSE2:       # BB#0:
733; SSE2-NEXT:    movdqa %xmm0, %xmm2
734; SSE2-NEXT:    psllq $14, %xmm2
735; SSE2-NEXT:    movdqa %xmm0, %xmm1
736; SSE2-NEXT:    psllq $4, %xmm1
737; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
738; SSE2-NEXT:    movdqa %xmm0, %xmm1
739; SSE2-NEXT:    psrlq $50, %xmm1
740; SSE2-NEXT:    psrlq $60, %xmm0
741; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
742; SSE2-NEXT:    orpd %xmm2, %xmm1
743; SSE2-NEXT:    movapd %xmm1, %xmm0
744; SSE2-NEXT:    retq
745;
746; SSE41-LABEL: constant_rotate_v2i64:
747; SSE41:       # BB#0:
748; SSE41-NEXT:    movdqa %xmm0, %xmm1
749; SSE41-NEXT:    psllq $14, %xmm1
750; SSE41-NEXT:    movdqa %xmm0, %xmm2
751; SSE41-NEXT:    psllq $4, %xmm2
752; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
753; SSE41-NEXT:    movdqa %xmm0, %xmm1
754; SSE41-NEXT:    psrlq $50, %xmm1
755; SSE41-NEXT:    psrlq $60, %xmm0
756; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
757; SSE41-NEXT:    por %xmm2, %xmm0
758; SSE41-NEXT:    retq
759;
760; AVX1-LABEL: constant_rotate_v2i64:
761; AVX1:       # BB#0:
762; AVX1-NEXT:    vpsllq $14, %xmm0, %xmm1
763; AVX1-NEXT:    vpsllq $4, %xmm0, %xmm2
764; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
765; AVX1-NEXT:    vpsrlq $50, %xmm0, %xmm2
766; AVX1-NEXT:    vpsrlq $60, %xmm0, %xmm0
767; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
768; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
769; AVX1-NEXT:    retq
770;
771; AVX2-LABEL: constant_rotate_v2i64:
772; AVX2:       # BB#0:
773; AVX2-NEXT:    vpsllvq {{.*}}(%rip), %xmm0, %xmm1
774; AVX2-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
775; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
776; AVX2-NEXT:    retq
777;
778; XOPAVX1-LABEL: constant_rotate_v2i64:
779; XOPAVX1:       # BB#0:
780; XOPAVX1-NEXT:    vpshlq {{.*}}(%rip), %xmm0, %xmm1
781; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
782; XOPAVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm2, %xmm2
783; XOPAVX1-NEXT:    vpshlq %xmm2, %xmm0, %xmm0
784; XOPAVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
785; XOPAVX1-NEXT:    retq
786;
787; XOPAVX2-LABEL: constant_rotate_v2i64:
788; XOPAVX2:       # BB#0:
789; XOPAVX2-NEXT:    vpsllvq {{.*}}(%rip), %xmm0, %xmm1
790; XOPAVX2-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
791; XOPAVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
792; XOPAVX2-NEXT:    retq
793;
794; X32-SSE-LABEL: constant_rotate_v2i64:
795; X32-SSE:       # BB#0:
796; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
797; X32-SSE-NEXT:    psllq $14, %xmm2
798; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
799; X32-SSE-NEXT:    psllq $4, %xmm1
800; X32-SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
801; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
802; X32-SSE-NEXT:    psrlq $50, %xmm1
803; X32-SSE-NEXT:    psrlq $60, %xmm0
804; X32-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
805; X32-SSE-NEXT:    orpd %xmm2, %xmm1
806; X32-SSE-NEXT:    movapd %xmm1, %xmm0
807; X32-SSE-NEXT:    retl
808  %shl = shl <2 x i64> %a, <i64 4, i64 14>
809  %lshr = lshr <2 x i64> %a, <i64 60, i64 50>
810  %or = or <2 x i64> %shl, %lshr
811  ret <2 x i64> %or
812}
813
814define <4 x i32> @constant_rotate_v4i32(<4 x i32> %a) nounwind {
815; SSE2-LABEL: constant_rotate_v4i32:
816; SSE2:       # BB#0:
817; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [16,32,64,128]
818; SSE2-NEXT:    movdqa %xmm0, %xmm2
819; SSE2-NEXT:    pmuludq %xmm1, %xmm2
820; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
821; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
822; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
823; SSE2-NEXT:    pmuludq %xmm1, %xmm3
824; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
825; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
826; SSE2-NEXT:    movdqa %xmm0, %xmm1
827; SSE2-NEXT:    psrld $25, %xmm1
828; SSE2-NEXT:    movdqa %xmm0, %xmm3
829; SSE2-NEXT:    psrld $27, %xmm3
830; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1]
831; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
832; SSE2-NEXT:    movdqa %xmm0, %xmm3
833; SSE2-NEXT:    psrld $26, %xmm3
834; SSE2-NEXT:    psrld $28, %xmm0
835; SSE2-NEXT:    movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1]
836; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
837; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
838; SSE2-NEXT:    por %xmm2, %xmm0
839; SSE2-NEXT:    retq
840;
841; SSE41-LABEL: constant_rotate_v4i32:
842; SSE41:       # BB#0:
843; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [16,32,64,128]
844; SSE41-NEXT:    pmulld %xmm0, %xmm1
845; SSE41-NEXT:    movdqa %xmm0, %xmm2
846; SSE41-NEXT:    psrld $25, %xmm2
847; SSE41-NEXT:    movdqa %xmm0, %xmm3
848; SSE41-NEXT:    psrld $27, %xmm3
849; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
850; SSE41-NEXT:    movdqa %xmm0, %xmm2
851; SSE41-NEXT:    psrld $26, %xmm2
852; SSE41-NEXT:    psrld $28, %xmm0
853; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
854; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
855; SSE41-NEXT:    por %xmm1, %xmm0
856; SSE41-NEXT:    retq
857;
858; AVX1-LABEL: constant_rotate_v4i32:
859; AVX1:       # BB#0:
860; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm1
861; AVX1-NEXT:    vpsrld $25, %xmm0, %xmm2
862; AVX1-NEXT:    vpsrld $27, %xmm0, %xmm3
863; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
864; AVX1-NEXT:    vpsrld $26, %xmm0, %xmm3
865; AVX1-NEXT:    vpsrld $28, %xmm0, %xmm0
866; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
867; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
868; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
869; AVX1-NEXT:    retq
870;
871; AVX2-LABEL: constant_rotate_v4i32:
872; AVX2:       # BB#0:
873; AVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm1
874; AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
875; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
876; AVX2-NEXT:    retq
877;
878; XOPAVX1-LABEL: constant_rotate_v4i32:
879; XOPAVX1:       # BB#0:
880; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm1
881; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm0
882; XOPAVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
883; XOPAVX1-NEXT:    retq
884;
885; XOPAVX2-LABEL: constant_rotate_v4i32:
886; XOPAVX2:       # BB#0:
887; XOPAVX2-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm1
888; XOPAVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
889; XOPAVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
890; XOPAVX2-NEXT:    retq
891;
892; X32-SSE-LABEL: constant_rotate_v4i32:
893; X32-SSE:       # BB#0:
894; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [16,32,64,128]
895; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
896; X32-SSE-NEXT:    pmuludq %xmm1, %xmm2
897; X32-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
898; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
899; X32-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
900; X32-SSE-NEXT:    pmuludq %xmm1, %xmm3
901; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
902; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
903; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
904; X32-SSE-NEXT:    psrld $25, %xmm1
905; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
906; X32-SSE-NEXT:    psrld $27, %xmm3
907; X32-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1]
908; X32-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
909; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
910; X32-SSE-NEXT:    psrld $26, %xmm3
911; X32-SSE-NEXT:    psrld $28, %xmm0
912; X32-SSE-NEXT:    movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1]
913; X32-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
914; X32-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
915; X32-SSE-NEXT:    por %xmm2, %xmm0
916; X32-SSE-NEXT:    retl
917  %shl = shl <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7>
918  %lshr = lshr <4 x i32> %a, <i32 28, i32 27, i32 26, i32 25>
919  %or = or <4 x i32> %shl, %lshr
920  ret <4 x i32> %or
921}
922
923define <8 x i16> @constant_rotate_v8i16(<8 x i16> %a) nounwind {
924; SSE2-LABEL: constant_rotate_v8i16:
925; SSE2:       # BB#0:
926; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128]
927; SSE2-NEXT:    pmullw %xmm0, %xmm2
928; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
929; SSE2-NEXT:    movdqa %xmm1, %xmm3
930; SSE2-NEXT:    pandn %xmm0, %xmm3
931; SSE2-NEXT:    psrlw $8, %xmm0
932; SSE2-NEXT:    pand %xmm1, %xmm0
933; SSE2-NEXT:    por %xmm3, %xmm0
934; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,0,0,0]
935; SSE2-NEXT:    movdqa %xmm1, %xmm3
936; SSE2-NEXT:    pandn %xmm0, %xmm3
937; SSE2-NEXT:    psrlw $4, %xmm0
938; SSE2-NEXT:    pand %xmm1, %xmm0
939; SSE2-NEXT:    por %xmm3, %xmm0
940; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,0,0,65535,65535,0]
941; SSE2-NEXT:    movdqa %xmm1, %xmm3
942; SSE2-NEXT:    pandn %xmm0, %xmm3
943; SSE2-NEXT:    psrlw $2, %xmm0
944; SSE2-NEXT:    pand %xmm1, %xmm0
945; SSE2-NEXT:    por %xmm3, %xmm0
946; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0]
947; SSE2-NEXT:    movdqa %xmm0, %xmm1
948; SSE2-NEXT:    pand %xmm3, %xmm1
949; SSE2-NEXT:    psrlw $1, %xmm0
950; SSE2-NEXT:    pandn %xmm0, %xmm3
951; SSE2-NEXT:    por %xmm2, %xmm3
952; SSE2-NEXT:    por %xmm3, %xmm1
953; SSE2-NEXT:    movdqa %xmm1, %xmm0
954; SSE2-NEXT:    retq
955;
956; SSE41-LABEL: constant_rotate_v8i16:
957; SSE41:       # BB#0:
958; SSE41-NEXT:    movdqa %xmm0, %xmm1
959; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128]
960; SSE41-NEXT:    pmullw %xmm1, %xmm2
961; SSE41-NEXT:    movdqa %xmm1, %xmm3
962; SSE41-NEXT:    psrlw $8, %xmm3
963; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [256,61680,57568,53456,49344,45232,41120,37008]
964; SSE41-NEXT:    pblendvb %xmm3, %xmm1
965; SSE41-NEXT:    movdqa %xmm1, %xmm3
966; SSE41-NEXT:    psrlw $4, %xmm3
967; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [512,57824,49600,41376,33152,24928,16704,8480]
968; SSE41-NEXT:    pblendvb %xmm3, %xmm1
969; SSE41-NEXT:    movdqa %xmm1, %xmm3
970; SSE41-NEXT:    psrlw $2, %xmm3
971; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [1024,50112,33664,17216,768,49856,33408,16960]
972; SSE41-NEXT:    pblendvb %xmm3, %xmm1
973; SSE41-NEXT:    movdqa %xmm1, %xmm3
974; SSE41-NEXT:    psrlw $1, %xmm3
975; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [2048,34688,1792,34432,1536,34176,1280,33920]
976; SSE41-NEXT:    pblendvb %xmm3, %xmm1
977; SSE41-NEXT:    por %xmm2, %xmm1
978; SSE41-NEXT:    movdqa %xmm1, %xmm0
979; SSE41-NEXT:    retq
980;
981; AVX1-LABEL: constant_rotate_v8i16:
982; AVX1:       # BB#0:
983; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm1
984; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2
985; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [256,61680,57568,53456,49344,45232,41120,37008]
986; AVX1-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
987; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm2
988; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [512,57824,49600,41376,33152,24928,16704,8480]
989; AVX1-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
990; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm2
991; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1024,50112,33664,17216,768,49856,33408,16960]
992; AVX1-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
993; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm2
994; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [2048,34688,1792,34432,1536,34176,1280,33920]
995; AVX1-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
996; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
997; AVX1-NEXT:    retq
998;
999; AVX2-LABEL: constant_rotate_v8i16:
1000; AVX2:       # BB#0:
1001; AVX2-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm1
1002; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1003; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1004; AVX2-NEXT:    vpsrlvd %ymm2, %ymm0, %ymm0
1005; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
1006; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1007; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
1008; AVX2-NEXT:    vzeroupper
1009; AVX2-NEXT:    retq
1010;
1011; XOP-LABEL: constant_rotate_v8i16:
1012; XOP:       # BB#0:
1013; XOP-NEXT:    vpshlw {{.*}}(%rip), %xmm0, %xmm1
1014; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1015; XOP-NEXT:    vpsubw {{.*}}(%rip), %xmm2, %xmm2
1016; XOP-NEXT:    vpshlw %xmm2, %xmm0, %xmm0
1017; XOP-NEXT:    vpor %xmm0, %xmm1, %xmm0
1018; XOP-NEXT:    retq
1019;
1020; X32-SSE-LABEL: constant_rotate_v8i16:
1021; X32-SSE:       # BB#0:
1022; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128]
1023; X32-SSE-NEXT:    pmullw %xmm0, %xmm2
1024; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
1025; X32-SSE-NEXT:    movdqa %xmm1, %xmm3
1026; X32-SSE-NEXT:    pandn %xmm0, %xmm3
1027; X32-SSE-NEXT:    psrlw $8, %xmm0
1028; X32-SSE-NEXT:    pand %xmm1, %xmm0
1029; X32-SSE-NEXT:    por %xmm3, %xmm0
1030; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,0,0,0]
1031; X32-SSE-NEXT:    movdqa %xmm1, %xmm3
1032; X32-SSE-NEXT:    pandn %xmm0, %xmm3
1033; X32-SSE-NEXT:    psrlw $4, %xmm0
1034; X32-SSE-NEXT:    pand %xmm1, %xmm0
1035; X32-SSE-NEXT:    por %xmm3, %xmm0
1036; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,0,0,65535,65535,0]
1037; X32-SSE-NEXT:    movdqa %xmm1, %xmm3
1038; X32-SSE-NEXT:    pandn %xmm0, %xmm3
1039; X32-SSE-NEXT:    psrlw $2, %xmm0
1040; X32-SSE-NEXT:    pand %xmm1, %xmm0
1041; X32-SSE-NEXT:    por %xmm3, %xmm0
1042; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0]
1043; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
1044; X32-SSE-NEXT:    pand %xmm3, %xmm1
1045; X32-SSE-NEXT:    psrlw $1, %xmm0
1046; X32-SSE-NEXT:    pandn %xmm0, %xmm3
1047; X32-SSE-NEXT:    por %xmm2, %xmm3
1048; X32-SSE-NEXT:    por %xmm3, %xmm1
1049; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
1050; X32-SSE-NEXT:    retl
1051  %shl = shl <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
1052  %lshr = lshr <8 x i16> %a, <i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9>
1053  %or = or <8 x i16> %shl, %lshr
1054  ret <8 x i16> %or
1055}
1056
1057define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind {
1058; SSE2-LABEL: constant_rotate_v16i8:
1059; SSE2:       # BB#0:
1060; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
1061; SSE2-NEXT:    psllw $5, %xmm3
1062; SSE2-NEXT:    pxor %xmm2, %xmm2
1063; SSE2-NEXT:    pxor %xmm1, %xmm1
1064; SSE2-NEXT:    pcmpgtb %xmm3, %xmm1
1065; SSE2-NEXT:    movdqa %xmm0, %xmm4
1066; SSE2-NEXT:    psllw $4, %xmm4
1067; SSE2-NEXT:    pand {{.*}}(%rip), %xmm4
1068; SSE2-NEXT:    pand %xmm1, %xmm4
1069; SSE2-NEXT:    pandn %xmm0, %xmm1
1070; SSE2-NEXT:    por %xmm4, %xmm1
1071; SSE2-NEXT:    paddb %xmm3, %xmm3
1072; SSE2-NEXT:    pxor %xmm4, %xmm4
1073; SSE2-NEXT:    pcmpgtb %xmm3, %xmm4
1074; SSE2-NEXT:    movdqa %xmm4, %xmm5
1075; SSE2-NEXT:    pandn %xmm1, %xmm5
1076; SSE2-NEXT:    psllw $2, %xmm1
1077; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
1078; SSE2-NEXT:    pand %xmm4, %xmm1
1079; SSE2-NEXT:    por %xmm5, %xmm1
1080; SSE2-NEXT:    paddb %xmm3, %xmm3
1081; SSE2-NEXT:    pxor %xmm4, %xmm4
1082; SSE2-NEXT:    pcmpgtb %xmm3, %xmm4
1083; SSE2-NEXT:    movdqa %xmm4, %xmm3
1084; SSE2-NEXT:    pandn %xmm1, %xmm3
1085; SSE2-NEXT:    paddb %xmm1, %xmm1
1086; SSE2-NEXT:    pand %xmm4, %xmm1
1087; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7]
1088; SSE2-NEXT:    psllw $5, %xmm4
1089; SSE2-NEXT:    pxor %xmm5, %xmm5
1090; SSE2-NEXT:    pcmpgtb %xmm4, %xmm5
1091; SSE2-NEXT:    movdqa %xmm5, %xmm6
1092; SSE2-NEXT:    pandn %xmm0, %xmm6
1093; SSE2-NEXT:    psrlw $4, %xmm0
1094; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
1095; SSE2-NEXT:    pand %xmm5, %xmm0
1096; SSE2-NEXT:    por %xmm6, %xmm0
1097; SSE2-NEXT:    paddb %xmm4, %xmm4
1098; SSE2-NEXT:    pxor %xmm5, %xmm5
1099; SSE2-NEXT:    pcmpgtb %xmm4, %xmm5
1100; SSE2-NEXT:    movdqa %xmm5, %xmm6
1101; SSE2-NEXT:    pandn %xmm0, %xmm6
1102; SSE2-NEXT:    psrlw $2, %xmm0
1103; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
1104; SSE2-NEXT:    pand %xmm5, %xmm0
1105; SSE2-NEXT:    por %xmm6, %xmm0
1106; SSE2-NEXT:    paddb %xmm4, %xmm4
1107; SSE2-NEXT:    pcmpgtb %xmm4, %xmm2
1108; SSE2-NEXT:    movdqa %xmm2, %xmm4
1109; SSE2-NEXT:    pandn %xmm0, %xmm4
1110; SSE2-NEXT:    psrlw $1, %xmm0
1111; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
1112; SSE2-NEXT:    pand %xmm2, %xmm0
1113; SSE2-NEXT:    por %xmm4, %xmm0
1114; SSE2-NEXT:    por %xmm3, %xmm0
1115; SSE2-NEXT:    por %xmm1, %xmm0
1116; SSE2-NEXT:    retq
1117;
1118; SSE41-LABEL: constant_rotate_v16i8:
1119; SSE41:       # BB#0:
1120; SSE41-NEXT:    movdqa %xmm0, %xmm1
1121; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
1122; SSE41-NEXT:    psllw $5, %xmm0
1123; SSE41-NEXT:    movdqa %xmm1, %xmm3
1124; SSE41-NEXT:    psllw $4, %xmm3
1125; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
1126; SSE41-NEXT:    movdqa %xmm1, %xmm2
1127; SSE41-NEXT:    pblendvb %xmm3, %xmm2
1128; SSE41-NEXT:    movdqa %xmm2, %xmm3
1129; SSE41-NEXT:    psllw $2, %xmm3
1130; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
1131; SSE41-NEXT:    paddb %xmm0, %xmm0
1132; SSE41-NEXT:    pblendvb %xmm3, %xmm2
1133; SSE41-NEXT:    movdqa %xmm2, %xmm3
1134; SSE41-NEXT:    paddb %xmm3, %xmm3
1135; SSE41-NEXT:    paddb %xmm0, %xmm0
1136; SSE41-NEXT:    pblendvb %xmm3, %xmm2
1137; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7]
1138; SSE41-NEXT:    psllw $5, %xmm0
1139; SSE41-NEXT:    movdqa %xmm1, %xmm3
1140; SSE41-NEXT:    psrlw $4, %xmm3
1141; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
1142; SSE41-NEXT:    pblendvb %xmm3, %xmm1
1143; SSE41-NEXT:    movdqa %xmm1, %xmm3
1144; SSE41-NEXT:    psrlw $2, %xmm3
1145; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
1146; SSE41-NEXT:    paddb %xmm0, %xmm0
1147; SSE41-NEXT:    pblendvb %xmm3, %xmm1
1148; SSE41-NEXT:    movdqa %xmm1, %xmm3
1149; SSE41-NEXT:    psrlw $1, %xmm3
1150; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
1151; SSE41-NEXT:    paddb %xmm0, %xmm0
1152; SSE41-NEXT:    pblendvb %xmm3, %xmm1
1153; SSE41-NEXT:    por %xmm2, %xmm1
1154; SSE41-NEXT:    movdqa %xmm1, %xmm0
1155; SSE41-NEXT:    retq
1156;
1157; AVX-LABEL: constant_rotate_v16i8:
1158; AVX:       # BB#0:
1159; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
1160; AVX-NEXT:    vpsllw $5, %xmm1, %xmm1
1161; AVX-NEXT:    vpsllw $4, %xmm0, %xmm2
1162; AVX-NEXT:    vpand {{.*}}(%rip), %xmm2, %xmm2
1163; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm2
1164; AVX-NEXT:    vpsllw $2, %xmm2, %xmm3
1165; AVX-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
1166; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
1167; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm2, %xmm2
1168; AVX-NEXT:    vpaddb %xmm2, %xmm2, %xmm3
1169; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
1170; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm2, %xmm1
1171; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7]
1172; AVX-NEXT:    vpsllw $5, %xmm2, %xmm2
1173; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm3
1174; AVX-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
1175; AVX-NEXT:    vpblendvb %xmm2, %xmm3, %xmm0, %xmm0
1176; AVX-NEXT:    vpsrlw $2, %xmm0, %xmm3
1177; AVX-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
1178; AVX-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
1179; AVX-NEXT:    vpblendvb %xmm2, %xmm3, %xmm0, %xmm0
1180; AVX-NEXT:    vpsrlw $1, %xmm0, %xmm3
1181; AVX-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
1182; AVX-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
1183; AVX-NEXT:    vpblendvb %xmm2, %xmm3, %xmm0, %xmm0
1184; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
1185; AVX-NEXT:    retq
1186;
1187; XOP-LABEL: constant_rotate_v16i8:
1188; XOP:       # BB#0:
1189; XOP-NEXT:    vpshlb {{.*}}(%rip), %xmm0, %xmm1
1190; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1191; XOP-NEXT:    vpsubb {{.*}}(%rip), %xmm2, %xmm2
1192; XOP-NEXT:    vpshlb %xmm2, %xmm0, %xmm0
1193; XOP-NEXT:    vpor %xmm0, %xmm1, %xmm0
1194; XOP-NEXT:    retq
1195;
1196; X32-SSE-LABEL: constant_rotate_v16i8:
1197; X32-SSE:       # BB#0:
1198; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
1199; X32-SSE-NEXT:    psllw $5, %xmm3
1200; X32-SSE-NEXT:    pxor %xmm2, %xmm2
1201; X32-SSE-NEXT:    pxor %xmm1, %xmm1
1202; X32-SSE-NEXT:    pcmpgtb %xmm3, %xmm1
1203; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
1204; X32-SSE-NEXT:    psllw $4, %xmm4
1205; X32-SSE-NEXT:    pand .LCPI7_1, %xmm4
1206; X32-SSE-NEXT:    pand %xmm1, %xmm4
1207; X32-SSE-NEXT:    pandn %xmm0, %xmm1
1208; X32-SSE-NEXT:    por %xmm4, %xmm1
1209; X32-SSE-NEXT:    paddb %xmm3, %xmm3
1210; X32-SSE-NEXT:    pxor %xmm4, %xmm4
1211; X32-SSE-NEXT:    pcmpgtb %xmm3, %xmm4
1212; X32-SSE-NEXT:    movdqa %xmm4, %xmm5
1213; X32-SSE-NEXT:    pandn %xmm1, %xmm5
1214; X32-SSE-NEXT:    psllw $2, %xmm1
1215; X32-SSE-NEXT:    pand .LCPI7_2, %xmm1
1216; X32-SSE-NEXT:    pand %xmm4, %xmm1
1217; X32-SSE-NEXT:    por %xmm5, %xmm1
1218; X32-SSE-NEXT:    paddb %xmm3, %xmm3
1219; X32-SSE-NEXT:    pxor %xmm4, %xmm4
1220; X32-SSE-NEXT:    pcmpgtb %xmm3, %xmm4
1221; X32-SSE-NEXT:    movdqa %xmm4, %xmm3
1222; X32-SSE-NEXT:    pandn %xmm1, %xmm3
1223; X32-SSE-NEXT:    paddb %xmm1, %xmm1
1224; X32-SSE-NEXT:    pand %xmm4, %xmm1
1225; X32-SSE-NEXT:    movdqa {{.*#+}} xmm4 = [8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7]
1226; X32-SSE-NEXT:    psllw $5, %xmm4
1227; X32-SSE-NEXT:    pxor %xmm5, %xmm5
1228; X32-SSE-NEXT:    pcmpgtb %xmm4, %xmm5
1229; X32-SSE-NEXT:    movdqa %xmm5, %xmm6
1230; X32-SSE-NEXT:    pandn %xmm0, %xmm6
1231; X32-SSE-NEXT:    psrlw $4, %xmm0
1232; X32-SSE-NEXT:    pand .LCPI7_4, %xmm0
1233; X32-SSE-NEXT:    pand %xmm5, %xmm0
1234; X32-SSE-NEXT:    por %xmm6, %xmm0
1235; X32-SSE-NEXT:    paddb %xmm4, %xmm4
1236; X32-SSE-NEXT:    pxor %xmm5, %xmm5
1237; X32-SSE-NEXT:    pcmpgtb %xmm4, %xmm5
1238; X32-SSE-NEXT:    movdqa %xmm5, %xmm6
1239; X32-SSE-NEXT:    pandn %xmm0, %xmm6
1240; X32-SSE-NEXT:    psrlw $2, %xmm0
1241; X32-SSE-NEXT:    pand .LCPI7_5, %xmm0
1242; X32-SSE-NEXT:    pand %xmm5, %xmm0
1243; X32-SSE-NEXT:    por %xmm6, %xmm0
1244; X32-SSE-NEXT:    paddb %xmm4, %xmm4
1245; X32-SSE-NEXT:    pcmpgtb %xmm4, %xmm2
1246; X32-SSE-NEXT:    movdqa %xmm2, %xmm4
1247; X32-SSE-NEXT:    pandn %xmm0, %xmm4
1248; X32-SSE-NEXT:    psrlw $1, %xmm0
1249; X32-SSE-NEXT:    pand .LCPI7_6, %xmm0
1250; X32-SSE-NEXT:    pand %xmm2, %xmm0
1251; X32-SSE-NEXT:    por %xmm4, %xmm0
1252; X32-SSE-NEXT:    por %xmm3, %xmm0
1253; X32-SSE-NEXT:    por %xmm1, %xmm0
1254; X32-SSE-NEXT:    retl
1255  %shl = shl <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>
1256  %lshr = lshr <16 x i8> %a, <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
1257  %or = or <16 x i8> %shl, %lshr
1258  ret <16 x i8> %or
1259}
1260
1261;
1262; Uniform Constant Rotates
1263;
1264
1265define <2 x i64> @splatconstant_rotate_v2i64(<2 x i64> %a) nounwind {
1266; SSE-LABEL: splatconstant_rotate_v2i64:
1267; SSE:       # BB#0:
1268; SSE-NEXT:    movdqa %xmm0, %xmm1
1269; SSE-NEXT:    psllq $14, %xmm1
1270; SSE-NEXT:    psrlq $50, %xmm0
1271; SSE-NEXT:    por %xmm1, %xmm0
1272; SSE-NEXT:    retq
1273;
1274; AVX-LABEL: splatconstant_rotate_v2i64:
1275; AVX:       # BB#0:
1276; AVX-NEXT:    vpsllq $14, %xmm0, %xmm1
1277; AVX-NEXT:    vpsrlq $50, %xmm0, %xmm0
1278; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
1279; AVX-NEXT:    retq
1280;
1281; XOP-LABEL: splatconstant_rotate_v2i64:
1282; XOP:       # BB#0:
1283; XOP-NEXT:    vprotq $14, %xmm0, %xmm0
1284; XOP-NEXT:    retq
1285;
1286; X32-SSE-LABEL: splatconstant_rotate_v2i64:
1287; X32-SSE:       # BB#0:
1288; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
1289; X32-SSE-NEXT:    psllq $14, %xmm1
1290; X32-SSE-NEXT:    psrlq $50, %xmm0
1291; X32-SSE-NEXT:    por %xmm1, %xmm0
1292; X32-SSE-NEXT:    retl
1293  %shl = shl <2 x i64> %a, <i64 14, i64 14>
1294  %lshr = lshr <2 x i64> %a, <i64 50, i64 50>
1295  %or = or <2 x i64> %shl, %lshr
1296  ret <2 x i64> %or
1297}
1298
1299define <4 x i32> @splatconstant_rotate_v4i32(<4 x i32> %a) nounwind {
1300; SSE-LABEL: splatconstant_rotate_v4i32:
1301; SSE:       # BB#0:
1302; SSE-NEXT:    movdqa %xmm0, %xmm1
1303; SSE-NEXT:    pslld $4, %xmm1
1304; SSE-NEXT:    psrld $28, %xmm0
1305; SSE-NEXT:    por %xmm1, %xmm0
1306; SSE-NEXT:    retq
1307;
1308; AVX-LABEL: splatconstant_rotate_v4i32:
1309; AVX:       # BB#0:
1310; AVX-NEXT:    vpslld $4, %xmm0, %xmm1
1311; AVX-NEXT:    vpsrld $28, %xmm0, %xmm0
1312; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
1313; AVX-NEXT:    retq
1314;
1315; XOP-LABEL: splatconstant_rotate_v4i32:
1316; XOP:       # BB#0:
1317; XOP-NEXT:    vprotd $4, %xmm0, %xmm0
1318; XOP-NEXT:    retq
1319;
1320; X32-SSE-LABEL: splatconstant_rotate_v4i32:
1321; X32-SSE:       # BB#0:
1322; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
1323; X32-SSE-NEXT:    pslld $4, %xmm1
1324; X32-SSE-NEXT:    psrld $28, %xmm0
1325; X32-SSE-NEXT:    por %xmm1, %xmm0
1326; X32-SSE-NEXT:    retl
1327  %shl = shl <4 x i32> %a, <i32 4, i32 4, i32 4, i32 4>
1328  %lshr = lshr <4 x i32> %a, <i32 28, i32 28, i32 28, i32 28>
1329  %or = or <4 x i32> %shl, %lshr
1330  ret <4 x i32> %or
1331}
1332
1333define <8 x i16> @splatconstant_rotate_v8i16(<8 x i16> %a) nounwind {
1334; SSE-LABEL: splatconstant_rotate_v8i16:
1335; SSE:       # BB#0:
1336; SSE-NEXT:    movdqa %xmm0, %xmm1
1337; SSE-NEXT:    psllw $7, %xmm1
1338; SSE-NEXT:    psrlw $9, %xmm0
1339; SSE-NEXT:    por %xmm1, %xmm0
1340; SSE-NEXT:    retq
1341;
1342; AVX-LABEL: splatconstant_rotate_v8i16:
1343; AVX:       # BB#0:
1344; AVX-NEXT:    vpsllw $7, %xmm0, %xmm1
1345; AVX-NEXT:    vpsrlw $9, %xmm0, %xmm0
1346; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
1347; AVX-NEXT:    retq
1348;
1349; XOP-LABEL: splatconstant_rotate_v8i16:
1350; XOP:       # BB#0:
1351; XOP-NEXT:    vprotw $7, %xmm0, %xmm0
1352; XOP-NEXT:    retq
1353;
1354; X32-SSE-LABEL: splatconstant_rotate_v8i16:
1355; X32-SSE:       # BB#0:
1356; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
1357; X32-SSE-NEXT:    psllw $7, %xmm1
1358; X32-SSE-NEXT:    psrlw $9, %xmm0
1359; X32-SSE-NEXT:    por %xmm1, %xmm0
1360; X32-SSE-NEXT:    retl
1361  %shl = shl <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
1362  %lshr = lshr <8 x i16> %a, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9>
1363  %or = or <8 x i16> %shl, %lshr
1364  ret <8 x i16> %or
1365}
1366
1367define <16 x i8> @splatconstant_rotate_v16i8(<16 x i8> %a) nounwind {
1368; SSE-LABEL: splatconstant_rotate_v16i8:
1369; SSE:       # BB#0:
1370; SSE-NEXT:    movdqa %xmm0, %xmm1
1371; SSE-NEXT:    psllw $4, %xmm1
1372; SSE-NEXT:    pand {{.*}}(%rip), %xmm1
1373; SSE-NEXT:    psrlw $4, %xmm0
1374; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
1375; SSE-NEXT:    por %xmm1, %xmm0
1376; SSE-NEXT:    retq
1377;
1378; AVX-LABEL: splatconstant_rotate_v16i8:
1379; AVX:       # BB#0:
1380; AVX-NEXT:    vpsllw $4, %xmm0, %xmm1
1381; AVX-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
1382; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
1383; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
1384; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
1385; AVX-NEXT:    retq
1386;
1387; XOP-LABEL: splatconstant_rotate_v16i8:
1388; XOP:       # BB#0:
1389; XOP-NEXT:    vprotb $4, %xmm0, %xmm0
1390; XOP-NEXT:    retq
1391;
1392; X32-SSE-LABEL: splatconstant_rotate_v16i8:
1393; X32-SSE:       # BB#0:
1394; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
1395; X32-SSE-NEXT:    psllw $4, %xmm1
1396; X32-SSE-NEXT:    pand .LCPI11_0, %xmm1
1397; X32-SSE-NEXT:    psrlw $4, %xmm0
1398; X32-SSE-NEXT:    pand .LCPI11_1, %xmm0
1399; X32-SSE-NEXT:    por %xmm1, %xmm0
1400; X32-SSE-NEXT:    retl
1401  %shl = shl <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1402  %lshr = lshr <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1403  %or = or <16 x i8> %shl, %lshr
1404  ret <16 x i8> %or
1405}
1406
1407;
1408; Masked Uniform Constant Rotates
1409;
1410
1411define <2 x i64> @splatconstant_rotate_mask_v2i64(<2 x i64> %a) nounwind {
1412; SSE-LABEL: splatconstant_rotate_mask_v2i64:
1413; SSE:       # BB#0:
1414; SSE-NEXT:    movdqa %xmm0, %xmm1
1415; SSE-NEXT:    psllq $15, %xmm1
1416; SSE-NEXT:    psrlq $49, %xmm0
1417; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
1418; SSE-NEXT:    pand {{.*}}(%rip), %xmm1
1419; SSE-NEXT:    por %xmm0, %xmm1
1420; SSE-NEXT:    movdqa %xmm1, %xmm0
1421; SSE-NEXT:    retq
1422;
1423; AVX-LABEL: splatconstant_rotate_mask_v2i64:
1424; AVX:       # BB#0:
1425; AVX-NEXT:    vpsllq $15, %xmm0, %xmm1
1426; AVX-NEXT:    vpsrlq $49, %xmm0, %xmm0
1427; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
1428; AVX-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
1429; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
1430; AVX-NEXT:    retq
1431;
1432; XOP-LABEL: splatconstant_rotate_mask_v2i64:
1433; XOP:       # BB#0:
1434; XOP-NEXT:    vprotq $15, %xmm0, %xmm0
1435; XOP-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
1436; XOP-NEXT:    retq
1437;
1438; X32-SSE-LABEL: splatconstant_rotate_mask_v2i64:
1439; X32-SSE:       # BB#0:
1440; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
1441; X32-SSE-NEXT:    psllq $15, %xmm1
1442; X32-SSE-NEXT:    psrlq $49, %xmm0
1443; X32-SSE-NEXT:    pand .LCPI12_0, %xmm0
1444; X32-SSE-NEXT:    pand .LCPI12_1, %xmm1
1445; X32-SSE-NEXT:    por %xmm0, %xmm1
1446; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
1447; X32-SSE-NEXT:    retl
1448  %shl = shl <2 x i64> %a, <i64 15, i64 15>
1449  %lshr = lshr <2 x i64> %a, <i64 49, i64 49>
1450  %rmask = and <2 x i64> %lshr, <i64 255, i64 127>
1451  %lmask = and <2 x i64> %shl, <i64 65, i64 33>
1452  %or = or <2 x i64> %lmask, %rmask
1453  ret <2 x i64> %or
1454}
1455
1456define <4 x i32> @splatconstant_rotate_mask_v4i32(<4 x i32> %a) nounwind {
1457; SSE-LABEL: splatconstant_rotate_mask_v4i32:
1458; SSE:       # BB#0:
1459; SSE-NEXT:    movdqa %xmm0, %xmm1
1460; SSE-NEXT:    pslld $4, %xmm1
1461; SSE-NEXT:    psrld $28, %xmm0
1462; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
1463; SSE-NEXT:    pand {{.*}}(%rip), %xmm1
1464; SSE-NEXT:    por %xmm0, %xmm1
1465; SSE-NEXT:    movdqa %xmm1, %xmm0
1466; SSE-NEXT:    retq
1467;
1468; AVX-LABEL: splatconstant_rotate_mask_v4i32:
1469; AVX:       # BB#0:
1470; AVX-NEXT:    vpslld $4, %xmm0, %xmm1
1471; AVX-NEXT:    vpsrld $28, %xmm0, %xmm0
1472; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
1473; AVX-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
1474; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
1475; AVX-NEXT:    retq
1476;
1477; XOP-LABEL: splatconstant_rotate_mask_v4i32:
1478; XOP:       # BB#0:
1479; XOP-NEXT:    vprotd $4, %xmm0, %xmm0
1480; XOP-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
1481; XOP-NEXT:    retq
1482;
1483; X32-SSE-LABEL: splatconstant_rotate_mask_v4i32:
1484; X32-SSE:       # BB#0:
1485; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
1486; X32-SSE-NEXT:    pslld $4, %xmm1
1487; X32-SSE-NEXT:    psrld $28, %xmm0
1488; X32-SSE-NEXT:    pand .LCPI13_0, %xmm0
1489; X32-SSE-NEXT:    pand .LCPI13_1, %xmm1
1490; X32-SSE-NEXT:    por %xmm0, %xmm1
1491; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
1492; X32-SSE-NEXT:    retl
1493  %shl = shl <4 x i32> %a, <i32 4, i32 4, i32 4, i32 4>
1494  %lshr = lshr <4 x i32> %a, <i32 28, i32 28, i32 28, i32 28>
1495  %rmask = and <4 x i32> %lshr, <i32 127, i32 255, i32 511, i32 1023>
1496  %lmask = and <4 x i32> %shl, <i32 1023, i32 511, i32 255, i32 127>
1497  %or = or <4 x i32> %lmask, %rmask
1498  ret <4 x i32> %or
1499}
1500
1501define <8 x i16> @splatconstant_rotate_mask_v8i16(<8 x i16> %a) nounwind {
1502; SSE-LABEL: splatconstant_rotate_mask_v8i16:
1503; SSE:       # BB#0:
1504; SSE-NEXT:    movdqa %xmm0, %xmm1
1505; SSE-NEXT:    psllw $5, %xmm1
1506; SSE-NEXT:    psrlw $11, %xmm0
1507; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
1508; SSE-NEXT:    pand {{.*}}(%rip), %xmm1
1509; SSE-NEXT:    por %xmm0, %xmm1
1510; SSE-NEXT:    movdqa %xmm1, %xmm0
1511; SSE-NEXT:    retq
1512;
1513; AVX-LABEL: splatconstant_rotate_mask_v8i16:
1514; AVX:       # BB#0:
1515; AVX-NEXT:    vpsllw $5, %xmm0, %xmm1
1516; AVX-NEXT:    vpsrlw $11, %xmm0, %xmm0
1517; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
1518; AVX-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
1519; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
1520; AVX-NEXT:    retq
1521;
1522; XOP-LABEL: splatconstant_rotate_mask_v8i16:
1523; XOP:       # BB#0:
1524; XOP-NEXT:    vprotw $5, %xmm0, %xmm0
1525; XOP-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
1526; XOP-NEXT:    retq
1527;
1528; X32-SSE-LABEL: splatconstant_rotate_mask_v8i16:
1529; X32-SSE:       # BB#0:
1530; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
1531; X32-SSE-NEXT:    psllw $5, %xmm1
1532; X32-SSE-NEXT:    psrlw $11, %xmm0
1533; X32-SSE-NEXT:    pand .LCPI14_0, %xmm0
1534; X32-SSE-NEXT:    pand .LCPI14_1, %xmm1
1535; X32-SSE-NEXT:    por %xmm0, %xmm1
1536; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
1537; X32-SSE-NEXT:    retl
1538  %shl = shl <8 x i16> %a, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
1539  %lshr = lshr <8 x i16> %a, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
1540  %rmask = and <8 x i16> %lshr, <i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55>
1541  %lmask = and <8 x i16> %shl, <i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33>
1542  %or = or <8 x i16> %lmask, %rmask
1543  ret <8 x i16> %or
1544}
1545
1546define <16 x i8> @splatconstant_rotate_mask_v16i8(<16 x i8> %a) nounwind {
1547; SSE-LABEL: splatconstant_rotate_mask_v16i8:
1548; SSE:       # BB#0:
1549; SSE-NEXT:    movdqa %xmm0, %xmm1
1550; SSE-NEXT:    psllw $4, %xmm1
1551; SSE-NEXT:    pand {{.*}}(%rip), %xmm1
1552; SSE-NEXT:    psrlw $4, %xmm0
1553; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
1554; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
1555; SSE-NEXT:    pand {{.*}}(%rip), %xmm1
1556; SSE-NEXT:    por %xmm0, %xmm1
1557; SSE-NEXT:    movdqa %xmm1, %xmm0
1558; SSE-NEXT:    retq
1559;
1560; AVX-LABEL: splatconstant_rotate_mask_v16i8:
1561; AVX:       # BB#0:
1562; AVX-NEXT:    vpsllw $4, %xmm0, %xmm1
1563; AVX-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
1564; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
1565; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
1566; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
1567; AVX-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
1568; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
1569; AVX-NEXT:    retq
1570;
1571; XOP-LABEL: splatconstant_rotate_mask_v16i8:
1572; XOP:       # BB#0:
1573; XOP-NEXT:    vprotb $4, %xmm0, %xmm0
1574; XOP-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
1575; XOP-NEXT:    retq
1576;
1577; X32-SSE-LABEL: splatconstant_rotate_mask_v16i8:
1578; X32-SSE:       # BB#0:
1579; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
1580; X32-SSE-NEXT:    psllw $4, %xmm1
1581; X32-SSE-NEXT:    pand .LCPI15_0, %xmm1
1582; X32-SSE-NEXT:    psrlw $4, %xmm0
1583; X32-SSE-NEXT:    pand .LCPI15_1, %xmm0
1584; X32-SSE-NEXT:    pand .LCPI15_2, %xmm0
1585; X32-SSE-NEXT:    pand .LCPI15_3, %xmm1
1586; X32-SSE-NEXT:    por %xmm0, %xmm1
1587; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
1588; X32-SSE-NEXT:    retl
1589  %shl = shl <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1590  %lshr = lshr <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1591  %rmask = and <16 x i8> %lshr, <i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55>
1592  %lmask = and <16 x i8> %shl, <i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33>
1593  %or = or <16 x i8> %lmask, %rmask
1594  ret <16 x i8> %or
1595}
1596