• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2
10
11;
12; Variable Rotates
13;
14
15define <4 x i64> @var_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
16; AVX1-LABEL: var_rotate_v4i64:
17; AVX1:       # %bb.0:
18; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [64,64]
19; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm3
20; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
21; AVX1-NEXT:    vpsubq %xmm4, %xmm2, %xmm2
22; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
23; AVX1-NEXT:    vpsllq %xmm4, %xmm5, %xmm6
24; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
25; AVX1-NEXT:    vpsllq %xmm4, %xmm5, %xmm4
26; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7]
27; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm6
28; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
29; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm1
30; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4,5,6,7]
31; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
32; AVX1-NEXT:    vpsrlq %xmm2, %xmm5, %xmm4
33; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
34; AVX1-NEXT:    vpsrlq %xmm2, %xmm5, %xmm2
35; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
36; AVX1-NEXT:    vpsrlq %xmm3, %xmm0, %xmm4
37; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
38; AVX1-NEXT:    vpsrlq %xmm3, %xmm0, %xmm0
39; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7]
40; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
41; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
42; AVX1-NEXT:    retq
43;
44; AVX2-LABEL: var_rotate_v4i64:
45; AVX2:       # %bb.0:
46; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [64,64,64,64]
47; AVX2-NEXT:    vpsubq %ymm1, %ymm2, %ymm2
48; AVX2-NEXT:    vpsllvq %ymm1, %ymm0, %ymm1
49; AVX2-NEXT:    vpsrlvq %ymm2, %ymm0, %ymm0
50; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
51; AVX2-NEXT:    retq
52;
53; AVX512F-LABEL: var_rotate_v4i64:
54; AVX512F:       # %bb.0:
55; AVX512F-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
56; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
57; AVX512F-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
58; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
59; AVX512F-NEXT:    retq
60;
61; AVX512VL-LABEL: var_rotate_v4i64:
62; AVX512VL:       # %bb.0:
63; AVX512VL-NEXT:    vprolvq %ymm1, %ymm0, %ymm0
64; AVX512VL-NEXT:    retq
65;
66; AVX512BW-LABEL: var_rotate_v4i64:
67; AVX512BW:       # %bb.0:
68; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
69; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
70; AVX512BW-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
71; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
72; AVX512BW-NEXT:    retq
73;
74; AVX512VLBW-LABEL: var_rotate_v4i64:
75; AVX512VLBW:       # %bb.0:
76; AVX512VLBW-NEXT:    vprolvq %ymm1, %ymm0, %ymm0
77; AVX512VLBW-NEXT:    retq
78;
79; XOPAVX1-LABEL: var_rotate_v4i64:
80; XOPAVX1:       # %bb.0:
81; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
82; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
83; XOPAVX1-NEXT:    vprotq %xmm2, %xmm3, %xmm2
84; XOPAVX1-NEXT:    vprotq %xmm1, %xmm0, %xmm0
85; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
86; XOPAVX1-NEXT:    retq
87;
88; XOPAVX2-LABEL: var_rotate_v4i64:
89; XOPAVX2:       # %bb.0:
90; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
91; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
92; XOPAVX2-NEXT:    vprotq %xmm2, %xmm3, %xmm2
93; XOPAVX2-NEXT:    vprotq %xmm1, %xmm0, %xmm0
94; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
95; XOPAVX2-NEXT:    retq
96  %b64 = sub <4 x i64> <i64 64, i64 64, i64 64, i64 64>, %b
97  %shl = shl <4 x i64> %a, %b
98  %lshr = lshr <4 x i64> %a, %b64
99  %or = or <4 x i64> %shl, %lshr
100  ret <4 x i64> %or
101}
102
103define <8 x i32> @var_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
104; AVX1-LABEL: var_rotate_v8i32:
105; AVX1:       # %bb.0:
106; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
107; AVX1-NEXT:    vpslld $23, %xmm2, %xmm2
108; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
109; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
110; AVX1-NEXT:    vcvttps2dq %xmm2, %xmm2
111; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
112; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
113; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
114; AVX1-NEXT:    vpmuludq %xmm4, %xmm6, %xmm4
115; AVX1-NEXT:    vpmuludq %xmm2, %xmm5, %xmm2
116; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
117; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7]
118; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,0,2,2]
119; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
120; AVX1-NEXT:    vpor %xmm5, %xmm2, %xmm2
121; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
122; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
123; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
124; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
125; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
126; AVX1-NEXT:    vpmuludq %xmm3, %xmm4, %xmm3
127; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
128; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
129; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
130; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
131; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
132; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
133; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
134; AVX1-NEXT:    retq
135;
136; AVX2-LABEL: var_rotate_v8i32:
137; AVX2:       # %bb.0:
138; AVX2-NEXT:    vpsllvd %ymm1, %ymm0, %ymm2
139; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32]
140; AVX2-NEXT:    vpsubd %ymm1, %ymm3, %ymm1
141; AVX2-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
142; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
143; AVX2-NEXT:    retq
144;
145; AVX512F-LABEL: var_rotate_v8i32:
146; AVX512F:       # %bb.0:
147; AVX512F-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
148; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
149; AVX512F-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
150; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
151; AVX512F-NEXT:    retq
152;
153; AVX512VL-LABEL: var_rotate_v8i32:
154; AVX512VL:       # %bb.0:
155; AVX512VL-NEXT:    vprolvd %ymm1, %ymm0, %ymm0
156; AVX512VL-NEXT:    retq
157;
158; AVX512BW-LABEL: var_rotate_v8i32:
159; AVX512BW:       # %bb.0:
160; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
161; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
162; AVX512BW-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
163; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
164; AVX512BW-NEXT:    retq
165;
166; AVX512VLBW-LABEL: var_rotate_v8i32:
167; AVX512VLBW:       # %bb.0:
168; AVX512VLBW-NEXT:    vprolvd %ymm1, %ymm0, %ymm0
169; AVX512VLBW-NEXT:    retq
170;
171; XOPAVX1-LABEL: var_rotate_v8i32:
172; XOPAVX1:       # %bb.0:
173; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
174; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
175; XOPAVX1-NEXT:    vprotd %xmm2, %xmm3, %xmm2
176; XOPAVX1-NEXT:    vprotd %xmm1, %xmm0, %xmm0
177; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
178; XOPAVX1-NEXT:    retq
179;
180; XOPAVX2-LABEL: var_rotate_v8i32:
181; XOPAVX2:       # %bb.0:
182; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
183; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
184; XOPAVX2-NEXT:    vprotd %xmm2, %xmm3, %xmm2
185; XOPAVX2-NEXT:    vprotd %xmm1, %xmm0, %xmm0
186; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
187; XOPAVX2-NEXT:    retq
188  %b32 = sub <8 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>, %b
189  %shl = shl <8 x i32> %a, %b
190  %lshr = lshr <8 x i32> %a, %b32
191  %or = or <8 x i32> %shl, %lshr
192  ret <8 x i32> %or
193}
194
195define <16 x i16> @var_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
196; AVX1-LABEL: var_rotate_v16i16:
197; AVX1:       # %bb.0:
198; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
199; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
200; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
201; AVX1-NEXT:    vpslld $23, %xmm4, %xmm4
202; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
203; AVX1-NEXT:    vpaddd %xmm5, %xmm4, %xmm4
204; AVX1-NEXT:    vcvttps2dq %xmm4, %xmm4
205; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
206; AVX1-NEXT:    vpslld $23, %xmm2, %xmm2
207; AVX1-NEXT:    vpaddd %xmm5, %xmm2, %xmm2
208; AVX1-NEXT:    vcvttps2dq %xmm2, %xmm2
209; AVX1-NEXT:    vpackusdw %xmm4, %xmm2, %xmm2
210; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
211; AVX1-NEXT:    vpmulhuw %xmm2, %xmm4, %xmm6
212; AVX1-NEXT:    vpmullw %xmm2, %xmm4, %xmm2
213; AVX1-NEXT:    vpor %xmm6, %xmm2, %xmm2
214; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
215; AVX1-NEXT:    vpslld $23, %xmm3, %xmm3
216; AVX1-NEXT:    vpaddd %xmm5, %xmm3, %xmm3
217; AVX1-NEXT:    vcvttps2dq %xmm3, %xmm3
218; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
219; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
220; AVX1-NEXT:    vpaddd %xmm5, %xmm1, %xmm1
221; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
222; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
223; AVX1-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm3
224; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
225; AVX1-NEXT:    vpor %xmm3, %xmm0, %xmm0
226; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
227; AVX1-NEXT:    retq
228;
229; AVX2-LABEL: var_rotate_v16i16:
230; AVX2:       # %bb.0:
231; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
232; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
233; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
234; AVX2-NEXT:    vpsllvd %ymm3, %ymm4, %ymm3
235; AVX2-NEXT:    vpsrld $16, %ymm3, %ymm3
236; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
237; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
238; AVX2-NEXT:    vpsllvd %ymm5, %ymm0, %ymm5
239; AVX2-NEXT:    vpsrld $16, %ymm5, %ymm5
240; AVX2-NEXT:    vpackusdw %ymm3, %ymm5, %ymm3
241; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
242; AVX2-NEXT:    vpsubw %ymm1, %ymm5, %ymm1
243; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
244; AVX2-NEXT:    vpsrlvd %ymm5, %ymm4, %ymm4
245; AVX2-NEXT:    vpsrld $16, %ymm4, %ymm4
246; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
247; AVX2-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
248; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
249; AVX2-NEXT:    vpackusdw %ymm4, %ymm0, %ymm0
250; AVX2-NEXT:    vpor %ymm0, %ymm3, %ymm0
251; AVX2-NEXT:    retq
252;
253; AVX512F-LABEL: var_rotate_v16i16:
254; AVX512F:       # %bb.0:
255; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
256; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
257; AVX512F-NEXT:    vpsllvd %zmm2, %zmm0, %zmm2
258; AVX512F-NEXT:    vpmovdw %zmm2, %ymm2
259; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
260; AVX512F-NEXT:    vpsubw %ymm1, %ymm3, %ymm1
261; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
262; AVX512F-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
263; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
264; AVX512F-NEXT:    vpor %ymm0, %ymm2, %ymm0
265; AVX512F-NEXT:    retq
266;
267; AVX512VL-LABEL: var_rotate_v16i16:
268; AVX512VL:       # %bb.0:
269; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
270; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
271; AVX512VL-NEXT:    vpsllvd %zmm2, %zmm0, %zmm2
272; AVX512VL-NEXT:    vpmovdw %zmm2, %ymm2
273; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
274; AVX512VL-NEXT:    vpsubw %ymm1, %ymm3, %ymm1
275; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
276; AVX512VL-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
277; AVX512VL-NEXT:    vpmovdw %zmm0, %ymm0
278; AVX512VL-NEXT:    vpor %ymm0, %ymm2, %ymm0
279; AVX512VL-NEXT:    retq
280;
281; AVX512BW-LABEL: var_rotate_v16i16:
282; AVX512BW:       # %bb.0:
283; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
284; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
285; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm2
286; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
287; AVX512BW-NEXT:    vpsubw %ymm1, %ymm3, %ymm1
288; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
289; AVX512BW-NEXT:    vpor %ymm0, %ymm2, %ymm0
290; AVX512BW-NEXT:    retq
291;
292; AVX512VLBW-LABEL: var_rotate_v16i16:
293; AVX512VLBW:       # %bb.0:
294; AVX512VLBW-NEXT:    vpsllvw %ymm1, %ymm0, %ymm2
295; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
296; AVX512VLBW-NEXT:    vpsubw %ymm1, %ymm3, %ymm1
297; AVX512VLBW-NEXT:    vpsrlvw %ymm1, %ymm0, %ymm0
298; AVX512VLBW-NEXT:    vpor %ymm0, %ymm2, %ymm0
299; AVX512VLBW-NEXT:    retq
300;
301; XOPAVX1-LABEL: var_rotate_v16i16:
302; XOPAVX1:       # %bb.0:
303; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
304; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
305; XOPAVX1-NEXT:    vprotw %xmm2, %xmm3, %xmm2
306; XOPAVX1-NEXT:    vprotw %xmm1, %xmm0, %xmm0
307; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
308; XOPAVX1-NEXT:    retq
309;
310; XOPAVX2-LABEL: var_rotate_v16i16:
311; XOPAVX2:       # %bb.0:
312; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
313; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
314; XOPAVX2-NEXT:    vprotw %xmm2, %xmm3, %xmm2
315; XOPAVX2-NEXT:    vprotw %xmm1, %xmm0, %xmm0
316; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
317; XOPAVX2-NEXT:    retq
318  %b16 = sub <16 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %b
319  %shl = shl <16 x i16> %a, %b
320  %lshr = lshr <16 x i16> %a, %b16
321  %or = or <16 x i16> %shl, %lshr
322  ret <16 x i16> %or
323}
324
325define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
326; AVX1-LABEL: var_rotate_v32i8:
327; AVX1:       # %bb.0:
328; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
329; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm3
330; AVX1-NEXT:    vmovdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
331; AVX1-NEXT:    vpand %xmm8, %xmm3, %xmm3
332; AVX1-NEXT:    vpsllw $4, %xmm2, %xmm5
333; AVX1-NEXT:    vmovdqa {{.*#+}} xmm9 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
334; AVX1-NEXT:    vpand %xmm9, %xmm5, %xmm5
335; AVX1-NEXT:    vpor %xmm3, %xmm5, %xmm3
336; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
337; AVX1-NEXT:    vpsllw $5, %xmm5, %xmm5
338; AVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
339; AVX1-NEXT:    vpsrlw $6, %xmm2, %xmm3
340; AVX1-NEXT:    vmovdqa {{.*#+}} xmm10 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
341; AVX1-NEXT:    vpand %xmm10, %xmm3, %xmm3
342; AVX1-NEXT:    vpsllw $2, %xmm2, %xmm4
343; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
344; AVX1-NEXT:    vpand %xmm6, %xmm4, %xmm4
345; AVX1-NEXT:    vpor %xmm3, %xmm4, %xmm3
346; AVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm4
347; AVX1-NEXT:    vpblendvb %xmm4, %xmm3, %xmm2, %xmm2
348; AVX1-NEXT:    vpsrlw $7, %xmm2, %xmm3
349; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
350; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
351; AVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm7
352; AVX1-NEXT:    vpor %xmm3, %xmm7, %xmm3
353; AVX1-NEXT:    vpaddb %xmm4, %xmm4, %xmm4
354; AVX1-NEXT:    vpblendvb %xmm4, %xmm3, %xmm2, %xmm2
355; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm3
356; AVX1-NEXT:    vpand %xmm8, %xmm3, %xmm3
357; AVX1-NEXT:    vpsllw $4, %xmm0, %xmm4
358; AVX1-NEXT:    vpand %xmm9, %xmm4, %xmm4
359; AVX1-NEXT:    vpor %xmm3, %xmm4, %xmm3
360; AVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
361; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
362; AVX1-NEXT:    vpsrlw $6, %xmm0, %xmm3
363; AVX1-NEXT:    vpand %xmm10, %xmm3, %xmm3
364; AVX1-NEXT:    vpsllw $2, %xmm0, %xmm4
365; AVX1-NEXT:    vpand %xmm6, %xmm4, %xmm4
366; AVX1-NEXT:    vpor %xmm3, %xmm4, %xmm3
367; AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
368; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
369; AVX1-NEXT:    vpsrlw $7, %xmm0, %xmm3
370; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
371; AVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm4
372; AVX1-NEXT:    vpor %xmm3, %xmm4, %xmm3
373; AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
374; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
375; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
376; AVX1-NEXT:    retq
377;
378; AVX2-LABEL: var_rotate_v32i8:
379; AVX2:       # %bb.0:
380; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm2
381; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
382; AVX2-NEXT:    vpsllw $4, %ymm0, %ymm3
383; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
384; AVX2-NEXT:    vpor %ymm2, %ymm3, %ymm2
385; AVX2-NEXT:    vpsllw $5, %ymm1, %ymm1
386; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
387; AVX2-NEXT:    vpsrlw $6, %ymm0, %ymm2
388; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
389; AVX2-NEXT:    vpsllw $2, %ymm0, %ymm3
390; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
391; AVX2-NEXT:    vpor %ymm2, %ymm3, %ymm2
392; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
393; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
394; AVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
395; AVX2-NEXT:    vpsrlw $7, %ymm0, %ymm3
396; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
397; AVX2-NEXT:    vpor %ymm3, %ymm2, %ymm2
398; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
399; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
400; AVX2-NEXT:    retq
401;
402; AVX512F-LABEL: var_rotate_v32i8:
403; AVX512F:       # %bb.0:
404; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm2
405; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
406; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm3
407; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
408; AVX512F-NEXT:    vpor %ymm2, %ymm3, %ymm2
409; AVX512F-NEXT:    vpsllw $5, %ymm1, %ymm1
410; AVX512F-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
411; AVX512F-NEXT:    vpsrlw $6, %ymm0, %ymm2
412; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
413; AVX512F-NEXT:    vpsllw $2, %ymm0, %ymm3
414; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
415; AVX512F-NEXT:    vpor %ymm2, %ymm3, %ymm2
416; AVX512F-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
417; AVX512F-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
418; AVX512F-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
419; AVX512F-NEXT:    vpsrlw $7, %ymm0, %ymm3
420; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
421; AVX512F-NEXT:    vpor %ymm3, %ymm2, %ymm2
422; AVX512F-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
423; AVX512F-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
424; AVX512F-NEXT:    retq
425;
426; AVX512VL-LABEL: var_rotate_v32i8:
427; AVX512VL:       # %bb.0:
428; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm2
429; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
430; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm3
431; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
432; AVX512VL-NEXT:    vpor %ymm2, %ymm3, %ymm2
433; AVX512VL-NEXT:    vpsllw $5, %ymm1, %ymm1
434; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
435; AVX512VL-NEXT:    vpsrlw $6, %ymm0, %ymm2
436; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
437; AVX512VL-NEXT:    vpsllw $2, %ymm0, %ymm3
438; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
439; AVX512VL-NEXT:    vpor %ymm2, %ymm3, %ymm2
440; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
441; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
442; AVX512VL-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
443; AVX512VL-NEXT:    vpsrlw $7, %ymm0, %ymm3
444; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
445; AVX512VL-NEXT:    vpor %ymm3, %ymm2, %ymm2
446; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
447; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
448; AVX512VL-NEXT:    retq
449;
450; AVX512BW-LABEL: var_rotate_v32i8:
451; AVX512BW:       # %bb.0:
452; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
453; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
454; AVX512BW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm2
455; AVX512BW-NEXT:    vpmovwb %zmm2, %ymm2
456; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
457; AVX512BW-NEXT:    vpsubb %ymm1, %ymm3, %ymm1
458; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
459; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
460; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
461; AVX512BW-NEXT:    vpor %ymm0, %ymm2, %ymm0
462; AVX512BW-NEXT:    retq
463;
464; AVX512VLBW-LABEL: var_rotate_v32i8:
465; AVX512VLBW:       # %bb.0:
466; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
467; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
468; AVX512VLBW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm2
469; AVX512VLBW-NEXT:    vpmovwb %zmm2, %ymm2
470; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
471; AVX512VLBW-NEXT:    vpsubb %ymm1, %ymm3, %ymm1
472; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
473; AVX512VLBW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
474; AVX512VLBW-NEXT:    vpmovwb %zmm0, %ymm0
475; AVX512VLBW-NEXT:    vpor %ymm0, %ymm2, %ymm0
476; AVX512VLBW-NEXT:    retq
477;
478; XOPAVX1-LABEL: var_rotate_v32i8:
479; XOPAVX1:       # %bb.0:
480; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
481; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
482; XOPAVX1-NEXT:    vprotb %xmm2, %xmm3, %xmm2
483; XOPAVX1-NEXT:    vprotb %xmm1, %xmm0, %xmm0
484; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
485; XOPAVX1-NEXT:    retq
486;
487; XOPAVX2-LABEL: var_rotate_v32i8:
488; XOPAVX2:       # %bb.0:
489; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
490; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
491; XOPAVX2-NEXT:    vprotb %xmm2, %xmm3, %xmm2
492; XOPAVX2-NEXT:    vprotb %xmm1, %xmm0, %xmm0
493; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
494; XOPAVX2-NEXT:    retq
495  %b8 = sub <32 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b
496  %shl = shl <32 x i8> %a, %b
497  %lshr = lshr <32 x i8> %a, %b8
498  %or = or <32 x i8> %shl, %lshr
499  ret <32 x i8> %or
500}
501
502;
503; Uniform Variable Rotates
504;
505
506define <4 x i64> @splatvar_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
507; AVX1-LABEL: splatvar_rotate_v4i64:
508; AVX1:       # %bb.0:
509; AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = xmm1[0,0]
510; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [64,64]
511; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
512; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
513; AVX1-NEXT:    vpsllq %xmm1, %xmm3, %xmm4
514; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm1
515; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
516; AVX1-NEXT:    vpsrlq %xmm2, %xmm3, %xmm3
517; AVX1-NEXT:    vpsrlq %xmm2, %xmm0, %xmm0
518; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
519; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
520; AVX1-NEXT:    retq
521;
522; AVX2-LABEL: splatvar_rotate_v4i64:
523; AVX2:       # %bb.0:
524; AVX2-NEXT:    vpbroadcastq %xmm1, %ymm2
525; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [64,64,64,64]
526; AVX2-NEXT:    vpsubq %ymm2, %ymm3, %ymm2
527; AVX2-NEXT:    vpsllq %xmm1, %ymm0, %ymm1
528; AVX2-NEXT:    vpsrlvq %ymm2, %ymm0, %ymm0
529; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
530; AVX2-NEXT:    retq
531;
532; AVX512F-LABEL: splatvar_rotate_v4i64:
533; AVX512F:       # %bb.0:
534; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
535; AVX512F-NEXT:    vpbroadcastq %xmm1, %ymm1
536; AVX512F-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
537; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
538; AVX512F-NEXT:    retq
539;
540; AVX512VL-LABEL: splatvar_rotate_v4i64:
541; AVX512VL:       # %bb.0:
542; AVX512VL-NEXT:    vpbroadcastq %xmm1, %ymm1
543; AVX512VL-NEXT:    vprolvq %ymm1, %ymm0, %ymm0
544; AVX512VL-NEXT:    retq
545;
546; AVX512BW-LABEL: splatvar_rotate_v4i64:
547; AVX512BW:       # %bb.0:
548; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
549; AVX512BW-NEXT:    vpbroadcastq %xmm1, %ymm1
550; AVX512BW-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
551; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
552; AVX512BW-NEXT:    retq
553;
554; AVX512VLBW-LABEL: splatvar_rotate_v4i64:
555; AVX512VLBW:       # %bb.0:
556; AVX512VLBW-NEXT:    vpbroadcastq %xmm1, %ymm1
557; AVX512VLBW-NEXT:    vprolvq %ymm1, %ymm0, %ymm0
558; AVX512VLBW-NEXT:    retq
559;
560; XOPAVX1-LABEL: splatvar_rotate_v4i64:
561; XOPAVX1:       # %bb.0:
562; XOPAVX1-NEXT:    vmovddup {{.*#+}} xmm1 = xmm1[0,0]
563; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
564; XOPAVX1-NEXT:    vprotq %xmm1, %xmm2, %xmm2
565; XOPAVX1-NEXT:    vprotq %xmm1, %xmm0, %xmm0
566; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
567; XOPAVX1-NEXT:    retq
568;
569; XOPAVX2-LABEL: splatvar_rotate_v4i64:
570; XOPAVX2:       # %bb.0:
571; XOPAVX2-NEXT:    vpbroadcastq %xmm1, %ymm1
572; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
573; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
574; XOPAVX2-NEXT:    vprotq %xmm3, %xmm2, %xmm2
575; XOPAVX2-NEXT:    vprotq %xmm1, %xmm0, %xmm0
576; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
577; XOPAVX2-NEXT:    retq
578  %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
579  %splat64 = sub <4 x i64> <i64 64, i64 64, i64 64, i64 64>, %splat
580  %shl = shl <4 x i64> %a, %splat
581  %lshr = lshr <4 x i64> %a, %splat64
582  %or = or <4 x i64> %shl, %lshr
583  ret <4 x i64> %or
584}
585
586define <8 x i32> @splatvar_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
587; AVX1-LABEL: splatvar_rotate_v8i32:
588; AVX1:       # %bb.0:
589; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[0,0,0,0]
590; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
591; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
592; AVX1-NEXT:    vpslld %xmm1, %xmm3, %xmm4
593; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [32,32,32,32]
594; AVX1-NEXT:    vpsubd %xmm2, %xmm5, %xmm2
595; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
596; AVX1-NEXT:    vpsrld %xmm2, %xmm3, %xmm3
597; AVX1-NEXT:    vpor %xmm3, %xmm4, %xmm3
598; AVX1-NEXT:    vpslld %xmm1, %xmm0, %xmm1
599; AVX1-NEXT:    vpsrld %xmm2, %xmm0, %xmm0
600; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
601; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
602; AVX1-NEXT:    retq
603;
604; AVX2-LABEL: splatvar_rotate_v8i32:
605; AVX2:       # %bb.0:
606; AVX2-NEXT:    vpbroadcastd %xmm1, %ymm2
607; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
608; AVX2-NEXT:    vpslld %xmm1, %ymm0, %ymm1
609; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32]
610; AVX2-NEXT:    vpsubd %ymm2, %ymm3, %ymm2
611; AVX2-NEXT:    vpsrlvd %ymm2, %ymm0, %ymm0
612; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
613; AVX2-NEXT:    retq
614;
615; AVX512F-LABEL: splatvar_rotate_v8i32:
616; AVX512F:       # %bb.0:
617; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
618; AVX512F-NEXT:    vpbroadcastd %xmm1, %ymm1
619; AVX512F-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
620; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
621; AVX512F-NEXT:    retq
622;
623; AVX512VL-LABEL: splatvar_rotate_v8i32:
624; AVX512VL:       # %bb.0:
625; AVX512VL-NEXT:    vpbroadcastd %xmm1, %ymm1
626; AVX512VL-NEXT:    vprolvd %ymm1, %ymm0, %ymm0
627; AVX512VL-NEXT:    retq
628;
629; AVX512BW-LABEL: splatvar_rotate_v8i32:
630; AVX512BW:       # %bb.0:
631; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
632; AVX512BW-NEXT:    vpbroadcastd %xmm1, %ymm1
633; AVX512BW-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
634; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
635; AVX512BW-NEXT:    retq
636;
637; AVX512VLBW-LABEL: splatvar_rotate_v8i32:
638; AVX512VLBW:       # %bb.0:
639; AVX512VLBW-NEXT:    vpbroadcastd %xmm1, %ymm1
640; AVX512VLBW-NEXT:    vprolvd %ymm1, %ymm0, %ymm0
641; AVX512VLBW-NEXT:    retq
642;
643; XOPAVX1-LABEL: splatvar_rotate_v8i32:
644; XOPAVX1:       # %bb.0:
645; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
646; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
647; XOPAVX1-NEXT:    vprotd %xmm1, %xmm2, %xmm2
648; XOPAVX1-NEXT:    vprotd %xmm1, %xmm0, %xmm0
649; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
650; XOPAVX1-NEXT:    retq
651;
652; XOPAVX2-LABEL: splatvar_rotate_v8i32:
653; XOPAVX2:       # %bb.0:
654; XOPAVX2-NEXT:    vpbroadcastd %xmm1, %ymm1
655; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
656; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
657; XOPAVX2-NEXT:    vprotd %xmm3, %xmm2, %xmm2
658; XOPAVX2-NEXT:    vprotd %xmm1, %xmm0, %xmm0
659; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
660; XOPAVX2-NEXT:    retq
661  %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
662  %splat32 = sub <8 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>, %splat
663  %shl = shl <8 x i32> %a, %splat
664  %lshr = lshr <8 x i32> %a, %splat32
665  %or = or <8 x i32> %shl, %lshr
666  ret <8 x i32> %or
667}
668
669define <16 x i16> @splatvar_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
670; AVX1-LABEL: splatvar_rotate_v16i16:
671; AVX1:       # %bb.0:
672; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm1[0,0,2,3,4,5,6,7]
673; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
674; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
675; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
676; AVX1-NEXT:    vpsllw %xmm1, %xmm3, %xmm4
677; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
678; AVX1-NEXT:    vpsubw %xmm2, %xmm5, %xmm2
679; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
680; AVX1-NEXT:    vpsrlw %xmm2, %xmm3, %xmm3
681; AVX1-NEXT:    vpor %xmm3, %xmm4, %xmm3
682; AVX1-NEXT:    vpsllw %xmm1, %xmm0, %xmm1
683; AVX1-NEXT:    vpsrlw %xmm2, %xmm0, %xmm0
684; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
685; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
686; AVX1-NEXT:    retq
687;
688; AVX2-LABEL: splatvar_rotate_v16i16:
689; AVX2:       # %bb.0:
690; AVX2-NEXT:    vpbroadcastw %xmm1, %ymm2
691; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
692; AVX2-NEXT:    vpsllw %xmm1, %ymm0, %ymm1
693; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
694; AVX2-NEXT:    vpsubw %ymm2, %ymm3, %ymm2
695; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
696; AVX2-NEXT:    vpsrlw %xmm2, %ymm0, %ymm0
697; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
698; AVX2-NEXT:    retq
699;
700; AVX512F-LABEL: splatvar_rotate_v16i16:
701; AVX512F:       # %bb.0:
702; AVX512F-NEXT:    vpbroadcastw %xmm1, %ymm2
703; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
704; AVX512F-NEXT:    vpsllw %xmm1, %ymm0, %ymm1
705; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
706; AVX512F-NEXT:    vpsubw %ymm2, %ymm3, %ymm2
707; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
708; AVX512F-NEXT:    vpsrlw %xmm2, %ymm0, %ymm0
709; AVX512F-NEXT:    vpor %ymm0, %ymm1, %ymm0
710; AVX512F-NEXT:    retq
711;
712; AVX512VL-LABEL: splatvar_rotate_v16i16:
713; AVX512VL:       # %bb.0:
714; AVX512VL-NEXT:    vpbroadcastw %xmm1, %ymm2
715; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
716; AVX512VL-NEXT:    vpsllw %xmm1, %ymm0, %ymm1
717; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
718; AVX512VL-NEXT:    vpsubw %ymm2, %ymm3, %ymm2
719; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
720; AVX512VL-NEXT:    vpsrlw %xmm2, %ymm0, %ymm0
721; AVX512VL-NEXT:    vpor %ymm0, %ymm1, %ymm0
722; AVX512VL-NEXT:    retq
723;
724; AVX512BW-LABEL: splatvar_rotate_v16i16:
725; AVX512BW:       # %bb.0:
726; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
727; AVX512BW-NEXT:    vpbroadcastw %xmm1, %ymm2
728; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
729; AVX512BW-NEXT:    vpsubw %ymm2, %ymm3, %ymm2
730; AVX512BW-NEXT:    vpsrlvw %zmm2, %zmm0, %zmm2
731; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
732; AVX512BW-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
733; AVX512BW-NEXT:    vpor %ymm2, %ymm0, %ymm0
734; AVX512BW-NEXT:    retq
735;
736; AVX512VLBW-LABEL: splatvar_rotate_v16i16:
737; AVX512VLBW:       # %bb.0:
738; AVX512VLBW-NEXT:    vpbroadcastw %xmm1, %ymm2
739; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
740; AVX512VLBW-NEXT:    vpsllw %xmm1, %ymm0, %ymm1
741; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
742; AVX512VLBW-NEXT:    vpsubw %ymm2, %ymm3, %ymm2
743; AVX512VLBW-NEXT:    vpsrlvw %ymm2, %ymm0, %ymm0
744; AVX512VLBW-NEXT:    vpor %ymm0, %ymm1, %ymm0
745; AVX512VLBW-NEXT:    retq
746;
747; XOPAVX1-LABEL: splatvar_rotate_v16i16:
748; XOPAVX1:       # %bb.0:
749; XOPAVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
750; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
751; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
752; XOPAVX1-NEXT:    vprotw %xmm1, %xmm2, %xmm2
753; XOPAVX1-NEXT:    vprotw %xmm1, %xmm0, %xmm0
754; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
755; XOPAVX1-NEXT:    retq
756;
757; XOPAVX2-LABEL: splatvar_rotate_v16i16:
758; XOPAVX2:       # %bb.0:
759; XOPAVX2-NEXT:    vpbroadcastw %xmm1, %ymm1
760; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
761; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
762; XOPAVX2-NEXT:    vprotw %xmm3, %xmm2, %xmm2
763; XOPAVX2-NEXT:    vprotw %xmm1, %xmm0, %xmm0
764; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
765; XOPAVX2-NEXT:    retq
766  %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
767  %splat16 = sub <16 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %splat
768  %shl = shl <16 x i16> %a, %splat
769  %lshr = lshr <16 x i16> %a, %splat16
770  %or = or <16 x i16> %shl, %lshr
771  ret <16 x i16> %or
772}
773
774define <32 x i8> @splatvar_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
775; AVX1-LABEL: splatvar_rotate_v32i8:
776; AVX1:       # %bb.0:
777; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
778; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
779; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
780; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm3
781; AVX1-NEXT:    vmovdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
782; AVX1-NEXT:    vpand %xmm8, %xmm3, %xmm3
783; AVX1-NEXT:    vpsllw $4, %xmm2, %xmm5
784; AVX1-NEXT:    vmovdqa {{.*#+}} xmm9 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
785; AVX1-NEXT:    vpand %xmm9, %xmm5, %xmm5
786; AVX1-NEXT:    vpor %xmm3, %xmm5, %xmm3
787; AVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
788; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm2, %xmm2
789; AVX1-NEXT:    vpsrlw $6, %xmm2, %xmm3
790; AVX1-NEXT:    vmovdqa {{.*#+}} xmm10 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
791; AVX1-NEXT:    vpand %xmm10, %xmm3, %xmm3
792; AVX1-NEXT:    vpsllw $2, %xmm2, %xmm7
793; AVX1-NEXT:    vmovdqa {{.*#+}} xmm11 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
794; AVX1-NEXT:    vpand %xmm11, %xmm7, %xmm7
795; AVX1-NEXT:    vpor %xmm3, %xmm7, %xmm3
796; AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm7
797; AVX1-NEXT:    vpblendvb %xmm7, %xmm3, %xmm2, %xmm2
798; AVX1-NEXT:    vpsrlw $7, %xmm2, %xmm3
799; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
800; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
801; AVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm5
802; AVX1-NEXT:    vpor %xmm3, %xmm5, %xmm3
803; AVX1-NEXT:    vpaddb %xmm7, %xmm7, %xmm5
804; AVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
805; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm3
806; AVX1-NEXT:    vpand %xmm8, %xmm3, %xmm3
807; AVX1-NEXT:    vpsllw $4, %xmm0, %xmm4
808; AVX1-NEXT:    vpand %xmm9, %xmm4, %xmm4
809; AVX1-NEXT:    vpor %xmm3, %xmm4, %xmm3
810; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
811; AVX1-NEXT:    vpsrlw $6, %xmm0, %xmm1
812; AVX1-NEXT:    vpand %xmm10, %xmm1, %xmm1
813; AVX1-NEXT:    vpsllw $2, %xmm0, %xmm3
814; AVX1-NEXT:    vpand %xmm11, %xmm3, %xmm3
815; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
816; AVX1-NEXT:    vpblendvb %xmm7, %xmm1, %xmm0, %xmm0
817; AVX1-NEXT:    vpsrlw $7, %xmm0, %xmm1
818; AVX1-NEXT:    vpand %xmm6, %xmm1, %xmm1
819; AVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm3
820; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
821; AVX1-NEXT:    vpblendvb %xmm5, %xmm1, %xmm0, %xmm0
822; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
823; AVX1-NEXT:    retq
824;
825; AVX2-LABEL: splatvar_rotate_v32i8:
826; AVX2:       # %bb.0:
827; AVX2-NEXT:    vpbroadcastb %xmm1, %ymm1
828; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm2
829; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
830; AVX2-NEXT:    vpsllw $4, %ymm0, %ymm3
831; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
832; AVX2-NEXT:    vpor %ymm2, %ymm3, %ymm2
833; AVX2-NEXT:    vpsllw $5, %ymm1, %ymm1
834; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
835; AVX2-NEXT:    vpsrlw $6, %ymm0, %ymm2
836; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
837; AVX2-NEXT:    vpsllw $2, %ymm0, %ymm3
838; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
839; AVX2-NEXT:    vpor %ymm2, %ymm3, %ymm2
840; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
841; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
842; AVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
843; AVX2-NEXT:    vpsrlw $7, %ymm0, %ymm3
844; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
845; AVX2-NEXT:    vpor %ymm3, %ymm2, %ymm2
846; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
847; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
848; AVX2-NEXT:    retq
849;
850; AVX512F-LABEL: splatvar_rotate_v32i8:
851; AVX512F:       # %bb.0:
852; AVX512F-NEXT:    vpbroadcastb %xmm1, %ymm1
853; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm2
854; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
855; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm3
856; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
857; AVX512F-NEXT:    vpor %ymm2, %ymm3, %ymm2
858; AVX512F-NEXT:    vpsllw $5, %ymm1, %ymm1
859; AVX512F-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
860; AVX512F-NEXT:    vpsrlw $6, %ymm0, %ymm2
861; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
862; AVX512F-NEXT:    vpsllw $2, %ymm0, %ymm3
863; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
864; AVX512F-NEXT:    vpor %ymm2, %ymm3, %ymm2
865; AVX512F-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
866; AVX512F-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
867; AVX512F-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
868; AVX512F-NEXT:    vpsrlw $7, %ymm0, %ymm3
869; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
870; AVX512F-NEXT:    vpor %ymm3, %ymm2, %ymm2
871; AVX512F-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
872; AVX512F-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
873; AVX512F-NEXT:    retq
874;
875; AVX512VL-LABEL: splatvar_rotate_v32i8:
876; AVX512VL:       # %bb.0:
877; AVX512VL-NEXT:    vpbroadcastb %xmm1, %ymm1
878; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm2
879; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
880; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm3
881; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
882; AVX512VL-NEXT:    vpor %ymm2, %ymm3, %ymm2
883; AVX512VL-NEXT:    vpsllw $5, %ymm1, %ymm1
884; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
885; AVX512VL-NEXT:    vpsrlw $6, %ymm0, %ymm2
886; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
887; AVX512VL-NEXT:    vpsllw $2, %ymm0, %ymm3
888; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
889; AVX512VL-NEXT:    vpor %ymm2, %ymm3, %ymm2
890; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
891; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
892; AVX512VL-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
893; AVX512VL-NEXT:    vpsrlw $7, %ymm0, %ymm3
894; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
895; AVX512VL-NEXT:    vpor %ymm3, %ymm2, %ymm2
896; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
897; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
898; AVX512VL-NEXT:    retq
899;
900; AVX512BW-LABEL: splatvar_rotate_v32i8:
901; AVX512BW:       # %bb.0:
902; AVX512BW-NEXT:    vpbroadcastb %xmm1, %ymm1
903; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
904; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
905; AVX512BW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm2
906; AVX512BW-NEXT:    vpmovwb %zmm2, %ymm2
907; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
908; AVX512BW-NEXT:    vpsubb %ymm1, %ymm3, %ymm1
909; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
910; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
911; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
912; AVX512BW-NEXT:    vpor %ymm0, %ymm2, %ymm0
913; AVX512BW-NEXT:    retq
914;
915; AVX512VLBW-LABEL: splatvar_rotate_v32i8:
916; AVX512VLBW:       # %bb.0:
917; AVX512VLBW-NEXT:    vpbroadcastb %xmm1, %ymm1
918; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
919; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
920; AVX512VLBW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm2
921; AVX512VLBW-NEXT:    vpmovwb %zmm2, %ymm2
922; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
923; AVX512VLBW-NEXT:    vpsubb %ymm1, %ymm3, %ymm1
924; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
925; AVX512VLBW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
926; AVX512VLBW-NEXT:    vpmovwb %zmm0, %ymm0
927; AVX512VLBW-NEXT:    vpor %ymm0, %ymm2, %ymm0
928; AVX512VLBW-NEXT:    retq
929;
930; XOPAVX1-LABEL: splatvar_rotate_v32i8:
931; XOPAVX1:       # %bb.0:
932; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
933; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
934; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
935; XOPAVX1-NEXT:    vprotb %xmm1, %xmm2, %xmm2
936; XOPAVX1-NEXT:    vprotb %xmm1, %xmm0, %xmm0
937; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
938; XOPAVX1-NEXT:    retq
939;
940; XOPAVX2-LABEL: splatvar_rotate_v32i8:
941; XOPAVX2:       # %bb.0:
942; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %ymm1
943; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
944; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
945; XOPAVX2-NEXT:    vprotb %xmm3, %xmm2, %xmm2
946; XOPAVX2-NEXT:    vprotb %xmm1, %xmm0, %xmm0
947; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
948; XOPAVX2-NEXT:    retq
949  %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
950  %splat8 = sub <32 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %splat
951  %shl = shl <32 x i8> %a, %splat
952  %lshr = lshr <32 x i8> %a, %splat8
953  %or = or <32 x i8> %shl, %lshr
954  ret <32 x i8> %or
955}
956
957;
958; Constant Rotates
959;
960
961define <4 x i64> @constant_rotate_v4i64(<4 x i64> %a) nounwind {
962; AVX1-LABEL: constant_rotate_v4i64:
963; AVX1:       # %bb.0:
964; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
965; AVX1-NEXT:    vpsllq $60, %xmm1, %xmm2
966; AVX1-NEXT:    vpsllq $50, %xmm1, %xmm3
967; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
968; AVX1-NEXT:    vpsllq $14, %xmm0, %xmm3
969; AVX1-NEXT:    vpsllq $4, %xmm0, %xmm4
970; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
971; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
972; AVX1-NEXT:    vpsrlq $4, %xmm1, %xmm3
973; AVX1-NEXT:    vpsrlq $14, %xmm1, %xmm1
974; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
975; AVX1-NEXT:    vpsrlq $50, %xmm0, %xmm3
976; AVX1-NEXT:    vpsrlq $60, %xmm0, %xmm0
977; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
978; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
979; AVX1-NEXT:    vorps %ymm0, %ymm2, %ymm0
980; AVX1-NEXT:    retq
981;
982; AVX2-LABEL: constant_rotate_v4i64:
983; AVX2:       # %bb.0:
984; AVX2-NEXT:    vpsllvq {{.*}}(%rip), %ymm0, %ymm1
985; AVX2-NEXT:    vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
986; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
987; AVX2-NEXT:    retq
988;
989; AVX512F-LABEL: constant_rotate_v4i64:
990; AVX512F:       # %bb.0:
991; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
992; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,14,50,60]
993; AVX512F-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
994; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
995; AVX512F-NEXT:    retq
996;
997; AVX512VL-LABEL: constant_rotate_v4i64:
998; AVX512VL:       # %bb.0:
999; AVX512VL-NEXT:    vprolvq {{.*}}(%rip), %ymm0, %ymm0
1000; AVX512VL-NEXT:    retq
1001;
1002; AVX512BW-LABEL: constant_rotate_v4i64:
1003; AVX512BW:       # %bb.0:
1004; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1005; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,14,50,60]
1006; AVX512BW-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
1007; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1008; AVX512BW-NEXT:    retq
1009;
1010; AVX512VLBW-LABEL: constant_rotate_v4i64:
1011; AVX512VLBW:       # %bb.0:
1012; AVX512VLBW-NEXT:    vprolvq {{.*}}(%rip), %ymm0, %ymm0
1013; AVX512VLBW-NEXT:    retq
1014;
1015; XOPAVX1-LABEL: constant_rotate_v4i64:
1016; XOPAVX1:       # %bb.0:
1017; XOPAVX1-NEXT:    vprotq {{.*}}(%rip), %xmm0, %xmm1
1018; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1019; XOPAVX1-NEXT:    vprotq {{.*}}(%rip), %xmm0, %xmm0
1020; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1021; XOPAVX1-NEXT:    retq
1022;
1023; XOPAVX2-LABEL: constant_rotate_v4i64:
1024; XOPAVX2:       # %bb.0:
1025; XOPAVX2-NEXT:    vprotq {{.*}}(%rip), %xmm0, %xmm1
1026; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1027; XOPAVX2-NEXT:    vprotq {{.*}}(%rip), %xmm0, %xmm0
1028; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
1029; XOPAVX2-NEXT:    retq
1030  %shl = shl <4 x i64> %a, <i64 4, i64 14, i64 50, i64 60>
1031  %lshr = lshr <4 x i64> %a, <i64 60, i64 50, i64 14, i64 4>
1032  %or = or <4 x i64> %shl, %lshr
1033  ret <4 x i64> %or
1034}
1035
1036define <8 x i32> @constant_rotate_v8i32(<8 x i32> %a) nounwind {
1037; AVX1-LABEL: constant_rotate_v8i32:
1038; AVX1:       # %bb.0:
1039; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [256,512,1024,2048]
1040; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
1041; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
1042; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
1043; AVX1-NEXT:    vpmuludq %xmm2, %xmm4, %xmm2
1044; AVX1-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
1045; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
1046; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
1047; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
1048; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1049; AVX1-NEXT:    vpor %xmm3, %xmm1, %xmm1
1050; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [16,32,64,128]
1051; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
1052; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
1053; AVX1-NEXT:    vpmuludq %xmm3, %xmm4, %xmm3
1054; AVX1-NEXT:    vpmuludq %xmm2, %xmm0, %xmm0
1055; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
1056; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1057; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
1058; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
1059; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
1060; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1061; AVX1-NEXT:    retq
1062;
1063; AVX2-LABEL: constant_rotate_v8i32:
1064; AVX2:       # %bb.0:
1065; AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %ymm0, %ymm1
1066; AVX2-NEXT:    vpsllvd {{.*}}(%rip), %ymm0, %ymm0
1067; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
1068; AVX2-NEXT:    retq
1069;
1070; AVX512F-LABEL: constant_rotate_v8i32:
1071; AVX512F:       # %bb.0:
1072; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1073; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11]
1074; AVX512F-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
1075; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1076; AVX512F-NEXT:    retq
1077;
1078; AVX512VL-LABEL: constant_rotate_v8i32:
1079; AVX512VL:       # %bb.0:
1080; AVX512VL-NEXT:    vprolvd {{.*}}(%rip), %ymm0, %ymm0
1081; AVX512VL-NEXT:    retq
1082;
1083; AVX512BW-LABEL: constant_rotate_v8i32:
1084; AVX512BW:       # %bb.0:
1085; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1086; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11]
1087; AVX512BW-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
1088; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1089; AVX512BW-NEXT:    retq
1090;
1091; AVX512VLBW-LABEL: constant_rotate_v8i32:
1092; AVX512VLBW:       # %bb.0:
1093; AVX512VLBW-NEXT:    vprolvd {{.*}}(%rip), %ymm0, %ymm0
1094; AVX512VLBW-NEXT:    retq
1095;
1096; XOPAVX1-LABEL: constant_rotate_v8i32:
1097; XOPAVX1:       # %bb.0:
1098; XOPAVX1-NEXT:    vprotd {{.*}}(%rip), %xmm0, %xmm1
1099; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1100; XOPAVX1-NEXT:    vprotd {{.*}}(%rip), %xmm0, %xmm0
1101; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1102; XOPAVX1-NEXT:    retq
1103;
1104; XOPAVX2-LABEL: constant_rotate_v8i32:
1105; XOPAVX2:       # %bb.0:
1106; XOPAVX2-NEXT:    vprotd {{.*}}(%rip), %xmm0, %xmm1
1107; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1108; XOPAVX2-NEXT:    vprotd {{.*}}(%rip), %xmm0, %xmm0
1109; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
1110; XOPAVX2-NEXT:    retq
1111  %shl = shl <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
1112  %lshr = lshr <8 x i32> %a, <i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21>
1113  %or = or <8 x i32> %shl, %lshr
1114  ret <8 x i32> %or
1115}
1116
1117define <16 x i16> @constant_rotate_v16i16(<16 x i16> %a) nounwind {
1118; AVX1-LABEL: constant_rotate_v16i16:
1119; AVX1:       # %bb.0:
1120; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1121; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [256,512,1024,2048,4096,8192,16384,32768]
1122; AVX1-NEXT:    vpmulhuw %xmm2, %xmm1, %xmm3
1123; AVX1-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
1124; AVX1-NEXT:    vpor %xmm3, %xmm1, %xmm1
1125; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128]
1126; AVX1-NEXT:    vpmulhuw %xmm2, %xmm0, %xmm3
1127; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
1128; AVX1-NEXT:    vpor %xmm3, %xmm0, %xmm0
1129; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1130; AVX1-NEXT:    retq
1131;
1132; AVX2-LABEL: constant_rotate_v16i16:
1133; AVX2:       # %bb.0:
1134; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
1135; AVX2-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm2
1136; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1137; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
1138; AVX2-NEXT:    retq
1139;
1140; AVX512F-LABEL: constant_rotate_v16i16:
1141; AVX512F:       # %bb.0:
1142; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
1143; AVX512F-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm2
1144; AVX512F-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1145; AVX512F-NEXT:    vpor %ymm2, %ymm0, %ymm0
1146; AVX512F-NEXT:    retq
1147;
1148; AVX512VL-LABEL: constant_rotate_v16i16:
1149; AVX512VL:       # %bb.0:
1150; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
1151; AVX512VL-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm2
1152; AVX512VL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1153; AVX512VL-NEXT:    vpor %ymm2, %ymm0, %ymm0
1154; AVX512VL-NEXT:    retq
1155;
1156; AVX512BW-LABEL: constant_rotate_v16i16:
1157; AVX512BW:       # %bb.0:
1158; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1159; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
1160; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1]
1161; AVX512BW-NEXT:    vpsrlvw %zmm2, %zmm0, %zmm2
1162; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
1163; AVX512BW-NEXT:    vpor %ymm2, %ymm0, %ymm0
1164; AVX512BW-NEXT:    retq
1165;
1166; AVX512VLBW-LABEL: constant_rotate_v16i16:
1167; AVX512VLBW:       # %bb.0:
1168; AVX512VLBW-NEXT:    vpsrlvw {{.*}}(%rip), %ymm0, %ymm1
1169; AVX512VLBW-NEXT:    vpsllvw {{.*}}(%rip), %ymm0, %ymm0
1170; AVX512VLBW-NEXT:    vpor %ymm1, %ymm0, %ymm0
1171; AVX512VLBW-NEXT:    retq
1172;
1173; XOPAVX1-LABEL: constant_rotate_v16i16:
1174; XOPAVX1:       # %bb.0:
1175; XOPAVX1-NEXT:    vprotw {{.*}}(%rip), %xmm0, %xmm1
1176; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1177; XOPAVX1-NEXT:    vprotw {{.*}}(%rip), %xmm0, %xmm0
1178; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1179; XOPAVX1-NEXT:    retq
1180;
1181; XOPAVX2-LABEL: constant_rotate_v16i16:
1182; XOPAVX2:       # %bb.0:
1183; XOPAVX2-NEXT:    vprotw {{.*}}(%rip), %xmm0, %xmm1
1184; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1185; XOPAVX2-NEXT:    vprotw {{.*}}(%rip), %xmm0, %xmm0
1186; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
1187; XOPAVX2-NEXT:    retq
1188  %shl = shl <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
1189  %lshr = lshr <16 x i16> %a, <i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1>
1190  %or = or <16 x i16> %shl, %lshr
1191  ret <16 x i16> %or
1192}
1193
1194define <32 x i8> @constant_rotate_v32i8(<32 x i8> %a) nounwind {
1195; AVX1-LABEL: constant_rotate_v32i8:
1196; AVX1:       # %bb.0:
1197; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1198; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm2
1199; AVX1-NEXT:    vmovdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1200; AVX1-NEXT:    vpand %xmm8, %xmm2, %xmm2
1201; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm4
1202; AVX1-NEXT:    vmovdqa {{.*#+}} xmm9 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
1203; AVX1-NEXT:    vpand %xmm9, %xmm4, %xmm4
1204; AVX1-NEXT:    vpor %xmm2, %xmm4, %xmm2
1205; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [8192,24640,41088,57536,57600,41152,24704,8256]
1206; AVX1-NEXT:    vpblendvb %xmm4, %xmm2, %xmm1, %xmm1
1207; AVX1-NEXT:    vpsrlw $6, %xmm1, %xmm2
1208; AVX1-NEXT:    vmovdqa {{.*#+}} xmm10 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
1209; AVX1-NEXT:    vpand %xmm10, %xmm2, %xmm2
1210; AVX1-NEXT:    vpsllw $2, %xmm1, %xmm7
1211; AVX1-NEXT:    vmovdqa {{.*#+}} xmm11 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
1212; AVX1-NEXT:    vpand %xmm11, %xmm7, %xmm7
1213; AVX1-NEXT:    vpor %xmm2, %xmm7, %xmm2
1214; AVX1-NEXT:    vpaddb %xmm4, %xmm4, %xmm7
1215; AVX1-NEXT:    vpblendvb %xmm7, %xmm2, %xmm1, %xmm1
1216; AVX1-NEXT:    vpsrlw $7, %xmm1, %xmm2
1217; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1218; AVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2
1219; AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm6
1220; AVX1-NEXT:    vpor %xmm2, %xmm6, %xmm2
1221; AVX1-NEXT:    vpaddb %xmm7, %xmm7, %xmm6
1222; AVX1-NEXT:    vpblendvb %xmm6, %xmm2, %xmm1, %xmm1
1223; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm2
1224; AVX1-NEXT:    vpand %xmm8, %xmm2, %xmm2
1225; AVX1-NEXT:    vpsllw $4, %xmm0, %xmm3
1226; AVX1-NEXT:    vpand %xmm9, %xmm3, %xmm3
1227; AVX1-NEXT:    vpor %xmm2, %xmm3, %xmm2
1228; AVX1-NEXT:    vpblendvb %xmm4, %xmm2, %xmm0, %xmm0
1229; AVX1-NEXT:    vpsrlw $6, %xmm0, %xmm2
1230; AVX1-NEXT:    vpand %xmm10, %xmm2, %xmm2
1231; AVX1-NEXT:    vpsllw $2, %xmm0, %xmm3
1232; AVX1-NEXT:    vpand %xmm11, %xmm3, %xmm3
1233; AVX1-NEXT:    vpor %xmm2, %xmm3, %xmm2
1234; AVX1-NEXT:    vpblendvb %xmm7, %xmm2, %xmm0, %xmm0
1235; AVX1-NEXT:    vpsrlw $7, %xmm0, %xmm2
1236; AVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2
1237; AVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm3
1238; AVX1-NEXT:    vpor %xmm2, %xmm3, %xmm2
1239; AVX1-NEXT:    vpblendvb %xmm6, %xmm2, %xmm0, %xmm0
1240; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1241; AVX1-NEXT:    retq
1242;
1243; AVX2-LABEL: constant_rotate_v32i8:
1244; AVX2:       # %bb.0:
1245; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm1
1246; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
1247; AVX2-NEXT:    vpsllw $4, %ymm0, %ymm2
1248; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
1249; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
1250; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
1251; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
1252; AVX2-NEXT:    vpsrlw $6, %ymm0, %ymm1
1253; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
1254; AVX2-NEXT:    vpsllw $2, %ymm0, %ymm3
1255; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
1256; AVX2-NEXT:    vpor %ymm1, %ymm3, %ymm1
1257; AVX2-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
1258; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
1259; AVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm1
1260; AVX2-NEXT:    vpsrlw $7, %ymm0, %ymm3
1261; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
1262; AVX2-NEXT:    vpor %ymm3, %ymm1, %ymm1
1263; AVX2-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
1264; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
1265; AVX2-NEXT:    retq
1266;
1267; AVX512F-LABEL: constant_rotate_v32i8:
1268; AVX512F:       # %bb.0:
1269; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm1
1270; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
1271; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm2
1272; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
1273; AVX512F-NEXT:    vpor %ymm1, %ymm2, %ymm1
1274; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
1275; AVX512F-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
1276; AVX512F-NEXT:    vpsrlw $6, %ymm0, %ymm1
1277; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
1278; AVX512F-NEXT:    vpsllw $2, %ymm0, %ymm3
1279; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
1280; AVX512F-NEXT:    vpor %ymm1, %ymm3, %ymm1
1281; AVX512F-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
1282; AVX512F-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
1283; AVX512F-NEXT:    vpaddb %ymm0, %ymm0, %ymm1
1284; AVX512F-NEXT:    vpsrlw $7, %ymm0, %ymm3
1285; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
1286; AVX512F-NEXT:    vpor %ymm3, %ymm1, %ymm1
1287; AVX512F-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
1288; AVX512F-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
1289; AVX512F-NEXT:    retq
1290;
1291; AVX512VL-LABEL: constant_rotate_v32i8:
1292; AVX512VL:       # %bb.0:
1293; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm1
1294; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
1295; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm2
1296; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
1297; AVX512VL-NEXT:    vpor %ymm1, %ymm2, %ymm1
1298; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
1299; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
1300; AVX512VL-NEXT:    vpsrlw $6, %ymm0, %ymm1
1301; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
1302; AVX512VL-NEXT:    vpsllw $2, %ymm0, %ymm3
1303; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
1304; AVX512VL-NEXT:    vpor %ymm1, %ymm3, %ymm1
1305; AVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
1306; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
1307; AVX512VL-NEXT:    vpaddb %ymm0, %ymm0, %ymm1
1308; AVX512VL-NEXT:    vpsrlw $7, %ymm0, %ymm3
1309; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
1310; AVX512VL-NEXT:    vpor %ymm3, %ymm1, %ymm1
1311; AVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
1312; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
1313; AVX512VL-NEXT:    retq
1314;
1315; AVX512BW-LABEL: constant_rotate_v32i8:
1316; AVX512BW:       # %bb.0:
1317; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1318; AVX512BW-NEXT:    vpsrlvw {{.*}}(%rip), %zmm0, %zmm1
1319; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
1320; AVX512BW-NEXT:    vpsllvw {{.*}}(%rip), %zmm0, %zmm0
1321; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1322; AVX512BW-NEXT:    vpor %ymm1, %ymm0, %ymm0
1323; AVX512BW-NEXT:    retq
1324;
1325; AVX512VLBW-LABEL: constant_rotate_v32i8:
1326; AVX512VLBW:       # %bb.0:
1327; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1328; AVX512VLBW-NEXT:    vpsrlvw {{.*}}(%rip), %zmm0, %zmm1
1329; AVX512VLBW-NEXT:    vpmovwb %zmm1, %ymm1
1330; AVX512VLBW-NEXT:    vpsllvw {{.*}}(%rip), %zmm0, %zmm0
1331; AVX512VLBW-NEXT:    vpmovwb %zmm0, %ymm0
1332; AVX512VLBW-NEXT:    vpor %ymm1, %ymm0, %ymm0
1333; AVX512VLBW-NEXT:    retq
1334;
1335; XOPAVX1-LABEL: constant_rotate_v32i8:
1336; XOPAVX1:       # %bb.0:
1337; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1338; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
1339; XOPAVX1-NEXT:    vprotb %xmm2, %xmm1, %xmm1
1340; XOPAVX1-NEXT:    vprotb %xmm2, %xmm0, %xmm0
1341; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1342; XOPAVX1-NEXT:    retq
1343;
1344; XOPAVX2-LABEL: constant_rotate_v32i8:
1345; XOPAVX2:       # %bb.0:
1346; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1347; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
1348; XOPAVX2-NEXT:    vprotb %xmm2, %xmm1, %xmm1
1349; XOPAVX2-NEXT:    vprotb %xmm2, %xmm0, %xmm0
1350; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1351; XOPAVX2-NEXT:    retq
1352  %shl = shl <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>
1353  %lshr = lshr <32 x i8> %a, <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
1354  %or = or <32 x i8> %shl, %lshr
1355  ret <32 x i8> %or
1356}
1357
1358;
1359; Uniform Constant Rotates
1360;
1361
1362define <4 x i64> @splatconstant_rotate_v4i64(<4 x i64> %a) nounwind {
1363; AVX1-LABEL: splatconstant_rotate_v4i64:
1364; AVX1:       # %bb.0:
1365; AVX1-NEXT:    vpsllq $14, %xmm0, %xmm1
1366; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1367; AVX1-NEXT:    vpsllq $14, %xmm2, %xmm3
1368; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
1369; AVX1-NEXT:    vpsrlq $50, %xmm0, %xmm0
1370; AVX1-NEXT:    vpsrlq $50, %xmm2, %xmm2
1371; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1372; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
1373; AVX1-NEXT:    retq
1374;
1375; AVX2-LABEL: splatconstant_rotate_v4i64:
1376; AVX2:       # %bb.0:
1377; AVX2-NEXT:    vpsllq $14, %ymm0, %ymm1
1378; AVX2-NEXT:    vpsrlq $50, %ymm0, %ymm0
1379; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
1380; AVX2-NEXT:    retq
1381;
1382; AVX512F-LABEL: splatconstant_rotate_v4i64:
1383; AVX512F:       # %bb.0:
1384; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1385; AVX512F-NEXT:    vprolq $14, %zmm0, %zmm0
1386; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1387; AVX512F-NEXT:    retq
1388;
1389; AVX512VL-LABEL: splatconstant_rotate_v4i64:
1390; AVX512VL:       # %bb.0:
1391; AVX512VL-NEXT:    vprolq $14, %ymm0, %ymm0
1392; AVX512VL-NEXT:    retq
1393;
1394; AVX512BW-LABEL: splatconstant_rotate_v4i64:
1395; AVX512BW:       # %bb.0:
1396; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1397; AVX512BW-NEXT:    vprolq $14, %zmm0, %zmm0
1398; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1399; AVX512BW-NEXT:    retq
1400;
1401; AVX512VLBW-LABEL: splatconstant_rotate_v4i64:
1402; AVX512VLBW:       # %bb.0:
1403; AVX512VLBW-NEXT:    vprolq $14, %ymm0, %ymm0
1404; AVX512VLBW-NEXT:    retq
1405;
1406; XOPAVX1-LABEL: splatconstant_rotate_v4i64:
1407; XOPAVX1:       # %bb.0:
1408; XOPAVX1-NEXT:    vprotq $14, %xmm0, %xmm1
1409; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1410; XOPAVX1-NEXT:    vprotq $14, %xmm0, %xmm0
1411; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1412; XOPAVX1-NEXT:    retq
1413;
1414; XOPAVX2-LABEL: splatconstant_rotate_v4i64:
1415; XOPAVX2:       # %bb.0:
1416; XOPAVX2-NEXT:    vprotq $14, %xmm0, %xmm1
1417; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1418; XOPAVX2-NEXT:    vprotq $14, %xmm0, %xmm0
1419; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
1420; XOPAVX2-NEXT:    retq
1421  %shl = shl <4 x i64> %a, <i64 14, i64 14, i64 14, i64 14>
1422  %lshr = lshr <4 x i64> %a, <i64 50, i64 50, i64 50, i64 50>
1423  %or = or <4 x i64> %shl, %lshr
1424  ret <4 x i64> %or
1425}
1426
1427define <8 x i32> @splatconstant_rotate_v8i32(<8 x i32> %a) nounwind {
1428; AVX1-LABEL: splatconstant_rotate_v8i32:
1429; AVX1:       # %bb.0:
1430; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1431; AVX1-NEXT:    vpsrld $28, %xmm1, %xmm2
1432; AVX1-NEXT:    vpslld $4, %xmm1, %xmm1
1433; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
1434; AVX1-NEXT:    vpsrld $28, %xmm0, %xmm2
1435; AVX1-NEXT:    vpslld $4, %xmm0, %xmm0
1436; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
1437; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1438; AVX1-NEXT:    retq
1439;
1440; AVX2-LABEL: splatconstant_rotate_v8i32:
1441; AVX2:       # %bb.0:
1442; AVX2-NEXT:    vpsrld $28, %ymm0, %ymm1
1443; AVX2-NEXT:    vpslld $4, %ymm0, %ymm0
1444; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
1445; AVX2-NEXT:    retq
1446;
1447; AVX512F-LABEL: splatconstant_rotate_v8i32:
1448; AVX512F:       # %bb.0:
1449; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1450; AVX512F-NEXT:    vprold $4, %zmm0, %zmm0
1451; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1452; AVX512F-NEXT:    retq
1453;
1454; AVX512VL-LABEL: splatconstant_rotate_v8i32:
1455; AVX512VL:       # %bb.0:
1456; AVX512VL-NEXT:    vprold $4, %ymm0, %ymm0
1457; AVX512VL-NEXT:    retq
1458;
1459; AVX512BW-LABEL: splatconstant_rotate_v8i32:
1460; AVX512BW:       # %bb.0:
1461; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1462; AVX512BW-NEXT:    vprold $4, %zmm0, %zmm0
1463; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1464; AVX512BW-NEXT:    retq
1465;
1466; AVX512VLBW-LABEL: splatconstant_rotate_v8i32:
1467; AVX512VLBW:       # %bb.0:
1468; AVX512VLBW-NEXT:    vprold $4, %ymm0, %ymm0
1469; AVX512VLBW-NEXT:    retq
1470;
1471; XOPAVX1-LABEL: splatconstant_rotate_v8i32:
1472; XOPAVX1:       # %bb.0:
1473; XOPAVX1-NEXT:    vprotd $4, %xmm0, %xmm1
1474; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1475; XOPAVX1-NEXT:    vprotd $4, %xmm0, %xmm0
1476; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1477; XOPAVX1-NEXT:    retq
1478;
1479; XOPAVX2-LABEL: splatconstant_rotate_v8i32:
1480; XOPAVX2:       # %bb.0:
1481; XOPAVX2-NEXT:    vprotd $4, %xmm0, %xmm1
1482; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1483; XOPAVX2-NEXT:    vprotd $4, %xmm0, %xmm0
1484; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
1485; XOPAVX2-NEXT:    retq
1486  %shl = shl <8 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
1487  %lshr = lshr <8 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
1488  %or = or <8 x i32> %shl, %lshr
1489  ret <8 x i32> %or
1490}
1491
1492define <16 x i16> @splatconstant_rotate_v16i16(<16 x i16> %a) nounwind {
1493; AVX1-LABEL: splatconstant_rotate_v16i16:
1494; AVX1:       # %bb.0:
1495; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1496; AVX1-NEXT:    vpsrlw $9, %xmm1, %xmm2
1497; AVX1-NEXT:    vpsllw $7, %xmm1, %xmm1
1498; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
1499; AVX1-NEXT:    vpsrlw $9, %xmm0, %xmm2
1500; AVX1-NEXT:    vpsllw $7, %xmm0, %xmm0
1501; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
1502; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1503; AVX1-NEXT:    retq
1504;
1505; AVX2-LABEL: splatconstant_rotate_v16i16:
1506; AVX2:       # %bb.0:
1507; AVX2-NEXT:    vpsrlw $9, %ymm0, %ymm1
1508; AVX2-NEXT:    vpsllw $7, %ymm0, %ymm0
1509; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
1510; AVX2-NEXT:    retq
1511;
1512; AVX512-LABEL: splatconstant_rotate_v16i16:
1513; AVX512:       # %bb.0:
1514; AVX512-NEXT:    vpsrlw $9, %ymm0, %ymm1
1515; AVX512-NEXT:    vpsllw $7, %ymm0, %ymm0
1516; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
1517; AVX512-NEXT:    retq
1518;
1519; XOPAVX1-LABEL: splatconstant_rotate_v16i16:
1520; XOPAVX1:       # %bb.0:
1521; XOPAVX1-NEXT:    vprotw $7, %xmm0, %xmm1
1522; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1523; XOPAVX1-NEXT:    vprotw $7, %xmm0, %xmm0
1524; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1525; XOPAVX1-NEXT:    retq
1526;
1527; XOPAVX2-LABEL: splatconstant_rotate_v16i16:
1528; XOPAVX2:       # %bb.0:
1529; XOPAVX2-NEXT:    vprotw $7, %xmm0, %xmm1
1530; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1531; XOPAVX2-NEXT:    vprotw $7, %xmm0, %xmm0
1532; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
1533; XOPAVX2-NEXT:    retq
1534  %shl = shl <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
1535  %lshr = lshr <16 x i16> %a, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9>
1536  %or = or <16 x i16> %shl, %lshr
1537  ret <16 x i16> %or
1538}
1539
1540define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind {
1541; AVX1-LABEL: splatconstant_rotate_v32i8:
1542; AVX1:       # %bb.0:
1543; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1544; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm2
1545; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1546; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
1547; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
1548; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
1549; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
1550; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
1551; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm2
1552; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
1553; AVX1-NEXT:    vpsllw $4, %xmm0, %xmm0
1554; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
1555; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
1556; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1557; AVX1-NEXT:    retq
1558;
1559; AVX2-LABEL: splatconstant_rotate_v32i8:
1560; AVX2:       # %bb.0:
1561; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm1
1562; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
1563; AVX2-NEXT:    vpsllw $4, %ymm0, %ymm0
1564; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1565; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
1566; AVX2-NEXT:    retq
1567;
1568; AVX512-LABEL: splatconstant_rotate_v32i8:
1569; AVX512:       # %bb.0:
1570; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm1
1571; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
1572; AVX512-NEXT:    vpsllw $4, %ymm0, %ymm0
1573; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1574; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
1575; AVX512-NEXT:    retq
1576;
1577; XOPAVX1-LABEL: splatconstant_rotate_v32i8:
1578; XOPAVX1:       # %bb.0:
1579; XOPAVX1-NEXT:    vprotb $4, %xmm0, %xmm1
1580; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1581; XOPAVX1-NEXT:    vprotb $4, %xmm0, %xmm0
1582; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1583; XOPAVX1-NEXT:    retq
1584;
1585; XOPAVX2-LABEL: splatconstant_rotate_v32i8:
1586; XOPAVX2:       # %bb.0:
1587; XOPAVX2-NEXT:    vprotb $4, %xmm0, %xmm1
1588; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1589; XOPAVX2-NEXT:    vprotb $4, %xmm0, %xmm0
1590; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
1591; XOPAVX2-NEXT:    retq
1592  %shl = shl <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1593  %lshr = lshr <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1594  %or = or <32 x i8> %shl, %lshr
1595  ret <32 x i8> %or
1596}
1597
1598;
1599; Masked Uniform Constant Rotates
1600;
1601
1602define <4 x i64> @splatconstant_rotate_mask_v4i64(<4 x i64> %a) nounwind {
1603; AVX1-LABEL: splatconstant_rotate_mask_v4i64:
1604; AVX1:       # %bb.0:
1605; AVX1-NEXT:    vpsrlq $49, %xmm0, %xmm1
1606; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1607; AVX1-NEXT:    vpsrlq $49, %xmm0, %xmm0
1608; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1609; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
1610; AVX1-NEXT:    retq
1611;
1612; AVX2-LABEL: splatconstant_rotate_mask_v4i64:
1613; AVX2:       # %bb.0:
1614; AVX2-NEXT:    vpsrlq $49, %ymm0, %ymm0
1615; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1616; AVX2-NEXT:    retq
1617;
1618; AVX512F-LABEL: splatconstant_rotate_mask_v4i64:
1619; AVX512F:       # %bb.0:
1620; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1621; AVX512F-NEXT:    vprolq $15, %zmm0, %zmm0
1622; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1623; AVX512F-NEXT:    retq
1624;
1625; AVX512VL-LABEL: splatconstant_rotate_mask_v4i64:
1626; AVX512VL:       # %bb.0:
1627; AVX512VL-NEXT:    vprolq $15, %ymm0, %ymm0
1628; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1629; AVX512VL-NEXT:    retq
1630;
1631; AVX512BW-LABEL: splatconstant_rotate_mask_v4i64:
1632; AVX512BW:       # %bb.0:
1633; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1634; AVX512BW-NEXT:    vprolq $15, %zmm0, %zmm0
1635; AVX512BW-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1636; AVX512BW-NEXT:    retq
1637;
1638; AVX512VLBW-LABEL: splatconstant_rotate_mask_v4i64:
1639; AVX512VLBW:       # %bb.0:
1640; AVX512VLBW-NEXT:    vprolq $15, %ymm0, %ymm0
1641; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1642; AVX512VLBW-NEXT:    retq
1643;
1644; XOPAVX1-LABEL: splatconstant_rotate_mask_v4i64:
1645; XOPAVX1:       # %bb.0:
1646; XOPAVX1-NEXT:    vprotq $15, %xmm0, %xmm1
1647; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1648; XOPAVX1-NEXT:    vprotq $15, %xmm0, %xmm0
1649; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1650; XOPAVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
1651; XOPAVX1-NEXT:    retq
1652;
1653; XOPAVX2-LABEL: splatconstant_rotate_mask_v4i64:
1654; XOPAVX2:       # %bb.0:
1655; XOPAVX2-NEXT:    vprotq $15, %xmm0, %xmm1
1656; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1657; XOPAVX2-NEXT:    vprotq $15, %xmm0, %xmm0
1658; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
1659; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1660; XOPAVX2-NEXT:    retq
1661  %shl = shl <4 x i64> %a, <i64 15, i64 15, i64 15, i64 15>
1662  %lshr = lshr <4 x i64> %a, <i64 49, i64 49, i64 49, i64 49>
1663  %rmask = and <4 x i64> %lshr, <i64 255, i64 127, i64 127, i64 255>
1664  %lmask = and <4 x i64> %shl, <i64 33, i64 65, i64 129, i64 257>
1665  %or = or <4 x i64> %lmask, %rmask
1666  ret <4 x i64> %or
1667}
1668
1669define <8 x i32> @splatconstant_rotate_mask_v8i32(<8 x i32> %a) nounwind {
1670; AVX1-LABEL: splatconstant_rotate_mask_v8i32:
1671; AVX1:       # %bb.0:
1672; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1673; AVX1-NEXT:    vpsrld $28, %xmm1, %xmm2
1674; AVX1-NEXT:    vpslld $4, %xmm1, %xmm1
1675; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
1676; AVX1-NEXT:    vpsrld $28, %xmm0, %xmm2
1677; AVX1-NEXT:    vpslld $4, %xmm0, %xmm0
1678; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
1679; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1680; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
1681; AVX1-NEXT:    retq
1682;
1683; AVX2-LABEL: splatconstant_rotate_mask_v8i32:
1684; AVX2:       # %bb.0:
1685; AVX2-NEXT:    vpsrld $28, %ymm0, %ymm1
1686; AVX2-NEXT:    vpslld $4, %ymm0, %ymm0
1687; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
1688; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1689; AVX2-NEXT:    retq
1690;
1691; AVX512F-LABEL: splatconstant_rotate_mask_v8i32:
1692; AVX512F:       # %bb.0:
1693; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1694; AVX512F-NEXT:    vprold $4, %zmm0, %zmm0
1695; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1696; AVX512F-NEXT:    retq
1697;
1698; AVX512VL-LABEL: splatconstant_rotate_mask_v8i32:
1699; AVX512VL:       # %bb.0:
1700; AVX512VL-NEXT:    vprold $4, %ymm0, %ymm0
1701; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1702; AVX512VL-NEXT:    retq
1703;
1704; AVX512BW-LABEL: splatconstant_rotate_mask_v8i32:
1705; AVX512BW:       # %bb.0:
1706; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1707; AVX512BW-NEXT:    vprold $4, %zmm0, %zmm0
1708; AVX512BW-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1709; AVX512BW-NEXT:    retq
1710;
1711; AVX512VLBW-LABEL: splatconstant_rotate_mask_v8i32:
1712; AVX512VLBW:       # %bb.0:
1713; AVX512VLBW-NEXT:    vprold $4, %ymm0, %ymm0
1714; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1715; AVX512VLBW-NEXT:    retq
1716;
1717; XOPAVX1-LABEL: splatconstant_rotate_mask_v8i32:
1718; XOPAVX1:       # %bb.0:
1719; XOPAVX1-NEXT:    vprotd $4, %xmm0, %xmm1
1720; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1721; XOPAVX1-NEXT:    vprotd $4, %xmm0, %xmm0
1722; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1723; XOPAVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
1724; XOPAVX1-NEXT:    retq
1725;
1726; XOPAVX2-LABEL: splatconstant_rotate_mask_v8i32:
1727; XOPAVX2:       # %bb.0:
1728; XOPAVX2-NEXT:    vprotd $4, %xmm0, %xmm1
1729; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1730; XOPAVX2-NEXT:    vprotd $4, %xmm0, %xmm0
1731; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
1732; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1733; XOPAVX2-NEXT:    retq
1734  %shl = shl <8 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
1735  %lshr = lshr <8 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
1736  %rmask = and <8 x i32> %lshr, <i32 3, i32 7, i32 15, i32 31, i32 63, i32 127, i32 255, i32 511>
1737  %lmask = and <8 x i32> %shl, <i32 511, i32 255, i32 127, i32 63, i32 31, i32 15, i32 7, i32 3>
1738  %or = or <8 x i32> %lmask, %rmask
1739  ret <8 x i32> %or
1740}
1741
1742define <16 x i16> @splatconstant_rotate_mask_v16i16(<16 x i16> %a) nounwind {
1743; AVX1-LABEL: splatconstant_rotate_mask_v16i16:
1744; AVX1:       # %bb.0:
1745; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1746; AVX1-NEXT:    vpsrlw $11, %xmm1, %xmm2
1747; AVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
1748; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
1749; AVX1-NEXT:    vpsrlw $11, %xmm0, %xmm2
1750; AVX1-NEXT:    vpsllw $5, %xmm0, %xmm0
1751; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
1752; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1753; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
1754; AVX1-NEXT:    retq
1755;
1756; AVX2-LABEL: splatconstant_rotate_mask_v16i16:
1757; AVX2:       # %bb.0:
1758; AVX2-NEXT:    vpsrlw $11, %ymm0, %ymm1
1759; AVX2-NEXT:    vpsllw $5, %ymm0, %ymm0
1760; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
1761; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1762; AVX2-NEXT:    retq
1763;
1764; AVX512-LABEL: splatconstant_rotate_mask_v16i16:
1765; AVX512:       # %bb.0:
1766; AVX512-NEXT:    vpsrlw $11, %ymm0, %ymm1
1767; AVX512-NEXT:    vpsllw $5, %ymm0, %ymm0
1768; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
1769; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1770; AVX512-NEXT:    retq
1771;
1772; XOPAVX1-LABEL: splatconstant_rotate_mask_v16i16:
1773; XOPAVX1:       # %bb.0:
1774; XOPAVX1-NEXT:    vprotw $5, %xmm0, %xmm1
1775; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1776; XOPAVX1-NEXT:    vprotw $5, %xmm0, %xmm0
1777; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1778; XOPAVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
1779; XOPAVX1-NEXT:    retq
1780;
1781; XOPAVX2-LABEL: splatconstant_rotate_mask_v16i16:
1782; XOPAVX2:       # %bb.0:
1783; XOPAVX2-NEXT:    vprotw $5, %xmm0, %xmm1
1784; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1785; XOPAVX2-NEXT:    vprotw $5, %xmm0, %xmm0
1786; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
1787; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1788; XOPAVX2-NEXT:    retq
1789  %shl = shl <16 x i16> %a, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
1790  %lshr = lshr <16 x i16> %a, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
1791  %rmask = and <16 x i16> %lshr, <i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55>
1792  %lmask = and <16 x i16> %shl, <i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33>
1793  %or = or <16 x i16> %lmask, %rmask
1794  ret <16 x i16> %or
1795}
1796
1797define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind {
1798; AVX1-LABEL: splatconstant_rotate_mask_v32i8:
1799; AVX1:       # %bb.0:
1800; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1801; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm2
1802; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1803; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
1804; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
1805; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
1806; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
1807; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
1808; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm2
1809; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
1810; AVX1-NEXT:    vpsllw $4, %xmm0, %xmm0
1811; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
1812; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
1813; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1814; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
1815; AVX1-NEXT:    retq
1816;
1817; AVX2-LABEL: splatconstant_rotate_mask_v32i8:
1818; AVX2:       # %bb.0:
1819; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm1
1820; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
1821; AVX2-NEXT:    vpsllw $4, %ymm0, %ymm0
1822; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1823; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
1824; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1825; AVX2-NEXT:    retq
1826;
1827; AVX512-LABEL: splatconstant_rotate_mask_v32i8:
1828; AVX512:       # %bb.0:
1829; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm1
1830; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
1831; AVX512-NEXT:    vpsllw $4, %ymm0, %ymm0
1832; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1833; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
1834; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1835; AVX512-NEXT:    retq
1836;
1837; XOPAVX1-LABEL: splatconstant_rotate_mask_v32i8:
1838; XOPAVX1:       # %bb.0:
1839; XOPAVX1-NEXT:    vprotb $4, %xmm0, %xmm1
1840; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1841; XOPAVX1-NEXT:    vprotb $4, %xmm0, %xmm0
1842; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1843; XOPAVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
1844; XOPAVX1-NEXT:    retq
1845;
1846; XOPAVX2-LABEL: splatconstant_rotate_mask_v32i8:
1847; XOPAVX2:       # %bb.0:
1848; XOPAVX2-NEXT:    vprotb $4, %xmm0, %xmm1
1849; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1850; XOPAVX2-NEXT:    vprotb $4, %xmm0, %xmm0
1851; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
1852; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1853; XOPAVX2-NEXT:    retq
1854  %shl = shl <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1855  %lshr = lshr <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1856  %rmask = and <32 x i8> %lshr, <i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55>
1857  %lmask = and <32 x i8> %shl, <i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33>
1858  %or = or <32 x i8> %lmask, %rmask
1859  ret <32 x i8> %or
1860}
1861