• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefixes=SSE,SSE3
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,XOP
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX1
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX2
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512,AVX512F
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512,AVX512BW
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512,VBMI
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512VL,AVX512VLBW
12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+avx512vbmi | FileCheck %s --check-prefixes=AVX,AVX512VL,VLVBMI
13
14define <2 x i64> @var_shuffle_v2i64(<2 x i64> %v, <2 x i64> %indices) nounwind {
15; SSE3-LABEL: var_shuffle_v2i64:
16; SSE3:       # %bb.0:
17; SSE3-NEXT:    movq %xmm1, %rax
18; SSE3-NEXT:    andl $1, %eax
19; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
20; SSE3-NEXT:    movq %xmm1, %rcx
21; SSE3-NEXT:    andl $1, %ecx
22; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
23; SSE3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
24; SSE3-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
25; SSE3-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
26; SSE3-NEXT:    retq
27;
28; SSSE3-LABEL: var_shuffle_v2i64:
29; SSSE3:       # %bb.0:
30; SSSE3-NEXT:    movq %xmm1, %rax
31; SSSE3-NEXT:    andl $1, %eax
32; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
33; SSSE3-NEXT:    movq %xmm1, %rcx
34; SSSE3-NEXT:    andl $1, %ecx
35; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
36; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
37; SSSE3-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
38; SSSE3-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
39; SSSE3-NEXT:    retq
40;
41; SSE41-LABEL: var_shuffle_v2i64:
42; SSE41:       # %bb.0:
43; SSE41-NEXT:    pxor %xmm2, %xmm2
44; SSE41-NEXT:    pcmpeqq %xmm1, %xmm2
45; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1]
46; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
47; SSE41-NEXT:    movdqa %xmm2, %xmm0
48; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
49; SSE41-NEXT:    movapd %xmm1, %xmm0
50; SSE41-NEXT:    retq
51;
52; AVX-LABEL: var_shuffle_v2i64:
53; AVX:       # %bb.0:
54; AVX-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
55; AVX-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
56; AVX-NEXT:    retq
57  %index0 = extractelement <2 x i64> %indices, i32 0
58  %index1 = extractelement <2 x i64> %indices, i32 1
59  %v0 = extractelement <2 x i64> %v, i64 %index0
60  %v1 = extractelement <2 x i64> %v, i64 %index1
61  %ret0 = insertelement <2 x i64> undef, i64 %v0, i32 0
62  %ret1 = insertelement <2 x i64> %ret0, i64 %v1, i32 1
63  ret <2 x i64> %ret1
64}
65
66define <4 x i32> @var_shuffle_v4i32(<4 x i32> %v, <4 x i32> %indices) nounwind {
67; SSE3-LABEL: var_shuffle_v4i32:
68; SSE3:       # %bb.0:
69; SSE3-NEXT:    movd %xmm1, %eax
70; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
71; SSE3-NEXT:    movd %xmm2, %ecx
72; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
73; SSE3-NEXT:    movd %xmm2, %edx
74; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
75; SSE3-NEXT:    movd %xmm1, %esi
76; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
77; SSE3-NEXT:    andl $3, %eax
78; SSE3-NEXT:    andl $3, %ecx
79; SSE3-NEXT:    andl $3, %edx
80; SSE3-NEXT:    andl $3, %esi
81; SSE3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
82; SSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
83; SSE3-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
84; SSE3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
85; SSE3-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
86; SSE3-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
87; SSE3-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
88; SSE3-NEXT:    retq
89;
90; SSSE3-LABEL: var_shuffle_v4i32:
91; SSSE3:       # %bb.0:
92; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [67372036,67372036,67372036,67372036]
93; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
94; SSSE3-NEXT:    pmuludq %xmm2, %xmm1
95; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
96; SSSE3-NEXT:    pmuludq %xmm2, %xmm3
97; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
98; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
99; SSSE3-NEXT:    paddd {{.*}}(%rip), %xmm1
100; SSSE3-NEXT:    pshufb %xmm1, %xmm0
101; SSSE3-NEXT:    retq
102;
103; SSE41-LABEL: var_shuffle_v4i32:
104; SSE41:       # %bb.0:
105; SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm1
106; SSE41-NEXT:    paddd {{.*}}(%rip), %xmm1
107; SSE41-NEXT:    pshufb %xmm1, %xmm0
108; SSE41-NEXT:    retq
109;
110; AVX-LABEL: var_shuffle_v4i32:
111; AVX:       # %bb.0:
112; AVX-NEXT:    vpermilps %xmm1, %xmm0, %xmm0
113; AVX-NEXT:    retq
114  %index0 = extractelement <4 x i32> %indices, i32 0
115  %index1 = extractelement <4 x i32> %indices, i32 1
116  %index2 = extractelement <4 x i32> %indices, i32 2
117  %index3 = extractelement <4 x i32> %indices, i32 3
118  %v0 = extractelement <4 x i32> %v, i32 %index0
119  %v1 = extractelement <4 x i32> %v, i32 %index1
120  %v2 = extractelement <4 x i32> %v, i32 %index2
121  %v3 = extractelement <4 x i32> %v, i32 %index3
122  %ret0 = insertelement <4 x i32> undef, i32 %v0, i32 0
123  %ret1 = insertelement <4 x i32> %ret0, i32 %v1, i32 1
124  %ret2 = insertelement <4 x i32> %ret1, i32 %v2, i32 2
125  %ret3 = insertelement <4 x i32> %ret2, i32 %v3, i32 3
126  ret <4 x i32> %ret3
127}
128
129define <8 x i16> @var_shuffle_v8i16(<8 x i16> %v, <8 x i16> %indices) nounwind {
130; SSE3-LABEL: var_shuffle_v8i16:
131; SSE3:       # %bb.0:
132; SSE3-NEXT:    movd %xmm1, %r8d
133; SSE3-NEXT:    pextrw $1, %xmm1, %r9d
134; SSE3-NEXT:    pextrw $2, %xmm1, %r10d
135; SSE3-NEXT:    pextrw $3, %xmm1, %esi
136; SSE3-NEXT:    pextrw $4, %xmm1, %edi
137; SSE3-NEXT:    pextrw $5, %xmm1, %eax
138; SSE3-NEXT:    pextrw $6, %xmm1, %ecx
139; SSE3-NEXT:    pextrw $7, %xmm1, %edx
140; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
141; SSE3-NEXT:    andl $7, %r8d
142; SSE3-NEXT:    andl $7, %r9d
143; SSE3-NEXT:    andl $7, %r10d
144; SSE3-NEXT:    andl $7, %esi
145; SSE3-NEXT:    andl $7, %edi
146; SSE3-NEXT:    andl $7, %eax
147; SSE3-NEXT:    andl $7, %ecx
148; SSE3-NEXT:    andl $7, %edx
149; SSE3-NEXT:    movzwl -24(%rsp,%rdx,2), %edx
150; SSE3-NEXT:    movd %edx, %xmm0
151; SSE3-NEXT:    movzwl -24(%rsp,%rcx,2), %ecx
152; SSE3-NEXT:    movd %ecx, %xmm1
153; SSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
154; SSE3-NEXT:    movzwl -24(%rsp,%rax,2), %eax
155; SSE3-NEXT:    movd %eax, %xmm0
156; SSE3-NEXT:    movzwl -24(%rsp,%rdi,2), %eax
157; SSE3-NEXT:    movd %eax, %xmm2
158; SSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
159; SSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
160; SSE3-NEXT:    movzwl -24(%rsp,%rsi,2), %eax
161; SSE3-NEXT:    movd %eax, %xmm0
162; SSE3-NEXT:    movzwl -24(%rsp,%r10,2), %eax
163; SSE3-NEXT:    movd %eax, %xmm1
164; SSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
165; SSE3-NEXT:    movzwl -24(%rsp,%r9,2), %eax
166; SSE3-NEXT:    movd %eax, %xmm3
167; SSE3-NEXT:    movzwl -24(%rsp,%r8,2), %eax
168; SSE3-NEXT:    movd %eax, %xmm0
169; SSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
170; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
171; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
172; SSE3-NEXT:    retq
173;
174; SSSE3-LABEL: var_shuffle_v8i16:
175; SSSE3:       # %bb.0:
176; SSSE3-NEXT:    pmullw {{.*}}(%rip), %xmm1
177; SSSE3-NEXT:    paddw {{.*}}(%rip), %xmm1
178; SSSE3-NEXT:    pshufb %xmm1, %xmm0
179; SSSE3-NEXT:    retq
180;
181; SSE41-LABEL: var_shuffle_v8i16:
182; SSE41:       # %bb.0:
183; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm1
184; SSE41-NEXT:    paddw {{.*}}(%rip), %xmm1
185; SSE41-NEXT:    pshufb %xmm1, %xmm0
186; SSE41-NEXT:    retq
187;
188; AVXNOVLBW-LABEL: var_shuffle_v8i16:
189; AVXNOVLBW:       # %bb.0:
190; AVXNOVLBW-NEXT:    vpmullw {{.*}}(%rip), %xmm1, %xmm1
191; AVXNOVLBW-NEXT:    vpaddw {{.*}}(%rip), %xmm1, %xmm1
192; AVXNOVLBW-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
193; AVXNOVLBW-NEXT:    retq
194;
195; AVX512VL-LABEL: var_shuffle_v8i16:
196; AVX512VL:       # %bb.0:
197; AVX512VL-NEXT:    vpermw %xmm0, %xmm1, %xmm0
198; AVX512VL-NEXT:    retq
199  %index0 = extractelement <8 x i16> %indices, i32 0
200  %index1 = extractelement <8 x i16> %indices, i32 1
201  %index2 = extractelement <8 x i16> %indices, i32 2
202  %index3 = extractelement <8 x i16> %indices, i32 3
203  %index4 = extractelement <8 x i16> %indices, i32 4
204  %index5 = extractelement <8 x i16> %indices, i32 5
205  %index6 = extractelement <8 x i16> %indices, i32 6
206  %index7 = extractelement <8 x i16> %indices, i32 7
207  %v0 = extractelement <8 x i16> %v, i16 %index0
208  %v1 = extractelement <8 x i16> %v, i16 %index1
209  %v2 = extractelement <8 x i16> %v, i16 %index2
210  %v3 = extractelement <8 x i16> %v, i16 %index3
211  %v4 = extractelement <8 x i16> %v, i16 %index4
212  %v5 = extractelement <8 x i16> %v, i16 %index5
213  %v6 = extractelement <8 x i16> %v, i16 %index6
214  %v7 = extractelement <8 x i16> %v, i16 %index7
215  %ret0 = insertelement <8 x i16> undef, i16 %v0, i32 0
216  %ret1 = insertelement <8 x i16> %ret0, i16 %v1, i32 1
217  %ret2 = insertelement <8 x i16> %ret1, i16 %v2, i32 2
218  %ret3 = insertelement <8 x i16> %ret2, i16 %v3, i32 3
219  %ret4 = insertelement <8 x i16> %ret3, i16 %v4, i32 4
220  %ret5 = insertelement <8 x i16> %ret4, i16 %v5, i32 5
221  %ret6 = insertelement <8 x i16> %ret5, i16 %v6, i32 6
222  %ret7 = insertelement <8 x i16> %ret6, i16 %v7, i32 7
223  ret <8 x i16> %ret7
224}
225
226define <16 x i8> @var_shuffle_v16i8(<16 x i8> %v, <16 x i8> %indices) nounwind {
227; SSE3-LABEL: var_shuffle_v16i8:
228; SSE3:       # %bb.0:
229; SSE3-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
230; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
231; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
232; SSE3-NEXT:    andl $15, %eax
233; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
234; SSE3-NEXT:    movd %eax, %xmm8
235; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
236; SSE3-NEXT:    andl $15, %eax
237; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
238; SSE3-NEXT:    movd %eax, %xmm15
239; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
240; SSE3-NEXT:    andl $15, %eax
241; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
242; SSE3-NEXT:    movd %eax, %xmm9
243; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
244; SSE3-NEXT:    andl $15, %eax
245; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
246; SSE3-NEXT:    movd %eax, %xmm3
247; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
248; SSE3-NEXT:    andl $15, %eax
249; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
250; SSE3-NEXT:    movd %eax, %xmm10
251; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
252; SSE3-NEXT:    andl $15, %eax
253; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
254; SSE3-NEXT:    movd %eax, %xmm7
255; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
256; SSE3-NEXT:    andl $15, %eax
257; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
258; SSE3-NEXT:    movd %eax, %xmm11
259; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
260; SSE3-NEXT:    andl $15, %eax
261; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
262; SSE3-NEXT:    movd %eax, %xmm6
263; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
264; SSE3-NEXT:    andl $15, %eax
265; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
266; SSE3-NEXT:    movd %eax, %xmm12
267; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
268; SSE3-NEXT:    andl $15, %eax
269; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
270; SSE3-NEXT:    movd %eax, %xmm5
271; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
272; SSE3-NEXT:    andl $15, %eax
273; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
274; SSE3-NEXT:    movd %eax, %xmm13
275; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
276; SSE3-NEXT:    andl $15, %eax
277; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
278; SSE3-NEXT:    movd %eax, %xmm4
279; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
280; SSE3-NEXT:    andl $15, %eax
281; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
282; SSE3-NEXT:    movd %eax, %xmm14
283; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
284; SSE3-NEXT:    andl $15, %eax
285; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
286; SSE3-NEXT:    movd %eax, %xmm1
287; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
288; SSE3-NEXT:    andl $15, %eax
289; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
290; SSE3-NEXT:    movd %eax, %xmm2
291; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
292; SSE3-NEXT:    andl $15, %eax
293; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
294; SSE3-NEXT:    movd %eax, %xmm0
295; SSE3-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
296; SSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
297; SSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
298; SSE3-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
299; SSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
300; SSE3-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
301; SSE3-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
302; SSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
303; SSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7]
304; SSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
305; SSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
306; SSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
307; SSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
308; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
309; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
310; SSE3-NEXT:    retq
311;
312; SSSE3-LABEL: var_shuffle_v16i8:
313; SSSE3:       # %bb.0:
314; SSSE3-NEXT:    pshufb %xmm1, %xmm0
315; SSSE3-NEXT:    retq
316;
317; SSE41-LABEL: var_shuffle_v16i8:
318; SSE41:       # %bb.0:
319; SSE41-NEXT:    pshufb %xmm1, %xmm0
320; SSE41-NEXT:    retq
321;
322; AVX-LABEL: var_shuffle_v16i8:
323; AVX:       # %bb.0:
324; AVX-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
325; AVX-NEXT:    retq
326  %index0 = extractelement <16 x i8> %indices, i32 0
327  %index1 = extractelement <16 x i8> %indices, i32 1
328  %index2 = extractelement <16 x i8> %indices, i32 2
329  %index3 = extractelement <16 x i8> %indices, i32 3
330  %index4 = extractelement <16 x i8> %indices, i32 4
331  %index5 = extractelement <16 x i8> %indices, i32 5
332  %index6 = extractelement <16 x i8> %indices, i32 6
333  %index7 = extractelement <16 x i8> %indices, i32 7
334  %index8 = extractelement <16 x i8> %indices, i32 8
335  %index9 = extractelement <16 x i8> %indices, i32 9
336  %index10 = extractelement <16 x i8> %indices, i32 10
337  %index11 = extractelement <16 x i8> %indices, i32 11
338  %index12 = extractelement <16 x i8> %indices, i32 12
339  %index13 = extractelement <16 x i8> %indices, i32 13
340  %index14 = extractelement <16 x i8> %indices, i32 14
341  %index15 = extractelement <16 x i8> %indices, i32 15
342  %v0 = extractelement <16 x i8> %v, i8 %index0
343  %v1 = extractelement <16 x i8> %v, i8 %index1
344  %v2 = extractelement <16 x i8> %v, i8 %index2
345  %v3 = extractelement <16 x i8> %v, i8 %index3
346  %v4 = extractelement <16 x i8> %v, i8 %index4
347  %v5 = extractelement <16 x i8> %v, i8 %index5
348  %v6 = extractelement <16 x i8> %v, i8 %index6
349  %v7 = extractelement <16 x i8> %v, i8 %index7
350  %v8 = extractelement <16 x i8> %v, i8 %index8
351  %v9 = extractelement <16 x i8> %v, i8 %index9
352  %v10 = extractelement <16 x i8> %v, i8 %index10
353  %v11 = extractelement <16 x i8> %v, i8 %index11
354  %v12 = extractelement <16 x i8> %v, i8 %index12
355  %v13 = extractelement <16 x i8> %v, i8 %index13
356  %v14 = extractelement <16 x i8> %v, i8 %index14
357  %v15 = extractelement <16 x i8> %v, i8 %index15
358  %ret0 = insertelement <16 x i8> undef, i8 %v0, i32 0
359  %ret1 = insertelement <16 x i8> %ret0, i8 %v1, i32 1
360  %ret2 = insertelement <16 x i8> %ret1, i8 %v2, i32 2
361  %ret3 = insertelement <16 x i8> %ret2, i8 %v3, i32 3
362  %ret4 = insertelement <16 x i8> %ret3, i8 %v4, i32 4
363  %ret5 = insertelement <16 x i8> %ret4, i8 %v5, i32 5
364  %ret6 = insertelement <16 x i8> %ret5, i8 %v6, i32 6
365  %ret7 = insertelement <16 x i8> %ret6, i8 %v7, i32 7
366  %ret8 = insertelement <16 x i8> %ret7, i8 %v8, i32 8
367  %ret9 = insertelement <16 x i8> %ret8, i8 %v9, i32 9
368  %ret10 = insertelement <16 x i8> %ret9, i8 %v10, i32 10
369  %ret11 = insertelement <16 x i8> %ret10, i8 %v11, i32 11
370  %ret12 = insertelement <16 x i8> %ret11, i8 %v12, i32 12
371  %ret13 = insertelement <16 x i8> %ret12, i8 %v13, i32 13
372  %ret14 = insertelement <16 x i8> %ret13, i8 %v14, i32 14
373  %ret15 = insertelement <16 x i8> %ret14, i8 %v15, i32 15
374  ret <16 x i8> %ret15
375}
376
377define <2 x double> @var_shuffle_v2f64(<2 x double> %v, <2 x i64> %indices) nounwind {
378; SSE3-LABEL: var_shuffle_v2f64:
379; SSE3:       # %bb.0:
380; SSE3-NEXT:    movq %xmm1, %rax
381; SSE3-NEXT:    andl $1, %eax
382; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
383; SSE3-NEXT:    movq %xmm1, %rcx
384; SSE3-NEXT:    andl $1, %ecx
385; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
386; SSE3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
387; SSE3-NEXT:    movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
388; SSE3-NEXT:    retq
389;
390; SSSE3-LABEL: var_shuffle_v2f64:
391; SSSE3:       # %bb.0:
392; SSSE3-NEXT:    movq %xmm1, %rax
393; SSSE3-NEXT:    andl $1, %eax
394; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
395; SSSE3-NEXT:    movq %xmm1, %rcx
396; SSSE3-NEXT:    andl $1, %ecx
397; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
398; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
399; SSSE3-NEXT:    movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
400; SSSE3-NEXT:    retq
401;
402; SSE41-LABEL: var_shuffle_v2f64:
403; SSE41:       # %bb.0:
404; SSE41-NEXT:    movdqa %xmm0, %xmm2
405; SSE41-NEXT:    pxor %xmm0, %xmm0
406; SSE41-NEXT:    pcmpeqq %xmm1, %xmm0
407; SSE41-NEXT:    movddup {{.*#+}} xmm1 = xmm2[0,0]
408; SSE41-NEXT:    movhlps {{.*#+}} xmm2 = xmm2[1,1]
409; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
410; SSE41-NEXT:    movapd %xmm2, %xmm0
411; SSE41-NEXT:    retq
412;
413; AVX-LABEL: var_shuffle_v2f64:
414; AVX:       # %bb.0:
415; AVX-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
416; AVX-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
417; AVX-NEXT:    retq
418  %index0 = extractelement <2 x i64> %indices, i32 0
419  %index1 = extractelement <2 x i64> %indices, i32 1
420  %v0 = extractelement <2 x double> %v, i64 %index0
421  %v1 = extractelement <2 x double> %v, i64 %index1
422  %ret0 = insertelement <2 x double> undef, double %v0, i32 0
423  %ret1 = insertelement <2 x double> %ret0, double %v1, i32 1
424  ret <2 x double> %ret1
425}
426
427define <4 x float> @var_shuffle_v4f32(<4 x float> %v, <4 x i32> %indices) nounwind {
428; SSE3-LABEL: var_shuffle_v4f32:
429; SSE3:       # %bb.0:
430; SSE3-NEXT:    movd %xmm1, %eax
431; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
432; SSE3-NEXT:    movd %xmm2, %ecx
433; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
434; SSE3-NEXT:    movd %xmm2, %edx
435; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
436; SSE3-NEXT:    movd %xmm1, %esi
437; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
438; SSE3-NEXT:    andl $3, %eax
439; SSE3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
440; SSE3-NEXT:    andl $3, %ecx
441; SSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
442; SSE3-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
443; SSE3-NEXT:    andl $3, %edx
444; SSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
445; SSE3-NEXT:    andl $3, %esi
446; SSE3-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
447; SSE3-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
448; SSE3-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
449; SSE3-NEXT:    retq
450;
451; SSSE3-LABEL: var_shuffle_v4f32:
452; SSSE3:       # %bb.0:
453; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [67372036,67372036,67372036,67372036]
454; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
455; SSSE3-NEXT:    pmuludq %xmm2, %xmm1
456; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
457; SSSE3-NEXT:    pmuludq %xmm2, %xmm3
458; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
459; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
460; SSSE3-NEXT:    paddd {{.*}}(%rip), %xmm1
461; SSSE3-NEXT:    pshufb %xmm1, %xmm0
462; SSSE3-NEXT:    retq
463;
464; SSE41-LABEL: var_shuffle_v4f32:
465; SSE41:       # %bb.0:
466; SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm1
467; SSE41-NEXT:    paddd {{.*}}(%rip), %xmm1
468; SSE41-NEXT:    pshufb %xmm1, %xmm0
469; SSE41-NEXT:    retq
470;
471; AVX-LABEL: var_shuffle_v4f32:
472; AVX:       # %bb.0:
473; AVX-NEXT:    vpermilps %xmm1, %xmm0, %xmm0
474; AVX-NEXT:    retq
475  %index0 = extractelement <4 x i32> %indices, i32 0
476  %index1 = extractelement <4 x i32> %indices, i32 1
477  %index2 = extractelement <4 x i32> %indices, i32 2
478  %index3 = extractelement <4 x i32> %indices, i32 3
479  %v0 = extractelement <4 x float> %v, i32 %index0
480  %v1 = extractelement <4 x float> %v, i32 %index1
481  %v2 = extractelement <4 x float> %v, i32 %index2
482  %v3 = extractelement <4 x float> %v, i32 %index3
483  %ret0 = insertelement <4 x float> undef, float %v0, i32 0
484  %ret1 = insertelement <4 x float> %ret0, float %v1, i32 1
485  %ret2 = insertelement <4 x float> %ret1, float %v2, i32 2
486  %ret3 = insertelement <4 x float> %ret2, float %v3, i32 3
487  ret <4 x float> %ret3
488}
489
490define <16 x i8> @var_shuffle_v16i8_from_v16i8_v32i8(<16 x i8> %v, <32 x i8> %indices) nounwind {
491; SSE3-LABEL: var_shuffle_v16i8_from_v16i8_v32i8:
492; SSE3:       # %bb.0:
493; SSE3-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
494; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
495; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
496; SSE3-NEXT:    andl $15, %eax
497; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
498; SSE3-NEXT:    movd %eax, %xmm8
499; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
500; SSE3-NEXT:    andl $15, %eax
501; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
502; SSE3-NEXT:    movd %eax, %xmm15
503; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
504; SSE3-NEXT:    andl $15, %eax
505; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
506; SSE3-NEXT:    movd %eax, %xmm9
507; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
508; SSE3-NEXT:    andl $15, %eax
509; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
510; SSE3-NEXT:    movd %eax, %xmm3
511; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
512; SSE3-NEXT:    andl $15, %eax
513; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
514; SSE3-NEXT:    movd %eax, %xmm10
515; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
516; SSE3-NEXT:    andl $15, %eax
517; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
518; SSE3-NEXT:    movd %eax, %xmm7
519; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
520; SSE3-NEXT:    andl $15, %eax
521; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
522; SSE3-NEXT:    movd %eax, %xmm11
523; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
524; SSE3-NEXT:    andl $15, %eax
525; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
526; SSE3-NEXT:    movd %eax, %xmm6
527; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
528; SSE3-NEXT:    andl $15, %eax
529; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
530; SSE3-NEXT:    movd %eax, %xmm12
531; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
532; SSE3-NEXT:    andl $15, %eax
533; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
534; SSE3-NEXT:    movd %eax, %xmm5
535; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
536; SSE3-NEXT:    andl $15, %eax
537; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
538; SSE3-NEXT:    movd %eax, %xmm13
539; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
540; SSE3-NEXT:    andl $15, %eax
541; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
542; SSE3-NEXT:    movd %eax, %xmm4
543; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
544; SSE3-NEXT:    andl $15, %eax
545; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
546; SSE3-NEXT:    movd %eax, %xmm14
547; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
548; SSE3-NEXT:    andl $15, %eax
549; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
550; SSE3-NEXT:    movd %eax, %xmm1
551; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
552; SSE3-NEXT:    andl $15, %eax
553; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
554; SSE3-NEXT:    movd %eax, %xmm2
555; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
556; SSE3-NEXT:    andl $15, %eax
557; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
558; SSE3-NEXT:    movd %eax, %xmm0
559; SSE3-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
560; SSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
561; SSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
562; SSE3-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
563; SSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
564; SSE3-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
565; SSE3-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
566; SSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
567; SSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7]
568; SSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
569; SSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
570; SSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
571; SSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
572; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
573; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
574; SSE3-NEXT:    retq
575;
576; SSSE3-LABEL: var_shuffle_v16i8_from_v16i8_v32i8:
577; SSSE3:       # %bb.0:
578; SSSE3-NEXT:    pshufb %xmm1, %xmm0
579; SSSE3-NEXT:    retq
580;
581; SSE41-LABEL: var_shuffle_v16i8_from_v16i8_v32i8:
582; SSE41:       # %bb.0:
583; SSE41-NEXT:    pshufb %xmm1, %xmm0
584; SSE41-NEXT:    retq
585;
586; AVX-LABEL: var_shuffle_v16i8_from_v16i8_v32i8:
587; AVX:       # %bb.0:
588; AVX-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
589; AVX-NEXT:    vzeroupper
590; AVX-NEXT:    retq
591  %index0 = extractelement <32 x i8> %indices, i32 0
592  %index1 = extractelement <32 x i8> %indices, i32 1
593  %index2 = extractelement <32 x i8> %indices, i32 2
594  %index3 = extractelement <32 x i8> %indices, i32 3
595  %index4 = extractelement <32 x i8> %indices, i32 4
596  %index5 = extractelement <32 x i8> %indices, i32 5
597  %index6 = extractelement <32 x i8> %indices, i32 6
598  %index7 = extractelement <32 x i8> %indices, i32 7
599  %index8 = extractelement <32 x i8> %indices, i32 8
600  %index9 = extractelement <32 x i8> %indices, i32 9
601  %index10 = extractelement <32 x i8> %indices, i32 10
602  %index11 = extractelement <32 x i8> %indices, i32 11
603  %index12 = extractelement <32 x i8> %indices, i32 12
604  %index13 = extractelement <32 x i8> %indices, i32 13
605  %index14 = extractelement <32 x i8> %indices, i32 14
606  %index15 = extractelement <32 x i8> %indices, i32 15
607  %v0 = extractelement <16 x i8> %v, i8 %index0
608  %v1 = extractelement <16 x i8> %v, i8 %index1
609  %v2 = extractelement <16 x i8> %v, i8 %index2
610  %v3 = extractelement <16 x i8> %v, i8 %index3
611  %v4 = extractelement <16 x i8> %v, i8 %index4
612  %v5 = extractelement <16 x i8> %v, i8 %index5
613  %v6 = extractelement <16 x i8> %v, i8 %index6
614  %v7 = extractelement <16 x i8> %v, i8 %index7
615  %v8 = extractelement <16 x i8> %v, i8 %index8
616  %v9 = extractelement <16 x i8> %v, i8 %index9
617  %v10 = extractelement <16 x i8> %v, i8 %index10
618  %v11 = extractelement <16 x i8> %v, i8 %index11
619  %v12 = extractelement <16 x i8> %v, i8 %index12
620  %v13 = extractelement <16 x i8> %v, i8 %index13
621  %v14 = extractelement <16 x i8> %v, i8 %index14
622  %v15 = extractelement <16 x i8> %v, i8 %index15
623  %ret0 = insertelement <16 x i8> undef, i8 %v0, i32 0
624  %ret1 = insertelement <16 x i8> %ret0, i8 %v1, i32 1
625  %ret2 = insertelement <16 x i8> %ret1, i8 %v2, i32 2
626  %ret3 = insertelement <16 x i8> %ret2, i8 %v3, i32 3
627  %ret4 = insertelement <16 x i8> %ret3, i8 %v4, i32 4
628  %ret5 = insertelement <16 x i8> %ret4, i8 %v5, i32 5
629  %ret6 = insertelement <16 x i8> %ret5, i8 %v6, i32 6
630  %ret7 = insertelement <16 x i8> %ret6, i8 %v7, i32 7
631  %ret8 = insertelement <16 x i8> %ret7, i8 %v8, i32 8
632  %ret9 = insertelement <16 x i8> %ret8, i8 %v9, i32 9
633  %ret10 = insertelement <16 x i8> %ret9, i8 %v10, i32 10
634  %ret11 = insertelement <16 x i8> %ret10, i8 %v11, i32 11
635  %ret12 = insertelement <16 x i8> %ret11, i8 %v12, i32 12
636  %ret13 = insertelement <16 x i8> %ret12, i8 %v13, i32 13
637  %ret14 = insertelement <16 x i8> %ret13, i8 %v14, i32 14
638  %ret15 = insertelement <16 x i8> %ret14, i8 %v15, i32 15
639  ret <16 x i8> %ret15
640}
641
642define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %indices) nounwind {
643; SSE3-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
644; SSE3:       # %bb.0:
645; SSE3-NEXT:    pushq %rbp
646; SSE3-NEXT:    movq %rsp, %rbp
647; SSE3-NEXT:    pushq %r15
648; SSE3-NEXT:    pushq %r14
649; SSE3-NEXT:    pushq %r13
650; SSE3-NEXT:    pushq %r12
651; SSE3-NEXT:    pushq %rbx
652; SSE3-NEXT:    andq $-32, %rsp
653; SSE3-NEXT:    subq $608, %rsp # imm = 0x260
654; SSE3-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
655; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
656; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
657; SSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
658; SSE3-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
659; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
660; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
661; SSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
662; SSE3-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
663; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
664; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
665; SSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
666; SSE3-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
667; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
668; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
669; SSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
670; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
671; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
672; SSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %r14d
673; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
674; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
675; SSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %r15d
676; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
677; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
678; SSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %r12d
679; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
680; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
681; SSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %r13d
682; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
683; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
684; SSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
685; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
686; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
687; SSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %r8d
688; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
689; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
690; SSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
691; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
692; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
693; SSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %esi
694; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
695; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
696; SSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
697; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
698; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
699; SSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %edx
700; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
701; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
702; SSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
703; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
704; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
705; SSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %r9d
706; SSE3-NEXT:    andl $31, %r9d
707; SSE3-NEXT:    movzbl 64(%rsp,%r9), %ebx
708; SSE3-NEXT:    movd %ebx, %xmm8
709; SSE3-NEXT:    andl $31, %eax
710; SSE3-NEXT:    movzbl 96(%rsp,%rax), %eax
711; SSE3-NEXT:    movd %eax, %xmm15
712; SSE3-NEXT:    andl $31, %edx
713; SSE3-NEXT:    movzbl 128(%rsp,%rdx), %eax
714; SSE3-NEXT:    movd %eax, %xmm9
715; SSE3-NEXT:    andl $31, %ecx
716; SSE3-NEXT:    movzbl 160(%rsp,%rcx), %eax
717; SSE3-NEXT:    movd %eax, %xmm3
718; SSE3-NEXT:    andl $31, %esi
719; SSE3-NEXT:    movzbl 192(%rsp,%rsi), %eax
720; SSE3-NEXT:    movd %eax, %xmm10
721; SSE3-NEXT:    andl $31, %edi
722; SSE3-NEXT:    movzbl 224(%rsp,%rdi), %eax
723; SSE3-NEXT:    movd %eax, %xmm7
724; SSE3-NEXT:    andl $31, %r8d
725; SSE3-NEXT:    movzbl 256(%rsp,%r8), %eax
726; SSE3-NEXT:    movd %eax, %xmm11
727; SSE3-NEXT:    andl $31, %r10d
728; SSE3-NEXT:    movzbl 288(%rsp,%r10), %eax
729; SSE3-NEXT:    movd %eax, %xmm6
730; SSE3-NEXT:    andl $31, %r13d
731; SSE3-NEXT:    movzbl 320(%rsp,%r13), %eax
732; SSE3-NEXT:    movd %eax, %xmm12
733; SSE3-NEXT:    andl $31, %r12d
734; SSE3-NEXT:    movzbl 352(%rsp,%r12), %eax
735; SSE3-NEXT:    movd %eax, %xmm5
736; SSE3-NEXT:    andl $31, %r15d
737; SSE3-NEXT:    movzbl 384(%rsp,%r15), %eax
738; SSE3-NEXT:    movd %eax, %xmm13
739; SSE3-NEXT:    andl $31, %r14d
740; SSE3-NEXT:    movzbl 416(%rsp,%r14), %eax
741; SSE3-NEXT:    movd %eax, %xmm4
742; SSE3-NEXT:    andl $31, %r11d
743; SSE3-NEXT:    movzbl 448(%rsp,%r11), %eax
744; SSE3-NEXT:    movd %eax, %xmm14
745; SSE3-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
746; SSE3-NEXT:    andl $31, %eax
747; SSE3-NEXT:    movzbl 480(%rsp,%rax), %eax
748; SSE3-NEXT:    movd %eax, %xmm1
749; SSE3-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
750; SSE3-NEXT:    andl $31, %eax
751; SSE3-NEXT:    movzbl 512(%rsp,%rax), %eax
752; SSE3-NEXT:    movd %eax, %xmm2
753; SSE3-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
754; SSE3-NEXT:    andl $31, %eax
755; SSE3-NEXT:    movzbl 544(%rsp,%rax), %eax
756; SSE3-NEXT:    movd %eax, %xmm0
757; SSE3-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
758; SSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
759; SSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
760; SSE3-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
761; SSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
762; SSE3-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
763; SSE3-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
764; SSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
765; SSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7]
766; SSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
767; SSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
768; SSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
769; SSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
770; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
771; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
772; SSE3-NEXT:    leaq -40(%rbp), %rsp
773; SSE3-NEXT:    popq %rbx
774; SSE3-NEXT:    popq %r12
775; SSE3-NEXT:    popq %r13
776; SSE3-NEXT:    popq %r14
777; SSE3-NEXT:    popq %r15
778; SSE3-NEXT:    popq %rbp
779; SSE3-NEXT:    retq
780;
781; SSSE3-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
782; SSSE3:       # %bb.0:
783; SSSE3-NEXT:    pushq %rbp
784; SSSE3-NEXT:    movq %rsp, %rbp
785; SSSE3-NEXT:    pushq %r15
786; SSSE3-NEXT:    pushq %r14
787; SSSE3-NEXT:    pushq %r13
788; SSSE3-NEXT:    pushq %r12
789; SSSE3-NEXT:    pushq %rbx
790; SSSE3-NEXT:    andq $-32, %rsp
791; SSSE3-NEXT:    subq $608, %rsp # imm = 0x260
792; SSSE3-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
793; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
794; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
795; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
796; SSSE3-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
797; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
798; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
799; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
800; SSSE3-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
801; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
802; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
803; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
804; SSSE3-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
805; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
806; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
807; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
808; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
809; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
810; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %r14d
811; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
812; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
813; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %r15d
814; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
815; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
816; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %r12d
817; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
818; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
819; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %r13d
820; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
821; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
822; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
823; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
824; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
825; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %r8d
826; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
827; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
828; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
829; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
830; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
831; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %esi
832; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
833; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
834; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
835; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
836; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
837; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %edx
838; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
839; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
840; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
841; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
842; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
843; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %r9d
844; SSSE3-NEXT:    andl $31, %r9d
845; SSSE3-NEXT:    movzbl 64(%rsp,%r9), %ebx
846; SSSE3-NEXT:    movd %ebx, %xmm8
847; SSSE3-NEXT:    andl $31, %eax
848; SSSE3-NEXT:    movzbl 96(%rsp,%rax), %eax
849; SSSE3-NEXT:    movd %eax, %xmm15
850; SSSE3-NEXT:    andl $31, %edx
851; SSSE3-NEXT:    movzbl 128(%rsp,%rdx), %eax
852; SSSE3-NEXT:    movd %eax, %xmm9
853; SSSE3-NEXT:    andl $31, %ecx
854; SSSE3-NEXT:    movzbl 160(%rsp,%rcx), %eax
855; SSSE3-NEXT:    movd %eax, %xmm3
856; SSSE3-NEXT:    andl $31, %esi
857; SSSE3-NEXT:    movzbl 192(%rsp,%rsi), %eax
858; SSSE3-NEXT:    movd %eax, %xmm10
859; SSSE3-NEXT:    andl $31, %edi
860; SSSE3-NEXT:    movzbl 224(%rsp,%rdi), %eax
861; SSSE3-NEXT:    movd %eax, %xmm7
862; SSSE3-NEXT:    andl $31, %r8d
863; SSSE3-NEXT:    movzbl 256(%rsp,%r8), %eax
864; SSSE3-NEXT:    movd %eax, %xmm11
865; SSSE3-NEXT:    andl $31, %r10d
866; SSSE3-NEXT:    movzbl 288(%rsp,%r10), %eax
867; SSSE3-NEXT:    movd %eax, %xmm6
868; SSSE3-NEXT:    andl $31, %r13d
869; SSSE3-NEXT:    movzbl 320(%rsp,%r13), %eax
870; SSSE3-NEXT:    movd %eax, %xmm12
871; SSSE3-NEXT:    andl $31, %r12d
872; SSSE3-NEXT:    movzbl 352(%rsp,%r12), %eax
873; SSSE3-NEXT:    movd %eax, %xmm5
874; SSSE3-NEXT:    andl $31, %r15d
875; SSSE3-NEXT:    movzbl 384(%rsp,%r15), %eax
876; SSSE3-NEXT:    movd %eax, %xmm13
877; SSSE3-NEXT:    andl $31, %r14d
878; SSSE3-NEXT:    movzbl 416(%rsp,%r14), %eax
879; SSSE3-NEXT:    movd %eax, %xmm4
880; SSSE3-NEXT:    andl $31, %r11d
881; SSSE3-NEXT:    movzbl 448(%rsp,%r11), %eax
882; SSSE3-NEXT:    movd %eax, %xmm14
883; SSSE3-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
884; SSSE3-NEXT:    andl $31, %eax
885; SSSE3-NEXT:    movzbl 480(%rsp,%rax), %eax
886; SSSE3-NEXT:    movd %eax, %xmm1
887; SSSE3-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
888; SSSE3-NEXT:    andl $31, %eax
889; SSSE3-NEXT:    movzbl 512(%rsp,%rax), %eax
890; SSSE3-NEXT:    movd %eax, %xmm2
891; SSSE3-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
892; SSSE3-NEXT:    andl $31, %eax
893; SSSE3-NEXT:    movzbl 544(%rsp,%rax), %eax
894; SSSE3-NEXT:    movd %eax, %xmm0
895; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
896; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
897; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
898; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
899; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
900; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
901; SSSE3-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
902; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
903; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7]
904; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
905; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
906; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
907; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
908; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
909; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
910; SSSE3-NEXT:    leaq -40(%rbp), %rsp
911; SSSE3-NEXT:    popq %rbx
912; SSSE3-NEXT:    popq %r12
913; SSSE3-NEXT:    popq %r13
914; SSSE3-NEXT:    popq %r14
915; SSSE3-NEXT:    popq %r15
916; SSSE3-NEXT:    popq %rbp
917; SSSE3-NEXT:    retq
918;
919; SSE41-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
920; SSE41:       # %bb.0:
921; SSE41-NEXT:    pushq %rbp
922; SSE41-NEXT:    movq %rsp, %rbp
923; SSE41-NEXT:    andq $-32, %rsp
924; SSE41-NEXT:    subq $544, %rsp # imm = 0x220
925; SSE41-NEXT:    pextrb $0, %xmm2, %eax
926; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
927; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
928; SSE41-NEXT:    andl $31, %eax
929; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
930; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
931; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
932; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
933; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
934; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
935; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
936; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
937; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
938; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
939; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
940; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
941; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
942; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
943; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
944; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
945; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
946; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
947; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
948; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
949; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
950; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
951; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
952; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
953; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
954; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
955; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
956; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
957; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
958; SSE41-NEXT:    movaps %xmm0, (%rsp)
959; SSE41-NEXT:    movzbl 480(%rsp,%rax), %eax
960; SSE41-NEXT:    movd %eax, %xmm0
961; SSE41-NEXT:    pextrb $1, %xmm2, %eax
962; SSE41-NEXT:    andl $31, %eax
963; SSE41-NEXT:    pinsrb $1, 448(%rsp,%rax), %xmm0
964; SSE41-NEXT:    pextrb $2, %xmm2, %eax
965; SSE41-NEXT:    andl $31, %eax
966; SSE41-NEXT:    pinsrb $2, 416(%rsp,%rax), %xmm0
967; SSE41-NEXT:    pextrb $3, %xmm2, %eax
968; SSE41-NEXT:    andl $31, %eax
969; SSE41-NEXT:    pinsrb $3, 384(%rsp,%rax), %xmm0
970; SSE41-NEXT:    pextrb $4, %xmm2, %eax
971; SSE41-NEXT:    andl $31, %eax
972; SSE41-NEXT:    pinsrb $4, 352(%rsp,%rax), %xmm0
973; SSE41-NEXT:    pextrb $5, %xmm2, %eax
974; SSE41-NEXT:    andl $31, %eax
975; SSE41-NEXT:    pinsrb $5, 320(%rsp,%rax), %xmm0
976; SSE41-NEXT:    pextrb $6, %xmm2, %eax
977; SSE41-NEXT:    andl $31, %eax
978; SSE41-NEXT:    pinsrb $6, 288(%rsp,%rax), %xmm0
979; SSE41-NEXT:    pextrb $7, %xmm2, %eax
980; SSE41-NEXT:    andl $31, %eax
981; SSE41-NEXT:    pinsrb $7, 256(%rsp,%rax), %xmm0
982; SSE41-NEXT:    pextrb $8, %xmm2, %eax
983; SSE41-NEXT:    andl $31, %eax
984; SSE41-NEXT:    pinsrb $8, 224(%rsp,%rax), %xmm0
985; SSE41-NEXT:    pextrb $9, %xmm2, %eax
986; SSE41-NEXT:    andl $31, %eax
987; SSE41-NEXT:    pinsrb $9, 192(%rsp,%rax), %xmm0
988; SSE41-NEXT:    pextrb $10, %xmm2, %eax
989; SSE41-NEXT:    andl $31, %eax
990; SSE41-NEXT:    pinsrb $10, 160(%rsp,%rax), %xmm0
991; SSE41-NEXT:    pextrb $11, %xmm2, %eax
992; SSE41-NEXT:    andl $31, %eax
993; SSE41-NEXT:    pinsrb $11, 128(%rsp,%rax), %xmm0
994; SSE41-NEXT:    pextrb $12, %xmm2, %eax
995; SSE41-NEXT:    andl $31, %eax
996; SSE41-NEXT:    pinsrb $12, 96(%rsp,%rax), %xmm0
997; SSE41-NEXT:    pextrb $13, %xmm2, %eax
998; SSE41-NEXT:    andl $31, %eax
999; SSE41-NEXT:    pinsrb $13, 64(%rsp,%rax), %xmm0
1000; SSE41-NEXT:    pextrb $14, %xmm2, %eax
1001; SSE41-NEXT:    andl $31, %eax
1002; SSE41-NEXT:    pinsrb $14, 32(%rsp,%rax), %xmm0
1003; SSE41-NEXT:    pextrb $15, %xmm2, %eax
1004; SSE41-NEXT:    andl $31, %eax
1005; SSE41-NEXT:    pinsrb $15, (%rsp,%rax), %xmm0
1006; SSE41-NEXT:    movq %rbp, %rsp
1007; SSE41-NEXT:    popq %rbp
1008; SSE41-NEXT:    retq
1009;
1010; XOP-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
1011; XOP:       # %bb.0:
1012; XOP-NEXT:    vextractf128 $1, %ymm0, %xmm2
1013; XOP-NEXT:    vpperm %xmm1, %xmm2, %xmm0, %xmm0
1014; XOP-NEXT:    vzeroupper
1015; XOP-NEXT:    retq
1016;
1017; AVX1-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
1018; AVX1:       # %bb.0:
1019; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1020; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
1021; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
1022; AVX1-NEXT:    vpcmpgtb {{.*}}(%rip), %xmm1, %xmm1
1023; AVX1-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
1024; AVX1-NEXT:    vzeroupper
1025; AVX1-NEXT:    retq
1026;
1027; AVX2-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
1028; AVX2:       # %bb.0:
1029; AVX2-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
1030; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm2
1031; AVX2-NEXT:    vpshufb %ymm1, %ymm2, %ymm2
1032; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
1033; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
1034; AVX2-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
1035; AVX2-NEXT:    vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1
1036; AVX2-NEXT:    vpblendvb %ymm1, %ymm0, %ymm2, %ymm0
1037; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1038; AVX2-NEXT:    vzeroupper
1039; AVX2-NEXT:    retq
1040;
1041; AVX512-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
1042; AVX512:       # %bb.0:
1043; AVX512-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
1044; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm2
1045; AVX512-NEXT:    vpshufb %ymm1, %ymm2, %ymm2
1046; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm3
1047; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
1048; AVX512-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
1049; AVX512-NEXT:    vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1
1050; AVX512-NEXT:    vpblendvb %ymm1, %ymm0, %ymm2, %ymm0
1051; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1052; AVX512-NEXT:    vzeroupper
1053; AVX512-NEXT:    retq
1054;
1055; AVX512VLBW-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
1056; AVX512VLBW:       # %bb.0:
1057; AVX512VLBW-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
1058; AVX512VLBW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm2
1059; AVX512VLBW-NEXT:    vpshufb %ymm1, %ymm2, %ymm2
1060; AVX512VLBW-NEXT:    vextracti128 $1, %ymm0, %xmm3
1061; AVX512VLBW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
1062; AVX512VLBW-NEXT:    vpcmpgtb {{.*}}(%rip), %ymm1, %k1
1063; AVX512VLBW-NEXT:    vpshufb %ymm1, %ymm0, %ymm2 {%k1}
1064; AVX512VLBW-NEXT:    vmovdqa %xmm2, %xmm0
1065; AVX512VLBW-NEXT:    vzeroupper
1066; AVX512VLBW-NEXT:    retq
1067;
1068; VLVBMI-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
1069; VLVBMI:       # %bb.0:
1070; VLVBMI-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
1071; VLVBMI-NEXT:    vpermb %ymm0, %ymm1, %ymm0
1072; VLVBMI-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1073; VLVBMI-NEXT:    vzeroupper
1074; VLVBMI-NEXT:    retq
1075  %index0 = extractelement <16 x i8> %indices, i32 0
1076  %index1 = extractelement <16 x i8> %indices, i32 1
1077  %index2 = extractelement <16 x i8> %indices, i32 2
1078  %index3 = extractelement <16 x i8> %indices, i32 3
1079  %index4 = extractelement <16 x i8> %indices, i32 4
1080  %index5 = extractelement <16 x i8> %indices, i32 5
1081  %index6 = extractelement <16 x i8> %indices, i32 6
1082  %index7 = extractelement <16 x i8> %indices, i32 7
1083  %index8 = extractelement <16 x i8> %indices, i32 8
1084  %index9 = extractelement <16 x i8> %indices, i32 9
1085  %index10 = extractelement <16 x i8> %indices, i32 10
1086  %index11 = extractelement <16 x i8> %indices, i32 11
1087  %index12 = extractelement <16 x i8> %indices, i32 12
1088  %index13 = extractelement <16 x i8> %indices, i32 13
1089  %index14 = extractelement <16 x i8> %indices, i32 14
1090  %index15 = extractelement <16 x i8> %indices, i32 15
1091  %v0 = extractelement <32 x i8> %v, i8 %index0
1092  %v1 = extractelement <32 x i8> %v, i8 %index1
1093  %v2 = extractelement <32 x i8> %v, i8 %index2
1094  %v3 = extractelement <32 x i8> %v, i8 %index3
1095  %v4 = extractelement <32 x i8> %v, i8 %index4
1096  %v5 = extractelement <32 x i8> %v, i8 %index5
1097  %v6 = extractelement <32 x i8> %v, i8 %index6
1098  %v7 = extractelement <32 x i8> %v, i8 %index7
1099  %v8 = extractelement <32 x i8> %v, i8 %index8
1100  %v9 = extractelement <32 x i8> %v, i8 %index9
1101  %v10 = extractelement <32 x i8> %v, i8 %index10
1102  %v11 = extractelement <32 x i8> %v, i8 %index11
1103  %v12 = extractelement <32 x i8> %v, i8 %index12
1104  %v13 = extractelement <32 x i8> %v, i8 %index13
1105  %v14 = extractelement <32 x i8> %v, i8 %index14
1106  %v15 = extractelement <32 x i8> %v, i8 %index15
1107  %ret0 = insertelement <16 x i8> undef, i8 %v0, i32 0
1108  %ret1 = insertelement <16 x i8> %ret0, i8 %v1, i32 1
1109  %ret2 = insertelement <16 x i8> %ret1, i8 %v2, i32 2
1110  %ret3 = insertelement <16 x i8> %ret2, i8 %v3, i32 3
1111  %ret4 = insertelement <16 x i8> %ret3, i8 %v4, i32 4
1112  %ret5 = insertelement <16 x i8> %ret4, i8 %v5, i32 5
1113  %ret6 = insertelement <16 x i8> %ret5, i8 %v6, i32 6
1114  %ret7 = insertelement <16 x i8> %ret6, i8 %v7, i32 7
1115  %ret8 = insertelement <16 x i8> %ret7, i8 %v8, i32 8
1116  %ret9 = insertelement <16 x i8> %ret8, i8 %v9, i32 9
1117  %ret10 = insertelement <16 x i8> %ret9, i8 %v10, i32 10
1118  %ret11 = insertelement <16 x i8> %ret10, i8 %v11, i32 11
1119  %ret12 = insertelement <16 x i8> %ret11, i8 %v12, i32 12
1120  %ret13 = insertelement <16 x i8> %ret12, i8 %v13, i32 13
1121  %ret14 = insertelement <16 x i8> %ret13, i8 %v14, i32 14
1122  %ret15 = insertelement <16 x i8> %ret14, i8 %v15, i32 15
1123  ret <16 x i8> %ret15
1124}
1125