• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=SSE3
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE41
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,XOP
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX1
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX2
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512VL,AVX512VLBW
12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+avx512vbmi | FileCheck %s --check-prefixes=AVX,AVX512VL,VLVBMI
13
14define <2 x i64> @var_shuffle_v2i64(<2 x i64> %v, <2 x i64> %indices) nounwind {
15; SSE3-LABEL: var_shuffle_v2i64:
16; SSE3:       # %bb.0:
17; SSE3-NEXT:    movq %xmm1, %rax
18; SSE3-NEXT:    andl $1, %eax
19; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
20; SSE3-NEXT:    movq %xmm1, %rcx
21; SSE3-NEXT:    andl $1, %ecx
22; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
23; SSE3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
24; SSE3-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
25; SSE3-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
26; SSE3-NEXT:    retq
27;
28; SSSE3-LABEL: var_shuffle_v2i64:
29; SSSE3:       # %bb.0:
30; SSSE3-NEXT:    movq %xmm1, %rax
31; SSSE3-NEXT:    andl $1, %eax
32; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
33; SSSE3-NEXT:    movq %xmm1, %rcx
34; SSSE3-NEXT:    andl $1, %ecx
35; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
36; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
37; SSSE3-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
38; SSSE3-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
39; SSSE3-NEXT:    retq
40;
41; SSE41-LABEL: var_shuffle_v2i64:
42; SSE41:       # %bb.0:
43; SSE41-NEXT:    pxor %xmm2, %xmm2
44; SSE41-NEXT:    pcmpeqq %xmm1, %xmm2
45; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1]
46; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
47; SSE41-NEXT:    movdqa %xmm2, %xmm0
48; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
49; SSE41-NEXT:    movapd %xmm1, %xmm0
50; SSE41-NEXT:    retq
51;
52; AVX-LABEL: var_shuffle_v2i64:
53; AVX:       # %bb.0:
54; AVX-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
55; AVX-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
56; AVX-NEXT:    retq
57  %index0 = extractelement <2 x i64> %indices, i32 0
58  %index1 = extractelement <2 x i64> %indices, i32 1
59  %v0 = extractelement <2 x i64> %v, i64 %index0
60  %v1 = extractelement <2 x i64> %v, i64 %index1
61  %ret0 = insertelement <2 x i64> undef, i64 %v0, i32 0
62  %ret1 = insertelement <2 x i64> %ret0, i64 %v1, i32 1
63  ret <2 x i64> %ret1
64}
65
66define <4 x i32> @var_shuffle_v4i32(<4 x i32> %v, <4 x i32> %indices) nounwind {
67; SSE3-LABEL: var_shuffle_v4i32:
68; SSE3:       # %bb.0:
69; SSE3-NEXT:    movd %xmm1, %eax
70; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
71; SSE3-NEXT:    movd %xmm2, %ecx
72; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
73; SSE3-NEXT:    movd %xmm2, %edx
74; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
75; SSE3-NEXT:    movd %xmm1, %esi
76; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
77; SSE3-NEXT:    andl $3, %eax
78; SSE3-NEXT:    andl $3, %ecx
79; SSE3-NEXT:    andl $3, %edx
80; SSE3-NEXT:    andl $3, %esi
81; SSE3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
82; SSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
83; SSE3-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
84; SSE3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
85; SSE3-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
86; SSE3-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
87; SSE3-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
88; SSE3-NEXT:    retq
89;
90; SSSE3-LABEL: var_shuffle_v4i32:
91; SSSE3:       # %bb.0:
92; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [67372036,67372036,67372036,67372036]
93; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
94; SSSE3-NEXT:    pmuludq %xmm2, %xmm1
95; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
96; SSSE3-NEXT:    pmuludq %xmm2, %xmm3
97; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
98; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
99; SSSE3-NEXT:    paddd {{.*}}(%rip), %xmm1
100; SSSE3-NEXT:    pshufb %xmm1, %xmm0
101; SSSE3-NEXT:    retq
102;
103; SSE41-LABEL: var_shuffle_v4i32:
104; SSE41:       # %bb.0:
105; SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm1
106; SSE41-NEXT:    paddd {{.*}}(%rip), %xmm1
107; SSE41-NEXT:    pshufb %xmm1, %xmm0
108; SSE41-NEXT:    retq
109;
110; AVX-LABEL: var_shuffle_v4i32:
111; AVX:       # %bb.0:
112; AVX-NEXT:    vpermilps %xmm1, %xmm0, %xmm0
113; AVX-NEXT:    retq
114  %index0 = extractelement <4 x i32> %indices, i32 0
115  %index1 = extractelement <4 x i32> %indices, i32 1
116  %index2 = extractelement <4 x i32> %indices, i32 2
117  %index3 = extractelement <4 x i32> %indices, i32 3
118  %v0 = extractelement <4 x i32> %v, i32 %index0
119  %v1 = extractelement <4 x i32> %v, i32 %index1
120  %v2 = extractelement <4 x i32> %v, i32 %index2
121  %v3 = extractelement <4 x i32> %v, i32 %index3
122  %ret0 = insertelement <4 x i32> undef, i32 %v0, i32 0
123  %ret1 = insertelement <4 x i32> %ret0, i32 %v1, i32 1
124  %ret2 = insertelement <4 x i32> %ret1, i32 %v2, i32 2
125  %ret3 = insertelement <4 x i32> %ret2, i32 %v3, i32 3
126  ret <4 x i32> %ret3
127}
128
129define <8 x i16> @var_shuffle_v8i16(<8 x i16> %v, <8 x i16> %indices) nounwind {
130; SSE3-LABEL: var_shuffle_v8i16:
131; SSE3:       # %bb.0:
132; SSE3-NEXT:    movd %xmm1, %r8d
133; SSE3-NEXT:    pextrw $1, %xmm1, %r9d
134; SSE3-NEXT:    pextrw $2, %xmm1, %r10d
135; SSE3-NEXT:    pextrw $3, %xmm1, %esi
136; SSE3-NEXT:    pextrw $4, %xmm1, %edi
137; SSE3-NEXT:    pextrw $5, %xmm1, %eax
138; SSE3-NEXT:    pextrw $6, %xmm1, %ecx
139; SSE3-NEXT:    pextrw $7, %xmm1, %edx
140; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
141; SSE3-NEXT:    andl $7, %r8d
142; SSE3-NEXT:    andl $7, %r9d
143; SSE3-NEXT:    andl $7, %r10d
144; SSE3-NEXT:    andl $7, %esi
145; SSE3-NEXT:    andl $7, %edi
146; SSE3-NEXT:    andl $7, %eax
147; SSE3-NEXT:    andl $7, %ecx
148; SSE3-NEXT:    andl $7, %edx
149; SSE3-NEXT:    movzwl -24(%rsp,%rdx,2), %edx
150; SSE3-NEXT:    movd %edx, %xmm0
151; SSE3-NEXT:    movzwl -24(%rsp,%rcx,2), %ecx
152; SSE3-NEXT:    movd %ecx, %xmm1
153; SSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
154; SSE3-NEXT:    movzwl -24(%rsp,%rax,2), %eax
155; SSE3-NEXT:    movd %eax, %xmm0
156; SSE3-NEXT:    movzwl -24(%rsp,%rdi,2), %eax
157; SSE3-NEXT:    movd %eax, %xmm2
158; SSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
159; SSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
160; SSE3-NEXT:    movzwl -24(%rsp,%rsi,2), %eax
161; SSE3-NEXT:    movd %eax, %xmm0
162; SSE3-NEXT:    movzwl -24(%rsp,%r10,2), %eax
163; SSE3-NEXT:    movd %eax, %xmm1
164; SSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
165; SSE3-NEXT:    movzwl -24(%rsp,%r9,2), %eax
166; SSE3-NEXT:    movd %eax, %xmm3
167; SSE3-NEXT:    movzwl -24(%rsp,%r8,2), %eax
168; SSE3-NEXT:    movd %eax, %xmm0
169; SSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
170; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
171; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
172; SSE3-NEXT:    retq
173;
174; SSSE3-LABEL: var_shuffle_v8i16:
175; SSSE3:       # %bb.0:
176; SSSE3-NEXT:    pmullw {{.*}}(%rip), %xmm1
177; SSSE3-NEXT:    paddw {{.*}}(%rip), %xmm1
178; SSSE3-NEXT:    pshufb %xmm1, %xmm0
179; SSSE3-NEXT:    retq
180;
181; SSE41-LABEL: var_shuffle_v8i16:
182; SSE41:       # %bb.0:
183; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm1
184; SSE41-NEXT:    paddw {{.*}}(%rip), %xmm1
185; SSE41-NEXT:    pshufb %xmm1, %xmm0
186; SSE41-NEXT:    retq
187;
188; AVXNOVLBW-LABEL: var_shuffle_v8i16:
189; AVXNOVLBW:       # %bb.0:
190; AVXNOVLBW-NEXT:    vpmullw {{.*}}(%rip), %xmm1, %xmm1
191; AVXNOVLBW-NEXT:    vpaddw {{.*}}(%rip), %xmm1, %xmm1
192; AVXNOVLBW-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
193; AVXNOVLBW-NEXT:    retq
194;
195; AVX512VL-LABEL: var_shuffle_v8i16:
196; AVX512VL:       # %bb.0:
197; AVX512VL-NEXT:    vpermw %xmm0, %xmm1, %xmm0
198; AVX512VL-NEXT:    retq
199  %index0 = extractelement <8 x i16> %indices, i32 0
200  %index1 = extractelement <8 x i16> %indices, i32 1
201  %index2 = extractelement <8 x i16> %indices, i32 2
202  %index3 = extractelement <8 x i16> %indices, i32 3
203  %index4 = extractelement <8 x i16> %indices, i32 4
204  %index5 = extractelement <8 x i16> %indices, i32 5
205  %index6 = extractelement <8 x i16> %indices, i32 6
206  %index7 = extractelement <8 x i16> %indices, i32 7
207  %v0 = extractelement <8 x i16> %v, i16 %index0
208  %v1 = extractelement <8 x i16> %v, i16 %index1
209  %v2 = extractelement <8 x i16> %v, i16 %index2
210  %v3 = extractelement <8 x i16> %v, i16 %index3
211  %v4 = extractelement <8 x i16> %v, i16 %index4
212  %v5 = extractelement <8 x i16> %v, i16 %index5
213  %v6 = extractelement <8 x i16> %v, i16 %index6
214  %v7 = extractelement <8 x i16> %v, i16 %index7
215  %ret0 = insertelement <8 x i16> undef, i16 %v0, i32 0
216  %ret1 = insertelement <8 x i16> %ret0, i16 %v1, i32 1
217  %ret2 = insertelement <8 x i16> %ret1, i16 %v2, i32 2
218  %ret3 = insertelement <8 x i16> %ret2, i16 %v3, i32 3
219  %ret4 = insertelement <8 x i16> %ret3, i16 %v4, i32 4
220  %ret5 = insertelement <8 x i16> %ret4, i16 %v5, i32 5
221  %ret6 = insertelement <8 x i16> %ret5, i16 %v6, i32 6
222  %ret7 = insertelement <8 x i16> %ret6, i16 %v7, i32 7
223  ret <8 x i16> %ret7
224}
225
226define <16 x i8> @var_shuffle_v16i8(<16 x i8> %v, <16 x i8> %indices) nounwind {
227; SSE3-LABEL: var_shuffle_v16i8:
228; SSE3:       # %bb.0:
229; SSE3-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
230; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
231; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
232; SSE3-NEXT:    andl $15, %eax
233; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
234; SSE3-NEXT:    movd %eax, %xmm8
235; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
236; SSE3-NEXT:    andl $15, %eax
237; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
238; SSE3-NEXT:    movd %eax, %xmm15
239; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
240; SSE3-NEXT:    andl $15, %eax
241; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
242; SSE3-NEXT:    movd %eax, %xmm9
243; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
244; SSE3-NEXT:    andl $15, %eax
245; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
246; SSE3-NEXT:    movd %eax, %xmm3
247; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
248; SSE3-NEXT:    andl $15, %eax
249; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
250; SSE3-NEXT:    movd %eax, %xmm10
251; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
252; SSE3-NEXT:    andl $15, %eax
253; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
254; SSE3-NEXT:    movd %eax, %xmm7
255; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
256; SSE3-NEXT:    andl $15, %eax
257; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
258; SSE3-NEXT:    movd %eax, %xmm11
259; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
260; SSE3-NEXT:    andl $15, %eax
261; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
262; SSE3-NEXT:    movd %eax, %xmm6
263; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
264; SSE3-NEXT:    andl $15, %eax
265; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
266; SSE3-NEXT:    movd %eax, %xmm12
267; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
268; SSE3-NEXT:    andl $15, %eax
269; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
270; SSE3-NEXT:    movd %eax, %xmm5
271; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
272; SSE3-NEXT:    andl $15, %eax
273; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
274; SSE3-NEXT:    movd %eax, %xmm13
275; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
276; SSE3-NEXT:    andl $15, %eax
277; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
278; SSE3-NEXT:    movd %eax, %xmm4
279; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
280; SSE3-NEXT:    andl $15, %eax
281; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
282; SSE3-NEXT:    movd %eax, %xmm14
283; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
284; SSE3-NEXT:    andl $15, %eax
285; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
286; SSE3-NEXT:    movd %eax, %xmm1
287; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
288; SSE3-NEXT:    andl $15, %eax
289; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
290; SSE3-NEXT:    movd %eax, %xmm2
291; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
292; SSE3-NEXT:    andl $15, %eax
293; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
294; SSE3-NEXT:    movd %eax, %xmm0
295; SSE3-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
296; SSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
297; SSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
298; SSE3-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
299; SSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
300; SSE3-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
301; SSE3-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
302; SSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
303; SSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7]
304; SSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
305; SSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
306; SSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
307; SSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
308; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
309; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
310; SSE3-NEXT:    retq
311;
312; SSSE3-LABEL: var_shuffle_v16i8:
313; SSSE3:       # %bb.0:
314; SSSE3-NEXT:    pshufb %xmm1, %xmm0
315; SSSE3-NEXT:    retq
316;
317; SSE41-LABEL: var_shuffle_v16i8:
318; SSE41:       # %bb.0:
319; SSE41-NEXT:    pshufb %xmm1, %xmm0
320; SSE41-NEXT:    retq
321;
322; AVX-LABEL: var_shuffle_v16i8:
323; AVX:       # %bb.0:
324; AVX-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
325; AVX-NEXT:    retq
326  %index0 = extractelement <16 x i8> %indices, i32 0
327  %index1 = extractelement <16 x i8> %indices, i32 1
328  %index2 = extractelement <16 x i8> %indices, i32 2
329  %index3 = extractelement <16 x i8> %indices, i32 3
330  %index4 = extractelement <16 x i8> %indices, i32 4
331  %index5 = extractelement <16 x i8> %indices, i32 5
332  %index6 = extractelement <16 x i8> %indices, i32 6
333  %index7 = extractelement <16 x i8> %indices, i32 7
334  %index8 = extractelement <16 x i8> %indices, i32 8
335  %index9 = extractelement <16 x i8> %indices, i32 9
336  %index10 = extractelement <16 x i8> %indices, i32 10
337  %index11 = extractelement <16 x i8> %indices, i32 11
338  %index12 = extractelement <16 x i8> %indices, i32 12
339  %index13 = extractelement <16 x i8> %indices, i32 13
340  %index14 = extractelement <16 x i8> %indices, i32 14
341  %index15 = extractelement <16 x i8> %indices, i32 15
342  %v0 = extractelement <16 x i8> %v, i8 %index0
343  %v1 = extractelement <16 x i8> %v, i8 %index1
344  %v2 = extractelement <16 x i8> %v, i8 %index2
345  %v3 = extractelement <16 x i8> %v, i8 %index3
346  %v4 = extractelement <16 x i8> %v, i8 %index4
347  %v5 = extractelement <16 x i8> %v, i8 %index5
348  %v6 = extractelement <16 x i8> %v, i8 %index6
349  %v7 = extractelement <16 x i8> %v, i8 %index7
350  %v8 = extractelement <16 x i8> %v, i8 %index8
351  %v9 = extractelement <16 x i8> %v, i8 %index9
352  %v10 = extractelement <16 x i8> %v, i8 %index10
353  %v11 = extractelement <16 x i8> %v, i8 %index11
354  %v12 = extractelement <16 x i8> %v, i8 %index12
355  %v13 = extractelement <16 x i8> %v, i8 %index13
356  %v14 = extractelement <16 x i8> %v, i8 %index14
357  %v15 = extractelement <16 x i8> %v, i8 %index15
358  %ret0 = insertelement <16 x i8> undef, i8 %v0, i32 0
359  %ret1 = insertelement <16 x i8> %ret0, i8 %v1, i32 1
360  %ret2 = insertelement <16 x i8> %ret1, i8 %v2, i32 2
361  %ret3 = insertelement <16 x i8> %ret2, i8 %v3, i32 3
362  %ret4 = insertelement <16 x i8> %ret3, i8 %v4, i32 4
363  %ret5 = insertelement <16 x i8> %ret4, i8 %v5, i32 5
364  %ret6 = insertelement <16 x i8> %ret5, i8 %v6, i32 6
365  %ret7 = insertelement <16 x i8> %ret6, i8 %v7, i32 7
366  %ret8 = insertelement <16 x i8> %ret7, i8 %v8, i32 8
367  %ret9 = insertelement <16 x i8> %ret8, i8 %v9, i32 9
368  %ret10 = insertelement <16 x i8> %ret9, i8 %v10, i32 10
369  %ret11 = insertelement <16 x i8> %ret10, i8 %v11, i32 11
370  %ret12 = insertelement <16 x i8> %ret11, i8 %v12, i32 12
371  %ret13 = insertelement <16 x i8> %ret12, i8 %v13, i32 13
372  %ret14 = insertelement <16 x i8> %ret13, i8 %v14, i32 14
373  %ret15 = insertelement <16 x i8> %ret14, i8 %v15, i32 15
374  ret <16 x i8> %ret15
375}
376
377define <2 x double> @var_shuffle_v2f64(<2 x double> %v, <2 x i64> %indices) nounwind {
378; SSE3-LABEL: var_shuffle_v2f64:
379; SSE3:       # %bb.0:
380; SSE3-NEXT:    movq %xmm1, %rax
381; SSE3-NEXT:    andl $1, %eax
382; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
383; SSE3-NEXT:    movq %xmm1, %rcx
384; SSE3-NEXT:    andl $1, %ecx
385; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
386; SSE3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
387; SSE3-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
388; SSE3-NEXT:    retq
389;
390; SSSE3-LABEL: var_shuffle_v2f64:
391; SSSE3:       # %bb.0:
392; SSSE3-NEXT:    movq %xmm1, %rax
393; SSSE3-NEXT:    andl $1, %eax
394; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
395; SSSE3-NEXT:    movq %xmm1, %rcx
396; SSSE3-NEXT:    andl $1, %ecx
397; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
398; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
399; SSSE3-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
400; SSSE3-NEXT:    retq
401;
402; SSE41-LABEL: var_shuffle_v2f64:
403; SSE41:       # %bb.0:
404; SSE41-NEXT:    movdqa %xmm0, %xmm2
405; SSE41-NEXT:    pxor %xmm0, %xmm0
406; SSE41-NEXT:    pcmpeqq %xmm1, %xmm0
407; SSE41-NEXT:    movddup {{.*#+}} xmm1 = xmm2[0,0]
408; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
409; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
410; SSE41-NEXT:    movapd %xmm2, %xmm0
411; SSE41-NEXT:    retq
412;
413; AVX-LABEL: var_shuffle_v2f64:
414; AVX:       # %bb.0:
415; AVX-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
416; AVX-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
417; AVX-NEXT:    retq
418  %index0 = extractelement <2 x i64> %indices, i32 0
419  %index1 = extractelement <2 x i64> %indices, i32 1
420  %v0 = extractelement <2 x double> %v, i64 %index0
421  %v1 = extractelement <2 x double> %v, i64 %index1
422  %ret0 = insertelement <2 x double> undef, double %v0, i32 0
423  %ret1 = insertelement <2 x double> %ret0, double %v1, i32 1
424  ret <2 x double> %ret1
425}
426
427define <4 x float> @var_shuffle_v4f32(<4 x float> %v, <4 x i32> %indices) nounwind {
428; SSE3-LABEL: var_shuffle_v4f32:
429; SSE3:       # %bb.0:
430; SSE3-NEXT:    movd %xmm1, %eax
431; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
432; SSE3-NEXT:    movd %xmm2, %ecx
433; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
434; SSE3-NEXT:    movd %xmm2, %edx
435; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
436; SSE3-NEXT:    movd %xmm1, %esi
437; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
438; SSE3-NEXT:    andl $3, %eax
439; SSE3-NEXT:    andl $3, %ecx
440; SSE3-NEXT:    andl $3, %edx
441; SSE3-NEXT:    andl $3, %esi
442; SSE3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
443; SSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
444; SSE3-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
445; SSE3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
446; SSE3-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
447; SSE3-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
448; SSE3-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
449; SSE3-NEXT:    retq
450;
451; SSSE3-LABEL: var_shuffle_v4f32:
452; SSSE3:       # %bb.0:
453; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [67372036,67372036,67372036,67372036]
454; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
455; SSSE3-NEXT:    pmuludq %xmm2, %xmm1
456; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
457; SSSE3-NEXT:    pmuludq %xmm2, %xmm3
458; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
459; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
460; SSSE3-NEXT:    paddd {{.*}}(%rip), %xmm1
461; SSSE3-NEXT:    pshufb %xmm1, %xmm0
462; SSSE3-NEXT:    retq
463;
464; SSE41-LABEL: var_shuffle_v4f32:
465; SSE41:       # %bb.0:
466; SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm1
467; SSE41-NEXT:    paddd {{.*}}(%rip), %xmm1
468; SSE41-NEXT:    pshufb %xmm1, %xmm0
469; SSE41-NEXT:    retq
470;
471; AVX-LABEL: var_shuffle_v4f32:
472; AVX:       # %bb.0:
473; AVX-NEXT:    vpermilps %xmm1, %xmm0, %xmm0
474; AVX-NEXT:    retq
475  %index0 = extractelement <4 x i32> %indices, i32 0
476  %index1 = extractelement <4 x i32> %indices, i32 1
477  %index2 = extractelement <4 x i32> %indices, i32 2
478  %index3 = extractelement <4 x i32> %indices, i32 3
479  %v0 = extractelement <4 x float> %v, i32 %index0
480  %v1 = extractelement <4 x float> %v, i32 %index1
481  %v2 = extractelement <4 x float> %v, i32 %index2
482  %v3 = extractelement <4 x float> %v, i32 %index3
483  %ret0 = insertelement <4 x float> undef, float %v0, i32 0
484  %ret1 = insertelement <4 x float> %ret0, float %v1, i32 1
485  %ret2 = insertelement <4 x float> %ret1, float %v2, i32 2
486  %ret3 = insertelement <4 x float> %ret2, float %v3, i32 3
487  ret <4 x float> %ret3
488}
489
490define <16 x i8> @var_shuffle_v16i8_from_v16i8_v32i8(<16 x i8> %v, <32 x i8> %indices) nounwind {
491; SSE3-LABEL: var_shuffle_v16i8_from_v16i8_v32i8:
492; SSE3:       # %bb.0:
493; SSE3-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
494; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
495; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
496; SSE3-NEXT:    andl $15, %eax
497; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
498; SSE3-NEXT:    movd %eax, %xmm8
499; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
500; SSE3-NEXT:    andl $15, %eax
501; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
502; SSE3-NEXT:    movd %eax, %xmm15
503; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
504; SSE3-NEXT:    andl $15, %eax
505; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
506; SSE3-NEXT:    movd %eax, %xmm9
507; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
508; SSE3-NEXT:    andl $15, %eax
509; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
510; SSE3-NEXT:    movd %eax, %xmm3
511; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
512; SSE3-NEXT:    andl $15, %eax
513; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
514; SSE3-NEXT:    movd %eax, %xmm10
515; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
516; SSE3-NEXT:    andl $15, %eax
517; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
518; SSE3-NEXT:    movd %eax, %xmm7
519; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
520; SSE3-NEXT:    andl $15, %eax
521; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
522; SSE3-NEXT:    movd %eax, %xmm11
523; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
524; SSE3-NEXT:    andl $15, %eax
525; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
526; SSE3-NEXT:    movd %eax, %xmm6
527; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
528; SSE3-NEXT:    andl $15, %eax
529; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
530; SSE3-NEXT:    movd %eax, %xmm12
531; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
532; SSE3-NEXT:    andl $15, %eax
533; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
534; SSE3-NEXT:    movd %eax, %xmm5
535; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
536; SSE3-NEXT:    andl $15, %eax
537; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
538; SSE3-NEXT:    movd %eax, %xmm13
539; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
540; SSE3-NEXT:    andl $15, %eax
541; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
542; SSE3-NEXT:    movd %eax, %xmm4
543; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
544; SSE3-NEXT:    andl $15, %eax
545; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
546; SSE3-NEXT:    movd %eax, %xmm14
547; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
548; SSE3-NEXT:    andl $15, %eax
549; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
550; SSE3-NEXT:    movd %eax, %xmm1
551; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
552; SSE3-NEXT:    andl $15, %eax
553; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
554; SSE3-NEXT:    movd %eax, %xmm2
555; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
556; SSE3-NEXT:    andl $15, %eax
557; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
558; SSE3-NEXT:    movd %eax, %xmm0
559; SSE3-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
560; SSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
561; SSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
562; SSE3-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
563; SSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
564; SSE3-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
565; SSE3-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
566; SSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
567; SSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7]
568; SSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
569; SSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
570; SSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
571; SSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
572; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
573; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
574; SSE3-NEXT:    retq
575;
576; SSSE3-LABEL: var_shuffle_v16i8_from_v16i8_v32i8:
577; SSSE3:       # %bb.0:
578; SSSE3-NEXT:    pshufb %xmm1, %xmm0
579; SSSE3-NEXT:    retq
580;
581; SSE41-LABEL: var_shuffle_v16i8_from_v16i8_v32i8:
582; SSE41:       # %bb.0:
583; SSE41-NEXT:    pshufb %xmm1, %xmm0
584; SSE41-NEXT:    retq
585;
586; AVX-LABEL: var_shuffle_v16i8_from_v16i8_v32i8:
587; AVX:       # %bb.0:
588; AVX-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
589; AVX-NEXT:    vzeroupper
590; AVX-NEXT:    retq
591  %index0 = extractelement <32 x i8> %indices, i32 0
592  %index1 = extractelement <32 x i8> %indices, i32 1
593  %index2 = extractelement <32 x i8> %indices, i32 2
594  %index3 = extractelement <32 x i8> %indices, i32 3
595  %index4 = extractelement <32 x i8> %indices, i32 4
596  %index5 = extractelement <32 x i8> %indices, i32 5
597  %index6 = extractelement <32 x i8> %indices, i32 6
598  %index7 = extractelement <32 x i8> %indices, i32 7
599  %index8 = extractelement <32 x i8> %indices, i32 8
600  %index9 = extractelement <32 x i8> %indices, i32 9
601  %index10 = extractelement <32 x i8> %indices, i32 10
602  %index11 = extractelement <32 x i8> %indices, i32 11
603  %index12 = extractelement <32 x i8> %indices, i32 12
604  %index13 = extractelement <32 x i8> %indices, i32 13
605  %index14 = extractelement <32 x i8> %indices, i32 14
606  %index15 = extractelement <32 x i8> %indices, i32 15
607  %v0 = extractelement <16 x i8> %v, i8 %index0
608  %v1 = extractelement <16 x i8> %v, i8 %index1
609  %v2 = extractelement <16 x i8> %v, i8 %index2
610  %v3 = extractelement <16 x i8> %v, i8 %index3
611  %v4 = extractelement <16 x i8> %v, i8 %index4
612  %v5 = extractelement <16 x i8> %v, i8 %index5
613  %v6 = extractelement <16 x i8> %v, i8 %index6
614  %v7 = extractelement <16 x i8> %v, i8 %index7
615  %v8 = extractelement <16 x i8> %v, i8 %index8
616  %v9 = extractelement <16 x i8> %v, i8 %index9
617  %v10 = extractelement <16 x i8> %v, i8 %index10
618  %v11 = extractelement <16 x i8> %v, i8 %index11
619  %v12 = extractelement <16 x i8> %v, i8 %index12
620  %v13 = extractelement <16 x i8> %v, i8 %index13
621  %v14 = extractelement <16 x i8> %v, i8 %index14
622  %v15 = extractelement <16 x i8> %v, i8 %index15
623  %ret0 = insertelement <16 x i8> undef, i8 %v0, i32 0
624  %ret1 = insertelement <16 x i8> %ret0, i8 %v1, i32 1
625  %ret2 = insertelement <16 x i8> %ret1, i8 %v2, i32 2
626  %ret3 = insertelement <16 x i8> %ret2, i8 %v3, i32 3
627  %ret4 = insertelement <16 x i8> %ret3, i8 %v4, i32 4
628  %ret5 = insertelement <16 x i8> %ret4, i8 %v5, i32 5
629  %ret6 = insertelement <16 x i8> %ret5, i8 %v6, i32 6
630  %ret7 = insertelement <16 x i8> %ret6, i8 %v7, i32 7
631  %ret8 = insertelement <16 x i8> %ret7, i8 %v8, i32 8
632  %ret9 = insertelement <16 x i8> %ret8, i8 %v9, i32 9
633  %ret10 = insertelement <16 x i8> %ret9, i8 %v10, i32 10
634  %ret11 = insertelement <16 x i8> %ret10, i8 %v11, i32 11
635  %ret12 = insertelement <16 x i8> %ret11, i8 %v12, i32 12
636  %ret13 = insertelement <16 x i8> %ret12, i8 %v13, i32 13
637  %ret14 = insertelement <16 x i8> %ret13, i8 %v14, i32 14
638  %ret15 = insertelement <16 x i8> %ret14, i8 %v15, i32 15
639  ret <16 x i8> %ret15
640}
641
642define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %indices) nounwind {
643; SSE3-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
644; SSE3:       # %bb.0:
645; SSE3-NEXT:    pushq %rbp
646; SSE3-NEXT:    pushq %r15
647; SSE3-NEXT:    pushq %r14
648; SSE3-NEXT:    pushq %r13
649; SSE3-NEXT:    pushq %r12
650; SSE3-NEXT:    pushq %rbx
651; SSE3-NEXT:    subq $424, %rsp # imm = 0x1A8
652; SSE3-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
653; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
654; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
655; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
656; SSE3-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
657; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
658; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
659; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
660; SSE3-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
661; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
662; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
663; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r10d
664; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
665; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
666; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r11d
667; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
668; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
669; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r14d
670; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
671; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
672; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r15d
673; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
674; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
675; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r12d
676; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
677; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
678; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r13d
679; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
680; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
681; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r9d
682; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
683; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
684; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebx
685; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
686; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
687; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edi
688; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
689; SSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
690; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
691; SSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
692; SSE3-NEXT:    movaps %xmm0, (%rsp)
693; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
694; SSE3-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
695; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
696; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
697; SSE3-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
698; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
699; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebp
700; SSE3-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
701; SSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
702; SSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r8d
703; SSE3-NEXT:    andl $31, %r8d
704; SSE3-NEXT:    movzbl -96(%rsp,%r8), %esi
705; SSE3-NEXT:    movd %esi, %xmm8
706; SSE3-NEXT:    andl $31, %ebp
707; SSE3-NEXT:    movzbl -64(%rsp,%rbp), %esi
708; SSE3-NEXT:    movd %esi, %xmm15
709; SSE3-NEXT:    andl $31, %edx
710; SSE3-NEXT:    movzbl -32(%rsp,%rdx), %edx
711; SSE3-NEXT:    movd %edx, %xmm9
712; SSE3-NEXT:    andl $31, %ecx
713; SSE3-NEXT:    movzbl (%rsp,%rcx), %ecx
714; SSE3-NEXT:    movd %ecx, %xmm3
715; SSE3-NEXT:    andl $31, %eax
716; SSE3-NEXT:    movzbl 32(%rsp,%rax), %eax
717; SSE3-NEXT:    movd %eax, %xmm10
718; SSE3-NEXT:    andl $31, %edi
719; SSE3-NEXT:    movzbl 64(%rsp,%rdi), %eax
720; SSE3-NEXT:    movd %eax, %xmm7
721; SSE3-NEXT:    andl $31, %ebx
722; SSE3-NEXT:    movzbl 96(%rsp,%rbx), %eax
723; SSE3-NEXT:    movd %eax, %xmm11
724; SSE3-NEXT:    andl $31, %r9d
725; SSE3-NEXT:    movzbl 128(%rsp,%r9), %eax
726; SSE3-NEXT:    movd %eax, %xmm6
727; SSE3-NEXT:    andl $31, %r13d
728; SSE3-NEXT:    movzbl 160(%rsp,%r13), %eax
729; SSE3-NEXT:    movd %eax, %xmm12
730; SSE3-NEXT:    andl $31, %r12d
731; SSE3-NEXT:    movzbl 192(%rsp,%r12), %eax
732; SSE3-NEXT:    movd %eax, %xmm5
733; SSE3-NEXT:    andl $31, %r15d
734; SSE3-NEXT:    movzbl 224(%rsp,%r15), %eax
735; SSE3-NEXT:    movd %eax, %xmm13
736; SSE3-NEXT:    andl $31, %r14d
737; SSE3-NEXT:    movzbl 256(%rsp,%r14), %eax
738; SSE3-NEXT:    movd %eax, %xmm4
739; SSE3-NEXT:    andl $31, %r11d
740; SSE3-NEXT:    movzbl 288(%rsp,%r11), %eax
741; SSE3-NEXT:    movd %eax, %xmm14
742; SSE3-NEXT:    andl $31, %r10d
743; SSE3-NEXT:    movzbl 320(%rsp,%r10), %eax
744; SSE3-NEXT:    movd %eax, %xmm1
745; SSE3-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
746; SSE3-NEXT:    andl $31, %eax
747; SSE3-NEXT:    movzbl 352(%rsp,%rax), %eax
748; SSE3-NEXT:    movd %eax, %xmm2
749; SSE3-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
750; SSE3-NEXT:    andl $31, %eax
751; SSE3-NEXT:    movzbl 384(%rsp,%rax), %eax
752; SSE3-NEXT:    movd %eax, %xmm0
753; SSE3-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
754; SSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
755; SSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
756; SSE3-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
757; SSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
758; SSE3-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
759; SSE3-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
760; SSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
761; SSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7]
762; SSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
763; SSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
764; SSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
765; SSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
766; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
767; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
768; SSE3-NEXT:    addq $424, %rsp # imm = 0x1A8
769; SSE3-NEXT:    popq %rbx
770; SSE3-NEXT:    popq %r12
771; SSE3-NEXT:    popq %r13
772; SSE3-NEXT:    popq %r14
773; SSE3-NEXT:    popq %r15
774; SSE3-NEXT:    popq %rbp
775; SSE3-NEXT:    retq
776;
777; SSSE3-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
778; SSSE3:       # %bb.0:
779; SSSE3-NEXT:    pushq %rbp
780; SSSE3-NEXT:    pushq %r15
781; SSSE3-NEXT:    pushq %r14
782; SSSE3-NEXT:    pushq %r13
783; SSSE3-NEXT:    pushq %r12
784; SSSE3-NEXT:    pushq %rbx
785; SSSE3-NEXT:    subq $424, %rsp # imm = 0x1A8
786; SSSE3-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
787; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
788; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
789; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
790; SSSE3-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
791; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
792; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
793; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
794; SSSE3-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
795; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
796; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
797; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r10d
798; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
799; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
800; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r11d
801; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
802; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
803; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r14d
804; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
805; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
806; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r15d
807; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
808; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
809; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r12d
810; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
811; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
812; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r13d
813; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
814; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
815; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r9d
816; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
817; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
818; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebx
819; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
820; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
821; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edi
822; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
823; SSSE3-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
824; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
825; SSSE3-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
826; SSSE3-NEXT:    movaps %xmm0, (%rsp)
827; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
828; SSSE3-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
829; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
830; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
831; SSSE3-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
832; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
833; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ebp
834; SSSE3-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
835; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
836; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r8d
837; SSSE3-NEXT:    andl $31, %r8d
838; SSSE3-NEXT:    movzbl -96(%rsp,%r8), %esi
839; SSSE3-NEXT:    movd %esi, %xmm8
840; SSSE3-NEXT:    andl $31, %ebp
841; SSSE3-NEXT:    movzbl -64(%rsp,%rbp), %esi
842; SSSE3-NEXT:    movd %esi, %xmm15
843; SSSE3-NEXT:    andl $31, %edx
844; SSSE3-NEXT:    movzbl -32(%rsp,%rdx), %edx
845; SSSE3-NEXT:    movd %edx, %xmm9
846; SSSE3-NEXT:    andl $31, %ecx
847; SSSE3-NEXT:    movzbl (%rsp,%rcx), %ecx
848; SSSE3-NEXT:    movd %ecx, %xmm3
849; SSSE3-NEXT:    andl $31, %eax
850; SSSE3-NEXT:    movzbl 32(%rsp,%rax), %eax
851; SSSE3-NEXT:    movd %eax, %xmm10
852; SSSE3-NEXT:    andl $31, %edi
853; SSSE3-NEXT:    movzbl 64(%rsp,%rdi), %eax
854; SSSE3-NEXT:    movd %eax, %xmm7
855; SSSE3-NEXT:    andl $31, %ebx
856; SSSE3-NEXT:    movzbl 96(%rsp,%rbx), %eax
857; SSSE3-NEXT:    movd %eax, %xmm11
858; SSSE3-NEXT:    andl $31, %r9d
859; SSSE3-NEXT:    movzbl 128(%rsp,%r9), %eax
860; SSSE3-NEXT:    movd %eax, %xmm6
861; SSSE3-NEXT:    andl $31, %r13d
862; SSSE3-NEXT:    movzbl 160(%rsp,%r13), %eax
863; SSSE3-NEXT:    movd %eax, %xmm12
864; SSSE3-NEXT:    andl $31, %r12d
865; SSSE3-NEXT:    movzbl 192(%rsp,%r12), %eax
866; SSSE3-NEXT:    movd %eax, %xmm5
867; SSSE3-NEXT:    andl $31, %r15d
868; SSSE3-NEXT:    movzbl 224(%rsp,%r15), %eax
869; SSSE3-NEXT:    movd %eax, %xmm13
870; SSSE3-NEXT:    andl $31, %r14d
871; SSSE3-NEXT:    movzbl 256(%rsp,%r14), %eax
872; SSSE3-NEXT:    movd %eax, %xmm4
873; SSSE3-NEXT:    andl $31, %r11d
874; SSSE3-NEXT:    movzbl 288(%rsp,%r11), %eax
875; SSSE3-NEXT:    movd %eax, %xmm14
876; SSSE3-NEXT:    andl $31, %r10d
877; SSSE3-NEXT:    movzbl 320(%rsp,%r10), %eax
878; SSSE3-NEXT:    movd %eax, %xmm1
879; SSSE3-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
880; SSSE3-NEXT:    andl $31, %eax
881; SSSE3-NEXT:    movzbl 352(%rsp,%rax), %eax
882; SSSE3-NEXT:    movd %eax, %xmm2
883; SSSE3-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
884; SSSE3-NEXT:    andl $31, %eax
885; SSSE3-NEXT:    movzbl 384(%rsp,%rax), %eax
886; SSSE3-NEXT:    movd %eax, %xmm0
887; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
888; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
889; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
890; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
891; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
892; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
893; SSSE3-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
894; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
895; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7]
896; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
897; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
898; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
899; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
900; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
901; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
902; SSSE3-NEXT:    addq $424, %rsp # imm = 0x1A8
903; SSSE3-NEXT:    popq %rbx
904; SSSE3-NEXT:    popq %r12
905; SSSE3-NEXT:    popq %r13
906; SSSE3-NEXT:    popq %r14
907; SSSE3-NEXT:    popq %r15
908; SSSE3-NEXT:    popq %rbp
909; SSSE3-NEXT:    retq
910;
911; SSE41-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
912; SSE41:       # %bb.0:
913; SSE41-NEXT:    subq $392, %rsp # imm = 0x188
914; SSE41-NEXT:    movd %xmm2, %eax
915; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
916; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
917; SSE41-NEXT:    andl $31, %eax
918; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
919; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
920; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
921; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
922; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
923; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
924; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
925; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
926; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
927; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
928; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
929; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
930; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
931; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
932; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
933; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
934; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
935; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
936; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
937; SSE41-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
938; SSE41-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
939; SSE41-NEXT:    movaps %xmm0, (%rsp)
940; SSE41-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
941; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
942; SSE41-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
943; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
944; SSE41-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
945; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
946; SSE41-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
947; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
948; SSE41-NEXT:    movzbl 352(%rsp,%rax), %eax
949; SSE41-NEXT:    movd %eax, %xmm0
950; SSE41-NEXT:    pextrb $1, %xmm2, %eax
951; SSE41-NEXT:    andl $31, %eax
952; SSE41-NEXT:    pinsrb $1, 320(%rsp,%rax), %xmm0
953; SSE41-NEXT:    pextrb $2, %xmm2, %eax
954; SSE41-NEXT:    andl $31, %eax
955; SSE41-NEXT:    pinsrb $2, 288(%rsp,%rax), %xmm0
956; SSE41-NEXT:    pextrb $3, %xmm2, %eax
957; SSE41-NEXT:    andl $31, %eax
958; SSE41-NEXT:    pinsrb $3, 256(%rsp,%rax), %xmm0
959; SSE41-NEXT:    pextrb $4, %xmm2, %eax
960; SSE41-NEXT:    andl $31, %eax
961; SSE41-NEXT:    pinsrb $4, 224(%rsp,%rax), %xmm0
962; SSE41-NEXT:    pextrb $5, %xmm2, %eax
963; SSE41-NEXT:    andl $31, %eax
964; SSE41-NEXT:    pinsrb $5, 192(%rsp,%rax), %xmm0
965; SSE41-NEXT:    pextrb $6, %xmm2, %eax
966; SSE41-NEXT:    andl $31, %eax
967; SSE41-NEXT:    pinsrb $6, 160(%rsp,%rax), %xmm0
968; SSE41-NEXT:    pextrb $7, %xmm2, %eax
969; SSE41-NEXT:    andl $31, %eax
970; SSE41-NEXT:    pinsrb $7, 128(%rsp,%rax), %xmm0
971; SSE41-NEXT:    pextrb $8, %xmm2, %eax
972; SSE41-NEXT:    andl $31, %eax
973; SSE41-NEXT:    pinsrb $8, 96(%rsp,%rax), %xmm0
974; SSE41-NEXT:    pextrb $9, %xmm2, %eax
975; SSE41-NEXT:    andl $31, %eax
976; SSE41-NEXT:    pinsrb $9, 64(%rsp,%rax), %xmm0
977; SSE41-NEXT:    pextrb $10, %xmm2, %eax
978; SSE41-NEXT:    andl $31, %eax
979; SSE41-NEXT:    pinsrb $10, 32(%rsp,%rax), %xmm0
980; SSE41-NEXT:    pextrb $11, %xmm2, %eax
981; SSE41-NEXT:    andl $31, %eax
982; SSE41-NEXT:    pinsrb $11, (%rsp,%rax), %xmm0
983; SSE41-NEXT:    pextrb $12, %xmm2, %eax
984; SSE41-NEXT:    andl $31, %eax
985; SSE41-NEXT:    pinsrb $12, -32(%rsp,%rax), %xmm0
986; SSE41-NEXT:    pextrb $13, %xmm2, %eax
987; SSE41-NEXT:    andl $31, %eax
988; SSE41-NEXT:    pinsrb $13, -64(%rsp,%rax), %xmm0
989; SSE41-NEXT:    pextrb $14, %xmm2, %eax
990; SSE41-NEXT:    andl $31, %eax
991; SSE41-NEXT:    pinsrb $14, -96(%rsp,%rax), %xmm0
992; SSE41-NEXT:    pextrb $15, %xmm2, %eax
993; SSE41-NEXT:    andl $31, %eax
994; SSE41-NEXT:    pinsrb $15, -128(%rsp,%rax), %xmm0
995; SSE41-NEXT:    addq $392, %rsp # imm = 0x188
996; SSE41-NEXT:    retq
997;
998; XOP-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
999; XOP:       # %bb.0:
1000; XOP-NEXT:    vextractf128 $1, %ymm0, %xmm2
1001; XOP-NEXT:    vpperm %xmm1, %xmm2, %xmm0, %xmm0
1002; XOP-NEXT:    vzeroupper
1003; XOP-NEXT:    retq
1004;
1005; AVX1-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
1006; AVX1:       # %bb.0:
1007; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1008; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
1009; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
1010; AVX1-NEXT:    vpcmpgtb {{.*}}(%rip), %xmm1, %xmm1
1011; AVX1-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
1012; AVX1-NEXT:    vzeroupper
1013; AVX1-NEXT:    retq
1014;
1015; AVX2-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
1016; AVX2:       # %bb.0:
1017; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
1018; AVX2-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
1019; AVX2-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
1020; AVX2-NEXT:    vpcmpgtb {{.*}}(%rip), %xmm1, %xmm1
1021; AVX2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
1022; AVX2-NEXT:    vzeroupper
1023; AVX2-NEXT:    retq
1024;
1025; AVX512-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
1026; AVX512:       # %bb.0:
1027; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
1028; AVX512-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
1029; AVX512-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
1030; AVX512-NEXT:    vpcmpgtb {{.*}}(%rip), %xmm1, %xmm1
1031; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
1032; AVX512-NEXT:    vzeroupper
1033; AVX512-NEXT:    retq
1034;
1035; AVX512VLBW-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
1036; AVX512VLBW:       # %bb.0:
1037; AVX512VLBW-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
1038; AVX512VLBW-NEXT:    vextracti128 $1, %ymm0, %xmm2
1039; AVX512VLBW-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
1040; AVX512VLBW-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
1041; AVX512VLBW-NEXT:    vpcmpgtb {{.*}}(%rip), %ymm1, %k1
1042; AVX512VLBW-NEXT:    vmovdqu8 %ymm2, %ymm0 {%k1}
1043; AVX512VLBW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1044; AVX512VLBW-NEXT:    vzeroupper
1045; AVX512VLBW-NEXT:    retq
1046;
1047; VLVBMI-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
1048; VLVBMI:       # %bb.0:
1049; VLVBMI-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
1050; VLVBMI-NEXT:    vpermb %ymm0, %ymm1, %ymm0
1051; VLVBMI-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1052; VLVBMI-NEXT:    vzeroupper
1053; VLVBMI-NEXT:    retq
1054  %index0 = extractelement <16 x i8> %indices, i32 0
1055  %index1 = extractelement <16 x i8> %indices, i32 1
1056  %index2 = extractelement <16 x i8> %indices, i32 2
1057  %index3 = extractelement <16 x i8> %indices, i32 3
1058  %index4 = extractelement <16 x i8> %indices, i32 4
1059  %index5 = extractelement <16 x i8> %indices, i32 5
1060  %index6 = extractelement <16 x i8> %indices, i32 6
1061  %index7 = extractelement <16 x i8> %indices, i32 7
1062  %index8 = extractelement <16 x i8> %indices, i32 8
1063  %index9 = extractelement <16 x i8> %indices, i32 9
1064  %index10 = extractelement <16 x i8> %indices, i32 10
1065  %index11 = extractelement <16 x i8> %indices, i32 11
1066  %index12 = extractelement <16 x i8> %indices, i32 12
1067  %index13 = extractelement <16 x i8> %indices, i32 13
1068  %index14 = extractelement <16 x i8> %indices, i32 14
1069  %index15 = extractelement <16 x i8> %indices, i32 15
1070  %v0 = extractelement <32 x i8> %v, i8 %index0
1071  %v1 = extractelement <32 x i8> %v, i8 %index1
1072  %v2 = extractelement <32 x i8> %v, i8 %index2
1073  %v3 = extractelement <32 x i8> %v, i8 %index3
1074  %v4 = extractelement <32 x i8> %v, i8 %index4
1075  %v5 = extractelement <32 x i8> %v, i8 %index5
1076  %v6 = extractelement <32 x i8> %v, i8 %index6
1077  %v7 = extractelement <32 x i8> %v, i8 %index7
1078  %v8 = extractelement <32 x i8> %v, i8 %index8
1079  %v9 = extractelement <32 x i8> %v, i8 %index9
1080  %v10 = extractelement <32 x i8> %v, i8 %index10
1081  %v11 = extractelement <32 x i8> %v, i8 %index11
1082  %v12 = extractelement <32 x i8> %v, i8 %index12
1083  %v13 = extractelement <32 x i8> %v, i8 %index13
1084  %v14 = extractelement <32 x i8> %v, i8 %index14
1085  %v15 = extractelement <32 x i8> %v, i8 %index15
1086  %ret0 = insertelement <16 x i8> undef, i8 %v0, i32 0
1087  %ret1 = insertelement <16 x i8> %ret0, i8 %v1, i32 1
1088  %ret2 = insertelement <16 x i8> %ret1, i8 %v2, i32 2
1089  %ret3 = insertelement <16 x i8> %ret2, i8 %v3, i32 3
1090  %ret4 = insertelement <16 x i8> %ret3, i8 %v4, i32 4
1091  %ret5 = insertelement <16 x i8> %ret4, i8 %v5, i32 5
1092  %ret6 = insertelement <16 x i8> %ret5, i8 %v6, i32 6
1093  %ret7 = insertelement <16 x i8> %ret6, i8 %v7, i32 7
1094  %ret8 = insertelement <16 x i8> %ret7, i8 %v8, i32 8
1095  %ret9 = insertelement <16 x i8> %ret8, i8 %v9, i32 9
1096  %ret10 = insertelement <16 x i8> %ret9, i8 %v10, i32 10
1097  %ret11 = insertelement <16 x i8> %ret10, i8 %v11, i32 11
1098  %ret12 = insertelement <16 x i8> %ret11, i8 %v12, i32 12
1099  %ret13 = insertelement <16 x i8> %ret12, i8 %v13, i32 13
1100  %ret14 = insertelement <16 x i8> %ret13, i8 %v14, i32 14
1101  %ret15 = insertelement <16 x i8> %ret14, i8 %v15, i32 15
1102  ret <16 x i8> %ret15
1103}
1104