• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
7
8;
9; Unary shuffle indices from registers
10;
11
12define <2 x double> @var_shuffle_v2f64_v2f64_xx_i64(<2 x double> %x, i64 %i0, i64 %i1) nounwind {
13; SSE-LABEL: var_shuffle_v2f64_v2f64_xx_i64:
14; SSE:       # BB#0:
15; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
16; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
17; SSE-NEXT:    movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
18; SSE-NEXT:    retq
19;
20; AVX-LABEL: var_shuffle_v2f64_v2f64_xx_i64:
21; AVX:       # BB#0:
22; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
23; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
24; AVX-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
25; AVX-NEXT:    retq
26  %x0 = extractelement <2 x double> %x, i64 %i0
27  %x1 = extractelement <2 x double> %x, i64 %i1
28  %r0 = insertelement <2 x double> undef, double %x0, i32 0
29  %r1 = insertelement <2 x double>   %r0, double %x1, i32 1
30  ret <2 x double> %r1
31}
32
33define <2 x i64> @var_shuffle_v2i64_v2i64_xx_i64(<2 x i64> %x, i32 %i0, i32 %i1) nounwind {
34; SSE-LABEL: var_shuffle_v2i64_v2i64_xx_i64:
35; SSE:       # BB#0:
36; SSE-NEXT:    movslq %edi, %rax
37; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
38; SSE-NEXT:    movslq %esi, %rcx
39; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
40; SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
41; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
42; SSE-NEXT:    retq
43;
44; AVX-LABEL: var_shuffle_v2i64_v2i64_xx_i64:
45; AVX:       # BB#0:
46; AVX-NEXT:    movslq %edi, %rax
47; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
48; AVX-NEXT:    movslq %esi, %rcx
49; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
50; AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
51; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
52; AVX-NEXT:    retq
53  %x0 = extractelement <2 x i64> %x, i32 %i0
54  %x1 = extractelement <2 x i64> %x, i32 %i1
55  %r0 = insertelement <2 x i64> undef, i64 %x0, i32 0
56  %r1 = insertelement <2 x i64>   %r0, i64 %x1, i32 1
57  ret <2 x i64> %r1
58}
59
60define <4 x float> @var_shuffle_v4f32_v4f32_xxxx_i32(<4 x float> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3) nounwind {
61; SSE2-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32:
62; SSE2:       # BB#0:
63; SSE2-NEXT:    movslq %edi, %rax
64; SSE2-NEXT:    movslq %esi, %rsi
65; SSE2-NEXT:    movslq %edx, %rdx
66; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
67; SSE2-NEXT:    movslq %ecx, %rcx
68; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
69; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
70; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
71; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
72; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
73; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
74; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
75; SSE2-NEXT:    retq
76;
77; SSSE3-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32:
78; SSSE3:       # BB#0:
79; SSSE3-NEXT:    movslq %edi, %rax
80; SSSE3-NEXT:    movslq %esi, %rsi
81; SSSE3-NEXT:    movslq %edx, %rdx
82; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
83; SSSE3-NEXT:    movslq %ecx, %rcx
84; SSSE3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
85; SSSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
86; SSSE3-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
87; SSSE3-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
88; SSSE3-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
89; SSSE3-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
90; SSSE3-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
91; SSSE3-NEXT:    retq
92;
93; SSE41-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32:
94; SSE41:       # BB#0:
95; SSE41-NEXT:    movslq %edi, %rax
96; SSE41-NEXT:    movslq %esi, %rsi
97; SSE41-NEXT:    movslq %edx, %rdx
98; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
99; SSE41-NEXT:    movslq %ecx, %rcx
100; SSE41-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
101; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
102; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
103; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
104; SSE41-NEXT:    retq
105;
106; AVX-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32:
107; AVX:       # BB#0:
108; AVX-NEXT:    movslq %edi, %rax
109; AVX-NEXT:    movslq %esi, %rsi
110; AVX-NEXT:    movslq %edx, %rdx
111; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
112; AVX-NEXT:    movslq %ecx, %rcx
113; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
114; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
115; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
116; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
117; AVX-NEXT:    retq
118  %x0 = extractelement <4 x float> %x, i32 %i0
119  %x1 = extractelement <4 x float> %x, i32 %i1
120  %x2 = extractelement <4 x float> %x, i32 %i2
121  %x3 = extractelement <4 x float> %x, i32 %i3
122  %r0 = insertelement <4 x float> undef, float %x0, i32 0
123  %r1 = insertelement <4 x float>   %r0, float %x1, i32 1
124  %r2 = insertelement <4 x float>   %r1, float %x2, i32 2
125  %r3 = insertelement <4 x float>   %r2, float %x3, i32 3
126  ret <4 x float> %r3
127}
128
129define <4 x i32> @var_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3) nounwind {
130; SSE2-LABEL: var_shuffle_v4i32_v4i32_xxxx_i32:
131; SSE2:       # BB#0:
132; SSE2-NEXT:    movslq %edi, %rax
133; SSE2-NEXT:    movslq %esi, %rsi
134; SSE2-NEXT:    movslq %edx, %rdx
135; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
136; SSE2-NEXT:    movslq %ecx, %rcx
137; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
138; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
139; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
140; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
141; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
142; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
143; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
144; SSE2-NEXT:    retq
145;
146; SSSE3-LABEL: var_shuffle_v4i32_v4i32_xxxx_i32:
147; SSSE3:       # BB#0:
148; SSSE3-NEXT:    movslq %edi, %rax
149; SSSE3-NEXT:    movslq %esi, %rsi
150; SSSE3-NEXT:    movslq %edx, %rdx
151; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
152; SSSE3-NEXT:    movslq %ecx, %rcx
153; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
154; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
155; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
156; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
157; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
158; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
159; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
160; SSSE3-NEXT:    retq
161;
162; SSE41-LABEL: var_shuffle_v4i32_v4i32_xxxx_i32:
163; SSE41:       # BB#0:
164; SSE41-NEXT:    movslq %edi, %rax
165; SSE41-NEXT:    movslq %esi, %rsi
166; SSE41-NEXT:    movslq %edx, %rdx
167; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
168; SSE41-NEXT:    movslq %ecx, %rcx
169; SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
170; SSE41-NEXT:    pinsrd $1, -24(%rsp,%rsi,4), %xmm0
171; SSE41-NEXT:    pinsrd $2, -24(%rsp,%rdx,4), %xmm0
172; SSE41-NEXT:    pinsrd $3, -24(%rsp,%rcx,4), %xmm0
173; SSE41-NEXT:    retq
174;
175; AVX-LABEL: var_shuffle_v4i32_v4i32_xxxx_i32:
176; AVX:       # BB#0:
177; AVX-NEXT:    movslq %edi, %rax
178; AVX-NEXT:    movslq %esi, %rsi
179; AVX-NEXT:    movslq %edx, %rdx
180; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
181; AVX-NEXT:    movslq %ecx, %rcx
182; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
183; AVX-NEXT:    vpinsrd $1, -24(%rsp,%rsi,4), %xmm0, %xmm0
184; AVX-NEXT:    vpinsrd $2, -24(%rsp,%rdx,4), %xmm0, %xmm0
185; AVX-NEXT:    vpinsrd $3, -24(%rsp,%rcx,4), %xmm0, %xmm0
186; AVX-NEXT:    retq
187  %x0 = extractelement <4 x i32> %x, i32 %i0
188  %x1 = extractelement <4 x i32> %x, i32 %i1
189  %x2 = extractelement <4 x i32> %x, i32 %i2
190  %x3 = extractelement <4 x i32> %x, i32 %i3
191  %r0 = insertelement <4 x i32> undef, i32 %x0, i32 0
192  %r1 = insertelement <4 x i32>   %r0, i32 %x1, i32 1
193  %r2 = insertelement <4 x i32>   %r1, i32 %x2, i32 2
194  %r3 = insertelement <4 x i32>   %r2, i32 %x3, i32 3
195  ret <4 x i32> %r3
196}
197
198define <8 x i16> @var_shuffle_v8i16_v8i16_xxxxxxxx_i16(<8 x i16> %x, i16 %i0, i16 %i1, i16 %i2, i16 %i3, i16 %i4, i16 %i5, i16 %i6, i16 %i7) nounwind {
199; SSE2-LABEL: var_shuffle_v8i16_v8i16_xxxxxxxx_i16:
200; SSE2:       # BB#0:
201; SSE2-NEXT:    # kill: %R9D<def> %R9D<kill> %R9<def>
202; SSE2-NEXT:    # kill: %R8D<def> %R8D<kill> %R8<def>
203; SSE2-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
204; SSE2-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
205; SSE2-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
206; SSE2-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
207; SSE2-NEXT:    movswq %di, %rax
208; SSE2-NEXT:    movswq %si, %rsi
209; SSE2-NEXT:    movswq %dx, %rdx
210; SSE2-NEXT:    movswq %cx, %r10
211; SSE2-NEXT:    movswq %r8w, %r11
212; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
213; SSE2-NEXT:    movswq %r9w, %r8
214; SSE2-NEXT:    movswq {{[0-9]+}}(%rsp), %rcx
215; SSE2-NEXT:    movswq {{[0-9]+}}(%rsp), %rdi
216; SSE2-NEXT:    movzwl -24(%rsp,%rcx,2), %ecx
217; SSE2-NEXT:    movzwl -24(%rsp,%rdi,2), %edi
218; SSE2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
219; SSE2-NEXT:    movzwl -24(%rsp,%rsi,2), %esi
220; SSE2-NEXT:    movd %ecx, %xmm0
221; SSE2-NEXT:    movzwl -24(%rsp,%rdx,2), %ecx
222; SSE2-NEXT:    movd %ecx, %xmm1
223; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
224; SSE2-NEXT:    movzwl -24(%rsp,%r10,2), %ecx
225; SSE2-NEXT:    movd %eax, %xmm0
226; SSE2-NEXT:    movzwl -24(%rsp,%r11,2), %eax
227; SSE2-NEXT:    movd %eax, %xmm2
228; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
229; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
230; SSE2-NEXT:    movd %edi, %xmm1
231; SSE2-NEXT:    movd %ecx, %xmm2
232; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
233; SSE2-NEXT:    movd %esi, %xmm1
234; SSE2-NEXT:    movzwl -24(%rsp,%r8,2), %eax
235; SSE2-NEXT:    movd %eax, %xmm3
236; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
237; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
238; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
239; SSE2-NEXT:    retq
240;
241; SSSE3-LABEL: var_shuffle_v8i16_v8i16_xxxxxxxx_i16:
242; SSSE3:       # BB#0:
243; SSSE3-NEXT:    # kill: %R9D<def> %R9D<kill> %R9<def>
244; SSSE3-NEXT:    # kill: %R8D<def> %R8D<kill> %R8<def>
245; SSSE3-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
246; SSSE3-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
247; SSSE3-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
248; SSSE3-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
249; SSSE3-NEXT:    movswq %di, %rax
250; SSSE3-NEXT:    movswq %si, %rsi
251; SSSE3-NEXT:    movswq %dx, %rdx
252; SSSE3-NEXT:    movswq %cx, %r10
253; SSSE3-NEXT:    movswq %r8w, %r11
254; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
255; SSSE3-NEXT:    movswq %r9w, %r8
256; SSSE3-NEXT:    movswq {{[0-9]+}}(%rsp), %rcx
257; SSSE3-NEXT:    movswq {{[0-9]+}}(%rsp), %rdi
258; SSSE3-NEXT:    movzwl -24(%rsp,%rcx,2), %ecx
259; SSSE3-NEXT:    movzwl -24(%rsp,%rdi,2), %edi
260; SSSE3-NEXT:    movzwl -24(%rsp,%rax,2), %eax
261; SSSE3-NEXT:    movzwl -24(%rsp,%rsi,2), %esi
262; SSSE3-NEXT:    movd %ecx, %xmm0
263; SSSE3-NEXT:    movzwl -24(%rsp,%rdx,2), %ecx
264; SSSE3-NEXT:    movd %ecx, %xmm1
265; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
266; SSSE3-NEXT:    movzwl -24(%rsp,%r10,2), %ecx
267; SSSE3-NEXT:    movd %eax, %xmm0
268; SSSE3-NEXT:    movzwl -24(%rsp,%r11,2), %eax
269; SSSE3-NEXT:    movd %eax, %xmm2
270; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
271; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
272; SSSE3-NEXT:    movd %edi, %xmm1
273; SSSE3-NEXT:    movd %ecx, %xmm2
274; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
275; SSSE3-NEXT:    movd %esi, %xmm1
276; SSSE3-NEXT:    movzwl -24(%rsp,%r8,2), %eax
277; SSSE3-NEXT:    movd %eax, %xmm3
278; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
279; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
280; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
281; SSSE3-NEXT:    retq
282;
283; SSE41-LABEL: var_shuffle_v8i16_v8i16_xxxxxxxx_i16:
284; SSE41:       # BB#0:
285; SSE41-NEXT:    pushq %rbx
286; SSE41-NEXT:    # kill: %R9D<def> %R9D<kill> %R9<def>
287; SSE41-NEXT:    # kill: %R8D<def> %R8D<kill> %R8<def>
288; SSE41-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
289; SSE41-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
290; SSE41-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
291; SSE41-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
292; SSE41-NEXT:    movswq %di, %rax
293; SSE41-NEXT:    movswq %si, %rbx
294; SSE41-NEXT:    movswq %dx, %r11
295; SSE41-NEXT:    movswq %cx, %r10
296; SSE41-NEXT:    movswq %r8w, %rdi
297; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
298; SSE41-NEXT:    movswq %r9w, %rcx
299; SSE41-NEXT:    movswq {{[0-9]+}}(%rsp), %rdx
300; SSE41-NEXT:    movswq {{[0-9]+}}(%rsp), %rsi
301; SSE41-NEXT:    movzwl -16(%rsp,%rdx,2), %edx
302; SSE41-NEXT:    movzwl -16(%rsp,%rsi,2), %esi
303; SSE41-NEXT:    movzwl -16(%rsp,%rax,2), %eax
304; SSE41-NEXT:    movd %eax, %xmm0
305; SSE41-NEXT:    pinsrw $1, -16(%rsp,%rbx,2), %xmm0
306; SSE41-NEXT:    pinsrw $2, -16(%rsp,%r11,2), %xmm0
307; SSE41-NEXT:    pinsrw $3, -16(%rsp,%r10,2), %xmm0
308; SSE41-NEXT:    pinsrw $4, -16(%rsp,%rdi,2), %xmm0
309; SSE41-NEXT:    pinsrw $5, -16(%rsp,%rcx,2), %xmm0
310; SSE41-NEXT:    pinsrw $6, %edx, %xmm0
311; SSE41-NEXT:    pinsrw $7, %esi, %xmm0
312; SSE41-NEXT:    popq %rbx
313; SSE41-NEXT:    retq
314;
315; AVX-LABEL: var_shuffle_v8i16_v8i16_xxxxxxxx_i16:
316; AVX:       # BB#0:
317; AVX-NEXT:    pushq %r14
318; AVX-NEXT:    pushq %rbx
319; AVX-NEXT:    # kill: %R9D<def> %R9D<kill> %R9<def>
320; AVX-NEXT:    # kill: %R8D<def> %R8D<kill> %R8<def>
321; AVX-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
322; AVX-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
323; AVX-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
324; AVX-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
325; AVX-NEXT:    movswq %di, %r10
326; AVX-NEXT:    movswq %si, %r11
327; AVX-NEXT:    movswq %dx, %r14
328; AVX-NEXT:    movswq %cx, %rcx
329; AVX-NEXT:    movswq %r8w, %rdi
330; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
331; AVX-NEXT:    movswq %r9w, %rax
332; AVX-NEXT:    movswq {{[0-9]+}}(%rsp), %rsi
333; AVX-NEXT:    movswq {{[0-9]+}}(%rsp), %rdx
334; AVX-NEXT:    movzwl -24(%rsp,%rsi,2), %esi
335; AVX-NEXT:    movzwl -24(%rsp,%rdx,2), %edx
336; AVX-NEXT:    movzwl -24(%rsp,%r10,2), %ebx
337; AVX-NEXT:    vmovd %ebx, %xmm0
338; AVX-NEXT:    vpinsrw $1, -24(%rsp,%r11,2), %xmm0, %xmm0
339; AVX-NEXT:    vpinsrw $2, -24(%rsp,%r14,2), %xmm0, %xmm0
340; AVX-NEXT:    vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0
341; AVX-NEXT:    vpinsrw $4, -24(%rsp,%rdi,2), %xmm0, %xmm0
342; AVX-NEXT:    vpinsrw $5, -24(%rsp,%rax,2), %xmm0, %xmm0
343; AVX-NEXT:    vpinsrw $6, %esi, %xmm0, %xmm0
344; AVX-NEXT:    vpinsrw $7, %edx, %xmm0, %xmm0
345; AVX-NEXT:    popq %rbx
346; AVX-NEXT:    popq %r14
347; AVX-NEXT:    retq
348  %x0 = extractelement <8 x i16> %x, i16 %i0
349  %x1 = extractelement <8 x i16> %x, i16 %i1
350  %x2 = extractelement <8 x i16> %x, i16 %i2
351  %x3 = extractelement <8 x i16> %x, i16 %i3
352  %x4 = extractelement <8 x i16> %x, i16 %i4
353  %x5 = extractelement <8 x i16> %x, i16 %i5
354  %x6 = extractelement <8 x i16> %x, i16 %i6
355  %x7 = extractelement <8 x i16> %x, i16 %i7
356  %r0 = insertelement <8 x i16> undef, i16 %x0, i32 0
357  %r1 = insertelement <8 x i16>   %r0, i16 %x1, i32 1
358  %r2 = insertelement <8 x i16>   %r1, i16 %x2, i32 2
359  %r3 = insertelement <8 x i16>   %r2, i16 %x3, i32 3
360  %r4 = insertelement <8 x i16>   %r3, i16 %x4, i32 4
361  %r5 = insertelement <8 x i16>   %r4, i16 %x5, i32 5
362  %r6 = insertelement <8 x i16>   %r5, i16 %x6, i32 6
363  %r7 = insertelement <8 x i16>   %r6, i16 %x7, i32 7
364  ret <8 x i16> %r7
365}
366
367define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %i0, i8 %i1, i8 %i2, i8 %i3, i8 %i4, i8 %i5, i8 %i6, i8 %i7, i8 %i8, i8 %i9, i8 %i10, i8 %i11, i8 %i12, i8 %i13, i8 %i14, i8 %i15) nounwind {
368; SSE2-LABEL: var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
369; SSE2:       # BB#0:
370; SSE2-NEXT:    # kill: %R9D<def> %R9D<kill> %R9<def>
371; SSE2-NEXT:    # kill: %R8D<def> %R8D<kill> %R8<def>
372; SSE2-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
373; SSE2-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
374; SSE2-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
375; SSE2-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
376; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
377; SSE2-NEXT:    movsbq {{[0-9]+}}(%rsp), %r10
378; SSE2-NEXT:    leaq -{{[0-9]+}}(%rsp), %r11
379; SSE2-NEXT:    movzbl (%r10,%r11), %eax
380; SSE2-NEXT:    movd %eax, %xmm15
381; SSE2-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
382; SSE2-NEXT:    movzbl (%rax,%r11), %eax
383; SSE2-NEXT:    movd %eax, %xmm8
384; SSE2-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
385; SSE2-NEXT:    movzbl (%rax,%r11), %eax
386; SSE2-NEXT:    movd %eax, %xmm9
387; SSE2-NEXT:    movsbq %dl, %rax
388; SSE2-NEXT:    movzbl (%rax,%r11), %eax
389; SSE2-NEXT:    movd %eax, %xmm3
390; SSE2-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
391; SSE2-NEXT:    movzbl (%rax,%r11), %eax
392; SSE2-NEXT:    movd %eax, %xmm10
393; SSE2-NEXT:    movsbq %dil, %rax
394; SSE2-NEXT:    movzbl (%rax,%r11), %eax
395; SSE2-NEXT:    movd %eax, %xmm0
396; SSE2-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
397; SSE2-NEXT:    movzbl (%rax,%r11), %eax
398; SSE2-NEXT:    movd %eax, %xmm11
399; SSE2-NEXT:    movsbq %r8b, %rax
400; SSE2-NEXT:    movzbl (%rax,%r11), %eax
401; SSE2-NEXT:    movd %eax, %xmm7
402; SSE2-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
403; SSE2-NEXT:    movzbl (%rax,%r11), %eax
404; SSE2-NEXT:    movd %eax, %xmm2
405; SSE2-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
406; SSE2-NEXT:    movzbl (%rax,%r11), %eax
407; SSE2-NEXT:    movd %eax, %xmm12
408; SSE2-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
409; SSE2-NEXT:    movzbl (%rax,%r11), %eax
410; SSE2-NEXT:    movd %eax, %xmm13
411; SSE2-NEXT:    movsbq %cl, %rax
412; SSE2-NEXT:    movzbl (%rax,%r11), %eax
413; SSE2-NEXT:    movd %eax, %xmm6
414; SSE2-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
415; SSE2-NEXT:    movzbl (%rax,%r11), %eax
416; SSE2-NEXT:    movd %eax, %xmm14
417; SSE2-NEXT:    movsbq %sil, %rax
418; SSE2-NEXT:    movzbl (%rax,%r11), %eax
419; SSE2-NEXT:    movd %eax, %xmm5
420; SSE2-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
421; SSE2-NEXT:    movzbl (%rax,%r11), %eax
422; SSE2-NEXT:    movd %eax, %xmm4
423; SSE2-NEXT:    movsbq %r9b, %rax
424; SSE2-NEXT:    movzbl (%rax,%r11), %eax
425; SSE2-NEXT:    movd %eax, %xmm1
426; SSE2-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
427; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
428; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7]
429; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
430; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3],xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7]
431; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
432; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
433; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3],xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7]
434; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3],xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7]
435; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
436; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3],xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7]
437; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
438; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
439; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
440; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
441; SSE2-NEXT:    retq
442;
443; SSSE3-LABEL: var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
444; SSSE3:       # BB#0:
445; SSSE3-NEXT:    # kill: %R9D<def> %R9D<kill> %R9<def>
446; SSSE3-NEXT:    # kill: %R8D<def> %R8D<kill> %R8<def>
447; SSSE3-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
448; SSSE3-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
449; SSSE3-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
450; SSSE3-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
451; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
452; SSSE3-NEXT:    movsbq {{[0-9]+}}(%rsp), %r10
453; SSSE3-NEXT:    leaq -{{[0-9]+}}(%rsp), %r11
454; SSSE3-NEXT:    movzbl (%r10,%r11), %eax
455; SSSE3-NEXT:    movd %eax, %xmm15
456; SSSE3-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
457; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
458; SSSE3-NEXT:    movd %eax, %xmm8
459; SSSE3-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
460; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
461; SSSE3-NEXT:    movd %eax, %xmm9
462; SSSE3-NEXT:    movsbq %dl, %rax
463; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
464; SSSE3-NEXT:    movd %eax, %xmm3
465; SSSE3-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
466; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
467; SSSE3-NEXT:    movd %eax, %xmm10
468; SSSE3-NEXT:    movsbq %dil, %rax
469; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
470; SSSE3-NEXT:    movd %eax, %xmm0
471; SSSE3-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
472; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
473; SSSE3-NEXT:    movd %eax, %xmm11
474; SSSE3-NEXT:    movsbq %r8b, %rax
475; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
476; SSSE3-NEXT:    movd %eax, %xmm7
477; SSSE3-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
478; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
479; SSSE3-NEXT:    movd %eax, %xmm2
480; SSSE3-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
481; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
482; SSSE3-NEXT:    movd %eax, %xmm12
483; SSSE3-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
484; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
485; SSSE3-NEXT:    movd %eax, %xmm13
486; SSSE3-NEXT:    movsbq %cl, %rax
487; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
488; SSSE3-NEXT:    movd %eax, %xmm6
489; SSSE3-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
490; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
491; SSSE3-NEXT:    movd %eax, %xmm14
492; SSSE3-NEXT:    movsbq %sil, %rax
493; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
494; SSSE3-NEXT:    movd %eax, %xmm5
495; SSSE3-NEXT:    movsbq {{[0-9]+}}(%rsp), %rax
496; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
497; SSSE3-NEXT:    movd %eax, %xmm4
498; SSSE3-NEXT:    movsbq %r9b, %rax
499; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
500; SSSE3-NEXT:    movd %eax, %xmm1
501; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
502; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
503; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7]
504; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
505; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3],xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7]
506; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
507; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
508; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3],xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7]
509; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3],xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7]
510; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
511; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3],xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7]
512; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
513; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
514; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
515; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
516; SSSE3-NEXT:    retq
517;
518; SSE41-LABEL: var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
519; SSE41:       # BB#0:
520; SSE41-NEXT:    pushq %rbp
521; SSE41-NEXT:    pushq %r15
522; SSE41-NEXT:    pushq %r14
523; SSE41-NEXT:    pushq %r13
524; SSE41-NEXT:    pushq %r12
525; SSE41-NEXT:    pushq %rbx
526; SSE41-NEXT:    # kill: %R9D<def> %R9D<kill> %R9<def>
527; SSE41-NEXT:    # kill: %R8D<def> %R8D<kill> %R8<def>
528; SSE41-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
529; SSE41-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
530; SSE41-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
531; SSE41-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
532; SSE41-NEXT:    movsbq %dil, %r15
533; SSE41-NEXT:    movsbq %sil, %r14
534; SSE41-NEXT:    movsbq %dl, %r11
535; SSE41-NEXT:    movsbq %cl, %r10
536; SSE41-NEXT:    movsbq %r8b, %r8
537; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
538; SSE41-NEXT:    movsbq %r9b, %r9
539; SSE41-NEXT:    movsbq {{[0-9]+}}(%rsp), %r12
540; SSE41-NEXT:    movsbq {{[0-9]+}}(%rsp), %r13
541; SSE41-NEXT:    movsbq {{[0-9]+}}(%rsp), %rbp
542; SSE41-NEXT:    movsbq {{[0-9]+}}(%rsp), %rbx
543; SSE41-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
544; SSE41-NEXT:    movzbl (%r15,%rax), %ecx
545; SSE41-NEXT:    movd %ecx, %xmm0
546; SSE41-NEXT:    movsbq {{[0-9]+}}(%rsp), %r15
547; SSE41-NEXT:    pinsrb $1, (%r14,%rax), %xmm0
548; SSE41-NEXT:    movsbq {{[0-9]+}}(%rsp), %r14
549; SSE41-NEXT:    pinsrb $2, (%r11,%rax), %xmm0
550; SSE41-NEXT:    movsbq {{[0-9]+}}(%rsp), %r11
551; SSE41-NEXT:    pinsrb $3, (%r10,%rax), %xmm0
552; SSE41-NEXT:    movsbq {{[0-9]+}}(%rsp), %r10
553; SSE41-NEXT:    pinsrb $4, (%r8,%rax), %xmm0
554; SSE41-NEXT:    movsbq {{[0-9]+}}(%rsp), %rcx
555; SSE41-NEXT:    pinsrb $5, (%r9,%rax), %xmm0
556; SSE41-NEXT:    movsbq {{[0-9]+}}(%rsp), %rdx
557; SSE41-NEXT:    movzbl (%r12,%rax), %esi
558; SSE41-NEXT:    movzbl (%r13,%rax), %edi
559; SSE41-NEXT:    movzbl (%rbp,%rax), %ebp
560; SSE41-NEXT:    movzbl (%rbx,%rax), %ebx
561; SSE41-NEXT:    movzbl (%r15,%rax), %r8d
562; SSE41-NEXT:    movzbl (%r14,%rax), %r9d
563; SSE41-NEXT:    movzbl (%r11,%rax), %r11d
564; SSE41-NEXT:    movzbl (%r10,%rax), %r10d
565; SSE41-NEXT:    movzbl (%rcx,%rax), %ecx
566; SSE41-NEXT:    movzbl (%rdx,%rax), %eax
567; SSE41-NEXT:    pinsrb $6, %esi, %xmm0
568; SSE41-NEXT:    pinsrb $7, %edi, %xmm0
569; SSE41-NEXT:    pinsrb $8, %ebp, %xmm0
570; SSE41-NEXT:    pinsrb $9, %ebx, %xmm0
571; SSE41-NEXT:    pinsrb $10, %r8d, %xmm0
572; SSE41-NEXT:    pinsrb $11, %r9d, %xmm0
573; SSE41-NEXT:    pinsrb $12, %r11d, %xmm0
574; SSE41-NEXT:    pinsrb $13, %r10d, %xmm0
575; SSE41-NEXT:    pinsrb $14, %ecx, %xmm0
576; SSE41-NEXT:    pinsrb $15, %eax, %xmm0
577; SSE41-NEXT:    popq %rbx
578; SSE41-NEXT:    popq %r12
579; SSE41-NEXT:    popq %r13
580; SSE41-NEXT:    popq %r14
581; SSE41-NEXT:    popq %r15
582; SSE41-NEXT:    popq %rbp
583; SSE41-NEXT:    retq
584;
585; AVX-LABEL: var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
586; AVX:       # BB#0:
587; AVX-NEXT:    pushq %rbp
588; AVX-NEXT:    pushq %r15
589; AVX-NEXT:    pushq %r14
590; AVX-NEXT:    pushq %r13
591; AVX-NEXT:    pushq %r12
592; AVX-NEXT:    pushq %rbx
593; AVX-NEXT:    # kill: %R9D<def> %R9D<kill> %R9<def>
594; AVX-NEXT:    # kill: %R8D<def> %R8D<kill> %R8<def>
595; AVX-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
596; AVX-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
597; AVX-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
598; AVX-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
599; AVX-NEXT:    movsbq %dil, %r10
600; AVX-NEXT:    movsbq %sil, %r11
601; AVX-NEXT:    movsbq %dl, %r14
602; AVX-NEXT:    movsbq %cl, %r15
603; AVX-NEXT:    movsbq %r8b, %r8
604; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
605; AVX-NEXT:    movsbq %r9b, %r9
606; AVX-NEXT:    movsbq {{[0-9]+}}(%rsp), %r12
607; AVX-NEXT:    movsbq {{[0-9]+}}(%rsp), %r13
608; AVX-NEXT:    movsbq {{[0-9]+}}(%rsp), %rbp
609; AVX-NEXT:    movsbq {{[0-9]+}}(%rsp), %rcx
610; AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rdi
611; AVX-NEXT:    movzbl (%r10,%rdi), %eax
612; AVX-NEXT:    vmovd %eax, %xmm0
613; AVX-NEXT:    movsbq {{[0-9]+}}(%rsp), %r10
614; AVX-NEXT:    vpinsrb $1, (%r11,%rdi), %xmm0, %xmm0
615; AVX-NEXT:    movsbq {{[0-9]+}}(%rsp), %r11
616; AVX-NEXT:    vpinsrb $2, (%r14,%rdi), %xmm0, %xmm0
617; AVX-NEXT:    movsbq {{[0-9]+}}(%rsp), %r14
618; AVX-NEXT:    vpinsrb $3, (%r15,%rdi), %xmm0, %xmm0
619; AVX-NEXT:    movsbq {{[0-9]+}}(%rsp), %r15
620; AVX-NEXT:    vpinsrb $4, (%r8,%rdi), %xmm0, %xmm0
621; AVX-NEXT:    movsbq {{[0-9]+}}(%rsp), %r8
622; AVX-NEXT:    vpinsrb $5, (%r9,%rdi), %xmm0, %xmm0
623; AVX-NEXT:    movsbq {{[0-9]+}}(%rsp), %rsi
624; AVX-NEXT:    movzbl (%r12,%rdi), %edx
625; AVX-NEXT:    movzbl (%r13,%rdi), %ebx
626; AVX-NEXT:    movzbl (%rbp,%rdi), %ebp
627; AVX-NEXT:    movzbl (%rcx,%rdi), %ecx
628; AVX-NEXT:    movzbl (%r10,%rdi), %eax
629; AVX-NEXT:    movzbl (%r11,%rdi), %r9d
630; AVX-NEXT:    movzbl (%r14,%rdi), %r10d
631; AVX-NEXT:    movzbl (%r15,%rdi), %r11d
632; AVX-NEXT:    movzbl (%r8,%rdi), %r8d
633; AVX-NEXT:    movzbl (%rsi,%rdi), %esi
634; AVX-NEXT:    vpinsrb $6, %edx, %xmm0, %xmm0
635; AVX-NEXT:    vpinsrb $7, %ebx, %xmm0, %xmm0
636; AVX-NEXT:    vpinsrb $8, %ebp, %xmm0, %xmm0
637; AVX-NEXT:    vpinsrb $9, %ecx, %xmm0, %xmm0
638; AVX-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
639; AVX-NEXT:    vpinsrb $11, %r9d, %xmm0, %xmm0
640; AVX-NEXT:    vpinsrb $12, %r10d, %xmm0, %xmm0
641; AVX-NEXT:    vpinsrb $13, %r11d, %xmm0, %xmm0
642; AVX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
643; AVX-NEXT:    vpinsrb $15, %esi, %xmm0, %xmm0
644; AVX-NEXT:    popq %rbx
645; AVX-NEXT:    popq %r12
646; AVX-NEXT:    popq %r13
647; AVX-NEXT:    popq %r14
648; AVX-NEXT:    popq %r15
649; AVX-NEXT:    popq %rbp
650; AVX-NEXT:    retq
651  %x0  = extractelement <16 x i8> %x, i8 %i0
652  %x1  = extractelement <16 x i8> %x, i8 %i1
653  %x2  = extractelement <16 x i8> %x, i8 %i2
654  %x3  = extractelement <16 x i8> %x, i8 %i3
655  %x4  = extractelement <16 x i8> %x, i8 %i4
656  %x5  = extractelement <16 x i8> %x, i8 %i5
657  %x6  = extractelement <16 x i8> %x, i8 %i6
658  %x7  = extractelement <16 x i8> %x, i8 %i7
659  %x8  = extractelement <16 x i8> %x, i8 %i8
660  %x9  = extractelement <16 x i8> %x, i8 %i9
661  %x10 = extractelement <16 x i8> %x, i8 %i10
662  %x11 = extractelement <16 x i8> %x, i8 %i11
663  %x12 = extractelement <16 x i8> %x, i8 %i12
664  %x13 = extractelement <16 x i8> %x, i8 %i13
665  %x14 = extractelement <16 x i8> %x, i8 %i14
666  %x15 = extractelement <16 x i8> %x, i8 %i15
667  %r0  = insertelement <16 x i8> undef, i8 %x0 , i32 0
668  %r1  = insertelement <16 x i8>  %r0 , i8 %x1 , i32 1
669  %r2  = insertelement <16 x i8>  %r1 , i8 %x2 , i32 2
670  %r3  = insertelement <16 x i8>  %r2 , i8 %x3 , i32 3
671  %r4  = insertelement <16 x i8>  %r3 , i8 %x4 , i32 4
672  %r5  = insertelement <16 x i8>  %r4 , i8 %x5 , i32 5
673  %r6  = insertelement <16 x i8>  %r5 , i8 %x6 , i32 6
674  %r7  = insertelement <16 x i8>  %r6 , i8 %x7 , i32 7
675  %r8  = insertelement <16 x i8>  %r7 , i8 %x8 , i32 8
676  %r9  = insertelement <16 x i8>  %r8 , i8 %x9 , i32 9
677  %r10 = insertelement <16 x i8>  %r9 , i8 %x10, i32 10
678  %r11 = insertelement <16 x i8>  %r10, i8 %x11, i32 11
679  %r12 = insertelement <16 x i8>  %r11, i8 %x12, i32 12
680  %r13 = insertelement <16 x i8>  %r12, i8 %x13, i32 13
681  %r14 = insertelement <16 x i8>  %r13, i8 %x14, i32 14
682  %r15 = insertelement <16 x i8>  %r14, i8 %x15, i32 15
683  ret <16 x i8> %r15
684}
685
686;
687; Unary shuffle indices from memory
688;
689
690define <4 x i32> @mem_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32* %i) nounwind {
691; SSE2-LABEL: mem_shuffle_v4i32_v4i32_xxxx_i32:
692; SSE2:       # BB#0:
693; SSE2-NEXT:    movslq (%rdi), %rax
694; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
695; SSE2-NEXT:    movslq 4(%rdi), %rcx
696; SSE2-NEXT:    movslq 8(%rdi), %rdx
697; SSE2-NEXT:    movslq 12(%rdi), %rsi
698; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
699; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
700; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
701; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
702; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
703; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
704; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
705; SSE2-NEXT:    retq
706;
707; SSSE3-LABEL: mem_shuffle_v4i32_v4i32_xxxx_i32:
708; SSSE3:       # BB#0:
709; SSSE3-NEXT:    movslq (%rdi), %rax
710; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
711; SSSE3-NEXT:    movslq 4(%rdi), %rcx
712; SSSE3-NEXT:    movslq 8(%rdi), %rdx
713; SSSE3-NEXT:    movslq 12(%rdi), %rsi
714; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
715; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
716; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
717; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
718; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
719; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
720; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
721; SSSE3-NEXT:    retq
722;
723; SSE41-LABEL: mem_shuffle_v4i32_v4i32_xxxx_i32:
724; SSE41:       # BB#0:
725; SSE41-NEXT:    movslq (%rdi), %rax
726; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
727; SSE41-NEXT:    movslq 4(%rdi), %rcx
728; SSE41-NEXT:    movslq 8(%rdi), %rdx
729; SSE41-NEXT:    movslq 12(%rdi), %rsi
730; SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
731; SSE41-NEXT:    pinsrd $1, -24(%rsp,%rcx,4), %xmm0
732; SSE41-NEXT:    pinsrd $2, -24(%rsp,%rdx,4), %xmm0
733; SSE41-NEXT:    pinsrd $3, -24(%rsp,%rsi,4), %xmm0
734; SSE41-NEXT:    retq
735;
736; AVX-LABEL: mem_shuffle_v4i32_v4i32_xxxx_i32:
737; AVX:       # BB#0:
738; AVX-NEXT:    movslq (%rdi), %rax
739; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
740; AVX-NEXT:    movslq 4(%rdi), %rcx
741; AVX-NEXT:    movslq 8(%rdi), %rdx
742; AVX-NEXT:    movslq 12(%rdi), %rsi
743; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
744; AVX-NEXT:    vpinsrd $1, -24(%rsp,%rcx,4), %xmm0, %xmm0
745; AVX-NEXT:    vpinsrd $2, -24(%rsp,%rdx,4), %xmm0, %xmm0
746; AVX-NEXT:    vpinsrd $3, -24(%rsp,%rsi,4), %xmm0, %xmm0
747; AVX-NEXT:    retq
748  %p0  = getelementptr inbounds i32, i32* %i, i64 0
749  %p1  = getelementptr inbounds i32, i32* %i, i64 1
750  %p2  = getelementptr inbounds i32, i32* %i, i64 2
751  %p3  = getelementptr inbounds i32, i32* %i, i64 3
752  %i0  = load i32, i32* %p0, align 4
753  %i1  = load i32, i32* %p1, align 4
754  %i2  = load i32, i32* %p2, align 4
755  %i3  = load i32, i32* %p3, align 4
756  %x0 = extractelement <4 x i32> %x, i32 %i0
757  %x1 = extractelement <4 x i32> %x, i32 %i1
758  %x2 = extractelement <4 x i32> %x, i32 %i2
759  %x3 = extractelement <4 x i32> %x, i32 %i3
760  %r0 = insertelement <4 x i32> undef, i32 %x0, i32 0
761  %r1 = insertelement <4 x i32>   %r0, i32 %x1, i32 1
762  %r2 = insertelement <4 x i32>   %r1, i32 %x2, i32 2
763  %r3 = insertelement <4 x i32>   %r2, i32 %x3, i32 3
764  ret <4 x i32> %r3
765}
766
767define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8* %i) nounwind {
768; SSE2-LABEL: mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
769; SSE2:       # BB#0:
770; SSE2-NEXT:    movsbq (%rdi), %rcx
771; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
772; SSE2-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
773; SSE2-NEXT:    movzbl (%rcx,%rax), %ecx
774; SSE2-NEXT:    movd %ecx, %xmm0
775; SSE2-NEXT:    movsbq 8(%rdi), %rcx
776; SSE2-NEXT:    movzbl (%rcx,%rax), %ecx
777; SSE2-NEXT:    movd %ecx, %xmm8
778; SSE2-NEXT:    movsbq 12(%rdi), %rcx
779; SSE2-NEXT:    movzbl (%rcx,%rax), %ecx
780; SSE2-NEXT:    movd %ecx, %xmm9
781; SSE2-NEXT:    movsbq 4(%rdi), %rcx
782; SSE2-NEXT:    movzbl (%rcx,%rax), %ecx
783; SSE2-NEXT:    movd %ecx, %xmm3
784; SSE2-NEXT:    movsbq 14(%rdi), %rcx
785; SSE2-NEXT:    movzbl (%rcx,%rax), %ecx
786; SSE2-NEXT:    movd %ecx, %xmm10
787; SSE2-NEXT:    movsbq 6(%rdi), %rcx
788; SSE2-NEXT:    movzbl (%rcx,%rax), %ecx
789; SSE2-NEXT:    movd %ecx, %xmm5
790; SSE2-NEXT:    movsbq 10(%rdi), %rcx
791; SSE2-NEXT:    movzbl (%rcx,%rax), %ecx
792; SSE2-NEXT:    movd %ecx, %xmm11
793; SSE2-NEXT:    movsbq 2(%rdi), %rcx
794; SSE2-NEXT:    movzbl (%rcx,%rax), %ecx
795; SSE2-NEXT:    movd %ecx, %xmm7
796; SSE2-NEXT:    movsbq 15(%rdi), %rcx
797; SSE2-NEXT:    movzbl (%rcx,%rax), %ecx
798; SSE2-NEXT:    movd %ecx, %xmm12
799; SSE2-NEXT:    movsbq 7(%rdi), %rcx
800; SSE2-NEXT:    movzbl (%rcx,%rax), %ecx
801; SSE2-NEXT:    movd %ecx, %xmm2
802; SSE2-NEXT:    movsbq 11(%rdi), %rcx
803; SSE2-NEXT:    movzbl (%rcx,%rax), %ecx
804; SSE2-NEXT:    movd %ecx, %xmm13
805; SSE2-NEXT:    movsbq 3(%rdi), %rcx
806; SSE2-NEXT:    movzbl (%rcx,%rax), %ecx
807; SSE2-NEXT:    movd %ecx, %xmm6
808; SSE2-NEXT:    movsbq 13(%rdi), %rcx
809; SSE2-NEXT:    movzbl (%rcx,%rax), %ecx
810; SSE2-NEXT:    movd %ecx, %xmm14
811; SSE2-NEXT:    movsbq 5(%rdi), %rcx
812; SSE2-NEXT:    movzbl (%rcx,%rax), %ecx
813; SSE2-NEXT:    movd %ecx, %xmm4
814; SSE2-NEXT:    movsbq 9(%rdi), %rcx
815; SSE2-NEXT:    movzbl (%rcx,%rax), %ecx
816; SSE2-NEXT:    movd %ecx, %xmm15
817; SSE2-NEXT:    movsbq 1(%rdi), %rcx
818; SSE2-NEXT:    movzbl (%rcx,%rax), %eax
819; SSE2-NEXT:    movd %eax, %xmm1
820; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
821; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
822; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
823; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7]
824; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3],xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7]
825; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
826; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
827; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3],xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7]
828; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3],xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7]
829; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
830; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7]
831; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7]
832; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
833; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
834; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
835; SSE2-NEXT:    retq
836;
837; SSSE3-LABEL: mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
838; SSSE3:       # BB#0:
839; SSSE3-NEXT:    movsbq (%rdi), %rcx
840; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
841; SSSE3-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
842; SSSE3-NEXT:    movzbl (%rcx,%rax), %ecx
843; SSSE3-NEXT:    movd %ecx, %xmm0
844; SSSE3-NEXT:    movsbq 8(%rdi), %rcx
845; SSSE3-NEXT:    movzbl (%rcx,%rax), %ecx
846; SSSE3-NEXT:    movd %ecx, %xmm8
847; SSSE3-NEXT:    movsbq 12(%rdi), %rcx
848; SSSE3-NEXT:    movzbl (%rcx,%rax), %ecx
849; SSSE3-NEXT:    movd %ecx, %xmm9
850; SSSE3-NEXT:    movsbq 4(%rdi), %rcx
851; SSSE3-NEXT:    movzbl (%rcx,%rax), %ecx
852; SSSE3-NEXT:    movd %ecx, %xmm3
853; SSSE3-NEXT:    movsbq 14(%rdi), %rcx
854; SSSE3-NEXT:    movzbl (%rcx,%rax), %ecx
855; SSSE3-NEXT:    movd %ecx, %xmm10
856; SSSE3-NEXT:    movsbq 6(%rdi), %rcx
857; SSSE3-NEXT:    movzbl (%rcx,%rax), %ecx
858; SSSE3-NEXT:    movd %ecx, %xmm5
859; SSSE3-NEXT:    movsbq 10(%rdi), %rcx
860; SSSE3-NEXT:    movzbl (%rcx,%rax), %ecx
861; SSSE3-NEXT:    movd %ecx, %xmm11
862; SSSE3-NEXT:    movsbq 2(%rdi), %rcx
863; SSSE3-NEXT:    movzbl (%rcx,%rax), %ecx
864; SSSE3-NEXT:    movd %ecx, %xmm7
865; SSSE3-NEXT:    movsbq 15(%rdi), %rcx
866; SSSE3-NEXT:    movzbl (%rcx,%rax), %ecx
867; SSSE3-NEXT:    movd %ecx, %xmm12
868; SSSE3-NEXT:    movsbq 7(%rdi), %rcx
869; SSSE3-NEXT:    movzbl (%rcx,%rax), %ecx
870; SSSE3-NEXT:    movd %ecx, %xmm2
871; SSSE3-NEXT:    movsbq 11(%rdi), %rcx
872; SSSE3-NEXT:    movzbl (%rcx,%rax), %ecx
873; SSSE3-NEXT:    movd %ecx, %xmm13
874; SSSE3-NEXT:    movsbq 3(%rdi), %rcx
875; SSSE3-NEXT:    movzbl (%rcx,%rax), %ecx
876; SSSE3-NEXT:    movd %ecx, %xmm6
877; SSSE3-NEXT:    movsbq 13(%rdi), %rcx
878; SSSE3-NEXT:    movzbl (%rcx,%rax), %ecx
879; SSSE3-NEXT:    movd %ecx, %xmm14
880; SSSE3-NEXT:    movsbq 5(%rdi), %rcx
881; SSSE3-NEXT:    movzbl (%rcx,%rax), %ecx
882; SSSE3-NEXT:    movd %ecx, %xmm4
883; SSSE3-NEXT:    movsbq 9(%rdi), %rcx
884; SSSE3-NEXT:    movzbl (%rcx,%rax), %ecx
885; SSSE3-NEXT:    movd %ecx, %xmm15
886; SSSE3-NEXT:    movsbq 1(%rdi), %rcx
887; SSSE3-NEXT:    movzbl (%rcx,%rax), %eax
888; SSSE3-NEXT:    movd %eax, %xmm1
889; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
890; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
891; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
892; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7]
893; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3],xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7]
894; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
895; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
896; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3],xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7]
897; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3],xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7]
898; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
899; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7]
900; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7]
901; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
902; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
903; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
904; SSSE3-NEXT:    retq
905;
906; SSE41-LABEL: mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
907; SSE41:       # BB#0:
908; SSE41-NEXT:    pushq %rbp
909; SSE41-NEXT:    pushq %r15
910; SSE41-NEXT:    pushq %r14
911; SSE41-NEXT:    pushq %r13
912; SSE41-NEXT:    pushq %r12
913; SSE41-NEXT:    pushq %rbx
914; SSE41-NEXT:    movsbq (%rdi), %rax
915; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
916; SSE41-NEXT:    movsbq 1(%rdi), %r15
917; SSE41-NEXT:    movsbq 2(%rdi), %r8
918; SSE41-NEXT:    movsbq 3(%rdi), %r9
919; SSE41-NEXT:    movsbq 4(%rdi), %r10
920; SSE41-NEXT:    movsbq 5(%rdi), %r11
921; SSE41-NEXT:    movsbq 6(%rdi), %r14
922; SSE41-NEXT:    movsbq 7(%rdi), %r12
923; SSE41-NEXT:    movsbq 8(%rdi), %r13
924; SSE41-NEXT:    movsbq 9(%rdi), %rdx
925; SSE41-NEXT:    movsbq 10(%rdi), %rcx
926; SSE41-NEXT:    movsbq 11(%rdi), %rsi
927; SSE41-NEXT:    movsbq 12(%rdi), %rbx
928; SSE41-NEXT:    leaq -{{[0-9]+}}(%rsp), %rbp
929; SSE41-NEXT:    movzbl (%rax,%rbp), %eax
930; SSE41-NEXT:    movd %eax, %xmm0
931; SSE41-NEXT:    movsbq 13(%rdi), %rax
932; SSE41-NEXT:    pinsrb $1, (%r15,%rbp), %xmm0
933; SSE41-NEXT:    movsbq 14(%rdi), %r15
934; SSE41-NEXT:    movsbq 15(%rdi), %rdi
935; SSE41-NEXT:    movzbl (%rdi,%rbp), %edi
936; SSE41-NEXT:    movzbl (%r15,%rbp), %r15d
937; SSE41-NEXT:    movzbl (%rax,%rbp), %eax
938; SSE41-NEXT:    movzbl (%rbx,%rbp), %ebx
939; SSE41-NEXT:    movzbl (%rsi,%rbp), %esi
940; SSE41-NEXT:    movzbl (%rcx,%rbp), %ecx
941; SSE41-NEXT:    movzbl (%rdx,%rbp), %edx
942; SSE41-NEXT:    movzbl (%r13,%rbp), %r13d
943; SSE41-NEXT:    movzbl (%r12,%rbp), %r12d
944; SSE41-NEXT:    movzbl (%r14,%rbp), %r14d
945; SSE41-NEXT:    movzbl (%r11,%rbp), %r11d
946; SSE41-NEXT:    movzbl (%r10,%rbp), %r10d
947; SSE41-NEXT:    movzbl (%r9,%rbp), %r9d
948; SSE41-NEXT:    movzbl (%r8,%rbp), %ebp
949; SSE41-NEXT:    pinsrb $2, %ebp, %xmm0
950; SSE41-NEXT:    pinsrb $3, %r9d, %xmm0
951; SSE41-NEXT:    pinsrb $4, %r10d, %xmm0
952; SSE41-NEXT:    pinsrb $5, %r11d, %xmm0
953; SSE41-NEXT:    pinsrb $6, %r14d, %xmm0
954; SSE41-NEXT:    pinsrb $7, %r12d, %xmm0
955; SSE41-NEXT:    pinsrb $8, %r13d, %xmm0
956; SSE41-NEXT:    pinsrb $9, %edx, %xmm0
957; SSE41-NEXT:    pinsrb $10, %ecx, %xmm0
958; SSE41-NEXT:    pinsrb $11, %esi, %xmm0
959; SSE41-NEXT:    pinsrb $12, %ebx, %xmm0
960; SSE41-NEXT:    pinsrb $13, %eax, %xmm0
961; SSE41-NEXT:    pinsrb $14, %r15d, %xmm0
962; SSE41-NEXT:    pinsrb $15, %edi, %xmm0
963; SSE41-NEXT:    popq %rbx
964; SSE41-NEXT:    popq %r12
965; SSE41-NEXT:    popq %r13
966; SSE41-NEXT:    popq %r14
967; SSE41-NEXT:    popq %r15
968; SSE41-NEXT:    popq %rbp
969; SSE41-NEXT:    retq
970;
971; AVX-LABEL: mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
972; AVX:       # BB#0:
973; AVX-NEXT:    pushq %rbp
974; AVX-NEXT:    pushq %r15
975; AVX-NEXT:    pushq %r14
976; AVX-NEXT:    pushq %r13
977; AVX-NEXT:    pushq %r12
978; AVX-NEXT:    pushq %rbx
979; AVX-NEXT:    movsbq (%rdi), %rsi
980; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
981; AVX-NEXT:    movsbq 1(%rdi), %r15
982; AVX-NEXT:    movsbq 2(%rdi), %r8
983; AVX-NEXT:    movsbq 3(%rdi), %r9
984; AVX-NEXT:    movsbq 4(%rdi), %r10
985; AVX-NEXT:    movsbq 5(%rdi), %r11
986; AVX-NEXT:    movsbq 6(%rdi), %r14
987; AVX-NEXT:    movsbq 7(%rdi), %r12
988; AVX-NEXT:    movsbq 8(%rdi), %r13
989; AVX-NEXT:    movsbq 9(%rdi), %rdx
990; AVX-NEXT:    movsbq 10(%rdi), %rax
991; AVX-NEXT:    movsbq 11(%rdi), %rcx
992; AVX-NEXT:    movsbq 12(%rdi), %rbx
993; AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rbp
994; AVX-NEXT:    movzbl (%rsi,%rbp), %esi
995; AVX-NEXT:    vmovd %esi, %xmm0
996; AVX-NEXT:    movsbq 13(%rdi), %rsi
997; AVX-NEXT:    vpinsrb $1, (%r15,%rbp), %xmm0, %xmm0
998; AVX-NEXT:    movsbq 14(%rdi), %r15
999; AVX-NEXT:    movsbq 15(%rdi), %rdi
1000; AVX-NEXT:    movzbl (%rdi,%rbp), %edi
1001; AVX-NEXT:    movzbl (%r15,%rbp), %r15d
1002; AVX-NEXT:    movzbl (%rsi,%rbp), %esi
1003; AVX-NEXT:    movzbl (%rbx,%rbp), %ebx
1004; AVX-NEXT:    movzbl (%rcx,%rbp), %ecx
1005; AVX-NEXT:    movzbl (%rax,%rbp), %eax
1006; AVX-NEXT:    movzbl (%rdx,%rbp), %edx
1007; AVX-NEXT:    movzbl (%r13,%rbp), %r13d
1008; AVX-NEXT:    movzbl (%r12,%rbp), %r12d
1009; AVX-NEXT:    movzbl (%r14,%rbp), %r14d
1010; AVX-NEXT:    movzbl (%r11,%rbp), %r11d
1011; AVX-NEXT:    movzbl (%r10,%rbp), %r10d
1012; AVX-NEXT:    movzbl (%r9,%rbp), %r9d
1013; AVX-NEXT:    movzbl (%r8,%rbp), %ebp
1014; AVX-NEXT:    vpinsrb $2, %ebp, %xmm0, %xmm0
1015; AVX-NEXT:    vpinsrb $3, %r9d, %xmm0, %xmm0
1016; AVX-NEXT:    vpinsrb $4, %r10d, %xmm0, %xmm0
1017; AVX-NEXT:    vpinsrb $5, %r11d, %xmm0, %xmm0
1018; AVX-NEXT:    vpinsrb $6, %r14d, %xmm0, %xmm0
1019; AVX-NEXT:    vpinsrb $7, %r12d, %xmm0, %xmm0
1020; AVX-NEXT:    vpinsrb $8, %r13d, %xmm0, %xmm0
1021; AVX-NEXT:    vpinsrb $9, %edx, %xmm0, %xmm0
1022; AVX-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
1023; AVX-NEXT:    vpinsrb $11, %ecx, %xmm0, %xmm0
1024; AVX-NEXT:    vpinsrb $12, %ebx, %xmm0, %xmm0
1025; AVX-NEXT:    vpinsrb $13, %esi, %xmm0, %xmm0
1026; AVX-NEXT:    vpinsrb $14, %r15d, %xmm0, %xmm0
1027; AVX-NEXT:    vpinsrb $15, %edi, %xmm0, %xmm0
1028; AVX-NEXT:    popq %rbx
1029; AVX-NEXT:    popq %r12
1030; AVX-NEXT:    popq %r13
1031; AVX-NEXT:    popq %r14
1032; AVX-NEXT:    popq %r15
1033; AVX-NEXT:    popq %rbp
1034; AVX-NEXT:    retq
1035  %p0  = getelementptr inbounds i8, i8* %i, i64 0
1036  %p1  = getelementptr inbounds i8, i8* %i, i64 1
1037  %p2  = getelementptr inbounds i8, i8* %i, i64 2
1038  %p3  = getelementptr inbounds i8, i8* %i, i64 3
1039  %p4  = getelementptr inbounds i8, i8* %i, i64 4
1040  %p5  = getelementptr inbounds i8, i8* %i, i64 5
1041  %p6  = getelementptr inbounds i8, i8* %i, i64 6
1042  %p7  = getelementptr inbounds i8, i8* %i, i64 7
1043  %p8  = getelementptr inbounds i8, i8* %i, i64 8
1044  %p9  = getelementptr inbounds i8, i8* %i, i64 9
1045  %p10 = getelementptr inbounds i8, i8* %i, i64 10
1046  %p11 = getelementptr inbounds i8, i8* %i, i64 11
1047  %p12 = getelementptr inbounds i8, i8* %i, i64 12
1048  %p13 = getelementptr inbounds i8, i8* %i, i64 13
1049  %p14 = getelementptr inbounds i8, i8* %i, i64 14
1050  %p15 = getelementptr inbounds i8, i8* %i, i64 15
1051  %i0  = load i8, i8* %p0 , align 4
1052  %i1  = load i8, i8* %p1 , align 4
1053  %i2  = load i8, i8* %p2 , align 4
1054  %i3  = load i8, i8* %p3 , align 4
1055  %i4  = load i8, i8* %p4 , align 4
1056  %i5  = load i8, i8* %p5 , align 4
1057  %i6  = load i8, i8* %p6 , align 4
1058  %i7  = load i8, i8* %p7 , align 4
1059  %i8  = load i8, i8* %p8 , align 4
1060  %i9  = load i8, i8* %p9 , align 4
1061  %i10 = load i8, i8* %p10, align 4
1062  %i11 = load i8, i8* %p11, align 4
1063  %i12 = load i8, i8* %p12, align 4
1064  %i13 = load i8, i8* %p13, align 4
1065  %i14 = load i8, i8* %p14, align 4
1066  %i15 = load i8, i8* %p15, align 4
1067  %x0  = extractelement <16 x i8> %x, i8 %i0
1068  %x1  = extractelement <16 x i8> %x, i8 %i1
1069  %x2  = extractelement <16 x i8> %x, i8 %i2
1070  %x3  = extractelement <16 x i8> %x, i8 %i3
1071  %x4  = extractelement <16 x i8> %x, i8 %i4
1072  %x5  = extractelement <16 x i8> %x, i8 %i5
1073  %x6  = extractelement <16 x i8> %x, i8 %i6
1074  %x7  = extractelement <16 x i8> %x, i8 %i7
1075  %x8  = extractelement <16 x i8> %x, i8 %i8
1076  %x9  = extractelement <16 x i8> %x, i8 %i9
1077  %x10 = extractelement <16 x i8> %x, i8 %i10
1078  %x11 = extractelement <16 x i8> %x, i8 %i11
1079  %x12 = extractelement <16 x i8> %x, i8 %i12
1080  %x13 = extractelement <16 x i8> %x, i8 %i13
1081  %x14 = extractelement <16 x i8> %x, i8 %i14
1082  %x15 = extractelement <16 x i8> %x, i8 %i15
1083  %r0  = insertelement <16 x i8> undef, i8 %x0 , i32 0
1084  %r1  = insertelement <16 x i8>  %r0 , i8 %x1 , i32 1
1085  %r2  = insertelement <16 x i8>  %r1 , i8 %x2 , i32 2
1086  %r3  = insertelement <16 x i8>  %r2 , i8 %x3 , i32 3
1087  %r4  = insertelement <16 x i8>  %r3 , i8 %x4 , i32 4
1088  %r5  = insertelement <16 x i8>  %r4 , i8 %x5 , i32 5
1089  %r6  = insertelement <16 x i8>  %r5 , i8 %x6 , i32 6
1090  %r7  = insertelement <16 x i8>  %r6 , i8 %x7 , i32 7
1091  %r8  = insertelement <16 x i8>  %r7 , i8 %x8 , i32 8
1092  %r9  = insertelement <16 x i8>  %r8 , i8 %x9 , i32 9
1093  %r10 = insertelement <16 x i8>  %r9 , i8 %x10, i32 10
1094  %r11 = insertelement <16 x i8>  %r10, i8 %x11, i32 11
1095  %r12 = insertelement <16 x i8>  %r11, i8 %x12, i32 12
1096  %r13 = insertelement <16 x i8>  %r12, i8 %x13, i32 13
1097  %r14 = insertelement <16 x i8>  %r13, i8 %x14, i32 14
1098  %r15 = insertelement <16 x i8>  %r14, i8 %x15, i32 15
1099  ret <16 x i8> %r15
1100}
1101
1102;
1103; Binary shuffle indices from registers
1104;
1105
1106define <4 x float> @var_shuffle_v4f32_v4f32_x0yx_i32(<4 x float> %x, <4 x float> %y, i32 %i0, i32 %i1, i32 %i2, i32 %i3) nounwind {
1107; SSE-LABEL: var_shuffle_v4f32_v4f32_x0yx_i32:
1108; SSE:       # BB#0:
1109; SSE-NEXT:    movslq %edi, %rax
1110; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
1111; SSE-NEXT:    movslq %edx, %rdx
1112; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
1113; SSE-NEXT:    movslq %ecx, %rcx
1114; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1115; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1116; SSE-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1117; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1118; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1119; SSE-NEXT:    retq
1120;
1121; AVX-LABEL: var_shuffle_v4f32_v4f32_x0yx_i32:
1122; AVX:       # BB#0:
1123; AVX-NEXT:    movslq %edi, %rax
1124; AVX-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
1125; AVX-NEXT:    movslq %edx, %rdx
1126; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1127; AVX-NEXT:    movslq %ecx, %rcx
1128; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1129; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1130; AVX-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1131; AVX-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1132; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1133; AVX-NEXT:    retq
1134  %x0 = extractelement <4 x float> %x, i32 %i0
1135  %x1 = extractelement <4 x float> %x, i32 %i1
1136  %y2 = extractelement <4 x float> %y, i32 %i2
1137  %x3 = extractelement <4 x float> %x, i32 %i3
1138  %r0 = insertelement <4 x float> undef, float %x0, i32 0
1139  %r1 = insertelement <4 x float>   %r0, float 0.0, i32 1
1140  %r2 = insertelement <4 x float>   %r1, float %y2, i32 2
1141  %r3 = insertelement <4 x float>   %r2, float %x3, i32 3
1142  ret <4 x float> %r3
1143}
1144
1145define <8 x i16> @var_shuffle_v8i16_v8i16_xyxyxy00_i16(<8 x i16> %x, <8 x i16> %y, i16 %i0, i16 %i1, i16 %i2, i16 %i3, i16 %i4, i16 %i5, i16 %i6, i16 %i7) nounwind {
1146; SSE2-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16:
1147; SSE2:       # BB#0:
1148; SSE2-NEXT:    # kill: %R9D<def> %R9D<kill> %R9<def>
1149; SSE2-NEXT:    # kill: %R8D<def> %R8D<kill> %R8<def>
1150; SSE2-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
1151; SSE2-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
1152; SSE2-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
1153; SSE2-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
1154; SSE2-NEXT:    movswq %di, %r10
1155; SSE2-NEXT:    movswq %si, %rsi
1156; SSE2-NEXT:    movswq %dx, %r11
1157; SSE2-NEXT:    movswq %cx, %rcx
1158; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
1159; SSE2-NEXT:    movswq %r8w, %rdi
1160; SSE2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
1161; SSE2-NEXT:    movswq %r9w, %rax
1162; SSE2-NEXT:    movzwl -24(%rsp,%rsi,2), %esi
1163; SSE2-NEXT:    xorl %edx, %edx
1164; SSE2-NEXT:    movd %edx, %xmm0
1165; SSE2-NEXT:    movzwl -24(%rsp,%rcx,2), %ecx
1166; SSE2-NEXT:    movd %ecx, %xmm1
1167; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1168; SSE2-NEXT:    movd %esi, %xmm2
1169; SSE2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
1170; SSE2-NEXT:    movd %eax, %xmm3
1171; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
1172; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1173; SSE2-NEXT:    movzwl -40(%rsp,%r10,2), %eax
1174; SSE2-NEXT:    movzwl -40(%rsp,%r11,2), %ecx
1175; SSE2-NEXT:    movd %ecx, %xmm1
1176; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1177; SSE2-NEXT:    movd %eax, %xmm0
1178; SSE2-NEXT:    movzwl -40(%rsp,%rdi,2), %eax
1179; SSE2-NEXT:    movd %eax, %xmm3
1180; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
1181; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1182; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1183; SSE2-NEXT:    retq
1184;
1185; SSSE3-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16:
1186; SSSE3:       # BB#0:
1187; SSSE3-NEXT:    # kill: %R9D<def> %R9D<kill> %R9<def>
1188; SSSE3-NEXT:    # kill: %R8D<def> %R8D<kill> %R8<def>
1189; SSSE3-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
1190; SSSE3-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
1191; SSSE3-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
1192; SSSE3-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
1193; SSSE3-NEXT:    movswq %di, %r10
1194; SSSE3-NEXT:    movswq %si, %rsi
1195; SSSE3-NEXT:    movswq %dx, %r11
1196; SSSE3-NEXT:    movswq %cx, %rcx
1197; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
1198; SSSE3-NEXT:    movswq %r8w, %rdi
1199; SSSE3-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
1200; SSSE3-NEXT:    movswq %r9w, %rax
1201; SSSE3-NEXT:    movzwl -24(%rsp,%rsi,2), %esi
1202; SSSE3-NEXT:    xorl %edx, %edx
1203; SSSE3-NEXT:    movd %edx, %xmm0
1204; SSSE3-NEXT:    movzwl -24(%rsp,%rcx,2), %ecx
1205; SSSE3-NEXT:    movd %ecx, %xmm1
1206; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1207; SSSE3-NEXT:    movd %esi, %xmm2
1208; SSSE3-NEXT:    movzwl -24(%rsp,%rax,2), %eax
1209; SSSE3-NEXT:    movd %eax, %xmm3
1210; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
1211; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1212; SSSE3-NEXT:    movzwl -40(%rsp,%r10,2), %eax
1213; SSSE3-NEXT:    movzwl -40(%rsp,%r11,2), %ecx
1214; SSSE3-NEXT:    movd %ecx, %xmm1
1215; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1216; SSSE3-NEXT:    movd %eax, %xmm0
1217; SSSE3-NEXT:    movzwl -40(%rsp,%rdi,2), %eax
1218; SSSE3-NEXT:    movd %eax, %xmm3
1219; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
1220; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1221; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1222; SSSE3-NEXT:    retq
1223;
1224; SSE41-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16:
1225; SSE41:       # BB#0:
1226; SSE41-NEXT:    # kill: %R9D<def> %R9D<kill> %R9<def>
1227; SSE41-NEXT:    # kill: %R8D<def> %R8D<kill> %R8<def>
1228; SSE41-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
1229; SSE41-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
1230; SSE41-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
1231; SSE41-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
1232; SSE41-NEXT:    movswq %di, %rax
1233; SSE41-NEXT:    movswq %si, %rsi
1234; SSE41-NEXT:    movswq %dx, %rdx
1235; SSE41-NEXT:    movswq %cx, %r10
1236; SSE41-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
1237; SSE41-NEXT:    movswq %r8w, %rdi
1238; SSE41-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
1239; SSE41-NEXT:    movswq %r9w, %rcx
1240; SSE41-NEXT:    movzwl -40(%rsp,%rax,2), %eax
1241; SSE41-NEXT:    movd %eax, %xmm1
1242; SSE41-NEXT:    pinsrw $1, -24(%rsp,%rsi,2), %xmm1
1243; SSE41-NEXT:    pinsrw $2, -40(%rsp,%rdx,2), %xmm1
1244; SSE41-NEXT:    pinsrw $3, -24(%rsp,%r10,2), %xmm1
1245; SSE41-NEXT:    pinsrw $4, -40(%rsp,%rdi,2), %xmm1
1246; SSE41-NEXT:    pinsrw $5, -24(%rsp,%rcx,2), %xmm1
1247; SSE41-NEXT:    pxor %xmm0, %xmm0
1248; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
1249; SSE41-NEXT:    retq
1250;
1251; AVX1-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16:
1252; AVX1:       # BB#0:
1253; AVX1-NEXT:    # kill: %R9D<def> %R9D<kill> %R9<def>
1254; AVX1-NEXT:    # kill: %R8D<def> %R8D<kill> %R8<def>
1255; AVX1-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
1256; AVX1-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
1257; AVX1-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
1258; AVX1-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
1259; AVX1-NEXT:    movswq %di, %r10
1260; AVX1-NEXT:    movswq %si, %r11
1261; AVX1-NEXT:    movswq %dx, %rdx
1262; AVX1-NEXT:    movswq %cx, %rcx
1263; AVX1-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1264; AVX1-NEXT:    movswq %r8w, %rdi
1265; AVX1-NEXT:    vmovdqa %xmm1, -{{[0-9]+}}(%rsp)
1266; AVX1-NEXT:    movswq %r9w, %rax
1267; AVX1-NEXT:    movzwl -40(%rsp,%r10,2), %esi
1268; AVX1-NEXT:    vmovd %esi, %xmm0
1269; AVX1-NEXT:    vpinsrw $1, -24(%rsp,%r11,2), %xmm0, %xmm0
1270; AVX1-NEXT:    vpinsrw $2, -40(%rsp,%rdx,2), %xmm0, %xmm0
1271; AVX1-NEXT:    vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0
1272; AVX1-NEXT:    vpinsrw $4, -40(%rsp,%rdi,2), %xmm0, %xmm0
1273; AVX1-NEXT:    vpinsrw $5, -24(%rsp,%rax,2), %xmm0, %xmm0
1274; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1275; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
1276; AVX1-NEXT:    retq
1277;
1278; AVX2-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16:
1279; AVX2:       # BB#0:
1280; AVX2-NEXT:    # kill: %R9D<def> %R9D<kill> %R9<def>
1281; AVX2-NEXT:    # kill: %R8D<def> %R8D<kill> %R8<def>
1282; AVX2-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
1283; AVX2-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
1284; AVX2-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
1285; AVX2-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
1286; AVX2-NEXT:    movswq %di, %r10
1287; AVX2-NEXT:    movswq %si, %r11
1288; AVX2-NEXT:    movswq %dx, %rdx
1289; AVX2-NEXT:    movswq %cx, %rcx
1290; AVX2-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1291; AVX2-NEXT:    movswq %r8w, %rdi
1292; AVX2-NEXT:    vmovdqa %xmm1, -{{[0-9]+}}(%rsp)
1293; AVX2-NEXT:    movswq %r9w, %rax
1294; AVX2-NEXT:    movzwl -40(%rsp,%r10,2), %esi
1295; AVX2-NEXT:    vmovd %esi, %xmm0
1296; AVX2-NEXT:    vpinsrw $1, -24(%rsp,%r11,2), %xmm0, %xmm0
1297; AVX2-NEXT:    vpinsrw $2, -40(%rsp,%rdx,2), %xmm0, %xmm0
1298; AVX2-NEXT:    vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0
1299; AVX2-NEXT:    vpinsrw $4, -40(%rsp,%rdi,2), %xmm0, %xmm0
1300; AVX2-NEXT:    vpinsrw $5, -24(%rsp,%rax,2), %xmm0, %xmm0
1301; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1302; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
1303; AVX2-NEXT:    retq
1304  %x0 = extractelement <8 x i16> %x, i16 %i0
1305  %y1 = extractelement <8 x i16> %y, i16 %i1
1306  %x2 = extractelement <8 x i16> %x, i16 %i2
1307  %y3 = extractelement <8 x i16> %y, i16 %i3
1308  %x4 = extractelement <8 x i16> %x, i16 %i4
1309  %y5 = extractelement <8 x i16> %y, i16 %i5
1310  %x6 = extractelement <8 x i16> %x, i16 %i6
1311  %x7 = extractelement <8 x i16> %x, i16 %i7
1312  %r0 = insertelement <8 x i16> undef, i16 %x0, i32 0
1313  %r1 = insertelement <8 x i16>   %r0, i16 %y1, i32 1
1314  %r2 = insertelement <8 x i16>   %r1, i16 %x2, i32 2
1315  %r3 = insertelement <8 x i16>   %r2, i16 %y3, i32 3
1316  %r4 = insertelement <8 x i16>   %r3, i16 %x4, i32 4
1317  %r5 = insertelement <8 x i16>   %r4, i16 %y5, i32 5
1318  %r6 = insertelement <8 x i16>   %r5, i16   0, i32 6
1319  %r7 = insertelement <8 x i16>   %r6, i16   0, i32 7
1320  ret <8 x i16> %r7
1321}
1322