• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
4
5;
6; Unary shuffle indices from registers
7;
8
9define <4 x double> @var_shuffle_v4f64_v4f64_xxxx_i64(<4 x double> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
10; ALL-LABEL: var_shuffle_v4f64_v4f64_xxxx_i64:
11; ALL:       # BB#0:
12; ALL-NEXT:    pushq %rbp
13; ALL-NEXT:    movq %rsp, %rbp
14; ALL-NEXT:    andq $-32, %rsp
15; ALL-NEXT:    subq $64, %rsp
16; ALL-NEXT:    vmovaps %ymm0, (%rsp)
17; ALL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
18; ALL-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
19; ALL-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
20; ALL-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
21; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
22; ALL-NEXT:    movq %rbp, %rsp
23; ALL-NEXT:    popq %rbp
24; ALL-NEXT:    retq
25  %x0 = extractelement <4 x double> %x, i64 %i0
26  %x1 = extractelement <4 x double> %x, i64 %i1
27  %x2 = extractelement <4 x double> %x, i64 %i2
28  %x3 = extractelement <4 x double> %x, i64 %i3
29  %r0 = insertelement <4 x double> undef, double %x0, i32 0
30  %r1 = insertelement <4 x double>   %r0, double %x1, i32 1
31  %r2 = insertelement <4 x double>   %r1, double %x2, i32 2
32  %r3 = insertelement <4 x double>   %r2, double %x3, i32 3
33  ret <4 x double> %r3
34}
35
36define <4 x double> @var_shuffle_v4f64_v4f64_uxx0_i64(<4 x double> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
37; ALL-LABEL: var_shuffle_v4f64_v4f64_uxx0_i64:
38; ALL:       # BB#0:
39; ALL-NEXT:    pushq %rbp
40; ALL-NEXT:    movq %rsp, %rbp
41; ALL-NEXT:    andq $-32, %rsp
42; ALL-NEXT:    subq $64, %rsp
43; ALL-NEXT:    vmovaps %ymm0, (%rsp)
44; ALL-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
45; ALL-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
46; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
47; ALL-NEXT:    movq %rbp, %rsp
48; ALL-NEXT:    popq %rbp
49; ALL-NEXT:    retq
50  %x0 = extractelement <4 x double> %x, i64 %i0
51  %x1 = extractelement <4 x double> %x, i64 %i1
52  %x2 = extractelement <4 x double> %x, i64 %i2
53  %x3 = extractelement <4 x double> %x, i64 %i3
54  %r0 = insertelement <4 x double> undef, double undef, i32 0
55  %r1 = insertelement <4 x double>   %r0, double   %x1, i32 1
56  %r2 = insertelement <4 x double>   %r1, double   %x2, i32 2
57  %r3 = insertelement <4 x double>   %r2, double   0.0, i32 3
58  ret <4 x double> %r3
59}
60
61define <4 x double> @var_shuffle_v4f64_v2f64_xxxx_i64(<2 x double> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
62; ALL-LABEL: var_shuffle_v4f64_v2f64_xxxx_i64:
63; ALL:       # BB#0:
64; ALL-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
65; ALL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
66; ALL-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
67; ALL-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
68; ALL-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
69; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
70; ALL-NEXT:    retq
71  %x0 = extractelement <2 x double> %x, i64 %i0
72  %x1 = extractelement <2 x double> %x, i64 %i1
73  %x2 = extractelement <2 x double> %x, i64 %i2
74  %x3 = extractelement <2 x double> %x, i64 %i3
75  %r0 = insertelement <4 x double> undef, double %x0, i32 0
76  %r1 = insertelement <4 x double>   %r0, double %x1, i32 1
77  %r2 = insertelement <4 x double>   %r1, double %x2, i32 2
78  %r3 = insertelement <4 x double>   %r2, double %x3, i32 3
79  ret <4 x double> %r3
80}
81
82define <4 x i64> @var_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
83; AVX1-LABEL: var_shuffle_v4i64_v4i64_xxxx_i64:
84; AVX1:       # BB#0:
85; AVX1-NEXT:    pushq %rbp
86; AVX1-NEXT:    movq %rsp, %rbp
87; AVX1-NEXT:    andq $-32, %rsp
88; AVX1-NEXT:    subq $64, %rsp
89; AVX1-NEXT:    vmovaps %ymm0, (%rsp)
90; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
91; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
92; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
93; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
94; AVX1-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
95; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
96; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
97; AVX1-NEXT:    movq %rbp, %rsp
98; AVX1-NEXT:    popq %rbp
99; AVX1-NEXT:    retq
100;
101; AVX2-LABEL: var_shuffle_v4i64_v4i64_xxxx_i64:
102; AVX2:       # BB#0:
103; AVX2-NEXT:    pushq %rbp
104; AVX2-NEXT:    movq %rsp, %rbp
105; AVX2-NEXT:    andq $-32, %rsp
106; AVX2-NEXT:    subq $64, %rsp
107; AVX2-NEXT:    vmovaps %ymm0, (%rsp)
108; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
109; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
110; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
111; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
112; AVX2-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
113; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
114; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
115; AVX2-NEXT:    movq %rbp, %rsp
116; AVX2-NEXT:    popq %rbp
117; AVX2-NEXT:    retq
118  %x0 = extractelement <4 x i64> %x, i64 %i0
119  %x1 = extractelement <4 x i64> %x, i64 %i1
120  %x2 = extractelement <4 x i64> %x, i64 %i2
121  %x3 = extractelement <4 x i64> %x, i64 %i3
122  %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0
123  %r1 = insertelement <4 x i64>   %r0, i64 %x1, i32 1
124  %r2 = insertelement <4 x i64>   %r1, i64 %x2, i32 2
125  %r3 = insertelement <4 x i64>   %r2, i64 %x3, i32 3
126  ret <4 x i64> %r3
127}
128
129define <4 x i64> @var_shuffle_v4i64_v4i64_xx00_i64(<4 x i64> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
130; AVX1-LABEL: var_shuffle_v4i64_v4i64_xx00_i64:
131; AVX1:       # BB#0:
132; AVX1-NEXT:    pushq %rbp
133; AVX1-NEXT:    movq %rsp, %rbp
134; AVX1-NEXT:    andq $-32, %rsp
135; AVX1-NEXT:    subq $64, %rsp
136; AVX1-NEXT:    vmovaps %ymm0, (%rsp)
137; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
138; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
139; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
140; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
141; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
142; AVX1-NEXT:    movq %rbp, %rsp
143; AVX1-NEXT:    popq %rbp
144; AVX1-NEXT:    retq
145;
146; AVX2-LABEL: var_shuffle_v4i64_v4i64_xx00_i64:
147; AVX2:       # BB#0:
148; AVX2-NEXT:    pushq %rbp
149; AVX2-NEXT:    movq %rsp, %rbp
150; AVX2-NEXT:    andq $-32, %rsp
151; AVX2-NEXT:    subq $64, %rsp
152; AVX2-NEXT:    vmovaps %ymm0, (%rsp)
153; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
154; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
155; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
156; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
157; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
158; AVX2-NEXT:    movq %rbp, %rsp
159; AVX2-NEXT:    popq %rbp
160; AVX2-NEXT:    retq
161  %x0 = extractelement <4 x i64> %x, i64 %i0
162  %x1 = extractelement <4 x i64> %x, i64 %i1
163  %x2 = extractelement <4 x i64> %x, i64 %i2
164  %x3 = extractelement <4 x i64> %x, i64 %i3
165  %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0
166  %r1 = insertelement <4 x i64>   %r0, i64 %x1, i32 1
167  %r2 = insertelement <4 x i64>   %r1, i64   0, i32 2
168  %r3 = insertelement <4 x i64>   %r2, i64   0, i32 3
169  ret <4 x i64> %r3
170}
171
172define <4 x i64> @var_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
173; AVX1-LABEL: var_shuffle_v4i64_v2i64_xxxx_i64:
174; AVX1:       # BB#0:
175; AVX1-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
176; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
177; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
178; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
179; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
180; AVX1-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
181; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
182; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
183; AVX1-NEXT:    retq
184;
185; AVX2-LABEL: var_shuffle_v4i64_v2i64_xxxx_i64:
186; AVX2:       # BB#0:
187; AVX2-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
188; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
189; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
190; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
191; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
192; AVX2-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
193; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
194; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
195; AVX2-NEXT:    retq
196  %x0 = extractelement <2 x i64> %x, i64 %i0
197  %x1 = extractelement <2 x i64> %x, i64 %i1
198  %x2 = extractelement <2 x i64> %x, i64 %i2
199  %x3 = extractelement <2 x i64> %x, i64 %i3
200  %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0
201  %r1 = insertelement <4 x i64>   %r0, i64 %x1, i32 1
202  %r2 = insertelement <4 x i64>   %r1, i64 %x2, i32 2
203  %r3 = insertelement <4 x i64>   %r2, i64 %x3, i32 3
204  ret <4 x i64> %r3
205}
206
207define <8 x float> @var_shuffle_v8f32_v8f32_xxxxxxxx_i32(<8 x float> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7) nounwind {
208; AVX1-LABEL: var_shuffle_v8f32_v8f32_xxxxxxxx_i32:
209; AVX1:       # BB#0:
210; AVX1-NEXT:    pushq %rbp
211; AVX1-NEXT:    movq %rsp, %rbp
212; AVX1-NEXT:    andq $-32, %rsp
213; AVX1-NEXT:    subq $64, %rsp
214; AVX1-NEXT:    movslq %edi, %rax
215; AVX1-NEXT:    movslq %esi, %rsi
216; AVX1-NEXT:    movslq %edx, %rdx
217; AVX1-NEXT:    movslq %ecx, %r11
218; AVX1-NEXT:    movslq %r8d, %r10
219; AVX1-NEXT:    vmovaps %ymm0, (%rsp)
220; AVX1-NEXT:    movslq %r9d, %r8
221; AVX1-NEXT:    movslq 16(%rbp), %rdi
222; AVX1-NEXT:    movslq 24(%rbp), %rcx
223; AVX1-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
224; AVX1-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
225; AVX1-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
226; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
227; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
228; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
229; AVX1-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
230; AVX1-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3]
231; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm3[0,1],xmm0[0],xmm3[3]
232; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
233; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
234; AVX1-NEXT:    movq %rbp, %rsp
235; AVX1-NEXT:    popq %rbp
236; AVX1-NEXT:    retq
237;
238; AVX2-LABEL: var_shuffle_v8f32_v8f32_xxxxxxxx_i32:
239; AVX2:       # BB#0:
240; AVX2-NEXT:    vmovd %edi, %xmm1
241; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm1
242; AVX2-NEXT:    vmovd %esi, %xmm2
243; AVX2-NEXT:    vpermps %ymm0, %ymm2, %ymm2
244; AVX2-NEXT:    vmovd %edx, %xmm3
245; AVX2-NEXT:    vpermps %ymm0, %ymm3, %ymm3
246; AVX2-NEXT:    vmovd %ecx, %xmm4
247; AVX2-NEXT:    vpermps %ymm0, %ymm4, %ymm4
248; AVX2-NEXT:    vmovd %r8d, %xmm5
249; AVX2-NEXT:    vpermps %ymm0, %ymm5, %ymm5
250; AVX2-NEXT:    vmovd %r9d, %xmm6
251; AVX2-NEXT:    vpermps %ymm0, %ymm6, %ymm6
252; AVX2-NEXT:    vmovd {{.*#+}} xmm7 = mem[0],zero,zero,zero
253; AVX2-NEXT:    vpermps %ymm0, %ymm7, %ymm7
254; AVX2-NEXT:    vmovd {{.*#+}} xmm8 = mem[0],zero,zero,zero
255; AVX2-NEXT:    vpermps %ymm0, %ymm8, %ymm0
256; AVX2-NEXT:    vinsertps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[2,3]
257; AVX2-NEXT:    vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm7[0],xmm5[3]
258; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[0]
259; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
260; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
261; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
262; AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
263; AVX2-NEXT:    retq
264  %x0 = extractelement <8 x float> %x, i32 %i0
265  %x1 = extractelement <8 x float> %x, i32 %i1
266  %x2 = extractelement <8 x float> %x, i32 %i2
267  %x3 = extractelement <8 x float> %x, i32 %i3
268  %x4 = extractelement <8 x float> %x, i32 %i4
269  %x5 = extractelement <8 x float> %x, i32 %i5
270  %x6 = extractelement <8 x float> %x, i32 %i6
271  %x7 = extractelement <8 x float> %x, i32 %i7
272  %r0 = insertelement <8 x float> undef, float %x0, i32 0
273  %r1 = insertelement <8 x float>   %r0, float %x1, i32 1
274  %r2 = insertelement <8 x float>   %r1, float %x2, i32 2
275  %r3 = insertelement <8 x float>   %r2, float %x3, i32 3
276  %r4 = insertelement <8 x float>   %r3, float %x4, i32 4
277  %r5 = insertelement <8 x float>   %r4, float %x5, i32 5
278  %r6 = insertelement <8 x float>   %r5, float %x6, i32 6
279  %r7 = insertelement <8 x float>   %r6, float %x7, i32 7
280  ret <8 x float> %r7
281}
282
283define <8 x float> @var_shuffle_v8f32_v4f32_xxxxxxxx_i32(<4 x float> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7) nounwind {
284; ALL-LABEL: var_shuffle_v8f32_v4f32_xxxxxxxx_i32:
285; ALL:       # BB#0:
286; ALL-NEXT:    movslq %edi, %rax
287; ALL-NEXT:    movslq %esi, %rsi
288; ALL-NEXT:    movslq %edx, %rdx
289; ALL-NEXT:    movslq %ecx, %r11
290; ALL-NEXT:    movslq %r8d, %r10
291; ALL-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
292; ALL-NEXT:    movslq %r9d, %r8
293; ALL-NEXT:    movslq {{[0-9]+}}(%rsp), %rdi
294; ALL-NEXT:    movslq {{[0-9]+}}(%rsp), %rcx
295; ALL-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
296; ALL-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
297; ALL-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
298; ALL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
299; ALL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
300; ALL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
301; ALL-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
302; ALL-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3]
303; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm3[0,1],xmm0[0],xmm3[3]
304; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
305; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
306; ALL-NEXT:    retq
307  %x0 = extractelement <4 x float> %x, i32 %i0
308  %x1 = extractelement <4 x float> %x, i32 %i1
309  %x2 = extractelement <4 x float> %x, i32 %i2
310  %x3 = extractelement <4 x float> %x, i32 %i3
311  %x4 = extractelement <4 x float> %x, i32 %i4
312  %x5 = extractelement <4 x float> %x, i32 %i5
313  %x6 = extractelement <4 x float> %x, i32 %i6
314  %x7 = extractelement <4 x float> %x, i32 %i7
315  %r0 = insertelement <8 x float> undef, float %x0, i32 0
316  %r1 = insertelement <8 x float>   %r0, float %x1, i32 1
317  %r2 = insertelement <8 x float>   %r1, float %x2, i32 2
318  %r3 = insertelement <8 x float>   %r2, float %x3, i32 3
319  %r4 = insertelement <8 x float>   %r3, float %x4, i32 4
320  %r5 = insertelement <8 x float>   %r4, float %x5, i32 5
321  %r6 = insertelement <8 x float>   %r5, float %x6, i32 6
322  %r7 = insertelement <8 x float>   %r6, float %x7, i32 7
323  ret <8 x float> %r7
324}
325
326define <16 x i16> @var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16(<16 x i16> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, i32 %i11, i32 %i12, i32 %i13, i32 %i14, i32 %i15) nounwind {
327; AVX1-LABEL: var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16:
328; AVX1:       # BB#0:
329; AVX1-NEXT:    pushq %rbp
330; AVX1-NEXT:    movq %rsp, %rbp
331; AVX1-NEXT:    andq $-32, %rsp
332; AVX1-NEXT:    subq $64, %rsp
333; AVX1-NEXT:    vmovaps %ymm0, (%rsp)
334; AVX1-NEXT:    movslq 32(%rbp), %rax
335; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
336; AVX1-NEXT:    vmovd %eax, %xmm0
337; AVX1-NEXT:    movslq 40(%rbp), %rax
338; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
339; AVX1-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
340; AVX1-NEXT:    movslq 48(%rbp), %rax
341; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
342; AVX1-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
343; AVX1-NEXT:    movslq 56(%rbp), %rax
344; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
345; AVX1-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
346; AVX1-NEXT:    movslq 64(%rbp), %rax
347; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
348; AVX1-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
349; AVX1-NEXT:    movslq 72(%rbp), %rax
350; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
351; AVX1-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
352; AVX1-NEXT:    movslq 80(%rbp), %rax
353; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
354; AVX1-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
355; AVX1-NEXT:    movslq 88(%rbp), %rax
356; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
357; AVX1-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
358; AVX1-NEXT:    movslq %edi, %rax
359; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
360; AVX1-NEXT:    vmovd %eax, %xmm1
361; AVX1-NEXT:    movslq %esi, %rax
362; AVX1-NEXT:    vpinsrw $1, (%rsp,%rax,2), %xmm1, %xmm1
363; AVX1-NEXT:    movslq %edx, %rax
364; AVX1-NEXT:    vpinsrw $2, (%rsp,%rax,2), %xmm1, %xmm1
365; AVX1-NEXT:    movslq %ecx, %rax
366; AVX1-NEXT:    vpinsrw $3, (%rsp,%rax,2), %xmm1, %xmm1
367; AVX1-NEXT:    movslq %r8d, %rax
368; AVX1-NEXT:    vpinsrw $4, (%rsp,%rax,2), %xmm1, %xmm1
369; AVX1-NEXT:    movslq %r9d, %rax
370; AVX1-NEXT:    vpinsrw $5, (%rsp,%rax,2), %xmm1, %xmm1
371; AVX1-NEXT:    movslq 16(%rbp), %rax
372; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
373; AVX1-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
374; AVX1-NEXT:    movslq 24(%rbp), %rax
375; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
376; AVX1-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
377; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
378; AVX1-NEXT:    movq %rbp, %rsp
379; AVX1-NEXT:    popq %rbp
380; AVX1-NEXT:    retq
381;
382; AVX2-LABEL: var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16:
383; AVX2:       # BB#0:
384; AVX2-NEXT:    pushq %rbp
385; AVX2-NEXT:    movq %rsp, %rbp
386; AVX2-NEXT:    andq $-32, %rsp
387; AVX2-NEXT:    subq $64, %rsp
388; AVX2-NEXT:    vmovaps %ymm0, (%rsp)
389; AVX2-NEXT:    movslq 32(%rbp), %rax
390; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
391; AVX2-NEXT:    vmovd %eax, %xmm0
392; AVX2-NEXT:    movslq 40(%rbp), %rax
393; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
394; AVX2-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
395; AVX2-NEXT:    movslq 48(%rbp), %rax
396; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
397; AVX2-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
398; AVX2-NEXT:    movslq 56(%rbp), %rax
399; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
400; AVX2-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
401; AVX2-NEXT:    movslq 64(%rbp), %rax
402; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
403; AVX2-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
404; AVX2-NEXT:    movslq 72(%rbp), %rax
405; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
406; AVX2-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
407; AVX2-NEXT:    movslq 80(%rbp), %rax
408; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
409; AVX2-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
410; AVX2-NEXT:    movslq 88(%rbp), %rax
411; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
412; AVX2-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
413; AVX2-NEXT:    movslq %edi, %rax
414; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
415; AVX2-NEXT:    vmovd %eax, %xmm1
416; AVX2-NEXT:    movslq %esi, %rax
417; AVX2-NEXT:    vpinsrw $1, (%rsp,%rax,2), %xmm1, %xmm1
418; AVX2-NEXT:    movslq %edx, %rax
419; AVX2-NEXT:    vpinsrw $2, (%rsp,%rax,2), %xmm1, %xmm1
420; AVX2-NEXT:    movslq %ecx, %rax
421; AVX2-NEXT:    vpinsrw $3, (%rsp,%rax,2), %xmm1, %xmm1
422; AVX2-NEXT:    movslq %r8d, %rax
423; AVX2-NEXT:    vpinsrw $4, (%rsp,%rax,2), %xmm1, %xmm1
424; AVX2-NEXT:    movslq %r9d, %rax
425; AVX2-NEXT:    vpinsrw $5, (%rsp,%rax,2), %xmm1, %xmm1
426; AVX2-NEXT:    movslq 16(%rbp), %rax
427; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
428; AVX2-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
429; AVX2-NEXT:    movslq 24(%rbp), %rax
430; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
431; AVX2-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
432; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
433; AVX2-NEXT:    movq %rbp, %rsp
434; AVX2-NEXT:    popq %rbp
435; AVX2-NEXT:    retq
436  %x0  = extractelement <16 x i16> %x, i32 %i0
437  %x1  = extractelement <16 x i16> %x, i32 %i1
438  %x2  = extractelement <16 x i16> %x, i32 %i2
439  %x3  = extractelement <16 x i16> %x, i32 %i3
440  %x4  = extractelement <16 x i16> %x, i32 %i4
441  %x5  = extractelement <16 x i16> %x, i32 %i5
442  %x6  = extractelement <16 x i16> %x, i32 %i6
443  %x7  = extractelement <16 x i16> %x, i32 %i7
444  %x8  = extractelement <16 x i16> %x, i32 %i8
445  %x9  = extractelement <16 x i16> %x, i32 %i9
446  %x10 = extractelement <16 x i16> %x, i32 %i10
447  %x11 = extractelement <16 x i16> %x, i32 %i11
448  %x12 = extractelement <16 x i16> %x, i32 %i12
449  %x13 = extractelement <16 x i16> %x, i32 %i13
450  %x14 = extractelement <16 x i16> %x, i32 %i14
451  %x15 = extractelement <16 x i16> %x, i32 %i15
452  %r0  = insertelement <16 x i16> undef, i16 %x0 , i32 0
453  %r1  = insertelement <16 x i16>  %r0 , i16 %x1 , i32 1
454  %r2  = insertelement <16 x i16>  %r1 , i16 %x2 , i32 2
455  %r3  = insertelement <16 x i16>  %r2 , i16 %x3 , i32 3
456  %r4  = insertelement <16 x i16>  %r3 , i16 %x4 , i32 4
457  %r5  = insertelement <16 x i16>  %r4 , i16 %x5 , i32 5
458  %r6  = insertelement <16 x i16>  %r5 , i16 %x6 , i32 6
459  %r7  = insertelement <16 x i16>  %r6 , i16 %x7 , i32 7
460  %r8  = insertelement <16 x i16>  %r7 , i16 %x8 , i32 8
461  %r9  = insertelement <16 x i16>  %r8 , i16 %x9 , i32 9
462  %r10 = insertelement <16 x i16>  %r9 , i16 %x10, i32 10
463  %r11 = insertelement <16 x i16>  %r10, i16 %x11, i32 11
464  %r12 = insertelement <16 x i16>  %r11, i16 %x12, i32 12
465  %r13 = insertelement <16 x i16>  %r12, i16 %x13, i32 13
466  %r14 = insertelement <16 x i16>  %r13, i16 %x14, i32 14
467  %r15 = insertelement <16 x i16>  %r14, i16 %x15, i32 15
468  ret <16 x i16> %r15
469}
470
471define <16 x i16> @var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16(<8 x i16> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, i32 %i11, i32 %i12, i32 %i13, i32 %i14, i32 %i15) nounwind {
472; AVX1-LABEL: var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16:
473; AVX1:       # BB#0:
474; AVX1-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
475; AVX1-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
476; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
477; AVX1-NEXT:    vmovd %eax, %xmm0
478; AVX1-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
479; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
480; AVX1-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
481; AVX1-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
482; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
483; AVX1-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
484; AVX1-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
485; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
486; AVX1-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
487; AVX1-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
488; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
489; AVX1-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
490; AVX1-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
491; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
492; AVX1-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
493; AVX1-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
494; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
495; AVX1-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
496; AVX1-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
497; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
498; AVX1-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
499; AVX1-NEXT:    movslq %edi, %rax
500; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
501; AVX1-NEXT:    vmovd %eax, %xmm1
502; AVX1-NEXT:    movslq %esi, %rax
503; AVX1-NEXT:    vpinsrw $1, -24(%rsp,%rax,2), %xmm1, %xmm1
504; AVX1-NEXT:    movslq %edx, %rax
505; AVX1-NEXT:    vpinsrw $2, -24(%rsp,%rax,2), %xmm1, %xmm1
506; AVX1-NEXT:    movslq %ecx, %rax
507; AVX1-NEXT:    vpinsrw $3, -24(%rsp,%rax,2), %xmm1, %xmm1
508; AVX1-NEXT:    movslq %r8d, %rax
509; AVX1-NEXT:    vpinsrw $4, -24(%rsp,%rax,2), %xmm1, %xmm1
510; AVX1-NEXT:    movslq %r9d, %rax
511; AVX1-NEXT:    vpinsrw $5, -24(%rsp,%rax,2), %xmm1, %xmm1
512; AVX1-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
513; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
514; AVX1-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
515; AVX1-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
516; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
517; AVX1-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
518; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
519; AVX1-NEXT:    retq
520;
521; AVX2-LABEL: var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16:
522; AVX2:       # BB#0:
523; AVX2-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
524; AVX2-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
525; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
526; AVX2-NEXT:    vmovd %eax, %xmm0
527; AVX2-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
528; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
529; AVX2-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
530; AVX2-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
531; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
532; AVX2-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
533; AVX2-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
534; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
535; AVX2-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
536; AVX2-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
537; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
538; AVX2-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
539; AVX2-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
540; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
541; AVX2-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
542; AVX2-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
543; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
544; AVX2-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
545; AVX2-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
546; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
547; AVX2-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
548; AVX2-NEXT:    movslq %edi, %rax
549; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
550; AVX2-NEXT:    vmovd %eax, %xmm1
551; AVX2-NEXT:    movslq %esi, %rax
552; AVX2-NEXT:    vpinsrw $1, -24(%rsp,%rax,2), %xmm1, %xmm1
553; AVX2-NEXT:    movslq %edx, %rax
554; AVX2-NEXT:    vpinsrw $2, -24(%rsp,%rax,2), %xmm1, %xmm1
555; AVX2-NEXT:    movslq %ecx, %rax
556; AVX2-NEXT:    vpinsrw $3, -24(%rsp,%rax,2), %xmm1, %xmm1
557; AVX2-NEXT:    movslq %r8d, %rax
558; AVX2-NEXT:    vpinsrw $4, -24(%rsp,%rax,2), %xmm1, %xmm1
559; AVX2-NEXT:    movslq %r9d, %rax
560; AVX2-NEXT:    vpinsrw $5, -24(%rsp,%rax,2), %xmm1, %xmm1
561; AVX2-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
562; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
563; AVX2-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
564; AVX2-NEXT:    movslq {{[0-9]+}}(%rsp), %rax
565; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
566; AVX2-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
567; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
568; AVX2-NEXT:    retq
569  %x0  = extractelement <8 x i16> %x, i32 %i0
570  %x1  = extractelement <8 x i16> %x, i32 %i1
571  %x2  = extractelement <8 x i16> %x, i32 %i2
572  %x3  = extractelement <8 x i16> %x, i32 %i3
573  %x4  = extractelement <8 x i16> %x, i32 %i4
574  %x5  = extractelement <8 x i16> %x, i32 %i5
575  %x6  = extractelement <8 x i16> %x, i32 %i6
576  %x7  = extractelement <8 x i16> %x, i32 %i7
577  %x8  = extractelement <8 x i16> %x, i32 %i8
578  %x9  = extractelement <8 x i16> %x, i32 %i9
579  %x10 = extractelement <8 x i16> %x, i32 %i10
580  %x11 = extractelement <8 x i16> %x, i32 %i11
581  %x12 = extractelement <8 x i16> %x, i32 %i12
582  %x13 = extractelement <8 x i16> %x, i32 %i13
583  %x14 = extractelement <8 x i16> %x, i32 %i14
584  %x15 = extractelement <8 x i16> %x, i32 %i15
585  %r0  = insertelement <16 x i16> undef, i16 %x0 , i32 0
586  %r1  = insertelement <16 x i16>  %r0 , i16 %x1 , i32 1
587  %r2  = insertelement <16 x i16>  %r1 , i16 %x2 , i32 2
588  %r3  = insertelement <16 x i16>  %r2 , i16 %x3 , i32 3
589  %r4  = insertelement <16 x i16>  %r3 , i16 %x4 , i32 4
590  %r5  = insertelement <16 x i16>  %r4 , i16 %x5 , i32 5
591  %r6  = insertelement <16 x i16>  %r5 , i16 %x6 , i32 6
592  %r7  = insertelement <16 x i16>  %r6 , i16 %x7 , i32 7
593  %r8  = insertelement <16 x i16>  %r7 , i16 %x8 , i32 8
594  %r9  = insertelement <16 x i16>  %r8 , i16 %x9 , i32 9
595  %r10 = insertelement <16 x i16>  %r9 , i16 %x10, i32 10
596  %r11 = insertelement <16 x i16>  %r10, i16 %x11, i32 11
597  %r12 = insertelement <16 x i16>  %r11, i16 %x12, i32 12
598  %r13 = insertelement <16 x i16>  %r12, i16 %x13, i32 13
599  %r14 = insertelement <16 x i16>  %r13, i16 %x14, i32 14
600  %r15 = insertelement <16 x i16>  %r14, i16 %x15, i32 15
601  ret <16 x i16> %r15
602}
603
604;
605; Unary shuffle indices from memory
606;
607
608define <4 x i64> @mem_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, i64* %i) nounwind {
609; AVX1-LABEL: mem_shuffle_v4i64_v4i64_xxxx_i64:
610; AVX1:       # BB#0:
611; AVX1-NEXT:    pushq %rbp
612; AVX1-NEXT:    movq %rsp, %rbp
613; AVX1-NEXT:    andq $-32, %rsp
614; AVX1-NEXT:    subq $64, %rsp
615; AVX1-NEXT:    movq (%rdi), %rax
616; AVX1-NEXT:    movq 8(%rdi), %rcx
617; AVX1-NEXT:    movq 16(%rdi), %rdx
618; AVX1-NEXT:    movq 24(%rdi), %rsi
619; AVX1-NEXT:    vmovaps %ymm0, (%rsp)
620; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
621; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
622; AVX1-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
623; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
624; AVX1-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
625; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
626; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
627; AVX1-NEXT:    movq %rbp, %rsp
628; AVX1-NEXT:    popq %rbp
629; AVX1-NEXT:    retq
630;
631; AVX2-LABEL: mem_shuffle_v4i64_v4i64_xxxx_i64:
632; AVX2:       # BB#0:
633; AVX2-NEXT:    pushq %rbp
634; AVX2-NEXT:    movq %rsp, %rbp
635; AVX2-NEXT:    andq $-32, %rsp
636; AVX2-NEXT:    subq $64, %rsp
637; AVX2-NEXT:    movq (%rdi), %rax
638; AVX2-NEXT:    movq 8(%rdi), %rcx
639; AVX2-NEXT:    movq 16(%rdi), %rdx
640; AVX2-NEXT:    movq 24(%rdi), %rsi
641; AVX2-NEXT:    vmovaps %ymm0, (%rsp)
642; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
643; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
644; AVX2-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
645; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
646; AVX2-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
647; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
648; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
649; AVX2-NEXT:    movq %rbp, %rsp
650; AVX2-NEXT:    popq %rbp
651; AVX2-NEXT:    retq
652  %p0  = getelementptr inbounds i64, i64* %i, i32 0
653  %p1  = getelementptr inbounds i64, i64* %i, i32 1
654  %p2  = getelementptr inbounds i64, i64* %i, i32 2
655  %p3  = getelementptr inbounds i64, i64* %i, i32 3
656  %i0  = load i64, i64* %p0, align 4
657  %i1  = load i64, i64* %p1, align 4
658  %i2  = load i64, i64* %p2, align 4
659  %i3  = load i64, i64* %p3, align 4
660  %x0 = extractelement <4 x i64> %x, i64 %i0
661  %x1 = extractelement <4 x i64> %x, i64 %i1
662  %x2 = extractelement <4 x i64> %x, i64 %i2
663  %x3 = extractelement <4 x i64> %x, i64 %i3
664  %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0
665  %r1 = insertelement <4 x i64>   %r0, i64 %x1, i32 1
666  %r2 = insertelement <4 x i64>   %r1, i64 %x2, i32 2
667  %r3 = insertelement <4 x i64>   %r2, i64 %x3, i32 3
668  ret <4 x i64> %r3
669}
670
671define <4 x i64> @mem_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, i64* %i) nounwind {
672; AVX1-LABEL: mem_shuffle_v4i64_v2i64_xxxx_i64:
673; AVX1:       # BB#0:
674; AVX1-NEXT:    movq (%rdi), %rax
675; AVX1-NEXT:    movq 8(%rdi), %rcx
676; AVX1-NEXT:    movq 16(%rdi), %rdx
677; AVX1-NEXT:    movq 24(%rdi), %rsi
678; AVX1-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
679; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
680; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
681; AVX1-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
682; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
683; AVX1-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
684; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
685; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
686; AVX1-NEXT:    retq
687;
688; AVX2-LABEL: mem_shuffle_v4i64_v2i64_xxxx_i64:
689; AVX2:       # BB#0:
690; AVX2-NEXT:    movq (%rdi), %rax
691; AVX2-NEXT:    movq 8(%rdi), %rcx
692; AVX2-NEXT:    movq 16(%rdi), %rdx
693; AVX2-NEXT:    movq 24(%rdi), %rsi
694; AVX2-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
695; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
696; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
697; AVX2-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
698; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
699; AVX2-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
700; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
701; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
702; AVX2-NEXT:    retq
703  %p0  = getelementptr inbounds i64, i64* %i, i32 0
704  %p1  = getelementptr inbounds i64, i64* %i, i32 1
705  %p2  = getelementptr inbounds i64, i64* %i, i32 2
706  %p3  = getelementptr inbounds i64, i64* %i, i32 3
707  %i0  = load i64, i64* %p0, align 4
708  %i1  = load i64, i64* %p1, align 4
709  %i2  = load i64, i64* %p2, align 4
710  %i3  = load i64, i64* %p3, align 4
711  %x0 = extractelement <2 x i64> %x, i64 %i0
712  %x1 = extractelement <2 x i64> %x, i64 %i1
713  %x2 = extractelement <2 x i64> %x, i64 %i2
714  %x3 = extractelement <2 x i64> %x, i64 %i3
715  %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0
716  %r1 = insertelement <4 x i64>   %r0, i64 %x1, i32 1
717  %r2 = insertelement <4 x i64>   %r1, i64 %x2, i32 2
718  %r3 = insertelement <4 x i64>   %r2, i64 %x3, i32 3
719  ret <4 x i64> %r3
720}
721