• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop | FileCheck %s --check-prefix=XOP
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=INT256,AVX2
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=INT256,AVX512
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=INT256,AVX512
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=INT256,AVX512
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi | FileCheck %s --check-prefixes=INT256,AVX512
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=INT256,AVX512VL
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=INT256,AVX512VL,AVX512VLDQ
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=INT256,AVX512VL,AVX512VLBW
12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+avx512vbmi | FileCheck %s --check-prefixes=INT256,AVX512VL,VLVBMI
13
14define <4 x i64> @var_shuffle_v4i64(<4 x i64> %v, <4 x i64> %indices) nounwind {
15; XOP-LABEL: var_shuffle_v4i64:
16; XOP:       # %bb.0:
17; XOP-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
18; XOP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
19; XOP-NEXT:    vpaddq %xmm1, %xmm1, %xmm3
20; XOP-NEXT:    vextractf128 $1, %ymm1, %xmm1
21; XOP-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
22; XOP-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
23; XOP-NEXT:    vpermil2pd $0, %ymm1, %ymm2, %ymm0, %ymm0
24; XOP-NEXT:    retq
25;
26; AVX1-LABEL: var_shuffle_v4i64:
27; AVX1:       # %bb.0:
28; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
29; AVX1-NEXT:    vpaddq %xmm1, %xmm1, %xmm3
30; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
31; AVX1-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
32; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm4
33; AVX1-NEXT:    vpermilpd %ymm4, %ymm2, %ymm2
34; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
35; AVX1-NEXT:    vpermilpd %ymm4, %ymm0, %ymm0
36; AVX1-NEXT:    vpcmpgtq {{.*}}(%rip), %xmm3, %xmm3
37; AVX1-NEXT:    vpcmpgtq {{\.LCPI.*}}+{{.*}}(%rip), %xmm1, %xmm1
38; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
39; AVX1-NEXT:    vblendvpd %ymm1, %ymm2, %ymm0, %ymm0
40; AVX1-NEXT:    retq
41;
42; AVX2-LABEL: var_shuffle_v4i64:
43; AVX2:       # %bb.0:
44; AVX2-NEXT:    vpaddq %ymm1, %ymm1, %ymm1
45; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2]
46; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm1, %ymm2
47; AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm0[2,3,2,3]
48; AVX2-NEXT:    vpermilpd %ymm1, %ymm3, %ymm3
49; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
50; AVX2-NEXT:    vpermilpd %ymm1, %ymm0, %ymm0
51; AVX2-NEXT:    vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
52; AVX2-NEXT:    retq
53;
54; AVX512-LABEL: var_shuffle_v4i64:
55; AVX512:       # %bb.0:
56; AVX512-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
57; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
58; AVX512-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
59; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
60; AVX512-NEXT:    retq
61;
62; AVX512VL-LABEL: var_shuffle_v4i64:
63; AVX512VL:       # %bb.0:
64; AVX512VL-NEXT:    vpermpd %ymm0, %ymm1, %ymm0
65; AVX512VL-NEXT:    retq
66  %index0 = extractelement <4 x i64> %indices, i32 0
67  %index1 = extractelement <4 x i64> %indices, i32 1
68  %index2 = extractelement <4 x i64> %indices, i32 2
69  %index3 = extractelement <4 x i64> %indices, i32 3
70  %v0 = extractelement <4 x i64> %v, i64 %index0
71  %v1 = extractelement <4 x i64> %v, i64 %index1
72  %v2 = extractelement <4 x i64> %v, i64 %index2
73  %v3 = extractelement <4 x i64> %v, i64 %index3
74  %ret0 = insertelement <4 x i64> undef, i64 %v0, i32 0
75  %ret1 = insertelement <4 x i64> %ret0, i64 %v1, i32 1
76  %ret2 = insertelement <4 x i64> %ret1, i64 %v2, i32 2
77  %ret3 = insertelement <4 x i64> %ret2, i64 %v3, i32 3
78  ret <4 x i64> %ret3
79}
80
81define <8 x i32> @var_shuffle_v8i32(<8 x i32> %v, <8 x i32> %indices) nounwind {
82; XOP-LABEL: var_shuffle_v8i32:
83; XOP:       # %bb.0:
84; XOP-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
85; XOP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
86; XOP-NEXT:    vpermil2ps $0, %ymm1, %ymm2, %ymm0, %ymm0
87; XOP-NEXT:    retq
88;
89; AVX1-LABEL: var_shuffle_v8i32:
90; AVX1:       # %bb.0:
91; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
92; AVX1-NEXT:    vpermilps %ymm1, %ymm2, %ymm2
93; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
94; AVX1-NEXT:    vpermilps %ymm1, %ymm0, %ymm0
95; AVX1-NEXT:    vpcmpgtd {{.*}}(%rip), %xmm1, %xmm3
96; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
97; AVX1-NEXT:    vpcmpgtd {{\.LCPI.*}}+{{.*}}(%rip), %xmm1, %xmm1
98; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
99; AVX1-NEXT:    vblendvps %ymm1, %ymm2, %ymm0, %ymm0
100; AVX1-NEXT:    retq
101;
102; INT256-LABEL: var_shuffle_v8i32:
103; INT256:       # %bb.0:
104; INT256-NEXT:    vpermps %ymm0, %ymm1, %ymm0
105; INT256-NEXT:    retq
106  %index0 = extractelement <8 x i32> %indices, i32 0
107  %index1 = extractelement <8 x i32> %indices, i32 1
108  %index2 = extractelement <8 x i32> %indices, i32 2
109  %index3 = extractelement <8 x i32> %indices, i32 3
110  %index4 = extractelement <8 x i32> %indices, i32 4
111  %index5 = extractelement <8 x i32> %indices, i32 5
112  %index6 = extractelement <8 x i32> %indices, i32 6
113  %index7 = extractelement <8 x i32> %indices, i32 7
114  %v0 = extractelement <8 x i32> %v, i32 %index0
115  %v1 = extractelement <8 x i32> %v, i32 %index1
116  %v2 = extractelement <8 x i32> %v, i32 %index2
117  %v3 = extractelement <8 x i32> %v, i32 %index3
118  %v4 = extractelement <8 x i32> %v, i32 %index4
119  %v5 = extractelement <8 x i32> %v, i32 %index5
120  %v6 = extractelement <8 x i32> %v, i32 %index6
121  %v7 = extractelement <8 x i32> %v, i32 %index7
122  %ret0 = insertelement <8 x i32> undef, i32 %v0, i32 0
123  %ret1 = insertelement <8 x i32> %ret0, i32 %v1, i32 1
124  %ret2 = insertelement <8 x i32> %ret1, i32 %v2, i32 2
125  %ret3 = insertelement <8 x i32> %ret2, i32 %v3, i32 3
126  %ret4 = insertelement <8 x i32> %ret3, i32 %v4, i32 4
127  %ret5 = insertelement <8 x i32> %ret4, i32 %v5, i32 5
128  %ret6 = insertelement <8 x i32> %ret5, i32 %v6, i32 6
129  %ret7 = insertelement <8 x i32> %ret6, i32 %v7, i32 7
130  ret <8 x i32> %ret7
131}
132
133define <16 x i16> @var_shuffle_v16i16(<16 x i16> %v, <16 x i16> %indices) nounwind {
134; XOP-LABEL: var_shuffle_v16i16:
135; XOP:       # %bb.0:
136; XOP-NEXT:    vmovdqa {{.*#+}} xmm2 = [256,256,256,256,256,256,256,256]
137; XOP-NEXT:    vmovdqa {{.*#+}} xmm3 = [514,514,514,514,514,514,514,514]
138; XOP-NEXT:    vpmacsww %xmm2, %xmm3, %xmm1, %xmm4
139; XOP-NEXT:    vextractf128 $1, %ymm1, %xmm1
140; XOP-NEXT:    vpmacsww %xmm2, %xmm3, %xmm1, %xmm1
141; XOP-NEXT:    vextractf128 $1, %ymm0, %xmm2
142; XOP-NEXT:    vpperm %xmm1, %xmm2, %xmm0, %xmm1
143; XOP-NEXT:    vpperm %xmm4, %xmm2, %xmm0, %xmm0
144; XOP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
145; XOP-NEXT:    retq
146;
147; AVX1-LABEL: var_shuffle_v16i16:
148; AVX1:       # %bb.0:
149; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [514,514,514,514,514,514,514,514]
150; AVX1-NEXT:    vpmullw %xmm2, %xmm1, %xmm3
151; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [256,256,256,256,256,256,256,256]
152; AVX1-NEXT:    vpaddw %xmm4, %xmm3, %xmm3
153; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
154; AVX1-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
155; AVX1-NEXT:    vpaddw %xmm4, %xmm1, %xmm1
156; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
157; AVX1-NEXT:    vpcmpgtb %xmm2, %xmm1, %xmm4
158; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
159; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm6
160; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
161; AVX1-NEXT:    vpblendvb %xmm4, %xmm6, %xmm1, %xmm1
162; AVX1-NEXT:    vpcmpgtb %xmm2, %xmm3, %xmm2
163; AVX1-NEXT:    vpshufb %xmm3, %xmm5, %xmm4
164; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
165; AVX1-NEXT:    vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
166; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
167; AVX1-NEXT:    retq
168;
169; AVX2-LABEL: var_shuffle_v16i16:
170; AVX2:       # %bb.0:
171; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
172; AVX2-NEXT:    vpaddw {{.*}}(%rip), %ymm1, %ymm1
173; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,2,3]
174; AVX2-NEXT:    vpshufb %ymm1, %ymm2, %ymm2
175; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
176; AVX2-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
177; AVX2-NEXT:    vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1
178; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
179; AVX2-NEXT:    retq
180;
181; AVX512-LABEL: var_shuffle_v16i16:
182; AVX512:       # %bb.0:
183; AVX512-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
184; AVX512-NEXT:    vpaddw {{.*}}(%rip), %ymm1, %ymm1
185; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,2,3]
186; AVX512-NEXT:    vpshufb %ymm1, %ymm2, %ymm2
187; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
188; AVX512-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
189; AVX512-NEXT:    vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1
190; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
191; AVX512-NEXT:    retq
192;
193; AVX512VLDQ-LABEL: var_shuffle_v16i16:
194; AVX512VLDQ:       # %bb.0:
195; AVX512VLDQ-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
196; AVX512VLDQ-NEXT:    vpaddw {{.*}}(%rip), %ymm1, %ymm1
197; AVX512VLDQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,2,3]
198; AVX512VLDQ-NEXT:    vpshufb %ymm1, %ymm2, %ymm2
199; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
200; AVX512VLDQ-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
201; AVX512VLDQ-NEXT:    vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1
202; AVX512VLDQ-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
203; AVX512VLDQ-NEXT:    retq
204;
205; AVX512VLBW-LABEL: var_shuffle_v16i16:
206; AVX512VLBW:       # %bb.0:
207; AVX512VLBW-NEXT:    vpermw %ymm0, %ymm1, %ymm0
208; AVX512VLBW-NEXT:    retq
209;
210; VLVBMI-LABEL: var_shuffle_v16i16:
211; VLVBMI:       # %bb.0:
212; VLVBMI-NEXT:    vpermw %ymm0, %ymm1, %ymm0
213; VLVBMI-NEXT:    retq
214  %index0 = extractelement <16 x i16> %indices, i32 0
215  %index1 = extractelement <16 x i16> %indices, i32 1
216  %index2 = extractelement <16 x i16> %indices, i32 2
217  %index3 = extractelement <16 x i16> %indices, i32 3
218  %index4 = extractelement <16 x i16> %indices, i32 4
219  %index5 = extractelement <16 x i16> %indices, i32 5
220  %index6 = extractelement <16 x i16> %indices, i32 6
221  %index7 = extractelement <16 x i16> %indices, i32 7
222  %index8 = extractelement <16 x i16> %indices, i32 8
223  %index9 = extractelement <16 x i16> %indices, i32 9
224  %index10 = extractelement <16 x i16> %indices, i32 10
225  %index11 = extractelement <16 x i16> %indices, i32 11
226  %index12 = extractelement <16 x i16> %indices, i32 12
227  %index13 = extractelement <16 x i16> %indices, i32 13
228  %index14 = extractelement <16 x i16> %indices, i32 14
229  %index15 = extractelement <16 x i16> %indices, i32 15
230  %v0 = extractelement <16 x i16> %v, i16 %index0
231  %v1 = extractelement <16 x i16> %v, i16 %index1
232  %v2 = extractelement <16 x i16> %v, i16 %index2
233  %v3 = extractelement <16 x i16> %v, i16 %index3
234  %v4 = extractelement <16 x i16> %v, i16 %index4
235  %v5 = extractelement <16 x i16> %v, i16 %index5
236  %v6 = extractelement <16 x i16> %v, i16 %index6
237  %v7 = extractelement <16 x i16> %v, i16 %index7
238  %v8 = extractelement <16 x i16> %v, i16 %index8
239  %v9 = extractelement <16 x i16> %v, i16 %index9
240  %v10 = extractelement <16 x i16> %v, i16 %index10
241  %v11 = extractelement <16 x i16> %v, i16 %index11
242  %v12 = extractelement <16 x i16> %v, i16 %index12
243  %v13 = extractelement <16 x i16> %v, i16 %index13
244  %v14 = extractelement <16 x i16> %v, i16 %index14
245  %v15 = extractelement <16 x i16> %v, i16 %index15
246  %ret0 = insertelement <16 x i16> undef, i16 %v0, i32 0
247  %ret1 = insertelement <16 x i16> %ret0, i16 %v1, i32 1
248  %ret2 = insertelement <16 x i16> %ret1, i16 %v2, i32 2
249  %ret3 = insertelement <16 x i16> %ret2, i16 %v3, i32 3
250  %ret4 = insertelement <16 x i16> %ret3, i16 %v4, i32 4
251  %ret5 = insertelement <16 x i16> %ret4, i16 %v5, i32 5
252  %ret6 = insertelement <16 x i16> %ret5, i16 %v6, i32 6
253  %ret7 = insertelement <16 x i16> %ret6, i16 %v7, i32 7
254  %ret8 = insertelement <16 x i16> %ret7, i16 %v8, i32 8
255  %ret9 = insertelement <16 x i16> %ret8, i16 %v9, i32 9
256  %ret10 = insertelement <16 x i16> %ret9, i16 %v10, i32 10
257  %ret11 = insertelement <16 x i16> %ret10, i16 %v11, i32 11
258  %ret12 = insertelement <16 x i16> %ret11, i16 %v12, i32 12
259  %ret13 = insertelement <16 x i16> %ret12, i16 %v13, i32 13
260  %ret14 = insertelement <16 x i16> %ret13, i16 %v14, i32 14
261  %ret15 = insertelement <16 x i16> %ret14, i16 %v15, i32 15
262  ret <16 x i16> %ret15
263}
264
265define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind {
266; XOP-LABEL: var_shuffle_v32i8:
267; XOP:       # %bb.0:
268; XOP-NEXT:    vextractf128 $1, %ymm1, %xmm2
269; XOP-NEXT:    vextractf128 $1, %ymm0, %xmm3
270; XOP-NEXT:    vpperm %xmm2, %xmm3, %xmm0, %xmm2
271; XOP-NEXT:    vpperm %xmm1, %xmm3, %xmm0, %xmm0
272; XOP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
273; XOP-NEXT:    retq
274;
275; AVX1-LABEL: var_shuffle_v32i8:
276; AVX1:       # %bb.0:
277; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
278; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
279; AVX1-NEXT:    vpcmpgtb %xmm3, %xmm2, %xmm4
280; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
281; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm6
282; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm2
283; AVX1-NEXT:    vpblendvb %xmm4, %xmm6, %xmm2, %xmm2
284; AVX1-NEXT:    vpcmpgtb %xmm3, %xmm1, %xmm3
285; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm4
286; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
287; AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm0, %xmm0
288; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
289; AVX1-NEXT:    retq
290;
291; AVX2-LABEL: var_shuffle_v32i8:
292; AVX2:       # %bb.0:
293; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,2,3]
294; AVX2-NEXT:    vpshufb %ymm1, %ymm2, %ymm2
295; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
296; AVX2-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
297; AVX2-NEXT:    vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1
298; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
299; AVX2-NEXT:    retq
300;
301; AVX512-LABEL: var_shuffle_v32i8:
302; AVX512:       # %bb.0:
303; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,2,3]
304; AVX512-NEXT:    vpshufb %ymm1, %ymm2, %ymm2
305; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
306; AVX512-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
307; AVX512-NEXT:    vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1
308; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
309; AVX512-NEXT:    retq
310;
311; AVX512VLDQ-LABEL: var_shuffle_v32i8:
312; AVX512VLDQ:       # %bb.0:
313; AVX512VLDQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,2,3]
314; AVX512VLDQ-NEXT:    vpshufb %ymm1, %ymm2, %ymm2
315; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
316; AVX512VLDQ-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
317; AVX512VLDQ-NEXT:    vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1
318; AVX512VLDQ-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
319; AVX512VLDQ-NEXT:    retq
320;
321; AVX512VLBW-LABEL: var_shuffle_v32i8:
322; AVX512VLBW:       # %bb.0:
323; AVX512VLBW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm2
324; AVX512VLBW-NEXT:    vpshufb %ymm1, %ymm2, %ymm2
325; AVX512VLBW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
326; AVX512VLBW-NEXT:    vpcmpgtb {{.*}}(%rip), %ymm1, %k1
327; AVX512VLBW-NEXT:    vpshufb %ymm1, %ymm0, %ymm2 {%k1}
328; AVX512VLBW-NEXT:    vmovdqa %ymm2, %ymm0
329; AVX512VLBW-NEXT:    retq
330;
331; VLVBMI-LABEL: var_shuffle_v32i8:
332; VLVBMI:       # %bb.0:
333; VLVBMI-NEXT:    vpermb %ymm0, %ymm1, %ymm0
334; VLVBMI-NEXT:    retq
335  %index0 = extractelement <32 x i8> %indices, i32 0
336  %index1 = extractelement <32 x i8> %indices, i32 1
337  %index2 = extractelement <32 x i8> %indices, i32 2
338  %index3 = extractelement <32 x i8> %indices, i32 3
339  %index4 = extractelement <32 x i8> %indices, i32 4
340  %index5 = extractelement <32 x i8> %indices, i32 5
341  %index6 = extractelement <32 x i8> %indices, i32 6
342  %index7 = extractelement <32 x i8> %indices, i32 7
343  %index8 = extractelement <32 x i8> %indices, i32 8
344  %index9 = extractelement <32 x i8> %indices, i32 9
345  %index10 = extractelement <32 x i8> %indices, i32 10
346  %index11 = extractelement <32 x i8> %indices, i32 11
347  %index12 = extractelement <32 x i8> %indices, i32 12
348  %index13 = extractelement <32 x i8> %indices, i32 13
349  %index14 = extractelement <32 x i8> %indices, i32 14
350  %index15 = extractelement <32 x i8> %indices, i32 15
351  %index16 = extractelement <32 x i8> %indices, i32 16
352  %index17 = extractelement <32 x i8> %indices, i32 17
353  %index18 = extractelement <32 x i8> %indices, i32 18
354  %index19 = extractelement <32 x i8> %indices, i32 19
355  %index20 = extractelement <32 x i8> %indices, i32 20
356  %index21 = extractelement <32 x i8> %indices, i32 21
357  %index22 = extractelement <32 x i8> %indices, i32 22
358  %index23 = extractelement <32 x i8> %indices, i32 23
359  %index24 = extractelement <32 x i8> %indices, i32 24
360  %index25 = extractelement <32 x i8> %indices, i32 25
361  %index26 = extractelement <32 x i8> %indices, i32 26
362  %index27 = extractelement <32 x i8> %indices, i32 27
363  %index28 = extractelement <32 x i8> %indices, i32 28
364  %index29 = extractelement <32 x i8> %indices, i32 29
365  %index30 = extractelement <32 x i8> %indices, i32 30
366  %index31 = extractelement <32 x i8> %indices, i32 31
367  %v0 = extractelement <32 x i8> %v, i8 %index0
368  %v1 = extractelement <32 x i8> %v, i8 %index1
369  %v2 = extractelement <32 x i8> %v, i8 %index2
370  %v3 = extractelement <32 x i8> %v, i8 %index3
371  %v4 = extractelement <32 x i8> %v, i8 %index4
372  %v5 = extractelement <32 x i8> %v, i8 %index5
373  %v6 = extractelement <32 x i8> %v, i8 %index6
374  %v7 = extractelement <32 x i8> %v, i8 %index7
375  %v8 = extractelement <32 x i8> %v, i8 %index8
376  %v9 = extractelement <32 x i8> %v, i8 %index9
377  %v10 = extractelement <32 x i8> %v, i8 %index10
378  %v11 = extractelement <32 x i8> %v, i8 %index11
379  %v12 = extractelement <32 x i8> %v, i8 %index12
380  %v13 = extractelement <32 x i8> %v, i8 %index13
381  %v14 = extractelement <32 x i8> %v, i8 %index14
382  %v15 = extractelement <32 x i8> %v, i8 %index15
383  %v16 = extractelement <32 x i8> %v, i8 %index16
384  %v17 = extractelement <32 x i8> %v, i8 %index17
385  %v18 = extractelement <32 x i8> %v, i8 %index18
386  %v19 = extractelement <32 x i8> %v, i8 %index19
387  %v20 = extractelement <32 x i8> %v, i8 %index20
388  %v21 = extractelement <32 x i8> %v, i8 %index21
389  %v22 = extractelement <32 x i8> %v, i8 %index22
390  %v23 = extractelement <32 x i8> %v, i8 %index23
391  %v24 = extractelement <32 x i8> %v, i8 %index24
392  %v25 = extractelement <32 x i8> %v, i8 %index25
393  %v26 = extractelement <32 x i8> %v, i8 %index26
394  %v27 = extractelement <32 x i8> %v, i8 %index27
395  %v28 = extractelement <32 x i8> %v, i8 %index28
396  %v29 = extractelement <32 x i8> %v, i8 %index29
397  %v30 = extractelement <32 x i8> %v, i8 %index30
398  %v31 = extractelement <32 x i8> %v, i8 %index31
399  %ret0 = insertelement <32 x i8> undef, i8 %v0, i32 0
400  %ret1 = insertelement <32 x i8> %ret0, i8 %v1, i32 1
401  %ret2 = insertelement <32 x i8> %ret1, i8 %v2, i32 2
402  %ret3 = insertelement <32 x i8> %ret2, i8 %v3, i32 3
403  %ret4 = insertelement <32 x i8> %ret3, i8 %v4, i32 4
404  %ret5 = insertelement <32 x i8> %ret4, i8 %v5, i32 5
405  %ret6 = insertelement <32 x i8> %ret5, i8 %v6, i32 6
406  %ret7 = insertelement <32 x i8> %ret6, i8 %v7, i32 7
407  %ret8 = insertelement <32 x i8> %ret7, i8 %v8, i32 8
408  %ret9 = insertelement <32 x i8> %ret8, i8 %v9, i32 9
409  %ret10 = insertelement <32 x i8> %ret9, i8 %v10, i32 10
410  %ret11 = insertelement <32 x i8> %ret10, i8 %v11, i32 11
411  %ret12 = insertelement <32 x i8> %ret11, i8 %v12, i32 12
412  %ret13 = insertelement <32 x i8> %ret12, i8 %v13, i32 13
413  %ret14 = insertelement <32 x i8> %ret13, i8 %v14, i32 14
414  %ret15 = insertelement <32 x i8> %ret14, i8 %v15, i32 15
415  %ret16 = insertelement <32 x i8> %ret15, i8 %v16, i32 16
416  %ret17 = insertelement <32 x i8> %ret16, i8 %v17, i32 17
417  %ret18 = insertelement <32 x i8> %ret17, i8 %v18, i32 18
418  %ret19 = insertelement <32 x i8> %ret18, i8 %v19, i32 19
419  %ret20 = insertelement <32 x i8> %ret19, i8 %v20, i32 20
420  %ret21 = insertelement <32 x i8> %ret20, i8 %v21, i32 21
421  %ret22 = insertelement <32 x i8> %ret21, i8 %v22, i32 22
422  %ret23 = insertelement <32 x i8> %ret22, i8 %v23, i32 23
423  %ret24 = insertelement <32 x i8> %ret23, i8 %v24, i32 24
424  %ret25 = insertelement <32 x i8> %ret24, i8 %v25, i32 25
425  %ret26 = insertelement <32 x i8> %ret25, i8 %v26, i32 26
426  %ret27 = insertelement <32 x i8> %ret26, i8 %v27, i32 27
427  %ret28 = insertelement <32 x i8> %ret27, i8 %v28, i32 28
428  %ret29 = insertelement <32 x i8> %ret28, i8 %v29, i32 29
429  %ret30 = insertelement <32 x i8> %ret29, i8 %v30, i32 30
430  %ret31 = insertelement <32 x i8> %ret30, i8 %v31, i32 31
431  ret <32 x i8> %ret31
432}
433
434define <4 x double> @var_shuffle_v4f64(<4 x double> %v, <4 x i64> %indices) nounwind {
435; XOP-LABEL: var_shuffle_v4f64:
436; XOP:       # %bb.0:
437; XOP-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
438; XOP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
439; XOP-NEXT:    vpaddq %xmm1, %xmm1, %xmm3
440; XOP-NEXT:    vextractf128 $1, %ymm1, %xmm1
441; XOP-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
442; XOP-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
443; XOP-NEXT:    vpermil2pd $0, %ymm1, %ymm2, %ymm0, %ymm0
444; XOP-NEXT:    retq
445;
446; AVX1-LABEL: var_shuffle_v4f64:
447; AVX1:       # %bb.0:
448; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
449; AVX1-NEXT:    vpaddq %xmm1, %xmm1, %xmm3
450; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
451; AVX1-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
452; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm4
453; AVX1-NEXT:    vpermilpd %ymm4, %ymm2, %ymm2
454; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
455; AVX1-NEXT:    vpermilpd %ymm4, %ymm0, %ymm0
456; AVX1-NEXT:    vpcmpgtq {{.*}}(%rip), %xmm3, %xmm3
457; AVX1-NEXT:    vpcmpgtq {{\.LCPI.*}}+{{.*}}(%rip), %xmm1, %xmm1
458; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
459; AVX1-NEXT:    vblendvpd %ymm1, %ymm2, %ymm0, %ymm0
460; AVX1-NEXT:    retq
461;
462; AVX2-LABEL: var_shuffle_v4f64:
463; AVX2:       # %bb.0:
464; AVX2-NEXT:    vpaddq %ymm1, %ymm1, %ymm1
465; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2]
466; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm1, %ymm2
467; AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm0[2,3,2,3]
468; AVX2-NEXT:    vpermilpd %ymm1, %ymm3, %ymm3
469; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
470; AVX2-NEXT:    vpermilpd %ymm1, %ymm0, %ymm0
471; AVX2-NEXT:    vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
472; AVX2-NEXT:    retq
473;
474; AVX512-LABEL: var_shuffle_v4f64:
475; AVX512:       # %bb.0:
476; AVX512-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
477; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
478; AVX512-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
479; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
480; AVX512-NEXT:    retq
481;
482; AVX512VL-LABEL: var_shuffle_v4f64:
483; AVX512VL:       # %bb.0:
484; AVX512VL-NEXT:    vpermpd %ymm0, %ymm1, %ymm0
485; AVX512VL-NEXT:    retq
486  %index0 = extractelement <4 x i64> %indices, i32 0
487  %index1 = extractelement <4 x i64> %indices, i32 1
488  %index2 = extractelement <4 x i64> %indices, i32 2
489  %index3 = extractelement <4 x i64> %indices, i32 3
490  %v0 = extractelement <4 x double> %v, i64 %index0
491  %v1 = extractelement <4 x double> %v, i64 %index1
492  %v2 = extractelement <4 x double> %v, i64 %index2
493  %v3 = extractelement <4 x double> %v, i64 %index3
494  %ret0 = insertelement <4 x double> undef, double %v0, i32 0
495  %ret1 = insertelement <4 x double> %ret0, double %v1, i32 1
496  %ret2 = insertelement <4 x double> %ret1, double %v2, i32 2
497  %ret3 = insertelement <4 x double> %ret2, double %v3, i32 3
498  ret <4 x double> %ret3
499}
500
501define <8 x float> @var_shuffle_v8f32(<8 x float> %v, <8 x i32> %indices) nounwind {
502; XOP-LABEL: var_shuffle_v8f32:
503; XOP:       # %bb.0:
504; XOP-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
505; XOP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
506; XOP-NEXT:    vpermil2ps $0, %ymm1, %ymm2, %ymm0, %ymm0
507; XOP-NEXT:    retq
508;
509; AVX1-LABEL: var_shuffle_v8f32:
510; AVX1:       # %bb.0:
511; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
512; AVX1-NEXT:    vpermilps %ymm1, %ymm2, %ymm2
513; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
514; AVX1-NEXT:    vpermilps %ymm1, %ymm0, %ymm0
515; AVX1-NEXT:    vpcmpgtd {{.*}}(%rip), %xmm1, %xmm3
516; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
517; AVX1-NEXT:    vpcmpgtd {{\.LCPI.*}}+{{.*}}(%rip), %xmm1, %xmm1
518; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
519; AVX1-NEXT:    vblendvps %ymm1, %ymm2, %ymm0, %ymm0
520; AVX1-NEXT:    retq
521;
522; INT256-LABEL: var_shuffle_v8f32:
523; INT256:       # %bb.0:
524; INT256-NEXT:    vpermps %ymm0, %ymm1, %ymm0
525; INT256-NEXT:    retq
526  %index0 = extractelement <8 x i32> %indices, i32 0
527  %index1 = extractelement <8 x i32> %indices, i32 1
528  %index2 = extractelement <8 x i32> %indices, i32 2
529  %index3 = extractelement <8 x i32> %indices, i32 3
530  %index4 = extractelement <8 x i32> %indices, i32 4
531  %index5 = extractelement <8 x i32> %indices, i32 5
532  %index6 = extractelement <8 x i32> %indices, i32 6
533  %index7 = extractelement <8 x i32> %indices, i32 7
534  %v0 = extractelement <8 x float> %v, i32 %index0
535  %v1 = extractelement <8 x float> %v, i32 %index1
536  %v2 = extractelement <8 x float> %v, i32 %index2
537  %v3 = extractelement <8 x float> %v, i32 %index3
538  %v4 = extractelement <8 x float> %v, i32 %index4
539  %v5 = extractelement <8 x float> %v, i32 %index5
540  %v6 = extractelement <8 x float> %v, i32 %index6
541  %v7 = extractelement <8 x float> %v, i32 %index7
542  %ret0 = insertelement <8 x float> undef, float %v0, i32 0
543  %ret1 = insertelement <8 x float> %ret0, float %v1, i32 1
544  %ret2 = insertelement <8 x float> %ret1, float %v2, i32 2
545  %ret3 = insertelement <8 x float> %ret2, float %v3, i32 3
546  %ret4 = insertelement <8 x float> %ret3, float %v4, i32 4
547  %ret5 = insertelement <8 x float> %ret4, float %v5, i32 5
548  %ret6 = insertelement <8 x float> %ret5, float %v6, i32 6
549  %ret7 = insertelement <8 x float> %ret6, float %v7, i32 7
550  ret <8 x float> %ret7
551}
552
553;
554; PR35820 - Unequal source/destination vector sizes
555;
556
557define <4 x i64> @var_shuffle_v4i64_from_v2i64(<2 x i64> %v, <4 x i64> %indices) nounwind {
558; XOP-LABEL: var_shuffle_v4i64_from_v2i64:
559; XOP:       # %bb.0:
560; XOP-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
561; XOP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
562; XOP-NEXT:    vpaddq %xmm1, %xmm1, %xmm2
563; XOP-NEXT:    vextractf128 $1, %ymm1, %xmm1
564; XOP-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
565; XOP-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
566; XOP-NEXT:    vpermil2pd $0, %ymm1, %ymm0, %ymm0, %ymm0
567; XOP-NEXT:    retq
568;
569; AVX1-LABEL: var_shuffle_v4i64_from_v2i64:
570; AVX1:       # %bb.0:
571; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
572; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
573; AVX1-NEXT:    vpaddq %xmm1, %xmm1, %xmm2
574; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
575; AVX1-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
576; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm3
577; AVX1-NEXT:    vpermilpd %ymm3, %ymm0, %ymm0
578; AVX1-NEXT:    vpcmpgtq {{.*}}(%rip), %xmm2, %xmm2
579; AVX1-NEXT:    vpcmpgtq {{\.LCPI.*}}+{{.*}}(%rip), %xmm1, %xmm1
580; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
581; AVX1-NEXT:    vpermilpd %ymm3, %ymm0, %ymm2
582; AVX1-NEXT:    vblendvpd %ymm1, %ymm2, %ymm0, %ymm0
583; AVX1-NEXT:    retq
584;
585; AVX2-LABEL: var_shuffle_v4i64_from_v2i64:
586; AVX2:       # %bb.0:
587; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
588; AVX2-NEXT:    vpaddq %ymm1, %ymm1, %ymm1
589; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2]
590; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm1, %ymm2
591; AVX2-NEXT:    vpermilpd %ymm1, %ymm0, %ymm3
592; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
593; AVX2-NEXT:    vpermilpd %ymm1, %ymm0, %ymm0
594; AVX2-NEXT:    vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
595; AVX2-NEXT:    retq
596;
597; AVX512-LABEL: var_shuffle_v4i64_from_v2i64:
598; AVX512:       # %bb.0:
599; AVX512-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
600; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
601; AVX512-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
602; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
603; AVX512-NEXT:    retq
604;
605; AVX512VL-LABEL: var_shuffle_v4i64_from_v2i64:
606; AVX512VL:       # %bb.0:
607; AVX512VL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
608; AVX512VL-NEXT:    vpermpd %ymm0, %ymm1, %ymm0
609; AVX512VL-NEXT:    retq
610  %index0 = extractelement <4 x i64> %indices, i32 0
611  %index1 = extractelement <4 x i64> %indices, i32 1
612  %index2 = extractelement <4 x i64> %indices, i32 2
613  %index3 = extractelement <4 x i64> %indices, i32 3
614  %v0 = extractelement <2 x i64> %v, i64 %index0
615  %v1 = extractelement <2 x i64> %v, i64 %index1
616  %v2 = extractelement <2 x i64> %v, i64 %index2
617  %v3 = extractelement <2 x i64> %v, i64 %index3
618  %ret0 = insertelement <4 x i64> undef, i64 %v0, i32 0
619  %ret1 = insertelement <4 x i64> %ret0, i64 %v1, i32 1
620  %ret2 = insertelement <4 x i64> %ret1, i64 %v2, i32 2
621  %ret3 = insertelement <4 x i64> %ret2, i64 %v3, i32 3
622  ret <4 x i64> %ret3
623}
624
625define <8 x i32> @var_shuffle_v8i32_from_v4i32(<4 x i32> %v, <8 x i32> %indices) unnamed_addr nounwind {
626; XOP-LABEL: var_shuffle_v8i32_from_v4i32:
627; XOP:       # %bb.0: # %entry
628; XOP-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
629; XOP-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
630; XOP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
631; XOP-NEXT:    vpermil2ps $0, %ymm1, %ymm2, %ymm0, %ymm0
632; XOP-NEXT:    retq
633;
634; AVX1-LABEL: var_shuffle_v8i32_from_v4i32:
635; AVX1:       # %bb.0: # %entry
636; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
637; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
638; AVX1-NEXT:    vpermilps %ymm1, %ymm2, %ymm2
639; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
640; AVX1-NEXT:    vpermilps %ymm1, %ymm0, %ymm0
641; AVX1-NEXT:    vpcmpgtd {{.*}}(%rip), %xmm1, %xmm3
642; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
643; AVX1-NEXT:    vpcmpgtd {{\.LCPI.*}}+{{.*}}(%rip), %xmm1, %xmm1
644; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
645; AVX1-NEXT:    vblendvps %ymm1, %ymm2, %ymm0, %ymm0
646; AVX1-NEXT:    retq
647;
648; INT256-LABEL: var_shuffle_v8i32_from_v4i32:
649; INT256:       # %bb.0: # %entry
650; INT256-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
651; INT256-NEXT:    vpermps %ymm0, %ymm1, %ymm0
652; INT256-NEXT:    retq
653entry:
654  %tmp1 = extractelement <8 x i32> %indices, i32 0
655  %vecext2.8 = extractelement <4 x i32> %v, i32 %tmp1
656  %tmp2 = extractelement <8 x i32> %indices, i32 1
657  %vecext2.9 = extractelement <4 x i32> %v, i32 %tmp2
658  %tmp3 = extractelement <8 x i32> %indices, i32 2
659  %vecext2.10 = extractelement <4 x i32> %v, i32 %tmp3
660  %tmp4 = extractelement <8 x i32> %indices, i32 3
661  %vecext2.11 = extractelement <4 x i32> %v, i32 %tmp4
662  %tmp5 = extractelement <8 x i32> %indices, i32 4
663  %vecext2.12 = extractelement <4 x i32> %v, i32 %tmp5
664  %tmp6 = extractelement <8 x i32> %indices, i32 5
665  %vecext2.13 = extractelement <4 x i32> %v, i32 %tmp6
666  %tmp7 = extractelement <8 x i32> %indices, i32 6
667  %vecext2.14 = extractelement <4 x i32> %v, i32 %tmp7
668  %tmp8 = extractelement <8 x i32> %indices, i32 7
669  %vecext2.15 = extractelement <4 x i32> %v, i32 %tmp8
670  %tmp9 = insertelement <8 x i32> undef, i32 %vecext2.8, i32 0
671  %tmp10 = insertelement <8 x i32> %tmp9, i32 %vecext2.9, i32 1
672  %tmp11 = insertelement <8 x i32> %tmp10, i32 %vecext2.10, i32 2
673  %tmp12 = insertelement <8 x i32> %tmp11, i32 %vecext2.11, i32 3
674  %tmp13 = insertelement <8 x i32> %tmp12, i32 %vecext2.12, i32 4
675  %tmp14 = insertelement <8 x i32> %tmp13, i32 %vecext2.13, i32 5
676  %tmp15 = insertelement <8 x i32> %tmp14, i32 %vecext2.14, i32 6
677  %tmp16 = insertelement <8 x i32> %tmp15, i32 %vecext2.15, i32 7
678  ret <8 x i32> %tmp16
679}
680
681define <16 x i16> @var_shuffle_v16i16_from_v8i16(<8 x i16> %v, <16 x i16> %indices) nounwind {
682; XOP-LABEL: var_shuffle_v16i16_from_v8i16:
683; XOP:       # %bb.0:
684; XOP-NEXT:    vmovdqa {{.*#+}} xmm2 = [256,256,256,256,256,256,256,256]
685; XOP-NEXT:    vmovdqa {{.*#+}} xmm3 = [514,514,514,514,514,514,514,514]
686; XOP-NEXT:    vpmacsww %xmm2, %xmm3, %xmm1, %xmm4
687; XOP-NEXT:    vextractf128 $1, %ymm1, %xmm1
688; XOP-NEXT:    vpmacsww %xmm2, %xmm3, %xmm1, %xmm1
689; XOP-NEXT:    vpperm %xmm1, %xmm0, %xmm0, %xmm1
690; XOP-NEXT:    vpperm %xmm4, %xmm0, %xmm0, %xmm0
691; XOP-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
692; XOP-NEXT:    retq
693;
694; AVX1-LABEL: var_shuffle_v16i16_from_v8i16:
695; AVX1:       # %bb.0:
696; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [514,514,514,514,514,514,514,514]
697; AVX1-NEXT:    vpmullw %xmm2, %xmm1, %xmm3
698; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [256,256,256,256,256,256,256,256]
699; AVX1-NEXT:    vpaddw %xmm4, %xmm3, %xmm3
700; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
701; AVX1-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
702; AVX1-NEXT:    vpaddw %xmm4, %xmm1, %xmm1
703; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
704; AVX1-NEXT:    vpcmpgtb %xmm2, %xmm1, %xmm4
705; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm5
706; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm1
707; AVX1-NEXT:    vpblendvb %xmm4, %xmm5, %xmm1, %xmm1
708; AVX1-NEXT:    vpcmpgtb %xmm2, %xmm3, %xmm2
709; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm4
710; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
711; AVX1-NEXT:    vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
712; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
713; AVX1-NEXT:    retq
714;
715; AVX2-LABEL: var_shuffle_v16i16_from_v8i16:
716; AVX2:       # %bb.0:
717; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
718; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
719; AVX2-NEXT:    vpaddw {{.*}}(%rip), %ymm1, %ymm1
720; AVX2-NEXT:    vpshufb %ymm1, %ymm0, %ymm2
721; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
722; AVX2-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
723; AVX2-NEXT:    vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1
724; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
725; AVX2-NEXT:    retq
726;
727; AVX512-LABEL: var_shuffle_v16i16_from_v8i16:
728; AVX512:       # %bb.0:
729; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
730; AVX512-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
731; AVX512-NEXT:    vpaddw {{.*}}(%rip), %ymm1, %ymm1
732; AVX512-NEXT:    vpshufb %ymm1, %ymm0, %ymm2
733; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
734; AVX512-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
735; AVX512-NEXT:    vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1
736; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
737; AVX512-NEXT:    retq
738;
739; AVX512VLDQ-LABEL: var_shuffle_v16i16_from_v8i16:
740; AVX512VLDQ:       # %bb.0:
741; AVX512VLDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
742; AVX512VLDQ-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
743; AVX512VLDQ-NEXT:    vpaddw {{.*}}(%rip), %ymm1, %ymm1
744; AVX512VLDQ-NEXT:    vpshufb %ymm1, %ymm0, %ymm2
745; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
746; AVX512VLDQ-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
747; AVX512VLDQ-NEXT:    vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1
748; AVX512VLDQ-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
749; AVX512VLDQ-NEXT:    retq
750;
751; AVX512VLBW-LABEL: var_shuffle_v16i16_from_v8i16:
752; AVX512VLBW:       # %bb.0:
753; AVX512VLBW-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
754; AVX512VLBW-NEXT:    vpermw %ymm0, %ymm1, %ymm0
755; AVX512VLBW-NEXT:    retq
756;
757; VLVBMI-LABEL: var_shuffle_v16i16_from_v8i16:
758; VLVBMI:       # %bb.0:
759; VLVBMI-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
760; VLVBMI-NEXT:    vpermw %ymm0, %ymm1, %ymm0
761; VLVBMI-NEXT:    retq
762  %index0 = extractelement <16 x i16> %indices, i32 0
763  %index1 = extractelement <16 x i16> %indices, i32 1
764  %index2 = extractelement <16 x i16> %indices, i32 2
765  %index3 = extractelement <16 x i16> %indices, i32 3
766  %index4 = extractelement <16 x i16> %indices, i32 4
767  %index5 = extractelement <16 x i16> %indices, i32 5
768  %index6 = extractelement <16 x i16> %indices, i32 6
769  %index7 = extractelement <16 x i16> %indices, i32 7
770  %index8 = extractelement <16 x i16> %indices, i32 8
771  %index9 = extractelement <16 x i16> %indices, i32 9
772  %index10 = extractelement <16 x i16> %indices, i32 10
773  %index11 = extractelement <16 x i16> %indices, i32 11
774  %index12 = extractelement <16 x i16> %indices, i32 12
775  %index13 = extractelement <16 x i16> %indices, i32 13
776  %index14 = extractelement <16 x i16> %indices, i32 14
777  %index15 = extractelement <16 x i16> %indices, i32 15
778  %v0 = extractelement <8 x i16> %v, i16 %index0
779  %v1 = extractelement <8 x i16> %v, i16 %index1
780  %v2 = extractelement <8 x i16> %v, i16 %index2
781  %v3 = extractelement <8 x i16> %v, i16 %index3
782  %v4 = extractelement <8 x i16> %v, i16 %index4
783  %v5 = extractelement <8 x i16> %v, i16 %index5
784  %v6 = extractelement <8 x i16> %v, i16 %index6
785  %v7 = extractelement <8 x i16> %v, i16 %index7
786  %v8 = extractelement <8 x i16> %v, i16 %index8
787  %v9 = extractelement <8 x i16> %v, i16 %index9
788  %v10 = extractelement <8 x i16> %v, i16 %index10
789  %v11 = extractelement <8 x i16> %v, i16 %index11
790  %v12 = extractelement <8 x i16> %v, i16 %index12
791  %v13 = extractelement <8 x i16> %v, i16 %index13
792  %v14 = extractelement <8 x i16> %v, i16 %index14
793  %v15 = extractelement <8 x i16> %v, i16 %index15
794  %ret0 = insertelement <16 x i16> undef, i16 %v0, i32 0
795  %ret1 = insertelement <16 x i16> %ret0, i16 %v1, i32 1
796  %ret2 = insertelement <16 x i16> %ret1, i16 %v2, i32 2
797  %ret3 = insertelement <16 x i16> %ret2, i16 %v3, i32 3
798  %ret4 = insertelement <16 x i16> %ret3, i16 %v4, i32 4
799  %ret5 = insertelement <16 x i16> %ret4, i16 %v5, i32 5
800  %ret6 = insertelement <16 x i16> %ret5, i16 %v6, i32 6
801  %ret7 = insertelement <16 x i16> %ret6, i16 %v7, i32 7
802  %ret8 = insertelement <16 x i16> %ret7, i16 %v8, i32 8
803  %ret9 = insertelement <16 x i16> %ret8, i16 %v9, i32 9
804  %ret10 = insertelement <16 x i16> %ret9, i16 %v10, i32 10
805  %ret11 = insertelement <16 x i16> %ret10, i16 %v11, i32 11
806  %ret12 = insertelement <16 x i16> %ret11, i16 %v12, i32 12
807  %ret13 = insertelement <16 x i16> %ret12, i16 %v13, i32 13
808  %ret14 = insertelement <16 x i16> %ret13, i16 %v14, i32 14
809  %ret15 = insertelement <16 x i16> %ret14, i16 %v15, i32 15
810  ret <16 x i16> %ret15
811}
812
813define <32 x i8> @var_shuffle_v32i8_from_v16i8(<16 x i8> %v, <32 x i8> %indices) nounwind {
814; XOP-LABEL: var_shuffle_v32i8_from_v16i8:
815; XOP:       # %bb.0:
816; XOP-NEXT:    vextractf128 $1, %ymm1, %xmm2
817; XOP-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm2
818; XOP-NEXT:    vpperm %xmm1, %xmm0, %xmm0, %xmm0
819; XOP-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
820; XOP-NEXT:    retq
821;
822; AVX1-LABEL: var_shuffle_v32i8_from_v16i8:
823; AVX1:       # %bb.0:
824; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
825; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
826; AVX1-NEXT:    vpcmpgtb %xmm3, %xmm2, %xmm4
827; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm5
828; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm2
829; AVX1-NEXT:    vpblendvb %xmm4, %xmm5, %xmm2, %xmm2
830; AVX1-NEXT:    vpcmpgtb %xmm3, %xmm1, %xmm3
831; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm4
832; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
833; AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm0, %xmm0
834; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
835; AVX1-NEXT:    retq
836;
837; AVX2-LABEL: var_shuffle_v32i8_from_v16i8:
838; AVX2:       # %bb.0:
839; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
840; AVX2-NEXT:    vpshufb %ymm1, %ymm0, %ymm2
841; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
842; AVX2-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
843; AVX2-NEXT:    vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1
844; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
845; AVX2-NEXT:    retq
846;
847; AVX512-LABEL: var_shuffle_v32i8_from_v16i8:
848; AVX512:       # %bb.0:
849; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
850; AVX512-NEXT:    vpshufb %ymm1, %ymm0, %ymm2
851; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
852; AVX512-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
853; AVX512-NEXT:    vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1
854; AVX512-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
855; AVX512-NEXT:    retq
856;
857; AVX512VLDQ-LABEL: var_shuffle_v32i8_from_v16i8:
858; AVX512VLDQ:       # %bb.0:
859; AVX512VLDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
860; AVX512VLDQ-NEXT:    vpshufb %ymm1, %ymm0, %ymm2
861; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
862; AVX512VLDQ-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
863; AVX512VLDQ-NEXT:    vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1
864; AVX512VLDQ-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
865; AVX512VLDQ-NEXT:    retq
866;
867; AVX512VLBW-LABEL: var_shuffle_v32i8_from_v16i8:
868; AVX512VLBW:       # %bb.0:
869; AVX512VLBW-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
870; AVX512VLBW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
871; AVX512VLBW-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
872; AVX512VLBW-NEXT:    vpcmpgtb {{.*}}(%rip), %ymm1, %k1
873; AVX512VLBW-NEXT:    vpshufb %ymm1, %ymm0, %ymm0 {%k1}
874; AVX512VLBW-NEXT:    retq
875;
876; VLVBMI-LABEL: var_shuffle_v32i8_from_v16i8:
877; VLVBMI:       # %bb.0:
878; VLVBMI-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
879; VLVBMI-NEXT:    vpermb %ymm0, %ymm1, %ymm0
880; VLVBMI-NEXT:    retq
881  %index0 = extractelement <32 x i8> %indices, i32 0
882  %index1 = extractelement <32 x i8> %indices, i32 1
883  %index2 = extractelement <32 x i8> %indices, i32 2
884  %index3 = extractelement <32 x i8> %indices, i32 3
885  %index4 = extractelement <32 x i8> %indices, i32 4
886  %index5 = extractelement <32 x i8> %indices, i32 5
887  %index6 = extractelement <32 x i8> %indices, i32 6
888  %index7 = extractelement <32 x i8> %indices, i32 7
889  %index8 = extractelement <32 x i8> %indices, i32 8
890  %index9 = extractelement <32 x i8> %indices, i32 9
891  %index10 = extractelement <32 x i8> %indices, i32 10
892  %index11 = extractelement <32 x i8> %indices, i32 11
893  %index12 = extractelement <32 x i8> %indices, i32 12
894  %index13 = extractelement <32 x i8> %indices, i32 13
895  %index14 = extractelement <32 x i8> %indices, i32 14
896  %index15 = extractelement <32 x i8> %indices, i32 15
897  %index16 = extractelement <32 x i8> %indices, i32 16
898  %index17 = extractelement <32 x i8> %indices, i32 17
899  %index18 = extractelement <32 x i8> %indices, i32 18
900  %index19 = extractelement <32 x i8> %indices, i32 19
901  %index20 = extractelement <32 x i8> %indices, i32 20
902  %index21 = extractelement <32 x i8> %indices, i32 21
903  %index22 = extractelement <32 x i8> %indices, i32 22
904  %index23 = extractelement <32 x i8> %indices, i32 23
905  %index24 = extractelement <32 x i8> %indices, i32 24
906  %index25 = extractelement <32 x i8> %indices, i32 25
907  %index26 = extractelement <32 x i8> %indices, i32 26
908  %index27 = extractelement <32 x i8> %indices, i32 27
909  %index28 = extractelement <32 x i8> %indices, i32 28
910  %index29 = extractelement <32 x i8> %indices, i32 29
911  %index30 = extractelement <32 x i8> %indices, i32 30
912  %index31 = extractelement <32 x i8> %indices, i32 31
913  %v0 = extractelement <16 x i8> %v, i8 %index0
914  %v1 = extractelement <16 x i8> %v, i8 %index1
915  %v2 = extractelement <16 x i8> %v, i8 %index2
916  %v3 = extractelement <16 x i8> %v, i8 %index3
917  %v4 = extractelement <16 x i8> %v, i8 %index4
918  %v5 = extractelement <16 x i8> %v, i8 %index5
919  %v6 = extractelement <16 x i8> %v, i8 %index6
920  %v7 = extractelement <16 x i8> %v, i8 %index7
921  %v8 = extractelement <16 x i8> %v, i8 %index8
922  %v9 = extractelement <16 x i8> %v, i8 %index9
923  %v10 = extractelement <16 x i8> %v, i8 %index10
924  %v11 = extractelement <16 x i8> %v, i8 %index11
925  %v12 = extractelement <16 x i8> %v, i8 %index12
926  %v13 = extractelement <16 x i8> %v, i8 %index13
927  %v14 = extractelement <16 x i8> %v, i8 %index14
928  %v15 = extractelement <16 x i8> %v, i8 %index15
929  %v16 = extractelement <16 x i8> %v, i8 %index16
930  %v17 = extractelement <16 x i8> %v, i8 %index17
931  %v18 = extractelement <16 x i8> %v, i8 %index18
932  %v19 = extractelement <16 x i8> %v, i8 %index19
933  %v20 = extractelement <16 x i8> %v, i8 %index20
934  %v21 = extractelement <16 x i8> %v, i8 %index21
935  %v22 = extractelement <16 x i8> %v, i8 %index22
936  %v23 = extractelement <16 x i8> %v, i8 %index23
937  %v24 = extractelement <16 x i8> %v, i8 %index24
938  %v25 = extractelement <16 x i8> %v, i8 %index25
939  %v26 = extractelement <16 x i8> %v, i8 %index26
940  %v27 = extractelement <16 x i8> %v, i8 %index27
941  %v28 = extractelement <16 x i8> %v, i8 %index28
942  %v29 = extractelement <16 x i8> %v, i8 %index29
943  %v30 = extractelement <16 x i8> %v, i8 %index30
944  %v31 = extractelement <16 x i8> %v, i8 %index31
945  %ret0 = insertelement <32 x i8> undef, i8 %v0, i32 0
946  %ret1 = insertelement <32 x i8> %ret0, i8 %v1, i32 1
947  %ret2 = insertelement <32 x i8> %ret1, i8 %v2, i32 2
948  %ret3 = insertelement <32 x i8> %ret2, i8 %v3, i32 3
949  %ret4 = insertelement <32 x i8> %ret3, i8 %v4, i32 4
950  %ret5 = insertelement <32 x i8> %ret4, i8 %v5, i32 5
951  %ret6 = insertelement <32 x i8> %ret5, i8 %v6, i32 6
952  %ret7 = insertelement <32 x i8> %ret6, i8 %v7, i32 7
953  %ret8 = insertelement <32 x i8> %ret7, i8 %v8, i32 8
954  %ret9 = insertelement <32 x i8> %ret8, i8 %v9, i32 9
955  %ret10 = insertelement <32 x i8> %ret9, i8 %v10, i32 10
956  %ret11 = insertelement <32 x i8> %ret10, i8 %v11, i32 11
957  %ret12 = insertelement <32 x i8> %ret11, i8 %v12, i32 12
958  %ret13 = insertelement <32 x i8> %ret12, i8 %v13, i32 13
959  %ret14 = insertelement <32 x i8> %ret13, i8 %v14, i32 14
960  %ret15 = insertelement <32 x i8> %ret14, i8 %v15, i32 15
961  %ret16 = insertelement <32 x i8> %ret15, i8 %v16, i32 16
962  %ret17 = insertelement <32 x i8> %ret16, i8 %v17, i32 17
963  %ret18 = insertelement <32 x i8> %ret17, i8 %v18, i32 18
964  %ret19 = insertelement <32 x i8> %ret18, i8 %v19, i32 19
965  %ret20 = insertelement <32 x i8> %ret19, i8 %v20, i32 20
966  %ret21 = insertelement <32 x i8> %ret20, i8 %v21, i32 21
967  %ret22 = insertelement <32 x i8> %ret21, i8 %v22, i32 22
968  %ret23 = insertelement <32 x i8> %ret22, i8 %v23, i32 23
969  %ret24 = insertelement <32 x i8> %ret23, i8 %v24, i32 24
970  %ret25 = insertelement <32 x i8> %ret24, i8 %v25, i32 25
971  %ret26 = insertelement <32 x i8> %ret25, i8 %v26, i32 26
972  %ret27 = insertelement <32 x i8> %ret26, i8 %v27, i32 27
973  %ret28 = insertelement <32 x i8> %ret27, i8 %v28, i32 28
974  %ret29 = insertelement <32 x i8> %ret28, i8 %v29, i32 29
975  %ret30 = insertelement <32 x i8> %ret29, i8 %v30, i32 30
976  %ret31 = insertelement <32 x i8> %ret30, i8 %v31, i32 31
977  ret <32 x i8> %ret31
978}
979
980define <4 x double> @var_shuffle_v4f64_from_v2f64(<2 x double> %v, <4 x i64> %indices) nounwind {
981; XOP-LABEL: var_shuffle_v4f64_from_v2f64:
982; XOP:       # %bb.0:
983; XOP-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
984; XOP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
985; XOP-NEXT:    vpaddq %xmm1, %xmm1, %xmm2
986; XOP-NEXT:    vextractf128 $1, %ymm1, %xmm1
987; XOP-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
988; XOP-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
989; XOP-NEXT:    vpermil2pd $0, %ymm1, %ymm0, %ymm0, %ymm0
990; XOP-NEXT:    retq
991;
992; AVX1-LABEL: var_shuffle_v4f64_from_v2f64:
993; AVX1:       # %bb.0:
994; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
995; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
996; AVX1-NEXT:    vpaddq %xmm1, %xmm1, %xmm2
997; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
998; AVX1-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
999; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm3
1000; AVX1-NEXT:    vpermilpd %ymm3, %ymm0, %ymm0
1001; AVX1-NEXT:    vpcmpgtq {{.*}}(%rip), %xmm2, %xmm2
1002; AVX1-NEXT:    vpcmpgtq {{\.LCPI.*}}+{{.*}}(%rip), %xmm1, %xmm1
1003; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
1004; AVX1-NEXT:    vpermilpd %ymm3, %ymm0, %ymm2
1005; AVX1-NEXT:    vblendvpd %ymm1, %ymm2, %ymm0, %ymm0
1006; AVX1-NEXT:    retq
1007;
1008; AVX2-LABEL: var_shuffle_v4f64_from_v2f64:
1009; AVX2:       # %bb.0:
1010; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1011; AVX2-NEXT:    vpaddq %ymm1, %ymm1, %ymm1
1012; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2]
1013; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm1, %ymm2
1014; AVX2-NEXT:    vpermilpd %ymm1, %ymm0, %ymm3
1015; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
1016; AVX2-NEXT:    vpermilpd %ymm1, %ymm0, %ymm0
1017; AVX2-NEXT:    vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
1018; AVX2-NEXT:    retq
1019;
1020; AVX512-LABEL: var_shuffle_v4f64_from_v2f64:
1021; AVX512:       # %bb.0:
1022; AVX512-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
1023; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1024; AVX512-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
1025; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1026; AVX512-NEXT:    retq
1027;
1028; AVX512VL-LABEL: var_shuffle_v4f64_from_v2f64:
1029; AVX512VL:       # %bb.0:
1030; AVX512VL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1031; AVX512VL-NEXT:    vpermpd %ymm0, %ymm1, %ymm0
1032; AVX512VL-NEXT:    retq
1033  %index0 = extractelement <4 x i64> %indices, i32 0
1034  %index1 = extractelement <4 x i64> %indices, i32 1
1035  %index2 = extractelement <4 x i64> %indices, i32 2
1036  %index3 = extractelement <4 x i64> %indices, i32 3
1037  %v0 = extractelement <2 x double> %v, i64 %index0
1038  %v1 = extractelement <2 x double> %v, i64 %index1
1039  %v2 = extractelement <2 x double> %v, i64 %index2
1040  %v3 = extractelement <2 x double> %v, i64 %index3
1041  %ret0 = insertelement <4 x double> undef, double %v0, i32 0
1042  %ret1 = insertelement <4 x double> %ret0, double %v1, i32 1
1043  %ret2 = insertelement <4 x double> %ret1, double %v2, i32 2
1044  %ret3 = insertelement <4 x double> %ret2, double %v3, i32 3
1045  ret <4 x double> %ret3
1046}
1047
1048define <8 x float> @var_shuffle_v8f32_from_v4f32(<4 x float> %v, <8 x i32> %indices) unnamed_addr nounwind {
1049; XOP-LABEL: var_shuffle_v8f32_from_v4f32:
1050; XOP:       # %bb.0: # %entry
1051; XOP-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1052; XOP-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
1053; XOP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1054; XOP-NEXT:    vpermil2ps $0, %ymm1, %ymm2, %ymm0, %ymm0
1055; XOP-NEXT:    retq
1056;
1057; AVX1-LABEL: var_shuffle_v8f32_from_v4f32:
1058; AVX1:       # %bb.0: # %entry
1059; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1060; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
1061; AVX1-NEXT:    vpermilps %ymm1, %ymm2, %ymm2
1062; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1063; AVX1-NEXT:    vpermilps %ymm1, %ymm0, %ymm0
1064; AVX1-NEXT:    vpcmpgtd {{.*}}(%rip), %xmm1, %xmm3
1065; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1066; AVX1-NEXT:    vpcmpgtd {{\.LCPI.*}}+{{.*}}(%rip), %xmm1, %xmm1
1067; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
1068; AVX1-NEXT:    vblendvps %ymm1, %ymm2, %ymm0, %ymm0
1069; AVX1-NEXT:    retq
1070;
1071; INT256-LABEL: var_shuffle_v8f32_from_v4f32:
1072; INT256:       # %bb.0: # %entry
1073; INT256-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1074; INT256-NEXT:    vpermps %ymm0, %ymm1, %ymm0
1075; INT256-NEXT:    retq
1076entry:
1077  %tmp1 = extractelement <8 x i32> %indices, i32 0
1078  %vecext2.8 = extractelement <4 x float> %v, i32 %tmp1
1079  %tmp2 = extractelement <8 x i32> %indices, i32 1
1080  %vecext2.9 = extractelement <4 x float> %v, i32 %tmp2
1081  %tmp3 = extractelement <8 x i32> %indices, i32 2
1082  %vecext2.10 = extractelement <4 x float> %v, i32 %tmp3
1083  %tmp4 = extractelement <8 x i32> %indices, i32 3
1084  %vecext2.11 = extractelement <4 x float> %v, i32 %tmp4
1085  %tmp5 = extractelement <8 x i32> %indices, i32 4
1086  %vecext2.12 = extractelement <4 x float> %v, i32 %tmp5
1087  %tmp6 = extractelement <8 x i32> %indices, i32 5
1088  %vecext2.13 = extractelement <4 x float> %v, i32 %tmp6
1089  %tmp7 = extractelement <8 x i32> %indices, i32 6
1090  %vecext2.14 = extractelement <4 x float> %v, i32 %tmp7
1091  %tmp8 = extractelement <8 x i32> %indices, i32 7
1092  %vecext2.15 = extractelement <4 x float> %v, i32 %tmp8
1093  %tmp9 = insertelement <8 x float> undef, float %vecext2.8, i32 0
1094  %tmp10 = insertelement <8 x float> %tmp9, float %vecext2.9, i32 1
1095  %tmp11 = insertelement <8 x float> %tmp10, float %vecext2.10, i32 2
1096  %tmp12 = insertelement <8 x float> %tmp11, float %vecext2.11, i32 3
1097  %tmp13 = insertelement <8 x float> %tmp12, float %vecext2.12, i32 4
1098  %tmp14 = insertelement <8 x float> %tmp13, float %vecext2.13, i32 5
1099  %tmp15 = insertelement <8 x float> %tmp14, float %vecext2.14, i32 6
1100  %tmp16 = insertelement <8 x float> %tmp15, float %vecext2.15, i32 7
1101  ret <8 x float> %tmp16
1102}
1103
1104define <4 x i32> @var_shuffle_v4i32_from_v8i32(<8 x i32> %v, <4 x i32> %indices) unnamed_addr nounwind {
1105; XOP-LABEL: var_shuffle_v4i32_from_v8i32:
1106; XOP:       # %bb.0: # %entry
1107; XOP-NEXT:    vextractf128 $1, %ymm0, %xmm2
1108; XOP-NEXT:    vpermil2ps $0, %xmm1, %xmm2, %xmm0, %xmm0
1109; XOP-NEXT:    vzeroupper
1110; XOP-NEXT:    retq
1111;
1112; AVX1-LABEL: var_shuffle_v4i32_from_v8i32:
1113; AVX1:       # %bb.0: # %entry
1114; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1115; AVX1-NEXT:    vpermilps %xmm1, %xmm2, %xmm2
1116; AVX1-NEXT:    vpermilps %xmm1, %xmm0, %xmm0
1117; AVX1-NEXT:    vpcmpgtd {{.*}}(%rip), %xmm1, %xmm1
1118; AVX1-NEXT:    vblendvps %xmm1, %xmm2, %xmm0, %xmm0
1119; AVX1-NEXT:    vzeroupper
1120; AVX1-NEXT:    retq
1121;
1122; INT256-LABEL: var_shuffle_v4i32_from_v8i32:
1123; INT256:       # %bb.0: # %entry
1124; INT256-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
1125; INT256-NEXT:    vpermps %ymm0, %ymm1, %ymm0
1126; INT256-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1127; INT256-NEXT:    vzeroupper
1128; INT256-NEXT:    retq
1129entry:
1130  %tmp1 = extractelement <4 x i32> %indices, i32 0
1131  %vecext2.8 = extractelement <8 x i32> %v, i32 %tmp1
1132  %tmp2 = extractelement <4 x i32> %indices, i32 1
1133  %vecext2.9 = extractelement <8 x i32> %v, i32 %tmp2
1134  %tmp3 = extractelement <4 x i32> %indices, i32 2
1135  %vecext2.10 = extractelement <8 x i32> %v, i32 %tmp3
1136  %tmp4 = extractelement <4 x i32> %indices, i32 3
1137  %vecext2.11 = extractelement <8 x i32> %v, i32 %tmp4
1138  %tmp9 = insertelement <4 x i32> undef, i32 %vecext2.8, i32 0
1139  %tmp10 = insertelement <4 x i32> %tmp9, i32 %vecext2.9, i32 1
1140  %tmp11 = insertelement <4 x i32> %tmp10, i32 %vecext2.10, i32 2
1141  %tmp12 = insertelement <4 x i32> %tmp11, i32 %vecext2.11, i32 3
1142  ret <4 x i32> %tmp12
1143}
1144