• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=ALL,AVX512F
3; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw,+avx512dq | FileCheck %s --check-prefixes=ALL,AVX512BW
4
5target triple = "x86_64-unknown-unknown"
6
7define <16 x float> @shuffle_v16f32_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x float> %a, <16 x float> %b) {
8; ALL-LABEL: shuffle_v16f32_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
9; ALL:       # %bb.0:
10; ALL-NEXT:    vbroadcastss %xmm0, %zmm0
11; ALL-NEXT:    retq
12  %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32><i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
13  ret <16 x float> %shuffle
14}
15
16define <16 x float> @shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08(<16 x float> %a, <16 x float> %b) {
17; ALL-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08:
18; ALL:       # %bb.0:
19; ALL-NEXT:    vextractf32x4 $2, %zmm0, %xmm0
20; ALL-NEXT:    vbroadcastss %xmm0, %zmm0
21; ALL-NEXT:    retq
22  %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32><i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
23  ret <16 x float> %shuffle
24}
25
26define <16 x float> @shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_bc(<16 x i32> %a, <16 x i32> %b) {
27; ALL-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_bc:
28; ALL:       # %bb.0:
29; ALL-NEXT:    vextractf32x4 $2, %zmm0, %xmm0
30; ALL-NEXT:    vbroadcastss %xmm0, %zmm0
31; ALL-NEXT:    retq
32  %tmp0 = bitcast <16 x i32> %a to <16 x float>
33  %tmp1 = bitcast <16 x i32> %b to <16 x float>
34  %shuffle = shufflevector <16 x float> %tmp0, <16 x float> %tmp1, <16 x i32><i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
35  ret <16 x float> %shuffle
36}
37
38define <16 x float> @shuffle_v16f32_00_10_01_11_04_14_05_15_08_18_09_19_0c_1c_0d_1d(<16 x float> %a, <16 x float> %b) {
39; ALL-LABEL: shuffle_v16f32_00_10_01_11_04_14_05_15_08_18_09_19_0c_1c_0d_1d:
40; ALL:       # %bb.0:
41; ALL-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
42; ALL-NEXT:    retq
43  %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32><i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
44  ret <16 x float> %shuffle
45}
46
47define <16 x float> @shuffle_v16f32_00_zz_01_zz_04_zz_05_zz_08_zz_09_zz_0c_zz_0d_zz(<16 x float> %a, <16 x float> %b) {
48; ALL-LABEL: shuffle_v16f32_00_zz_01_zz_04_zz_05_zz_08_zz_09_zz_0c_zz_0d_zz:
49; ALL:       # %bb.0:
50; ALL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
51; ALL-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
52; ALL-NEXT:    retq
53  %shuffle = shufflevector <16 x float> %a, <16 x float> zeroinitializer, <16 x i32><i32 0, i32 16, i32 1, i32 16, i32 4, i32 16, i32 5, i32 16, i32 8, i32 16, i32 9, i32 16, i32 12, i32 16, i32 13, i32 16>
54  ret <16 x float> %shuffle
55}
56
57define <16 x float> @shuffle_v16f32_vunpcklps_swap(<16 x float> %a, <16 x float> %b) {
58; ALL-LABEL: shuffle_v16f32_vunpcklps_swap:
59; ALL:       # %bb.0:
60; ALL-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[12],zmm0[12],zmm1[13],zmm0[13]
61; ALL-NEXT:    retq
62  %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 16, i32 0, i32 17, i32 1, i32 20, i32 4, i32 21, i32 5, i32 24, i32 8, i32 25, i32 9, i32 28, i32 12, i32 29, i32 13>
63  ret <16 x float> %shuffle
64}
65
66; PR34382
67define <16 x float> @shuffle_v16f32_01_01_03_00_06_04_05_07_08_08_09_09_15_14_14_12(<16 x float> %a0) {
68; ALL-LABEL: shuffle_v16f32_01_01_03_00_06_04_05_07_08_08_09_09_15_14_14_12:
69; ALL:       # %bb.0:
70; ALL-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[1,1,3,0,6,4,5,7,8,8,9,9,15,14,14,12]
71; ALL-NEXT:    retq
72  %shuffle = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 0, i32 6, i32 4, i32 5, i32 7, i32 8, i32 8, i32 9, i32 9, i32 15, i32 14, i32 14, i32 12>
73  ret <16 x float> %shuffle
74}
75
76define <16 x i32> @shuffle_v16i32_00_10_01_11_04_14_05_15_08_18_09_19_0c_1c_0d_1d(<16 x i32> %a, <16 x i32> %b) {
77; ALL-LABEL: shuffle_v16i32_00_10_01_11_04_14_05_15_08_18_09_19_0c_1c_0d_1d:
78; ALL:       # %bb.0:
79; ALL-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
80; ALL-NEXT:    retq
81  %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
82  ret <16 x i32> %shuffle
83}
84
85define <16 x i32> @shuffle_v16i32_zz_10_zz_11_zz_14_zz_15_zz_18_zz_19_zz_1c_zz_1d(<16 x i32> %a, <16 x i32> %b) {
86; ALL-LABEL: shuffle_v16i32_zz_10_zz_11_zz_14_zz_15_zz_18_zz_19_zz_1c_zz_1d:
87; ALL:       # %bb.0:
88; ALL-NEXT:    vxorps %xmm0, %xmm0, %xmm0
89; ALL-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
90; ALL-NEXT:    retq
91  %shuffle = shufflevector <16 x i32> zeroinitializer, <16 x i32> %b, <16 x i32><i32 15, i32 16, i32 13, i32 17, i32 11, i32 20, i32 9, i32 21, i32 7, i32 24, i32 5, i32 25, i32 3, i32 28, i32 1, i32 29>
92  ret <16 x i32> %shuffle
93}
94
95define <16 x float> @shuffle_v16f32_02_12_03_13_06_16_07_17_0a_1a_0b_1b_0e_1e_0f_1f(<16 x float> %a, <16 x float> %b) {
96; ALL-LABEL: shuffle_v16f32_02_12_03_13_06_16_07_17_0a_1a_0b_1b_0e_1e_0f_1f:
97; ALL:       # %bb.0:
98; ALL-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
99; ALL-NEXT:    retq
100  %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32><i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
101  ret <16 x float> %shuffle
102}
103
104define <16 x float> @shuffle_v16f32_zz_12_zz_13_zz_16_zz_17_zz_1a_zz_1b_zz_1e_zz_1f(<16 x float> %a, <16 x float> %b) {
105; ALL-LABEL: shuffle_v16f32_zz_12_zz_13_zz_16_zz_17_zz_1a_zz_1b_zz_1e_zz_1f:
106; ALL:       # %bb.0:
107; ALL-NEXT:    vxorps %xmm0, %xmm0, %xmm0
108; ALL-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
109; ALL-NEXT:    retq
110  %shuffle = shufflevector <16 x float> zeroinitializer, <16 x float> %b, <16 x i32><i32 0, i32 18, i32 0, i32 19, i32 4, i32 22, i32 4, i32 23, i32 6, i32 26, i32 6, i32 27, i32 8, i32 30, i32 8, i32 31>
111  ret <16 x float> %shuffle
112}
113
114define <16 x float> @shuffle_v16f32_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14(<16 x float> %a, <16 x float> %b) {
115; ALL-LABEL: shuffle_v16f32_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14:
116; ALL:       # %bb.0:
117; ALL-NEXT:    vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
118; ALL-NEXT:    retq
119  %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32><i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
120  ret <16 x float> %shuffle
121}
122
123define <16 x float> @shuffle_v16f32_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15(<16 x float> %a, <16 x float> %b) {
124; ALL-LABEL: shuffle_v16f32_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15:
125; ALL:       # %bb.0:
126; ALL-NEXT:    vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
127; ALL-NEXT:    retq
128  %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32><i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
129  ret <16 x float> %shuffle
130}
131
132define <16 x float> @shuffle_v16f32_00_01_00_01_06_07_06_07_08_09_10_11_12_13_12_13(<16 x float> %a, <16 x float> %b) {
133; ALL-LABEL: shuffle_v16f32_00_01_00_01_06_07_06_07_08_09_10_11_12_13_12_13:
134; ALL:       # %bb.0:
135; ALL-NEXT:    vpermilpd {{.*#+}} zmm0 = zmm0[0,0,3,3,4,5,6,6]
136; ALL-NEXT:    retq
137  %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 6, i32 7, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 12, i32 13>
138  ret <16 x float> %shuffle
139}
140
141define <16 x float> @shuffle_v16f32_00_00_02_00_04_04_06_04_08_08_10_08_12_12_14_12(<16 x float> %a, <16 x float> %b) {
142; ALL-LABEL: shuffle_v16f32_00_00_02_00_04_04_06_04_08_08_10_08_12_12_14_12:
143; ALL:       # %bb.0:
144; ALL-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[0,0,2,0,4,4,6,4,8,8,10,8,12,12,14,12]
145; ALL-NEXT:    retq
146  %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 4, i32 6, i32 4, i32 8, i32 8, i32 10, i32 8, i32 12, i32 12, i32 14, i32 12>
147  ret <16 x float> %shuffle
148}
149
150define <16 x float> @shuffle_v16f32_03_uu_uu_uu_uu_04_uu_uu_uu_uu_11_uu_uu_uu_uu_12(<16 x float> %a, <16 x float> %b) {
151; ALL-LABEL: shuffle_v16f32_03_uu_uu_uu_uu_04_uu_uu_uu_uu_11_uu_uu_uu_uu_12:
152; ALL:       # %bb.0:
153; ALL-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[3,0,3,0,7,4,7,4,11,8,11,8,15,12,15,12]
154; ALL-NEXT:    retq
155  %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 undef, i32 undef, i32 undef, i32 undef, i32 11, i32 undef, i32 undef, i32 undef, i32 undef, i32 12>
156  ret <16 x float> %shuffle
157}
158
159; PR41203
160define <16 x float> @shuffle_v16f32_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31(<16 x float> %a) {
161; ALL-LABEL: shuffle_v16f32_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
162; ALL:       # %bb.0:
163; ALL-NEXT:    vandps {{.*}}(%rip), %zmm0, %zmm0
164; ALL-NEXT:    retq
165  %tmp1 = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> <i32 undef, i32 17, i32 undef, i32 19, i32 undef, i32 5, i32 undef, i32 7, i32 undef, i32 9, i32 undef, i32 11, i32 undef, i32 13, i32 undef, i32 15>
166  %tmp2 = shufflevector <16 x float> %tmp1, <16 x float> <float 0.000000e+00, float undef, float 0.000000e+00, float undef, float 0.000000e+00, float undef, float 0.000000e+00, float undef, float 0.000000e+00, float undef, float 0.000000e+00, float undef, float 0.000000e+00, float undef, float 0.000000e+00, float undef>, <16 x i32> <i32 16, i32 1, i32 18, i32 3, i32 20, i32 5, i32 22, i32 7, i32 24, i32 9, i32 26, i32 11, i32 28, i32 13, i32 30, i32 15>
167  ret <16 x float> %tmp2
168}
169
170; PR48322
171define <16 x float> @shuffle_v16f32_02_03_16_17_06_07_20_21_10_11_24_25_14_15_28_29(<16 x float> %a, <16 x float> %b) {
172; ALL-LABEL: shuffle_v16f32_02_03_16_17_06_07_20_21_10_11_24_25_14_15_28_29:
173; ALL:       # %bb.0:
174; ALL-NEXT:    vshufpd {{.*#+}} zmm0 = zmm0[1],zmm1[0],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[7],zmm1[6]
175; ALL-NEXT:    retq
176  %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 20, i32 21, i32 10, i32 11, i32 24, i32 25, i32 14, i32 15, i32 28, i32 29>
177  ret <16 x float> %shuffle
178}
179
180define <16 x i32> @shuffle_v16i32_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i32> %a, <16 x i32> %b) {
181; ALL-LABEL: shuffle_v16i32_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
182; ALL:       # %bb.0:
183; ALL-NEXT:    vbroadcastss %xmm0, %zmm0
184; ALL-NEXT:    retq
185  %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
186  ret <16 x i32> %shuffle
187}
188
189define <16 x i32> @shuffle_v16i32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04(<16 x i32> %a, <16 x i32> %b) {
190; ALL-LABEL: shuffle_v16i32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
191; ALL:       # %bb.0:
192; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm0
193; ALL-NEXT:    vbroadcastss %xmm0, %zmm0
194; ALL-NEXT:    retq
195  %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
196  ret <16 x i32> %shuffle
197}
198
199define <16 x i32> @shuffle_v16i32_02_12_03_13_06_16_07_17_0a_1a_0b_1b_0e_1e_0f_1f(<16 x i32> %a, <16 x i32> %b) {
200; ALL-LABEL: shuffle_v16i32_02_12_03_13_06_16_07_17_0a_1a_0b_1b_0e_1e_0f_1f:
201; ALL:       # %bb.0:
202; ALL-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
203; ALL-NEXT:    retq
204  %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
205  ret <16 x i32> %shuffle
206}
207
208define <16 x i32> @shuffle_v16i32_02_zz_03_zz_06_zz_07_zz_0a_zz_0b_zz_0e_zz_0f_zz(<16 x i32> %a, <16 x i32> %b) {
209; ALL-LABEL: shuffle_v16i32_02_zz_03_zz_06_zz_07_zz_0a_zz_0b_zz_0e_zz_0f_zz:
210; ALL:       # %bb.0:
211; ALL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
212; ALL-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
213; ALL-NEXT:    retq
214  %shuffle = shufflevector <16 x i32> %a, <16 x i32> zeroinitializer, <16 x i32><i32 2, i32 30, i32 3, i32 28, i32 6, i32 26, i32 7, i32 24, i32 10, i32 22, i32 11, i32 20, i32 14, i32 18, i32 15, i32 16>
215  ret <16 x i32> %shuffle
216}
217
218define <16 x i32> @shuffle_v16i32_01_02_03_16_05_06_07_20_09_10_11_24_13_14_15_28(<16 x i32> %a, <16 x i32> %b) {
219; AVX512F-LABEL: shuffle_v16i32_01_02_03_16_05_06_07_20_09_10_11_24_13_14_15_28:
220; AVX512F:       # %bb.0:
221; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,2,3,16,5,6,7,20,9,10,11,24,13,14,15,28]
222; AVX512F-NEXT:    vpermt2d %zmm1, %zmm2, %zmm0
223; AVX512F-NEXT:    retq
224;
225; AVX512BW-LABEL: shuffle_v16i32_01_02_03_16_05_06_07_20_09_10_11_24_13_14_15_28:
226; AVX512BW:       # %bb.0:
227; AVX512BW-NEXT:    vpalignr {{.*#+}} zmm0 = zmm0[4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1,2,3],zmm0[20,21,22,23,24,25,26,27,28,29,30,31],zmm1[16,17,18,19],zmm0[36,37,38,39,40,41,42,43,44,45,46,47],zmm1[32,33,34,35],zmm0[52,53,54,55,56,57,58,59,60,61,62,63],zmm1[48,49,50,51]
228; AVX512BW-NEXT:    retq
229  %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 1, i32 2, i32 3, i32 16, i32 5, i32 6, i32 7, i32 20, i32 9, i32 10, i32 11, i32 24, i32 13, i32 14, i32 15, i32 28>
230  ret <16 x i32> %shuffle
231}
232
233define <16 x float> @shuffle_v16f32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01(<16 x float> %a)  {
234; ALL-LABEL: shuffle_v16f32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01:
235; ALL:       # %bb.0:
236; ALL-NEXT:    vmovaps {{.*#+}} zmm1 = <2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,1>
237; ALL-NEXT:    vpermps %zmm0, %zmm1, %zmm0
238; ALL-NEXT:    retq
239  %c = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> <i32 2, i32 5, i32 undef, i32 undef, i32 7, i32 undef, i32 10, i32 1,  i32 0, i32 5, i32 undef, i32 4, i32 7, i32 undef, i32 10, i32 1>
240  ret <16 x float> %c
241}
242
243define <16 x i32> @shuffle_v16i32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01(<16 x i32> %a)  {
244; ALL-LABEL: shuffle_v16i32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01:
245; ALL:       # %bb.0:
246; ALL-NEXT:    vmovaps {{.*#+}} zmm1 = <2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,1>
247; ALL-NEXT:    vpermps %zmm0, %zmm1, %zmm0
248; ALL-NEXT:    retq
249  %c = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> <i32 2, i32 5, i32 undef, i32 undef, i32 7, i32 undef, i32 10, i32 1,  i32 0, i32 5, i32 undef, i32 4, i32 7, i32 undef, i32 10, i32 1>
250  ret <16 x i32> %c
251}
252
253define <16 x i32> @shuffle_v16i32_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18(<16 x i32> %a, <16 x i32> %b)  {
254; ALL-LABEL: shuffle_v16i32_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18:
255; ALL:       # %bb.0:
256; ALL-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24]
257; ALL-NEXT:    vpermt2d %zmm1, %zmm2, %zmm0
258; ALL-NEXT:    retq
259  %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
260  ret <16 x i32> %c
261}
262
263define <16 x float> @shuffle_v16f32_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18(<16 x float> %a, <16 x float> %b)  {
264; ALL-LABEL: shuffle_v16f32_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18:
265; ALL:       # %bb.0:
266; ALL-NEXT:    vmovaps {{.*#+}} zmm2 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24]
267; ALL-NEXT:    vpermt2ps %zmm1, %zmm2, %zmm0
268; ALL-NEXT:    retq
269  %c = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
270  ret <16 x float> %c
271}
272
273; PR46249
274define <16 x i32> @shuffle_v16i32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04(<16 x i32> %a) {
275; ALL-LABEL: shuffle_v16i32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
276; ALL:       # %bb.0:
277; ALL-NEXT:    vpshufd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
278; ALL-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3]
279; ALL-NEXT:    retq
280  %1 = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> <i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
281  ret <16 x i32> %1
282}
283
284define <16 x float> @shuffle_v16f32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04(<16 x float> %a) {
285; ALL-LABEL: shuffle_v16f32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
286; ALL:       # %bb.0:
287; ALL-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
288; ALL-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3]
289; ALL-NEXT:    retq
290  %1 = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> <i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
291  ret <16 x float> %1
292}
293
294define <16 x float> @shuffle_v16f32_load_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18(<16 x float> %a, <16 x float>* %b)  {
295; ALL-LABEL: shuffle_v16f32_load_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18:
296; ALL:       # %bb.0:
297; ALL-NEXT:    vmovaps {{.*#+}} zmm1 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24]
298; ALL-NEXT:    vpermt2ps (%rdi), %zmm1, %zmm0
299; ALL-NEXT:    retq
300  %c = load <16 x float>, <16 x float>* %b
301  %d = shufflevector <16 x float> %a, <16 x float> %c, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
302  ret <16 x float> %d
303}
304
305define <16 x float> @shuffle_v16f32_load_08_11_10_00_12_15_14_04(<16 x float> %a0, <16 x float>* %a1) {
306; ALL-LABEL: shuffle_v16f32_load_08_11_10_00_12_15_14_04:
307; ALL:       # %bb.0:
308; ALL-NEXT:    vshufps {{.*#+}} zmm1 = zmm0[2,0],mem[0,0],zmm0[6,4],mem[4,4],zmm0[10,8],mem[8,8],zmm0[14,12],mem[12,12]
309; ALL-NEXT:    vshufps {{.*#+}} zmm0 = zmm0[0,3],zmm1[0,2],zmm0[4,7],zmm1[4,6],zmm0[8,11],zmm1[8,10],zmm0[12,15],zmm1[12,14]
310; ALL-NEXT:    retq
311  %1 = load <16 x float>, <16 x float>* %a1
312  %2 = shufflevector <16 x float> %1, <16 x float> %a0, <16 x i32> <i32 16, i32 19, i32 18, i32 0, i32 20, i32 23, i32 22, i32 4, i32 24, i32 27, i32 26, i32 8, i32 28, i32 31, i32 30, i32 12>
313  ret <16 x float> %2
314}
315
316define <16 x i32> @shuffle_v16i32_load_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18(<16 x i32> %a, <16 x i32>* %b)  {
317; ALL-LABEL: shuffle_v16i32_load_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18:
318; ALL:       # %bb.0:
319; ALL-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24]
320; ALL-NEXT:    vpermt2d (%rdi), %zmm1, %zmm0
321; ALL-NEXT:    retq
322  %c = load <16 x i32>, <16 x i32>* %b
323  %d = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
324  ret <16 x i32> %d
325}
326
327define <16 x i32> @shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u(<16 x i32> %a, <16 x i32> %b)  {
328; ALL-LABEL: shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u:
329; ALL:       # %bb.0:
330; ALL-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
331; ALL-NEXT:    retq
332  %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
333  ret <16 x i32> %c
334}
335
336;FIXME: can do better with vpcompress
337define <8 x i32> @test_v16i32_1_3_5_7_9_11_13_15(<16 x i32> %v) {
338; ALL-LABEL: test_v16i32_1_3_5_7_9_11_13_15:
339; ALL:       # %bb.0:
340; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
341; ALL-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
342; ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
343; ALL-NEXT:    retq
344  %res = shufflevector <16 x i32> %v, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
345  ret <8 x i32> %res
346}
347
348;FIXME: can do better with vpcompress
349define <4 x i32> @test_v16i32_0_1_2_12 (<16 x i32> %v) {
350; ALL-LABEL: test_v16i32_0_1_2_12:
351; ALL:       # %bb.0:
352; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
353; ALL-NEXT:    vextractf128 $1, %ymm1, %xmm1
354; ALL-NEXT:    vbroadcastss %xmm1, %xmm1
355; ALL-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
356; ALL-NEXT:    vzeroupper
357; ALL-NEXT:    retq
358  %res = shufflevector <16 x i32> %v, <16 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 12>
359  ret <4 x i32> %res
360}
361
362;PR31451
363;FIXME: can do better with vpcompress
364define <4 x i32> @test_v16i32_0_4_8_12(<16 x i32> %v) {
365; ALL-LABEL: test_v16i32_0_4_8_12:
366; ALL:       # %bb.0:
367; ALL-NEXT:    vmovaps {{.*#+}} xmm1 = [0,4,8,12]
368; ALL-NEXT:    vpermps %zmm0, %zmm1, %zmm0
369; ALL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
370; ALL-NEXT:    vzeroupper
371; ALL-NEXT:    retq
372  %res = shufflevector <16 x i32> %v, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
373  ret <4 x i32> %res
374}
375
376define <8 x float> @shuffle_v16f32_extract_256(float* %RET, float* %a) {
377; ALL-LABEL: shuffle_v16f32_extract_256:
378; ALL:       # %bb.0:
379; ALL-NEXT:    vmovups 32(%rsi), %ymm0
380; ALL-NEXT:    retq
381  %ptr_a = bitcast float* %a to <16 x float>*
382  %v_a = load <16 x float>, <16 x float>* %ptr_a, align 4
383  %v2 = shufflevector <16 x float> %v_a, <16 x float> undef, <8 x i32>  <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
384  ret <8 x float> %v2
385}
386
387;FIXME: can do better with vcompressp
388define <8 x float> @test_v16f32_0_1_2_3_4_6_7_10 (<16 x float> %v) {
389; ALL-LABEL: test_v16f32_0_1_2_3_4_6_7_10:
390; ALL:       # %bb.0:
391; ALL-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,2,3,4,6,7,10,0,1,2,3,4,6,7,10]
392; ALL-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
393; ALL-NEXT:    vpermd %zmm0, %zmm1, %zmm0
394; ALL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
395; ALL-NEXT:    retq
396  %res = shufflevector <16 x float> %v, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 7, i32 10>
397  ret <8 x float> %res
398}
399
400;FIXME: can do better with vcompressp
401define <4 x float> @test_v16f32_0_1_3_6 (<16 x float> %v) {
402; ALL-LABEL: test_v16f32_0_1_3_6:
403; ALL:       # %bb.0:
404; ALL-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,3,6,0,1,3,6,0,1,3,6,0,1,3,6]
405; ALL-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
406; ALL-NEXT:    vpermd %zmm0, %zmm1, %zmm0
407; ALL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
408; ALL-NEXT:    vzeroupper
409; ALL-NEXT:    retq
410  %res = shufflevector <16 x float> %v, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 6>
411  ret <4 x float> %res
412}
413
414define <16 x i32> @shuffle_v16i16_1_0_0_0_5_4_4_4_9_8_8_8_13_12_12_12(<16 x i32> %a, <16 x i32> %b)  {
415; ALL-LABEL: shuffle_v16i16_1_0_0_0_5_4_4_4_9_8_8_8_13_12_12_12:
416; ALL:       # %bb.0:
417; ALL-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
418; ALL-NEXT:    retq
419  %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
420  ret <16 x i32> %c
421}
422
423define <16 x i32> @shuffle_v16i16_3_3_0_0_7_7_4_4_11_11_8_8_15_15_12_12(<16 x i32> %a, <16 x i32> %b)  {
424; ALL-LABEL: shuffle_v16i16_3_3_0_0_7_7_4_4_11_11_8_8_15_15_12_12:
425; ALL:       # %bb.0:
426; ALL-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
427; ALL-NEXT:    retq
428  %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9, i32 14, i32 15, i32 12, i32 13>
429  ret <16 x i32> %c
430}
431
432define <16 x float> @shuffle_v16f32_00_01_10_10_04_05_14_14_08_09_18_18_0c_0d_1c_1c(<16 x float> %a, <16 x float> %b) {
433; ALL-LABEL: shuffle_v16f32_00_01_10_10_04_05_14_14_08_09_18_18_0c_0d_1c_1c:
434; ALL:       # %bb.0:
435; ALL-NEXT:    vshufps {{.*#+}} zmm0 = zmm0[0,1],zmm1[0,0],zmm0[4,5],zmm1[4,4],zmm0[8,9],zmm1[8,8],zmm0[12,13],zmm1[12,12]
436; ALL-NEXT:    retq
437  %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 0, i32 1, i32 16, i32 16, i32 4, i32 5, i32 20, i32 20, i32 8, i32 9, i32 24, i32 24, i32 12, i32 13, i32 28, i32 28>
438  ret <16 x float> %shuffle
439}
440
441define <16 x i32> @insert_mem_and_zero_v16i32(i32* %ptr) {
442; ALL-LABEL: insert_mem_and_zero_v16i32:
443; ALL:       # %bb.0:
444; ALL-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
445; ALL-NEXT:    retq
446  %a = load i32, i32* %ptr
447  %v = insertelement <16 x i32> undef, i32 %a, i32 0
448  %shuffle = shufflevector <16 x i32> %v, <16 x i32> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
449  ret <16 x i32> %shuffle
450}
451
452
453define <16 x i32> @shuffle_v16i32_0zzzzzzzzzzzzzzz(<16 x i32> %a) {
454; ALL-LABEL: shuffle_v16i32_0zzzzzzzzzzzzzzz:
455; ALL:       # %bb.0:
456; ALL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
457; ALL-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
458; ALL-NEXT:    retq
459  %shuffle = shufflevector <16 x i32> %a, <16 x i32> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
460  ret <16 x i32> %shuffle
461}
462
463define <16 x float> @shuffle_v16f32_0zzzzzzzzzzzzzzz(<16 x float> %a) {
464; ALL-LABEL: shuffle_v16f32_0zzzzzzzzzzzzzzz:
465; ALL:       # %bb.0:
466; ALL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
467; ALL-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
468; ALL-NEXT:    retq
469  %shuffle = shufflevector <16 x float> %a, <16 x float> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
470  ret <16 x float> %shuffle
471}
472
473define <16 x i32> @shuffle_v16i32_16_zz_17_zz_18_zz_19_zz_20_zz_21_zz_22_zz_23_zz(<16 x i32> %a) {
474; ALL-LABEL: shuffle_v16i32_16_zz_17_zz_18_zz_19_zz_20_zz_21_zz_22_zz_23_zz:
475; ALL:       # %bb.0:
476; ALL-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
477; ALL-NEXT:    retq
478  %shuffle = shufflevector <16 x i32> zeroinitializer, <16 x i32> %a, <16 x i32> <i32 16, i32 0, i32 17, i32 0, i32 18, i32 0, i32 19, i32 0, i32 20, i32 0, i32 21, i32 0, i32 22, i32 0, i32 23, i32 0>
479  ret <16 x i32> %shuffle
480}
481
482define <16 x i32> @shuffle_v16i32_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16(<16 x i32> %a, <16 x i32> %b) {
483; ALL-LABEL: shuffle_v16i32_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16:
484; ALL:       # %bb.0:
485; ALL-NEXT:    valignd {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0]
486; ALL-NEXT:    retq
487  %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
488  ret <16 x i32> %shuffle
489}
490
491define <16 x i32> @shuffle_v16i32_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00(<16 x i32> %a) {
492; ALL-LABEL: shuffle_v16i32_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00:
493; ALL:       # %bb.0:
494; ALL-NEXT:    valignd {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0]
495; ALL-NEXT:    retq
496  %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32><i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0>
497  ret <16 x i32> %shuffle
498}
499
500define <16 x i32> @shuffle_v16i32_00_03_16_19_04_07_20_23_08_11_24_27_12_15_28_31(<16 x i32> %a, <16 x i32> %b) {
501; ALL-LABEL: shuffle_v16i32_00_03_16_19_04_07_20_23_08_11_24_27_12_15_28_31:
502; ALL:       # %bb.0:
503; ALL-NEXT:    vshufps {{.*#+}} zmm0 = zmm0[0,3],zmm1[0,3],zmm0[4,7],zmm1[4,7],zmm0[8,11],zmm1[8,11],zmm0[12,15],zmm1[12,15]
504; ALL-NEXT:    retq
505  %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 3, i32 16, i32 19, i32 4, i32 7, i32 20, i32 23, i32 8, i32 11, i32 24, i32 27, i32 12, i32 15, i32 28, i32 31>
506  ret <16 x i32> %shuffle
507}
508
509define <16 x i32> @shuffle_v16i32_16_16_02_03_20_20_06_07_24_24_10_11_28_28_uu_uu(<16 x i32> %a, <16 x i32> %b) {
510; ALL-LABEL: shuffle_v16i32_16_16_02_03_20_20_06_07_24_24_10_11_28_28_uu_uu:
511; ALL:       # %bb.0:
512; ALL-NEXT:    vshufps {{.*#+}} zmm0 = zmm1[0,0],zmm0[2,3],zmm1[4,4],zmm0[6,7],zmm1[8,8],zmm0[10,11],zmm1[12,12],zmm0[14,15]
513; ALL-NEXT:    retq
514  %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 16, i32 16, i32 02, i32 03, i32 20, i32 20, i32 06, i32 07, i32 24, i32 24, i32 10, i32 11, i32 28, i32 28, i32 undef, i32 undef>
515  ret <16 x i32> %shuffle
516}
517
518; PR48322
519define <16 x i32> @shuffle_v16i32_02_03_16_17_06_07_20_21_10_11_24_25_14_15_28_29(<16 x i32> %a, <16 x i32> %b) {
520; AVX512F-LABEL: shuffle_v16i32_02_03_16_17_06_07_20_21_10_11_24_25_14_15_28_29:
521; AVX512F:       # %bb.0:
522; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,8,3,10,5,12,7,14]
523; AVX512F-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
524; AVX512F-NEXT:    retq
525;
526; AVX512BW-LABEL: shuffle_v16i32_02_03_16_17_06_07_20_21_10_11_24_25_14_15_28_29:
527; AVX512BW:       # %bb.0:
528; AVX512BW-NEXT:    vpalignr {{.*#+}} zmm0 = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7],zmm0[24,25,26,27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23],zmm0[40,41,42,43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39],zmm0[56,57,58,59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55]
529; AVX512BW-NEXT:    retq
530  %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 20, i32 21, i32 10, i32 11, i32 24, i32 25, i32 14, i32 15, i32 28, i32 29>
531  ret <16 x i32> %shuffle
532}
533
534define <16 x i32> @shuffle_v8i32_17_16_01_00_21_20_05_04_25_24_09_08_29_28_13_12(<16 x i32> %a, <16 x i32> %b) {
535; ALL-LABEL: shuffle_v8i32_17_16_01_00_21_20_05_04_25_24_09_08_29_28_13_12:
536; ALL:       # %bb.0:
537; ALL-NEXT:    vshufps {{.*#+}} zmm0 = zmm1[1,0],zmm0[1,0],zmm1[5,4],zmm0[5,4],zmm1[9,8],zmm0[9,8],zmm1[13,12],zmm0[13,12]
538; ALL-NEXT:    retq
539  %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 17, i32 16, i32 01, i32 00, i32 21, i32 20, i32 05, i32 04, i32 25, i32 24, i32 09, i32 08, i32 29, i32 28, i32 13, i32 12>
540  ret <16 x i32> %shuffle
541}
542
543define <16 x float> @shuffle_v8f32_v16f32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04(<8 x float> %a) {
544; ALL-LABEL: shuffle_v8f32_v16f32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
545; ALL:       # %bb.0:
546; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm0
547; ALL-NEXT:    vbroadcastss %xmm0, %zmm0
548; ALL-NEXT:    retq
549  %shuffle = shufflevector <8 x float> %a, <8 x float> undef, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
550  ret <16 x float> %shuffle
551}
552
553define <16 x i32> @mask_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_01(<16 x i32> %a, <16 x i32> %passthru, i16 %mask) {
554; AVX512F-LABEL: mask_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_01:
555; AVX512F:       # %bb.0:
556; AVX512F-NEXT:    kmovw %edi, %k1
557; AVX512F-NEXT:    valignd {{.*#+}} zmm1 {%k1} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1]
558; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0
559; AVX512F-NEXT:    retq
560;
561; AVX512BW-LABEL: mask_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_01:
562; AVX512BW:       # %bb.0:
563; AVX512BW-NEXT:    kmovd %edi, %k1
564; AVX512BW-NEXT:    valignd {{.*#+}} zmm1 {%k1} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1]
565; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
566; AVX512BW-NEXT:    retq
567  %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32><i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1>
568  %mask.cast = bitcast i16 %mask to <16 x i1>
569  %res = select <16 x i1> %mask.cast, <16 x i32> %shuffle, <16 x i32> %passthru
570  ret <16 x i32> %res
571}
572
573define <16 x i32> @mask_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passthru, i16 %mask) {
574; AVX512F-LABEL: mask_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17:
575; AVX512F:       # %bb.0:
576; AVX512F-NEXT:    kmovw %edi, %k1
577; AVX512F-NEXT:    valignd {{.*#+}} zmm2 {%k1} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1]
578; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0
579; AVX512F-NEXT:    retq
580;
581; AVX512BW-LABEL: mask_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17:
582; AVX512BW:       # %bb.0:
583; AVX512BW-NEXT:    kmovd %edi, %k1
584; AVX512BW-NEXT:    valignd {{.*#+}} zmm2 {%k1} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1]
585; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
586; AVX512BW-NEXT:    retq
587  %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
588  %mask.cast = bitcast i16 %mask to <16 x i1>
589  %res = select <16 x i1> %mask.cast, <16 x i32> %shuffle, <16 x i32> %passthru
590  ret <16 x i32> %res
591}
592
593define <16 x i32> @maskz_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_01(<16 x i32> %a, i16 %mask) {
594; AVX512F-LABEL: maskz_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_01:
595; AVX512F:       # %bb.0:
596; AVX512F-NEXT:    kmovw %edi, %k1
597; AVX512F-NEXT:    valignd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1]
598; AVX512F-NEXT:    retq
599;
600; AVX512BW-LABEL: maskz_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_01:
601; AVX512BW:       # %bb.0:
602; AVX512BW-NEXT:    kmovd %edi, %k1
603; AVX512BW-NEXT:    valignd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1]
604; AVX512BW-NEXT:    retq
605  %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32><i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1>
606  %mask.cast = bitcast i16 %mask to <16 x i1>
607  %res = select <16 x i1> %mask.cast, <16 x i32> %shuffle, <16 x i32> zeroinitializer
608  ret <16 x i32> %res
609}
610
611define <16 x i32> @maskz_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
612; AVX512F-LABEL: maskz_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17:
613; AVX512F:       # %bb.0:
614; AVX512F-NEXT:    kmovw %edi, %k1
615; AVX512F-NEXT:    valignd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1]
616; AVX512F-NEXT:    retq
617;
618; AVX512BW-LABEL: maskz_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17:
619; AVX512BW:       # %bb.0:
620; AVX512BW-NEXT:    kmovd %edi, %k1
621; AVX512BW-NEXT:    valignd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1]
622; AVX512BW-NEXT:    retq
623  %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
624  %mask.cast = bitcast i16 %mask to <16 x i1>
625  %res = select <16 x i1> %mask.cast, <16 x i32> %shuffle, <16 x i32> zeroinitializer
626  ret <16 x i32> %res
627}
628
629define <16 x float> @test_vshuff32x4_512(<16 x float> %x, <16 x float> %x1) nounwind {
630; ALL-LABEL: test_vshuff32x4_512:
631; ALL:       # %bb.0:
632; ALL-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[2,3,0,1]
633; ALL-NEXT:    retq
634  %res = shufflevector <16 x float> %x, <16 x float> %x1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
635  ret <16 x float> %res
636}
637
638define <16 x i32> @test_vshufi32x4_512(<16 x i32> %x, <16 x i32> %x1) nounwind {
639; ALL-LABEL: test_vshufi32x4_512:
640; ALL:       # %bb.0:
641; ALL-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[2,3,0,1]
642; ALL-NEXT:    retq
643  %res = shufflevector <16 x i32> %x, <16 x i32> %x1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
644  ret <16 x i32> %res
645}
646
647define <16 x float> @test_vshuff32x4_512_mask(<16 x float> %x, <16 x float> %x1, <16 x float> %y, <16 x i1> %mask) nounwind {
648; AVX512F-LABEL: test_vshuff32x4_512_mask:
649; AVX512F:       # %bb.0:
650; AVX512F-NEXT:    vpmovsxbd %xmm3, %zmm3
651; AVX512F-NEXT:    vpslld $31, %zmm3, %zmm3
652; AVX512F-NEXT:    vpmovd2m %zmm3, %k1
653; AVX512F-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
654; AVX512F-NEXT:    vmovaps %zmm2, %zmm0
655; AVX512F-NEXT:    retq
656;
657; AVX512BW-LABEL: test_vshuff32x4_512_mask:
658; AVX512BW:       # %bb.0:
659; AVX512BW-NEXT:    vpsllw $7, %xmm3, %xmm3
660; AVX512BW-NEXT:    vpmovb2m %zmm3, %k1
661; AVX512BW-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
662; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
663; AVX512BW-NEXT:    retq
664  %x2 = shufflevector <16 x float> %x, <16 x float> %x1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
665  %res = select <16 x i1> %mask, <16 x float> %x2, <16 x float> %y
666  ret <16 x float> %res
667}
668
669define <16 x i32> @test_vshufi32x4_512_mask(<16 x i32> %x, <16 x i32> %x1, <16 x i32> %y, <16 x i1> %mask) nounwind {
670; AVX512F-LABEL: test_vshufi32x4_512_mask:
671; AVX512F:       # %bb.0:
672; AVX512F-NEXT:    vpmovsxbd %xmm3, %zmm3
673; AVX512F-NEXT:    vpslld $31, %zmm3, %zmm3
674; AVX512F-NEXT:    vpmovd2m %zmm3, %k1
675; AVX512F-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
676; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0
677; AVX512F-NEXT:    retq
678;
679; AVX512BW-LABEL: test_vshufi32x4_512_mask:
680; AVX512BW:       # %bb.0:
681; AVX512BW-NEXT:    vpsllw $7, %xmm3, %xmm3
682; AVX512BW-NEXT:    vpmovb2m %zmm3, %k1
683; AVX512BW-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
684; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
685; AVX512BW-NEXT:    retq
686  %x2 = shufflevector <16 x i32> %x, <16 x i32> %x1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
687  %res = select <16 x i1> %mask, <16 x i32> %x2, <16 x i32> %y
688  ret <16 x i32> %res
689}
690
691define <16 x float> @mask_shuffle_v16f32_00_01_02_03_04_05_06_07_16_17_18_19_20_21_22_23(<16 x float> %a, <16 x float> %b, <16 x float> %passthru, i16 %mask) {
692; AVX512F-LABEL: mask_shuffle_v16f32_00_01_02_03_04_05_06_07_16_17_18_19_20_21_22_23:
693; AVX512F:       # %bb.0:
694; AVX512F-NEXT:    kmovw %edi, %k1
695; AVX512F-NEXT:    vinsertf32x8 $1, %ymm1, %zmm0, %zmm2 {%k1}
696; AVX512F-NEXT:    vmovaps %zmm2, %zmm0
697; AVX512F-NEXT:    retq
698;
699; AVX512BW-LABEL: mask_shuffle_v16f32_00_01_02_03_04_05_06_07_16_17_18_19_20_21_22_23:
700; AVX512BW:       # %bb.0:
701; AVX512BW-NEXT:    kmovd %edi, %k1
702; AVX512BW-NEXT:    vinsertf32x8 $1, %ymm1, %zmm0, %zmm2 {%k1}
703; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
704; AVX512BW-NEXT:    retq
705  %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
706  %mask.cast = bitcast i16 %mask to <16 x i1>
707  %res = select <16 x i1> %mask.cast, <16 x float> %shuffle, <16 x float> %passthru
708  ret <16 x float> %res
709}
710
711define <16 x float> @mask_shuffle_v16f32_00_01_02_03_16_17_18_19_08_09_10_11_12_13_14_15(<16 x float> %a, <16 x float> %b, <16 x float> %passthru, i16 %mask) {
712; AVX512F-LABEL: mask_shuffle_v16f32_00_01_02_03_16_17_18_19_08_09_10_11_12_13_14_15:
713; AVX512F:       # %bb.0:
714; AVX512F-NEXT:    kmovw %edi, %k1
715; AVX512F-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
716; AVX512F-NEXT:    vmovaps %zmm2, %zmm0
717; AVX512F-NEXT:    retq
718;
719; AVX512BW-LABEL: mask_shuffle_v16f32_00_01_02_03_16_17_18_19_08_09_10_11_12_13_14_15:
720; AVX512BW:       # %bb.0:
721; AVX512BW-NEXT:    kmovd %edi, %k1
722; AVX512BW-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
723; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
724; AVX512BW-NEXT:    retq
725  %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
726  %mask.cast = bitcast i16 %mask to <16 x i1>
727  %res = select <16 x i1> %mask.cast, <16 x float> %shuffle, <16 x float> %passthru
728  ret <16 x float> %res
729}
730
731define <16 x i32> @mask_shuffle_v16i32_00_01_02_03_04_05_06_07_16_17_18_19_20_21_22_23(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passthru, i16 %mask) {
732; AVX512F-LABEL: mask_shuffle_v16i32_00_01_02_03_04_05_06_07_16_17_18_19_20_21_22_23:
733; AVX512F:       # %bb.0:
734; AVX512F-NEXT:    kmovw %edi, %k1
735; AVX512F-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm2 {%k1}
736; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0
737; AVX512F-NEXT:    retq
738;
739; AVX512BW-LABEL: mask_shuffle_v16i32_00_01_02_03_04_05_06_07_16_17_18_19_20_21_22_23:
740; AVX512BW:       # %bb.0:
741; AVX512BW-NEXT:    kmovd %edi, %k1
742; AVX512BW-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm2 {%k1}
743; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
744; AVX512BW-NEXT:    retq
745  %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
746  %mask.cast = bitcast i16 %mask to <16 x i1>
747  %res = select <16 x i1> %mask.cast, <16 x i32> %shuffle, <16 x i32> %passthru
748  ret <16 x i32> %res
749}
750
751define <16 x i32> @mask_shuffle_v16i32_00_01_02_03_16_17_18_19_08_09_10_11_12_13_14_15(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passthru, i16 %mask) {
752; AVX512F-LABEL: mask_shuffle_v16i32_00_01_02_03_16_17_18_19_08_09_10_11_12_13_14_15:
753; AVX512F:       # %bb.0:
754; AVX512F-NEXT:    kmovw %edi, %k1
755; AVX512F-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
756; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0
757; AVX512F-NEXT:    retq
758;
759; AVX512BW-LABEL: mask_shuffle_v16i32_00_01_02_03_16_17_18_19_08_09_10_11_12_13_14_15:
760; AVX512BW:       # %bb.0:
761; AVX512BW-NEXT:    kmovd %edi, %k1
762; AVX512BW-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
763; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
764; AVX512BW-NEXT:    retq
765  %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
766  %mask.cast = bitcast i16 %mask to <16 x i1>
767  %res = select <16 x i1> %mask.cast, <16 x i32> %shuffle, <16 x i32> %passthru
768  ret <16 x i32> %res
769}
770
771define <16 x i32> @mask_shuffle_v4i32_v16i32_00_01_02_03_00_01_02_03_00_01_02_03_00_01_02_03(<4 x i32> %a) {
772; ALL-LABEL: mask_shuffle_v4i32_v16i32_00_01_02_03_00_01_02_03_00_01_02_03_00_01_02_03:
773; ALL:       # %bb.0:
774; ALL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
775; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
776; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
777; ALL-NEXT:    retq
778  %res = shufflevector <4 x i32> %a, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
779  ret <16 x i32> %res
780}
781
782define <16 x float> @mask_shuffle_v4f32_v16f32_00_01_02_03_00_01_02_03_00_01_02_03_00_01_02_03(<4 x float> %a) {
783; ALL-LABEL: mask_shuffle_v4f32_v16f32_00_01_02_03_00_01_02_03_00_01_02_03_00_01_02_03:
784; ALL:       # %bb.0:
785; ALL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
786; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
787; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
788; ALL-NEXT:    retq
789  %res = shufflevector <4 x float> %a, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
790  ret <16 x float> %res
791}
792
793%struct.foo = type { [4 x double], [3 x [4 x double]], [4 x double] }
794
795; This test previously hung in shuffle combining. https://github.com/ispc/ispc/issues/1864
796define void @ispc_1864(<16 x float>* %arg) {
797; ALL-LABEL: ispc_1864:
798; ALL:       # %bb.0: # %bb
799; ALL-NEXT:    pushq %rbp
800; ALL-NEXT:    .cfi_def_cfa_offset 16
801; ALL-NEXT:    .cfi_offset %rbp, -16
802; ALL-NEXT:    movq %rsp, %rbp
803; ALL-NEXT:    .cfi_def_cfa_register %rbp
804; ALL-NEXT:    andq $-64, %rsp
805; ALL-NEXT:    subq $4864, %rsp # imm = 0x1300
806; ALL-NEXT:    vbroadcastss {{.*#+}} ymm0 = [-5.0E+0,-5.0E+0,-5.0E+0,-5.0E+0,-5.0E+0,-5.0E+0,-5.0E+0,-5.0E+0]
807; ALL-NEXT:    vmulps 32(%rdi), %ymm0, %ymm0
808; ALL-NEXT:    vcvtps2pd %ymm0, %zmm0
809; ALL-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[2,3,4,5,0,1,0,1]
810; ALL-NEXT:    vmovapd %ymm0, {{[0-9]+}}(%rsp)
811; ALL-NEXT:    movq %rbp, %rsp
812; ALL-NEXT:    popq %rbp
813; ALL-NEXT:    .cfi_def_cfa %rsp, 8
814; ALL-NEXT:    vzeroupper
815; ALL-NEXT:    retq
816bb:
817  %tmp = alloca [30 x %struct.foo], align 64
818  %tmp1 = load <16 x float>, <16 x float>* %arg, align 4
819  %tmp2 = fmul <16 x float> %tmp1, <float -5.000000e+00, float -5.000000e+00, float -5.000000e+00, float -5.000000e+00, float -5.000000e+00, float -5.000000e+00, float -5.000000e+00, float -5.000000e+00, float -5.000000e+00, float -5.000000e+00, float -5.000000e+00, float -5.000000e+00, float -5.000000e+00, float -5.000000e+00, float -5.000000e+00, float -5.000000e+00>
820  %tmp3 = fpext <16 x float> %tmp2 to <16 x double>
821  %tmp4 = getelementptr inbounds [30 x %struct.foo], [30 x %struct.foo]* %tmp, i64 0, i64 3, i32 2, i64 0
822  %tmp5 = extractelement <16 x double> %tmp3, i32 10
823  store double %tmp5, double* %tmp4, align 32
824  %tmp6 = getelementptr inbounds [30 x %struct.foo], [30 x %struct.foo]* %tmp, i64 0, i64 3, i32 2, i64 1
825  %tmp7 = extractelement <16 x double> %tmp3, i32 11
826  store double %tmp7, double* %tmp6, align 8
827  %tmp8 = getelementptr inbounds [30 x %struct.foo], [30 x %struct.foo]* %tmp, i64 0, i64 3, i32 2, i64 2
828  %tmp9 = extractelement <16 x double> %tmp3, i32 12
829  store double %tmp9, double* %tmp8, align 16
830  %tmp10 = getelementptr inbounds [30 x %struct.foo], [30 x %struct.foo]* %tmp, i64 0, i64 3, i32 2, i64 3
831  %tmp11 = extractelement <16 x double> %tmp3, i32 13
832  store double %tmp11, double* %tmp10, align 8
833  ret void
834}
835
836