• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX512VL
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefix=VL_BW_DQ
5
6define <2 x i1> @shuf2i1_1_0(<2 x i1> %a) {
7; AVX512F-LABEL: shuf2i1_1_0:
8; AVX512F:       # %bb.0:
9; AVX512F-NEXT:    vpsllq $63, %xmm0, %xmm0
10; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
11; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
12; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
13; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
14; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
15; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
16; AVX512F-NEXT:    vzeroupper
17; AVX512F-NEXT:    retq
18;
19; AVX512VL-LABEL: shuf2i1_1_0:
20; AVX512VL:       # %bb.0:
21; AVX512VL-NEXT:    vpsllq $63, %xmm0, %xmm0
22; AVX512VL-NEXT:    vptestmq %xmm0, %xmm0, %k1
23; AVX512VL-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
24; AVX512VL-NEXT:    vmovdqa64 %xmm0, %xmm1 {%k1} {z}
25; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
26; AVX512VL-NEXT:    vptestmq %xmm1, %xmm1, %k1
27; AVX512VL-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
28; AVX512VL-NEXT:    retq
29;
30; VL_BW_DQ-LABEL: shuf2i1_1_0:
31; VL_BW_DQ:       # %bb.0:
32; VL_BW_DQ-NEXT:    vpsllq $63, %xmm0, %xmm0
33; VL_BW_DQ-NEXT:    vpmovq2m %xmm0, %k0
34; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm0
35; VL_BW_DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
36; VL_BW_DQ-NEXT:    vpmovq2m %xmm0, %k0
37; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm0
38; VL_BW_DQ-NEXT:    retq
39  %b = shufflevector <2 x i1> %a, <2 x i1> undef, <2 x i32> <i32 1, i32 0>
40  ret <2 x i1> %b
41}
42
43define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) {
44; AVX512F-LABEL: shuf2i1_1_2:
45; AVX512F:       # %bb.0:
46; AVX512F-NEXT:    vpsllq $63, %xmm0, %xmm0
47; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
48; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
49; AVX512F-NEXT:    movq $-1, %rax
50; AVX512F-NEXT:    vmovq %rax, %xmm1
51; AVX512F-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
52; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
53; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
54; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
55; AVX512F-NEXT:    vzeroupper
56; AVX512F-NEXT:    retq
57;
58; AVX512VL-LABEL: shuf2i1_1_2:
59; AVX512VL:       # %bb.0:
60; AVX512VL-NEXT:    vpsllq $63, %xmm0, %xmm0
61; AVX512VL-NEXT:    vptestmq %xmm0, %xmm0, %k1
62; AVX512VL-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
63; AVX512VL-NEXT:    vmovdqa64 %xmm0, %xmm1 {%k1} {z}
64; AVX512VL-NEXT:    movq $-1, %rax
65; AVX512VL-NEXT:    vmovq %rax, %xmm2
66; AVX512VL-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
67; AVX512VL-NEXT:    vptestmq %xmm1, %xmm1, %k1
68; AVX512VL-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
69; AVX512VL-NEXT:    retq
70;
71; VL_BW_DQ-LABEL: shuf2i1_1_2:
72; VL_BW_DQ:       # %bb.0:
73; VL_BW_DQ-NEXT:    vpsllq $63, %xmm0, %xmm0
74; VL_BW_DQ-NEXT:    vpmovq2m %xmm0, %k0
75; VL_BW_DQ-NEXT:    movq $-1, %rax
76; VL_BW_DQ-NEXT:    vmovq %rax, %xmm0
77; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm1
78; VL_BW_DQ-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
79; VL_BW_DQ-NEXT:    vpmovq2m %xmm0, %k0
80; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm0
81; VL_BW_DQ-NEXT:    retq
82  %b = shufflevector <2 x i1> %a, <2 x i1> <i1 1, i1 0>, <2 x i32> <i32 1, i32 2>
83  ret <2 x i1> %b
84}
85
86
87define <4 x i1> @shuf4i1_3_2_10(<4 x i1> %a) {
88; AVX512F-LABEL: shuf4i1_3_2_10:
89; AVX512F:       # %bb.0:
90; AVX512F-NEXT:    vpslld $31, %xmm0, %xmm0
91; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k1
92; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
93; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
94; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k1
95; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
96; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
97; AVX512F-NEXT:    vzeroupper
98; AVX512F-NEXT:    retq
99;
100; AVX512VL-LABEL: shuf4i1_3_2_10:
101; AVX512VL:       # %bb.0:
102; AVX512VL-NEXT:    vpslld $31, %xmm0, %xmm0
103; AVX512VL-NEXT:    vptestmd %xmm0, %xmm0, %k1
104; AVX512VL-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
105; AVX512VL-NEXT:    vmovdqa32 %xmm0, %xmm1 {%k1} {z}
106; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,2,1,0]
107; AVX512VL-NEXT:    vptestmd %xmm1, %xmm1, %k1
108; AVX512VL-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
109; AVX512VL-NEXT:    retq
110;
111; VL_BW_DQ-LABEL: shuf4i1_3_2_10:
112; VL_BW_DQ:       # %bb.0:
113; VL_BW_DQ-NEXT:    vpslld $31, %xmm0, %xmm0
114; VL_BW_DQ-NEXT:    vpmovd2m %xmm0, %k0
115; VL_BW_DQ-NEXT:    vpmovm2d %k0, %xmm0
116; VL_BW_DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
117; VL_BW_DQ-NEXT:    vpmovd2m %xmm0, %k0
118; VL_BW_DQ-NEXT:    vpmovm2d %k0, %xmm0
119; VL_BW_DQ-NEXT:    retq
120  %b = shufflevector <4 x i1> %a, <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
121  ret <4 x i1> %b
122}
123
124define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> %a1, <8 x i64> %b1) {
125; AVX512F-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
126; AVX512F:       # %bb.0:
127; AVX512F-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1
128; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
129; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0]
130; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
131; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
132; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
133; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
134; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
135; AVX512F-NEXT:    vzeroupper
136; AVX512F-NEXT:    retq
137;
138; AVX512VL-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
139; AVX512VL:       # %bb.0:
140; AVX512VL-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1
141; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
142; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1} {z}
143; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,6,1,0,3,7,7,0]
144; AVX512VL-NEXT:    vpermd %ymm1, %ymm2, %ymm1
145; AVX512VL-NEXT:    vptestmd %ymm1, %ymm1, %k1
146; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
147; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
148; AVX512VL-NEXT:    vzeroupper
149; AVX512VL-NEXT:    retq
150;
151; VL_BW_DQ-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
152; VL_BW_DQ:       # %bb.0:
153; VL_BW_DQ-NEXT:    vpcmpeqq %zmm2, %zmm0, %k0
154; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0
155; VL_BW_DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,6,1,0,3,7,7,0]
156; VL_BW_DQ-NEXT:    vpermd %ymm0, %ymm1, %ymm0
157; VL_BW_DQ-NEXT:    vpmovd2m %ymm0, %k0
158; VL_BW_DQ-NEXT:    vpmovm2w %k0, %xmm0
159; VL_BW_DQ-NEXT:    vzeroupper
160; VL_BW_DQ-NEXT:    retq
161  %a2 = icmp eq <8 x i64> %a, %a1
162  %b2 = icmp eq <8 x i64> %b, %b1
163  %c = shufflevector <8 x i1> %a2, <8 x i1> %b2, <8 x i32> <i32 3, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
164  ret <8 x i1> %c
165}
166
167define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <16 x i32> %b, <16 x i32> %a1, <16 x i32> %b1) {
168; AVX512F-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
169; AVX512F:       # %bb.0:
170; AVX512F-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1
171; AVX512F-NEXT:    vpcmpeqd %zmm3, %zmm1, %k2
172; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
173; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
174; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
175; AVX512F-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
176; AVX512F-NEXT:    vptestmd %zmm2, %zmm2, %k1
177; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
178; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
179; AVX512F-NEXT:    vzeroupper
180; AVX512F-NEXT:    retq
181;
182; AVX512VL-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
183; AVX512VL:       # %bb.0:
184; AVX512VL-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1
185; AVX512VL-NEXT:    vpcmpeqd %zmm3, %zmm1, %k2
186; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
187; AVX512VL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
188; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
189; AVX512VL-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
190; AVX512VL-NEXT:    vptestmd %zmm2, %zmm2, %k1
191; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
192; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
193; AVX512VL-NEXT:    vzeroupper
194; AVX512VL-NEXT:    retq
195;
196; VL_BW_DQ-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
197; VL_BW_DQ:       # %bb.0:
198; VL_BW_DQ-NEXT:    vpcmpeqd %zmm2, %zmm0, %k0
199; VL_BW_DQ-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1
200; VL_BW_DQ-NEXT:    vpmovm2d %k1, %zmm0
201; VL_BW_DQ-NEXT:    vpmovm2d %k0, %zmm1
202; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
203; VL_BW_DQ-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
204; VL_BW_DQ-NEXT:    vpmovd2m %zmm2, %k0
205; VL_BW_DQ-NEXT:    vpmovm2b %k0, %xmm0
206; VL_BW_DQ-NEXT:    vzeroupper
207; VL_BW_DQ-NEXT:    retq
208  %a2 = icmp eq <16 x i32> %a, %a1
209  %b2 = icmp eq <16 x i32> %b, %b1
210  %c = shufflevector <16 x i1> %a2, <16 x i1> %b2, <16 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
211  ret <16 x i1> %c
212}
213
214define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<32 x i1> %a) {
215; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
216; AVX512F:       # %bb.0:
217; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm1
218; AVX512F-NEXT:    vpslld $31, %zmm1, %zmm1
219; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k1
220; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
221; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
222; AVX512F-NEXT:    vpslld $31, %zmm0, %zmm0
223; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k2
224; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
225; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
226; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
227; AVX512F-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
228; AVX512F-NEXT:    vptestmd %zmm2, %zmm2, %k1
229; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
230; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
231; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
232; AVX512F-NEXT:    retq
233;
234; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
235; AVX512VL:       # %bb.0:
236; AVX512VL-NEXT:    vpmovsxbd %xmm0, %zmm1
237; AVX512VL-NEXT:    vpslld $31, %zmm1, %zmm1
238; AVX512VL-NEXT:    vptestmd %zmm1, %zmm1, %k1
239; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0
240; AVX512VL-NEXT:    vpmovsxbd %xmm0, %zmm0
241; AVX512VL-NEXT:    vpslld $31, %zmm0, %zmm0
242; AVX512VL-NEXT:    vptestmd %zmm0, %zmm0, %k2
243; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
244; AVX512VL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
245; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
246; AVX512VL-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
247; AVX512VL-NEXT:    vptestmd %zmm2, %zmm2, %k1
248; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
249; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
250; AVX512VL-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
251; AVX512VL-NEXT:    retq
252;
253; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
254; VL_BW_DQ:       # %bb.0:
255; VL_BW_DQ-NEXT:    vpsllw $7, %ymm0, %ymm0
256; VL_BW_DQ-NEXT:    vpmovb2m %ymm0, %k0
257; VL_BW_DQ-NEXT:    vpmovm2w %k0, %zmm0
258; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
259; VL_BW_DQ-NEXT:    vpermw %zmm0, %zmm1, %zmm0
260; VL_BW_DQ-NEXT:    vpmovw2m %zmm0, %k0
261; VL_BW_DQ-NEXT:    vpmovm2b %k0, %ymm0
262; VL_BW_DQ-NEXT:    retq
263  %b = shufflevector <32 x i1> %a, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
264  ret <32 x i1> %b
265}
266
267define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16(<32 x i16> %a, <32 x i16> %c, <32 x i16> %d) {
268; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16:
269; AVX512F:       # %bb.0:
270; AVX512F-NEXT:    vpxor %xmm6, %xmm6, %xmm6
271; AVX512F-NEXT:    vpcmpeqw %ymm6, %ymm0, %ymm0
272; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
273; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k1
274; AVX512F-NEXT:    vpcmpeqw %ymm6, %ymm1, %ymm0
275; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
276; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k2
277; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
278; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
279; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
280; AVX512F-NEXT:    vpermi2d %zmm0, %zmm1, %zmm6
281; AVX512F-NEXT:    vptestmd %zmm6, %zmm6, %k1
282; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
283; AVX512F-NEXT:    vpmovdw %zmm0, %ymm1
284; AVX512F-NEXT:    vpblendvb %ymm1, %ymm2, %ymm4, %ymm0
285; AVX512F-NEXT:    vpblendvb %ymm1, %ymm3, %ymm5, %ymm1
286; AVX512F-NEXT:    retq
287;
288; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16:
289; AVX512VL:       # %bb.0:
290; AVX512VL-NEXT:    vpxor %xmm6, %xmm6, %xmm6
291; AVX512VL-NEXT:    vpcmpeqw %ymm6, %ymm0, %ymm0
292; AVX512VL-NEXT:    vpmovsxwd %ymm0, %zmm0
293; AVX512VL-NEXT:    vptestmd %zmm0, %zmm0, %k1
294; AVX512VL-NEXT:    vpcmpeqw %ymm6, %ymm1, %ymm0
295; AVX512VL-NEXT:    vpmovsxwd %ymm0, %zmm0
296; AVX512VL-NEXT:    vptestmd %zmm0, %zmm0, %k2
297; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
298; AVX512VL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
299; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
300; AVX512VL-NEXT:    vpermi2d %zmm0, %zmm1, %zmm6
301; AVX512VL-NEXT:    vptestmd %zmm6, %zmm6, %k1
302; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
303; AVX512VL-NEXT:    vpmovdw %zmm0, %ymm1
304; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm4, %ymm0
305; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm5, %ymm1
306; AVX512VL-NEXT:    retq
307;
308; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16:
309; VL_BW_DQ:       # %bb.0:
310; VL_BW_DQ-NEXT:    vptestnmw %zmm0, %zmm0, %k0
311; VL_BW_DQ-NEXT:    vpmovm2w %k0, %zmm0
312; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
313; VL_BW_DQ-NEXT:    vpermw %zmm0, %zmm3, %zmm0
314; VL_BW_DQ-NEXT:    vpmovw2m %zmm0, %k1
315; VL_BW_DQ-NEXT:    vpblendmw %zmm1, %zmm2, %zmm0 {%k1}
316; VL_BW_DQ-NEXT:    retq
317  %cmp = icmp eq <32 x i16> %a, zeroinitializer
318  %shuf = shufflevector <32 x i1> %cmp, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
319  %sel = select <32 x i1> %shuf, <32 x i16> %c, <32 x i16> %d
320  ret <32 x i16> %sel
321}
322
323define <32 x i8> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8(<32 x i8> %a, <32 x i8> %c, <32 x i8> %d) {
324; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8:
325; AVX512F:       # %bb.0:
326; AVX512F-NEXT:    vpxor %xmm3, %xmm3, %xmm3
327; AVX512F-NEXT:    vpcmpeqb %ymm3, %ymm0, %ymm0
328; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm3
329; AVX512F-NEXT:    vptestmd %zmm3, %zmm3, %k1
330; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
331; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
332; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k2
333; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
334; AVX512F-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
335; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
336; AVX512F-NEXT:    vpermi2d %zmm0, %zmm3, %zmm4
337; AVX512F-NEXT:    vptestmd %zmm4, %zmm4, %k1
338; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
339; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
340; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
341; AVX512F-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
342; AVX512F-NEXT:    retq
343;
344; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8:
345; AVX512VL:       # %bb.0:
346; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
347; AVX512VL-NEXT:    vpcmpeqb %ymm3, %ymm0, %ymm0
348; AVX512VL-NEXT:    vpmovsxbd %xmm0, %zmm3
349; AVX512VL-NEXT:    vptestmd %zmm3, %zmm3, %k1
350; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0
351; AVX512VL-NEXT:    vpmovsxbd %xmm0, %zmm0
352; AVX512VL-NEXT:    vptestmd %zmm0, %zmm0, %k2
353; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
354; AVX512VL-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
355; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
356; AVX512VL-NEXT:    vpermi2d %zmm0, %zmm3, %zmm4
357; AVX512VL-NEXT:    vptestmd %zmm4, %zmm4, %k1
358; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
359; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
360; AVX512VL-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
361; AVX512VL-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
362; AVX512VL-NEXT:    retq
363;
364; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8:
365; VL_BW_DQ:       # %bb.0:
366; VL_BW_DQ-NEXT:    vptestnmb %ymm0, %ymm0, %k0
367; VL_BW_DQ-NEXT:    vpmovm2w %k0, %zmm0
368; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
369; VL_BW_DQ-NEXT:    vpermw %zmm0, %zmm3, %zmm0
370; VL_BW_DQ-NEXT:    vpmovw2m %zmm0, %k1
371; VL_BW_DQ-NEXT:    vpblendmb %ymm1, %ymm2, %ymm0 {%k1}
372; VL_BW_DQ-NEXT:    retq
373  %cmp = icmp eq <32 x i8> %a, zeroinitializer
374  %shuf = shufflevector <32 x i1> %cmp, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
375  %sel = select <32 x i1> %shuf, <32 x i8> %c, <32 x i8> %d
376  ret <32 x i8> %sel
377}
378
379define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split(<16 x i32> %a, <16 x i32> %b, <32 x i16> %c, <32 x i16> %d) {
380; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split:
381; AVX512F:       # %bb.0:
382; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k1
383; AVX512F-NEXT:    vptestnmd %zmm1, %zmm1, %k2
384; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
385; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
386; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
387; AVX512F-NEXT:    vpermi2d %zmm0, %zmm1, %zmm6
388; AVX512F-NEXT:    vptestmd %zmm6, %zmm6, %k1
389; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
390; AVX512F-NEXT:    vpmovdw %zmm0, %ymm1
391; AVX512F-NEXT:    vpblendvb %ymm1, %ymm2, %ymm4, %ymm0
392; AVX512F-NEXT:    vpblendvb %ymm1, %ymm3, %ymm5, %ymm1
393; AVX512F-NEXT:    retq
394;
395; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split:
396; AVX512VL:       # %bb.0:
397; AVX512VL-NEXT:    vptestnmd %zmm0, %zmm0, %k1
398; AVX512VL-NEXT:    vptestnmd %zmm1, %zmm1, %k2
399; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
400; AVX512VL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
401; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
402; AVX512VL-NEXT:    vpermi2d %zmm0, %zmm1, %zmm6
403; AVX512VL-NEXT:    vptestmd %zmm6, %zmm6, %k1
404; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
405; AVX512VL-NEXT:    vpmovdw %zmm0, %ymm1
406; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm4, %ymm0
407; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm5, %ymm1
408; AVX512VL-NEXT:    retq
409;
410; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split:
411; VL_BW_DQ:       # %bb.0:
412; VL_BW_DQ-NEXT:    vptestnmd %zmm0, %zmm0, %k0
413; VL_BW_DQ-NEXT:    vptestnmd %zmm1, %zmm1, %k1
414; VL_BW_DQ-NEXT:    kunpckwd %k0, %k1, %k0
415; VL_BW_DQ-NEXT:    vpmovm2w %k0, %zmm0
416; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
417; VL_BW_DQ-NEXT:    vpermw %zmm0, %zmm1, %zmm0
418; VL_BW_DQ-NEXT:    vpmovw2m %zmm0, %k1
419; VL_BW_DQ-NEXT:    vpblendmw %zmm2, %zmm3, %zmm0 {%k1}
420; VL_BW_DQ-NEXT:    retq
421  %cmp1 = icmp eq <16 x i32> %a, zeroinitializer
422  %cmp2 = icmp eq <16 x i32> %b, zeroinitializer
423  %concat = shufflevector <16 x i1> %cmp1, <16 x i1> %cmp2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
424  %shuf = shufflevector <32 x i1> %concat, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
425  %sel = select <32 x i1> %shuf, <32 x i16> %c, <32 x i16> %d
426  ret <32 x i16> %sel
427}
428
429define <32 x i8> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split(<16 x i32> %a, <16 x i32> %b, <32 x i8> %c, <32 x i8> %d) {
430; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split:
431; AVX512F:       # %bb.0:
432; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k1
433; AVX512F-NEXT:    vptestnmd %zmm1, %zmm1, %k2
434; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
435; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
436; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
437; AVX512F-NEXT:    vpermi2d %zmm0, %zmm1, %zmm4
438; AVX512F-NEXT:    vptestmd %zmm4, %zmm4, %k1
439; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
440; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
441; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
442; AVX512F-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm0
443; AVX512F-NEXT:    retq
444;
445; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split:
446; AVX512VL:       # %bb.0:
447; AVX512VL-NEXT:    vptestnmd %zmm0, %zmm0, %k1
448; AVX512VL-NEXT:    vptestnmd %zmm1, %zmm1, %k2
449; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
450; AVX512VL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
451; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
452; AVX512VL-NEXT:    vpermi2d %zmm0, %zmm1, %zmm4
453; AVX512VL-NEXT:    vptestmd %zmm4, %zmm4, %k1
454; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
455; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
456; AVX512VL-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
457; AVX512VL-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm0
458; AVX512VL-NEXT:    retq
459;
460; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split:
461; VL_BW_DQ:       # %bb.0:
462; VL_BW_DQ-NEXT:    vptestnmd %zmm0, %zmm0, %k0
463; VL_BW_DQ-NEXT:    vptestnmd %zmm1, %zmm1, %k1
464; VL_BW_DQ-NEXT:    kunpckwd %k0, %k1, %k0
465; VL_BW_DQ-NEXT:    vpmovm2w %k0, %zmm0
466; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
467; VL_BW_DQ-NEXT:    vpermw %zmm0, %zmm1, %zmm0
468; VL_BW_DQ-NEXT:    vpmovw2m %zmm0, %k1
469; VL_BW_DQ-NEXT:    vpblendmb %ymm2, %ymm3, %ymm0 {%k1}
470; VL_BW_DQ-NEXT:    retq
471  %cmp1 = icmp eq <16 x i32> %a, zeroinitializer
472  %cmp2 = icmp eq <16 x i32> %b, zeroinitializer
473  %concat = shufflevector <16 x i1> %cmp1, <16 x i1> %cmp2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
474  %shuf = shufflevector <32 x i1> %concat, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
475  %sel = select <32 x i1> %shuf, <32 x i8> %c, <32 x i8> %d
476  ret <32 x i8> %sel
477}
478
479define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) {
480; AVX512F-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
481; AVX512F:       # %bb.0:
482; AVX512F-NEXT:    kmovw %edi, %k1
483; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
484; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
485; AVX512F-NEXT:    vpbroadcastq %xmm0, %zmm0
486; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
487; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
488; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
489; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
490; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
491; AVX512F-NEXT:    vzeroupper
492; AVX512F-NEXT:    retq
493;
494; AVX512VL-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
495; AVX512VL:       # %bb.0:
496; AVX512VL-NEXT:    kmovw %edi, %k1
497; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
498; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1} {z}
499; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
500; AVX512VL-NEXT:    vpbroadcastq %xmm1, %ymm1
501; AVX512VL-NEXT:    vpslld $31, %ymm1, %ymm1
502; AVX512VL-NEXT:    vptestmd %ymm1, %ymm1, %k1
503; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
504; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
505; AVX512VL-NEXT:    vzeroupper
506; AVX512VL-NEXT:    retq
507;
508; VL_BW_DQ-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
509; VL_BW_DQ:       # %bb.0:
510; VL_BW_DQ-NEXT:    kmovd %edi, %k0
511; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0
512; VL_BW_DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
513; VL_BW_DQ-NEXT:    vpbroadcastq %xmm0, %ymm0
514; VL_BW_DQ-NEXT:    vpmovd2m %ymm0, %k0
515; VL_BW_DQ-NEXT:    vpmovm2w %k0, %xmm0
516; VL_BW_DQ-NEXT:    vzeroupper
517; VL_BW_DQ-NEXT:    retq
518  %b = bitcast i8 %a to <8 x i1>
519  %c = shufflevector < 8 x i1> %b, <8 x i1>undef, <8 x i32> <i32 undef, i32 2, i32 undef, i32 undef, i32 2, i32 undef, i32 2, i32 undef>
520  ret <8 x i1> %c
521}
522
523define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) {
524; AVX512F-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
525; AVX512F:       # %bb.0:
526; AVX512F-NEXT:    kmovw %edi, %k1
527; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
528; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
529; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u>
530; AVX512F-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
531; AVX512F-NEXT:    vpsllq $63, %zmm2, %zmm0
532; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
533; AVX512F-NEXT:    kmovw %k0, %eax
534; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
535; AVX512F-NEXT:    vzeroupper
536; AVX512F-NEXT:    retq
537;
538; AVX512VL-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
539; AVX512VL:       # %bb.0:
540; AVX512VL-NEXT:    kmovw %edi, %k1
541; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
542; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
543; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
544; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
545; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,1,10,3,0,1,2,3]
546; AVX512VL-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2
547; AVX512VL-NEXT:    vpslld $31, %ymm2, %ymm0
548; AVX512VL-NEXT:    vptestmd %ymm0, %ymm0, %k0
549; AVX512VL-NEXT:    kmovw %k0, %eax
550; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
551; AVX512VL-NEXT:    vzeroupper
552; AVX512VL-NEXT:    retq
553;
554; VL_BW_DQ-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
555; VL_BW_DQ:       # %bb.0:
556; VL_BW_DQ-NEXT:    kmovd %edi, %k0
557; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0
558; VL_BW_DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
559; VL_BW_DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
560; VL_BW_DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,1,10,3,0,1,2,3]
561; VL_BW_DQ-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2
562; VL_BW_DQ-NEXT:    vpmovd2m %ymm2, %k0
563; VL_BW_DQ-NEXT:    kmovd %k0, %eax
564; VL_BW_DQ-NEXT:    # kill: def $al killed $al killed $eax
565; VL_BW_DQ-NEXT:    vzeroupper
566; VL_BW_DQ-NEXT:    retq
567  %b = bitcast i8 %a to <8 x i1>
568  %c = shufflevector < 8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> <i32 10, i32 2, i32 9, i32 undef, i32 3, i32 undef, i32 2, i32 undef>
569  %d = bitcast <8 x i1> %c to i8
570  ret i8 %d
571}
572
573define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a) {
574; AVX512F-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
575; AVX512F:       # %bb.0:
576; AVX512F-NEXT:    kmovw %edi, %k1
577; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
578; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5,0,1,0,1]
579; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
580; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
581; AVX512F-NEXT:    kmovw %k0, %eax
582; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
583; AVX512F-NEXT:    vzeroupper
584; AVX512F-NEXT:    retq
585;
586; AVX512VL-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
587; AVX512VL:       # %bb.0:
588; AVX512VL-NEXT:    kmovw %edi, %k1
589; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
590; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
591; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
592; AVX512VL-NEXT:    vpslld $31, %ymm0, %ymm0
593; AVX512VL-NEXT:    vptestmd %ymm0, %ymm0, %k0
594; AVX512VL-NEXT:    kmovw %k0, %eax
595; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
596; AVX512VL-NEXT:    vzeroupper
597; AVX512VL-NEXT:    retq
598;
599; VL_BW_DQ-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
600; VL_BW_DQ:       # %bb.0:
601; VL_BW_DQ-NEXT:    kmovd %edi, %k0
602; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0
603; VL_BW_DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
604; VL_BW_DQ-NEXT:    vpmovd2m %ymm0, %k0
605; VL_BW_DQ-NEXT:    kmovd %k0, %eax
606; VL_BW_DQ-NEXT:    # kill: def $al killed $al killed $eax
607; VL_BW_DQ-NEXT:    vzeroupper
608; VL_BW_DQ-NEXT:    retq
609  %b = bitcast i8 %a to <8 x i1>
610  %c = shufflevector < 8 x i1> %b, <8 x i1> undef, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 undef, i32 undef, i32 undef, i32 undef>
611  %d = bitcast <8 x i1> %c to i8
612  ret i8 %d
613}
614
615define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) {
616; AVX512F-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
617; AVX512F:       # %bb.0:
618; AVX512F-NEXT:    kmovw %edi, %k1
619; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
620; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
621; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0]
622; AVX512F-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
623; AVX512F-NEXT:    vptestmq %zmm2, %zmm2, %k0
624; AVX512F-NEXT:    kmovw %k0, %eax
625; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
626; AVX512F-NEXT:    vzeroupper
627; AVX512F-NEXT:    retq
628;
629; AVX512VL-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
630; AVX512VL:       # %bb.0:
631; AVX512VL-NEXT:    kmovw %edi, %k1
632; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
633; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
634; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
635; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,6,1,0,3,7,7,0]
636; AVX512VL-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2
637; AVX512VL-NEXT:    vptestmd %ymm2, %ymm2, %k0
638; AVX512VL-NEXT:    kmovw %k0, %eax
639; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
640; AVX512VL-NEXT:    vzeroupper
641; AVX512VL-NEXT:    retq
642;
643; VL_BW_DQ-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
644; VL_BW_DQ:       # %bb.0:
645; VL_BW_DQ-NEXT:    kmovd %edi, %k0
646; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0
647; VL_BW_DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
648; VL_BW_DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,6,1,0,3,7,7,0]
649; VL_BW_DQ-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2
650; VL_BW_DQ-NEXT:    vpmovd2m %ymm2, %k0
651; VL_BW_DQ-NEXT:    kmovd %k0, %eax
652; VL_BW_DQ-NEXT:    # kill: def $al killed $al killed $eax
653; VL_BW_DQ-NEXT:    vzeroupper
654; VL_BW_DQ-NEXT:    retq
655  %b = bitcast i8 %a to <8 x i1>
656  %c = shufflevector <8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
657  %d = bitcast <8 x i1>%c to i8
658  ret i8 %d
659}
660
661define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) {
662; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
663; AVX512F:       # %bb.0:
664; AVX512F-NEXT:    kmovw %edi, %k1
665; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
666; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7]
667; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
668; AVX512F-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
669; AVX512F-NEXT:    vptestmq %zmm2, %zmm2, %k0
670; AVX512F-NEXT:    kmovw %k0, %eax
671; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
672; AVX512F-NEXT:    vzeroupper
673; AVX512F-NEXT:    retq
674;
675; AVX512VL-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
676; AVX512VL:       # %bb.0:
677; AVX512VL-NEXT:    kmovw %edi, %k1
678; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
679; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
680; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2]
681; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
682; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4,5,6,7]
683; AVX512VL-NEXT:    vptestmd %ymm0, %ymm0, %k0
684; AVX512VL-NEXT:    kmovw %k0, %eax
685; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
686; AVX512VL-NEXT:    vzeroupper
687; AVX512VL-NEXT:    retq
688;
689; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
690; VL_BW_DQ:       # %bb.0:
691; VL_BW_DQ-NEXT:    kmovd %edi, %k0
692; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0
693; VL_BW_DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2]
694; VL_BW_DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
695; VL_BW_DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4,5,6,7]
696; VL_BW_DQ-NEXT:    vpmovd2m %ymm0, %k0
697; VL_BW_DQ-NEXT:    kmovd %k0, %eax
698; VL_BW_DQ-NEXT:    # kill: def $al killed $al killed $eax
699; VL_BW_DQ-NEXT:    vzeroupper
700; VL_BW_DQ-NEXT:    retq
701  %b = bitcast i8 %a to <8 x i1>
702  %c = shufflevector <8 x i1> zeroinitializer, <8 x i1> %b, <8 x i32> <i32 9, i32 6, i32 1, i32 10, i32 3, i32 7, i32 7, i32 0>
703  %d = bitcast <8 x i1>%c to i8
704  ret i8 %d
705}
706
707define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) {
708; AVX512F-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
709; AVX512F:       # %bb.0:
710; AVX512F-NEXT:    kmovw %edi, %k1
711; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
712; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [9,6,1,0,3,7,7,1]
713; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [18446744073709551615,18446744073709551615,0,0,0,0,0,0]
714; AVX512F-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
715; AVX512F-NEXT:    vptestmq %zmm2, %zmm2, %k0
716; AVX512F-NEXT:    kmovw %k0, %eax
717; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
718; AVX512F-NEXT:    vzeroupper
719; AVX512F-NEXT:    retq
720;
721; AVX512VL-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
722; AVX512VL:       # %bb.0:
723; AVX512VL-NEXT:    kmovw %edi, %k1
724; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
725; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
726; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
727; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6,7]
728; AVX512VL-NEXT:    vptestmd %ymm0, %ymm0, %k0
729; AVX512VL-NEXT:    kmovw %k0, %eax
730; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
731; AVX512VL-NEXT:    vzeroupper
732; AVX512VL-NEXT:    retq
733;
734; VL_BW_DQ-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
735; VL_BW_DQ:       # %bb.0:
736; VL_BW_DQ-NEXT:    kmovd %edi, %k0
737; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0
738; VL_BW_DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
739; VL_BW_DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6,7]
740; VL_BW_DQ-NEXT:    vpmovd2m %ymm0, %k0
741; VL_BW_DQ-NEXT:    kmovd %k0, %eax
742; VL_BW_DQ-NEXT:    # kill: def $al killed $al killed $eax
743; VL_BW_DQ-NEXT:    vzeroupper
744; VL_BW_DQ-NEXT:    retq
745  %b = bitcast i8 %a to <8 x i1>
746  %c = shufflevector <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <8 x i1> %b, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 1>
747  %c1 = bitcast <8 x i1>%c to i8
748  ret i8 %c1
749}
750
751define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) {
752; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones:
753; AVX512F:       # %bb.0:
754; AVX512F-NEXT:    vpmovsxwq %xmm0, %zmm0
755; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
756; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
757; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
758; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7]
759; AVX512F-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
760; AVX512F-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
761; AVX512F-NEXT:    vptestmq %zmm2, %zmm2, %k0
762; AVX512F-NEXT:    kmovw %k0, %eax
763; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
764; AVX512F-NEXT:    vzeroupper
765; AVX512F-NEXT:    retq
766;
767; AVX512VL-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones:
768; AVX512VL:       # %bb.0:
769; AVX512VL-NEXT:    vpmovsxwd %xmm0, %ymm0
770; AVX512VL-NEXT:    vpslld $31, %ymm0, %ymm0
771; AVX512VL-NEXT:    vptestmd %ymm0, %ymm0, %k1
772; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
773; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1} {z}
774; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
775; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7]
776; AVX512VL-NEXT:    vptestmd %ymm0, %ymm0, %k0
777; AVX512VL-NEXT:    kmovw %k0, %eax
778; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
779; AVX512VL-NEXT:    vzeroupper
780; AVX512VL-NEXT:    retq
781;
782; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones:
783; VL_BW_DQ:       # %bb.0:
784; VL_BW_DQ-NEXT:    vpsllw $15, %xmm0, %xmm0
785; VL_BW_DQ-NEXT:    vpmovw2m %xmm0, %k0
786; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0
787; VL_BW_DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
788; VL_BW_DQ-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
789; VL_BW_DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
790; VL_BW_DQ-NEXT:    vpmovd2m %ymm0, %k0
791; VL_BW_DQ-NEXT:    kmovd %k0, %eax
792; VL_BW_DQ-NEXT:    # kill: def $al killed $al killed $eax
793; VL_BW_DQ-NEXT:    vzeroupper
794; VL_BW_DQ-NEXT:    retq
795  %c = shufflevector <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i1> %a, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
796  %c1 = bitcast <8 x i1>%c to i8
797  ret i8 %c1
798}
799
800
801define i16 @shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0(i16 %a) {
802; AVX512F-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0:
803; AVX512F:       # %bb.0:
804; AVX512F-NEXT:    kmovw %edi, %k1
805; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
806; AVX512F-NEXT:    vpbroadcastd %xmm0, %zmm0
807; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
808; AVX512F-NEXT:    kmovw %k0, %eax
809; AVX512F-NEXT:    # kill: def $ax killed $ax killed $eax
810; AVX512F-NEXT:    vzeroupper
811; AVX512F-NEXT:    retq
812;
813; AVX512VL-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0:
814; AVX512VL:       # %bb.0:
815; AVX512VL-NEXT:    kmovw %edi, %k1
816; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
817; AVX512VL-NEXT:    vpbroadcastd %xmm0, %zmm0
818; AVX512VL-NEXT:    vptestmd %zmm0, %zmm0, %k0
819; AVX512VL-NEXT:    kmovw %k0, %eax
820; AVX512VL-NEXT:    # kill: def $ax killed $ax killed $eax
821; AVX512VL-NEXT:    vzeroupper
822; AVX512VL-NEXT:    retq
823;
824; VL_BW_DQ-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0:
825; VL_BW_DQ:       # %bb.0:
826; VL_BW_DQ-NEXT:    kmovd %edi, %k0
827; VL_BW_DQ-NEXT:    vpmovm2d %k0, %zmm0
828; VL_BW_DQ-NEXT:    vpbroadcastd %xmm0, %zmm0
829; VL_BW_DQ-NEXT:    vpmovd2m %zmm0, %k0
830; VL_BW_DQ-NEXT:    kmovd %k0, %eax
831; VL_BW_DQ-NEXT:    # kill: def $ax killed $ax killed $eax
832; VL_BW_DQ-NEXT:    vzeroupper
833; VL_BW_DQ-NEXT:    retq
834  %b = bitcast i16 %a to <16 x i1>
835  %c = shufflevector < 16 x i1> %b, <16 x i1> undef, <16 x i32> zeroinitializer
836  %d = bitcast <16 x i1> %c to i16
837  ret i16 %d
838}
839
840define i64 @shuf64i1_zero(i64 %a) {
841; AVX512F-LABEL: shuf64i1_zero:
842; AVX512F:       # %bb.0:
843; AVX512F-NEXT:    kmovw %edi, %k1
844; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
845; AVX512F-NEXT:    vpbroadcastd %xmm0, %zmm0
846; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
847; AVX512F-NEXT:    kmovw %k0, %eax
848; AVX512F-NEXT:    kmovw %k0, %ecx
849; AVX512F-NEXT:    shll $16, %ecx
850; AVX512F-NEXT:    orl %eax, %ecx
851; AVX512F-NEXT:    movq %rcx, %rax
852; AVX512F-NEXT:    shlq $32, %rax
853; AVX512F-NEXT:    orq %rcx, %rax
854; AVX512F-NEXT:    vzeroupper
855; AVX512F-NEXT:    retq
856;
857; AVX512VL-LABEL: shuf64i1_zero:
858; AVX512VL:       # %bb.0:
859; AVX512VL-NEXT:    kmovw %edi, %k1
860; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
861; AVX512VL-NEXT:    vpbroadcastd %xmm0, %zmm0
862; AVX512VL-NEXT:    vptestmd %zmm0, %zmm0, %k0
863; AVX512VL-NEXT:    kmovw %k0, %eax
864; AVX512VL-NEXT:    kmovw %k0, %ecx
865; AVX512VL-NEXT:    shll $16, %ecx
866; AVX512VL-NEXT:    orl %eax, %ecx
867; AVX512VL-NEXT:    movq %rcx, %rax
868; AVX512VL-NEXT:    shlq $32, %rax
869; AVX512VL-NEXT:    orq %rcx, %rax
870; AVX512VL-NEXT:    vzeroupper
871; AVX512VL-NEXT:    retq
872;
873; VL_BW_DQ-LABEL: shuf64i1_zero:
874; VL_BW_DQ:       # %bb.0:
875; VL_BW_DQ-NEXT:    kmovq %rdi, %k0
876; VL_BW_DQ-NEXT:    vpmovm2b %k0, %zmm0
877; VL_BW_DQ-NEXT:    vpbroadcastb %xmm0, %zmm0
878; VL_BW_DQ-NEXT:    vpmovb2m %zmm0, %k0
879; VL_BW_DQ-NEXT:    kmovq %k0, %rax
880; VL_BW_DQ-NEXT:    vzeroupper
881; VL_BW_DQ-NEXT:    retq
882  %b = bitcast i64 %a to <64 x i1>
883  %c = shufflevector < 64 x i1> %b, <64 x i1> undef, <64 x i32> zeroinitializer
884  %d = bitcast <64 x i1> %c to i64
885  ret i64 %d
886}
887