• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX512VL
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefix=VL_BW_DQ
5
6define <2 x i1> @shuf2i1_1_0(<2 x i1> %a) {
7; AVX512F-LABEL: shuf2i1_1_0:
8; AVX512F:       # %bb.0:
9; AVX512F-NEXT:    vpsllq $63, %xmm0, %xmm0
10; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
11; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
12; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
13; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
14; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
15; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
16; AVX512F-NEXT:    vzeroupper
17; AVX512F-NEXT:    retq
18;
19; AVX512VL-LABEL: shuf2i1_1_0:
20; AVX512VL:       # %bb.0:
21; AVX512VL-NEXT:    vpsllq $63, %xmm0, %xmm0
22; AVX512VL-NEXT:    vptestmq %xmm0, %xmm0, %k1
23; AVX512VL-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
24; AVX512VL-NEXT:    vmovdqa64 %xmm0, %xmm1 {%k1} {z}
25; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
26; AVX512VL-NEXT:    vptestmq %xmm1, %xmm1, %k1
27; AVX512VL-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
28; AVX512VL-NEXT:    retq
29;
30; VL_BW_DQ-LABEL: shuf2i1_1_0:
31; VL_BW_DQ:       # %bb.0:
32; VL_BW_DQ-NEXT:    vpsllq $63, %xmm0, %xmm0
33; VL_BW_DQ-NEXT:    vpmovq2m %xmm0, %k0
34; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm0
35; VL_BW_DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
36; VL_BW_DQ-NEXT:    vpmovq2m %xmm0, %k0
37; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm0
38; VL_BW_DQ-NEXT:    retq
39  %b = shufflevector <2 x i1> %a, <2 x i1> undef, <2 x i32> <i32 1, i32 0>
40  ret <2 x i1> %b
41}
42
43define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) {
44; AVX512F-LABEL: shuf2i1_1_2:
45; AVX512F:       # %bb.0:
46; AVX512F-NEXT:    vpsllq $63, %xmm0, %xmm0
47; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
48; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
49; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = [18446744073709551615,0]
50; AVX512F-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
51; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
52; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
53; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
54; AVX512F-NEXT:    vzeroupper
55; AVX512F-NEXT:    retq
56;
57; AVX512VL-LABEL: shuf2i1_1_2:
58; AVX512VL:       # %bb.0:
59; AVX512VL-NEXT:    vpsllq $63, %xmm0, %xmm0
60; AVX512VL-NEXT:    vptestmq %xmm0, %xmm0, %k1
61; AVX512VL-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
62; AVX512VL-NEXT:    vmovdqa64 %xmm0, %xmm1 {%k1} {z}
63; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [18446744073709551615,0]
64; AVX512VL-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
65; AVX512VL-NEXT:    vptestmq %xmm1, %xmm1, %k1
66; AVX512VL-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
67; AVX512VL-NEXT:    retq
68;
69; VL_BW_DQ-LABEL: shuf2i1_1_2:
70; VL_BW_DQ:       # %bb.0:
71; VL_BW_DQ-NEXT:    vpsllq $63, %xmm0, %xmm0
72; VL_BW_DQ-NEXT:    vpmovq2m %xmm0, %k0
73; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm0
74; VL_BW_DQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [18446744073709551615,0]
75; VL_BW_DQ-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
76; VL_BW_DQ-NEXT:    vpmovq2m %xmm0, %k0
77; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm0
78; VL_BW_DQ-NEXT:    retq
79  %b = shufflevector <2 x i1> %a, <2 x i1> <i1 1, i1 0>, <2 x i32> <i32 1, i32 2>
80  ret <2 x i1> %b
81}
82
83
84define <4 x i1> @shuf4i1_3_2_10(<4 x i1> %a) {
85; AVX512F-LABEL: shuf4i1_3_2_10:
86; AVX512F:       # %bb.0:
87; AVX512F-NEXT:    vpslld $31, %xmm0, %xmm0
88; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k1
89; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
90; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
91; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k1
92; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
93; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
94; AVX512F-NEXT:    vzeroupper
95; AVX512F-NEXT:    retq
96;
97; AVX512VL-LABEL: shuf4i1_3_2_10:
98; AVX512VL:       # %bb.0:
99; AVX512VL-NEXT:    vpslld $31, %xmm0, %xmm0
100; AVX512VL-NEXT:    vptestmd %xmm0, %xmm0, %k1
101; AVX512VL-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
102; AVX512VL-NEXT:    vmovdqa32 %xmm0, %xmm1 {%k1} {z}
103; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,2,1,0]
104; AVX512VL-NEXT:    vptestmd %xmm1, %xmm1, %k1
105; AVX512VL-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
106; AVX512VL-NEXT:    retq
107;
108; VL_BW_DQ-LABEL: shuf4i1_3_2_10:
109; VL_BW_DQ:       # %bb.0:
110; VL_BW_DQ-NEXT:    vpslld $31, %xmm0, %xmm0
111; VL_BW_DQ-NEXT:    vpmovd2m %xmm0, %k0
112; VL_BW_DQ-NEXT:    vpmovm2d %k0, %xmm0
113; VL_BW_DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
114; VL_BW_DQ-NEXT:    vpmovd2m %xmm0, %k0
115; VL_BW_DQ-NEXT:    vpmovm2d %k0, %xmm0
116; VL_BW_DQ-NEXT:    retq
117  %b = shufflevector <4 x i1> %a, <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
118  ret <4 x i1> %b
119}
120
121define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> %a1, <8 x i64> %b1) {
122; AVX512F-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
123; AVX512F:       # %bb.0:
124; AVX512F-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1
125; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
126; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0]
127; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
128; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
129; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
130; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
131; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
132; AVX512F-NEXT:    vzeroupper
133; AVX512F-NEXT:    retq
134;
135; AVX512VL-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
136; AVX512VL:       # %bb.0:
137; AVX512VL-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1
138; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
139; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1} {z}
140; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,6,1,0,3,7,7,0]
141; AVX512VL-NEXT:    vpermd %ymm1, %ymm2, %ymm1
142; AVX512VL-NEXT:    vptestmd %ymm1, %ymm1, %k1
143; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
144; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
145; AVX512VL-NEXT:    vzeroupper
146; AVX512VL-NEXT:    retq
147;
148; VL_BW_DQ-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
149; VL_BW_DQ:       # %bb.0:
150; VL_BW_DQ-NEXT:    vpcmpeqq %zmm2, %zmm0, %k0
151; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0
152; VL_BW_DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,6,1,0,3,7,7,0]
153; VL_BW_DQ-NEXT:    vpermd %ymm0, %ymm1, %ymm0
154; VL_BW_DQ-NEXT:    vpmovd2m %ymm0, %k0
155; VL_BW_DQ-NEXT:    vpmovm2w %k0, %xmm0
156; VL_BW_DQ-NEXT:    vzeroupper
157; VL_BW_DQ-NEXT:    retq
158  %a2 = icmp eq <8 x i64> %a, %a1
159  %b2 = icmp eq <8 x i64> %b, %b1
160  %c = shufflevector <8 x i1> %a2, <8 x i1> %b2, <8 x i32> <i32 3, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
161  ret <8 x i1> %c
162}
163
164define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <16 x i32> %b, <16 x i32> %a1, <16 x i32> %b1) {
165; AVX512F-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
166; AVX512F:       # %bb.0:
167; AVX512F-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1
168; AVX512F-NEXT:    vpcmpeqd %zmm3, %zmm1, %k2
169; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
170; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
171; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
172; AVX512F-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
173; AVX512F-NEXT:    vptestmd %zmm2, %zmm2, %k1
174; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
175; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
176; AVX512F-NEXT:    vzeroupper
177; AVX512F-NEXT:    retq
178;
179; AVX512VL-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
180; AVX512VL:       # %bb.0:
181; AVX512VL-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1
182; AVX512VL-NEXT:    vpcmpeqd %zmm3, %zmm1, %k2
183; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
184; AVX512VL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
185; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
186; AVX512VL-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
187; AVX512VL-NEXT:    vptestmd %zmm2, %zmm2, %k1
188; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
189; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
190; AVX512VL-NEXT:    vzeroupper
191; AVX512VL-NEXT:    retq
192;
193; VL_BW_DQ-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
194; VL_BW_DQ:       # %bb.0:
195; VL_BW_DQ-NEXT:    vpcmpeqd %zmm2, %zmm0, %k0
196; VL_BW_DQ-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1
197; VL_BW_DQ-NEXT:    vpmovm2d %k1, %zmm0
198; VL_BW_DQ-NEXT:    vpmovm2d %k0, %zmm1
199; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
200; VL_BW_DQ-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
201; VL_BW_DQ-NEXT:    vpmovd2m %zmm2, %k0
202; VL_BW_DQ-NEXT:    vpmovm2b %k0, %xmm0
203; VL_BW_DQ-NEXT:    vzeroupper
204; VL_BW_DQ-NEXT:    retq
205  %a2 = icmp eq <16 x i32> %a, %a1
206  %b2 = icmp eq <16 x i32> %b, %b1
207  %c = shufflevector <16 x i1> %a2, <16 x i1> %b2, <16 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
208  ret <16 x i1> %c
209}
210
211define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<32 x i1> %a) {
212; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
213; AVX512F:       # %bb.0:
214; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm1
215; AVX512F-NEXT:    vpslld $31, %zmm1, %zmm1
216; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k1
217; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
218; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
219; AVX512F-NEXT:    vpslld $31, %zmm0, %zmm0
220; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k2
221; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
222; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
223; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
224; AVX512F-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
225; AVX512F-NEXT:    vptestmd %zmm2, %zmm2, %k1
226; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
227; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
228; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
229; AVX512F-NEXT:    retq
230;
231; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
232; AVX512VL:       # %bb.0:
233; AVX512VL-NEXT:    vpmovsxbd %xmm0, %zmm1
234; AVX512VL-NEXT:    vpslld $31, %zmm1, %zmm1
235; AVX512VL-NEXT:    vptestmd %zmm1, %zmm1, %k1
236; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0
237; AVX512VL-NEXT:    vpmovsxbd %xmm0, %zmm0
238; AVX512VL-NEXT:    vpslld $31, %zmm0, %zmm0
239; AVX512VL-NEXT:    vptestmd %zmm0, %zmm0, %k2
240; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
241; AVX512VL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
242; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
243; AVX512VL-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
244; AVX512VL-NEXT:    vptestmd %zmm2, %zmm2, %k1
245; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
246; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
247; AVX512VL-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
248; AVX512VL-NEXT:    retq
249;
250; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
251; VL_BW_DQ:       # %bb.0:
252; VL_BW_DQ-NEXT:    vpsllw $7, %ymm0, %ymm0
253; VL_BW_DQ-NEXT:    vpmovb2m %ymm0, %k0
254; VL_BW_DQ-NEXT:    vpmovm2w %k0, %zmm0
255; VL_BW_DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
256; VL_BW_DQ-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
257; VL_BW_DQ-NEXT:    vpermw %zmm0, %zmm1, %zmm0
258; VL_BW_DQ-NEXT:    vpmovw2m %zmm0, %k0
259; VL_BW_DQ-NEXT:    vpmovm2b %k0, %ymm0
260; VL_BW_DQ-NEXT:    retq
261  %b = shufflevector <32 x i1> %a, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
262  ret <32 x i1> %b
263}
264
265define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16(<32 x i16> %a, <32 x i16> %c, <32 x i16> %d) {
266; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16:
267; AVX512F:       # %bb.0:
268; AVX512F-NEXT:    vpxor %xmm3, %xmm3, %xmm3
269; AVX512F-NEXT:    vpcmpeqw %ymm3, %ymm0, %ymm4
270; AVX512F-NEXT:    vpmovsxwd %ymm4, %zmm4
271; AVX512F-NEXT:    vptestmd %zmm4, %zmm4, %k1
272; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
273; AVX512F-NEXT:    vpcmpeqw %ymm3, %ymm0, %ymm0
274; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
275; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k2
276; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
277; AVX512F-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
278; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
279; AVX512F-NEXT:    vpermi2d %zmm0, %zmm3, %zmm4
280; AVX512F-NEXT:    vptestmd %zmm4, %zmm4, %k1
281; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
282; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
283; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
284; AVX512F-NEXT:    vpternlogq $202, %zmm2, %zmm1, %zmm0
285; AVX512F-NEXT:    retq
286;
287; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16:
288; AVX512VL:       # %bb.0:
289; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
290; AVX512VL-NEXT:    vpcmpeqw %ymm3, %ymm0, %ymm4
291; AVX512VL-NEXT:    vpmovsxwd %ymm4, %zmm4
292; AVX512VL-NEXT:    vptestmd %zmm4, %zmm4, %k1
293; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
294; AVX512VL-NEXT:    vpcmpeqw %ymm3, %ymm0, %ymm0
295; AVX512VL-NEXT:    vpmovsxwd %ymm0, %zmm0
296; AVX512VL-NEXT:    vptestmd %zmm0, %zmm0, %k2
297; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
298; AVX512VL-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
299; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
300; AVX512VL-NEXT:    vpermi2d %zmm0, %zmm3, %zmm4
301; AVX512VL-NEXT:    vptestmd %zmm4, %zmm4, %k1
302; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
303; AVX512VL-NEXT:    vpmovdw %zmm0, %ymm0
304; AVX512VL-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
305; AVX512VL-NEXT:    vpternlogq $202, %zmm2, %zmm1, %zmm0
306; AVX512VL-NEXT:    retq
307;
308; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16:
309; VL_BW_DQ:       # %bb.0:
310; VL_BW_DQ-NEXT:    vptestnmw %zmm0, %zmm0, %k0
311; VL_BW_DQ-NEXT:    vpmovm2w %k0, %zmm0
312; VL_BW_DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
313; VL_BW_DQ-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
314; VL_BW_DQ-NEXT:    vpermw %zmm0, %zmm3, %zmm0
315; VL_BW_DQ-NEXT:    vpmovw2m %zmm0, %k1
316; VL_BW_DQ-NEXT:    vpblendmw %zmm1, %zmm2, %zmm0 {%k1}
317; VL_BW_DQ-NEXT:    retq
318  %cmp = icmp eq <32 x i16> %a, zeroinitializer
319  %shuf = shufflevector <32 x i1> %cmp, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
320  %sel = select <32 x i1> %shuf, <32 x i16> %c, <32 x i16> %d
321  ret <32 x i16> %sel
322}
323
324define <32 x i8> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8(<32 x i8> %a, <32 x i8> %c, <32 x i8> %d) {
325; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8:
326; AVX512F:       # %bb.0:
327; AVX512F-NEXT:    vpxor %xmm3, %xmm3, %xmm3
328; AVX512F-NEXT:    vpcmpeqb %ymm3, %ymm0, %ymm0
329; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm3
330; AVX512F-NEXT:    vptestmd %zmm3, %zmm3, %k1
331; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
332; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
333; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k2
334; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
335; AVX512F-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
336; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
337; AVX512F-NEXT:    vpermi2d %zmm0, %zmm3, %zmm4
338; AVX512F-NEXT:    vptestmd %zmm4, %zmm4, %k1
339; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
340; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
341; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
342; AVX512F-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
343; AVX512F-NEXT:    retq
344;
345; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8:
346; AVX512VL:       # %bb.0:
347; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
348; AVX512VL-NEXT:    vpcmpeqb %ymm3, %ymm0, %ymm0
349; AVX512VL-NEXT:    vpmovsxbd %xmm0, %zmm3
350; AVX512VL-NEXT:    vptestmd %zmm3, %zmm3, %k1
351; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0
352; AVX512VL-NEXT:    vpmovsxbd %xmm0, %zmm0
353; AVX512VL-NEXT:    vptestmd %zmm0, %zmm0, %k2
354; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
355; AVX512VL-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
356; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
357; AVX512VL-NEXT:    vpermi2d %zmm0, %zmm3, %zmm4
358; AVX512VL-NEXT:    vptestmd %zmm4, %zmm4, %k1
359; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
360; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
361; AVX512VL-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
362; AVX512VL-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
363; AVX512VL-NEXT:    retq
364;
365; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8:
366; VL_BW_DQ:       # %bb.0:
367; VL_BW_DQ-NEXT:    vptestnmb %ymm0, %ymm0, %k0
368; VL_BW_DQ-NEXT:    vpmovm2w %k0, %zmm0
369; VL_BW_DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
370; VL_BW_DQ-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
371; VL_BW_DQ-NEXT:    vpermw %zmm0, %zmm3, %zmm0
372; VL_BW_DQ-NEXT:    vpmovw2m %zmm0, %k1
373; VL_BW_DQ-NEXT:    vpblendmb %ymm1, %ymm2, %ymm0 {%k1}
374; VL_BW_DQ-NEXT:    retq
375  %cmp = icmp eq <32 x i8> %a, zeroinitializer
376  %shuf = shufflevector <32 x i1> %cmp, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
377  %sel = select <32 x i1> %shuf, <32 x i8> %c, <32 x i8> %d
378  ret <32 x i8> %sel
379}
380
381define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split(<16 x i32> %a, <16 x i32> %b, <32 x i16> %c, <32 x i16> %d) {
382; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split:
383; AVX512F:       # %bb.0:
384; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k1
385; AVX512F-NEXT:    vptestnmd %zmm1, %zmm1, %k2
386; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
387; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
388; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
389; AVX512F-NEXT:    vpermi2d %zmm0, %zmm1, %zmm4
390; AVX512F-NEXT:    vptestmd %zmm4, %zmm4, %k1
391; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
392; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
393; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
394; AVX512F-NEXT:    vpternlogq $202, %zmm3, %zmm2, %zmm0
395; AVX512F-NEXT:    retq
396;
397; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split:
398; AVX512VL:       # %bb.0:
399; AVX512VL-NEXT:    vptestnmd %zmm0, %zmm0, %k1
400; AVX512VL-NEXT:    vptestnmd %zmm1, %zmm1, %k2
401; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
402; AVX512VL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
403; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
404; AVX512VL-NEXT:    vpermi2d %zmm0, %zmm1, %zmm4
405; AVX512VL-NEXT:    vptestmd %zmm4, %zmm4, %k1
406; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
407; AVX512VL-NEXT:    vpmovdw %zmm0, %ymm0
408; AVX512VL-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
409; AVX512VL-NEXT:    vpternlogq $202, %zmm3, %zmm2, %zmm0
410; AVX512VL-NEXT:    retq
411;
412; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split:
413; VL_BW_DQ:       # %bb.0:
414; VL_BW_DQ-NEXT:    vptestnmd %zmm0, %zmm0, %k0
415; VL_BW_DQ-NEXT:    vptestnmd %zmm1, %zmm1, %k1
416; VL_BW_DQ-NEXT:    kunpckwd %k0, %k1, %k0
417; VL_BW_DQ-NEXT:    vpmovm2w %k0, %zmm0
418; VL_BW_DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
419; VL_BW_DQ-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
420; VL_BW_DQ-NEXT:    vpermw %zmm0, %zmm1, %zmm0
421; VL_BW_DQ-NEXT:    vpmovw2m %zmm0, %k1
422; VL_BW_DQ-NEXT:    vpblendmw %zmm2, %zmm3, %zmm0 {%k1}
423; VL_BW_DQ-NEXT:    retq
424  %cmp1 = icmp eq <16 x i32> %a, zeroinitializer
425  %cmp2 = icmp eq <16 x i32> %b, zeroinitializer
426  %concat = shufflevector <16 x i1> %cmp1, <16 x i1> %cmp2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
427  %shuf = shufflevector <32 x i1> %concat, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
428  %sel = select <32 x i1> %shuf, <32 x i16> %c, <32 x i16> %d
429  ret <32 x i16> %sel
430}
431
432define <32 x i8> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split(<16 x i32> %a, <16 x i32> %b, <32 x i8> %c, <32 x i8> %d) {
433; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split:
434; AVX512F:       # %bb.0:
435; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k1
436; AVX512F-NEXT:    vptestnmd %zmm1, %zmm1, %k2
437; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
438; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
439; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
440; AVX512F-NEXT:    vpermi2d %zmm0, %zmm1, %zmm4
441; AVX512F-NEXT:    vptestmd %zmm4, %zmm4, %k1
442; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
443; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
444; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
445; AVX512F-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm0
446; AVX512F-NEXT:    retq
447;
448; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split:
449; AVX512VL:       # %bb.0:
450; AVX512VL-NEXT:    vptestnmd %zmm0, %zmm0, %k1
451; AVX512VL-NEXT:    vptestnmd %zmm1, %zmm1, %k2
452; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
453; AVX512VL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
454; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
455; AVX512VL-NEXT:    vpermi2d %zmm0, %zmm1, %zmm4
456; AVX512VL-NEXT:    vptestmd %zmm4, %zmm4, %k1
457; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
458; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
459; AVX512VL-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
460; AVX512VL-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm0
461; AVX512VL-NEXT:    retq
462;
463; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split:
464; VL_BW_DQ:       # %bb.0:
465; VL_BW_DQ-NEXT:    vptestnmd %zmm0, %zmm0, %k0
466; VL_BW_DQ-NEXT:    vptestnmd %zmm1, %zmm1, %k1
467; VL_BW_DQ-NEXT:    kunpckwd %k0, %k1, %k0
468; VL_BW_DQ-NEXT:    vpmovm2w %k0, %zmm0
469; VL_BW_DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
470; VL_BW_DQ-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
471; VL_BW_DQ-NEXT:    vpermw %zmm0, %zmm1, %zmm0
472; VL_BW_DQ-NEXT:    vpmovw2m %zmm0, %k1
473; VL_BW_DQ-NEXT:    vpblendmb %ymm2, %ymm3, %ymm0 {%k1}
474; VL_BW_DQ-NEXT:    retq
475  %cmp1 = icmp eq <16 x i32> %a, zeroinitializer
476  %cmp2 = icmp eq <16 x i32> %b, zeroinitializer
477  %concat = shufflevector <16 x i1> %cmp1, <16 x i1> %cmp2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
478  %shuf = shufflevector <32 x i1> %concat, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
479  %sel = select <32 x i1> %shuf, <32 x i8> %c, <32 x i8> %d
480  ret <32 x i8> %sel
481}
482
483define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) {
484; AVX512F-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
485; AVX512F:       # %bb.0:
486; AVX512F-NEXT:    kmovw %edi, %k1
487; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
488; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
489; AVX512F-NEXT:    vpbroadcastq %xmm0, %zmm0
490; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
491; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
492; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
493; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
494; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
495; AVX512F-NEXT:    vzeroupper
496; AVX512F-NEXT:    retq
497;
498; AVX512VL-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
499; AVX512VL:       # %bb.0:
500; AVX512VL-NEXT:    kmovw %edi, %k1
501; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
502; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1} {z}
503; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
504; AVX512VL-NEXT:    vpbroadcastq %xmm1, %ymm1
505; AVX512VL-NEXT:    vpslld $31, %ymm1, %ymm1
506; AVX512VL-NEXT:    vptestmd %ymm1, %ymm1, %k1
507; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
508; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
509; AVX512VL-NEXT:    vzeroupper
510; AVX512VL-NEXT:    retq
511;
512; VL_BW_DQ-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
513; VL_BW_DQ:       # %bb.0:
514; VL_BW_DQ-NEXT:    kmovd %edi, %k0
515; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0
516; VL_BW_DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
517; VL_BW_DQ-NEXT:    vpbroadcastq %xmm0, %ymm0
518; VL_BW_DQ-NEXT:    vpmovd2m %ymm0, %k0
519; VL_BW_DQ-NEXT:    vpmovm2w %k0, %xmm0
520; VL_BW_DQ-NEXT:    vzeroupper
521; VL_BW_DQ-NEXT:    retq
522  %b = bitcast i8 %a to <8 x i1>
523  %c = shufflevector < 8 x i1> %b, <8 x i1>undef, <8 x i32> <i32 undef, i32 2, i32 undef, i32 undef, i32 2, i32 undef, i32 2, i32 undef>
524  ret <8 x i1> %c
525}
526
527define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) {
528; AVX512F-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
529; AVX512F:       # %bb.0:
530; AVX512F-NEXT:    kmovw %edi, %k1
531; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
532; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
533; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u>
534; AVX512F-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
535; AVX512F-NEXT:    vpsllq $63, %zmm2, %zmm0
536; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
537; AVX512F-NEXT:    kmovw %k0, %eax
538; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
539; AVX512F-NEXT:    vzeroupper
540; AVX512F-NEXT:    retq
541;
542; AVX512VL-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
543; AVX512VL:       # %bb.0:
544; AVX512VL-NEXT:    kmovw %edi, %k1
545; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
546; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
547; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
548; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,2,10,3,3,2,2,3]
549; AVX512VL-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2
550; AVX512VL-NEXT:    vpslld $31, %ymm2, %ymm0
551; AVX512VL-NEXT:    vptestmd %ymm0, %ymm0, %k0
552; AVX512VL-NEXT:    kmovw %k0, %eax
553; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
554; AVX512VL-NEXT:    vzeroupper
555; AVX512VL-NEXT:    retq
556;
557; VL_BW_DQ-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
558; VL_BW_DQ:       # %bb.0:
559; VL_BW_DQ-NEXT:    kmovd %edi, %k0
560; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0
561; VL_BW_DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
562; VL_BW_DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,2,10,3,3,2,2,3]
563; VL_BW_DQ-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2
564; VL_BW_DQ-NEXT:    vpmovd2m %ymm2, %k0
565; VL_BW_DQ-NEXT:    kmovd %k0, %eax
566; VL_BW_DQ-NEXT:    # kill: def $al killed $al killed $eax
567; VL_BW_DQ-NEXT:    vzeroupper
568; VL_BW_DQ-NEXT:    retq
569  %b = bitcast i8 %a to <8 x i1>
570  %c = shufflevector < 8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> <i32 10, i32 2, i32 9, i32 undef, i32 3, i32 undef, i32 2, i32 undef>
571  %d = bitcast <8 x i1> %c to i8
572  ret i8 %d
573}
574
575define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a) {
576; AVX512F-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
577; AVX512F:       # %bb.0:
578; AVX512F-NEXT:    kmovw %edi, %k1
579; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
580; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5,0,1,0,1]
581; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
582; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
583; AVX512F-NEXT:    kmovw %k0, %eax
584; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
585; AVX512F-NEXT:    vzeroupper
586; AVX512F-NEXT:    retq
587;
588; AVX512VL-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
589; AVX512VL:       # %bb.0:
590; AVX512VL-NEXT:    kmovw %edi, %k1
591; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
592; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
593; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
594; AVX512VL-NEXT:    vpslld $31, %ymm0, %ymm0
595; AVX512VL-NEXT:    vptestmd %ymm0, %ymm0, %k0
596; AVX512VL-NEXT:    kmovw %k0, %eax
597; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
598; AVX512VL-NEXT:    vzeroupper
599; AVX512VL-NEXT:    retq
600;
601; VL_BW_DQ-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
602; VL_BW_DQ:       # %bb.0:
603; VL_BW_DQ-NEXT:    kmovd %edi, %k0
604; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0
605; VL_BW_DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
606; VL_BW_DQ-NEXT:    vpmovd2m %ymm0, %k0
607; VL_BW_DQ-NEXT:    kmovd %k0, %eax
608; VL_BW_DQ-NEXT:    # kill: def $al killed $al killed $eax
609; VL_BW_DQ-NEXT:    vzeroupper
610; VL_BW_DQ-NEXT:    retq
611  %b = bitcast i8 %a to <8 x i1>
612  %c = shufflevector < 8 x i1> %b, <8 x i1> undef, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 undef, i32 undef, i32 undef, i32 undef>
613  %d = bitcast <8 x i1> %c to i8
614  ret i8 %d
615}
616
617define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) {
618; AVX512F-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
619; AVX512F:       # %bb.0:
620; AVX512F-NEXT:    kmovw %edi, %k1
621; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
622; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
623; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0]
624; AVX512F-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
625; AVX512F-NEXT:    vptestmq %zmm2, %zmm2, %k0
626; AVX512F-NEXT:    kmovw %k0, %eax
627; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
628; AVX512F-NEXT:    vzeroupper
629; AVX512F-NEXT:    retq
630;
631; AVX512VL-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
632; AVX512VL:       # %bb.0:
633; AVX512VL-NEXT:    kmovw %edi, %k1
634; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
635; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
636; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
637; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,6,1,0,3,7,7,0]
638; AVX512VL-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2
639; AVX512VL-NEXT:    vptestmd %ymm2, %ymm2, %k0
640; AVX512VL-NEXT:    kmovw %k0, %eax
641; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
642; AVX512VL-NEXT:    vzeroupper
643; AVX512VL-NEXT:    retq
644;
645; VL_BW_DQ-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
646; VL_BW_DQ:       # %bb.0:
647; VL_BW_DQ-NEXT:    kmovd %edi, %k0
648; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0
649; VL_BW_DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
650; VL_BW_DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,6,1,0,3,7,7,0]
651; VL_BW_DQ-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2
652; VL_BW_DQ-NEXT:    vpmovd2m %ymm2, %k0
653; VL_BW_DQ-NEXT:    kmovd %k0, %eax
654; VL_BW_DQ-NEXT:    # kill: def $al killed $al killed $eax
655; VL_BW_DQ-NEXT:    vzeroupper
656; VL_BW_DQ-NEXT:    retq
657  %b = bitcast i8 %a to <8 x i1>
658  %c = shufflevector <8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
659  %d = bitcast <8 x i1>%c to i8
660  ret i8 %d
661}
662
663define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) {
664; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
665; AVX512F:       # %bb.0:
666; AVX512F-NEXT:    kmovw %edi, %k1
667; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
668; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7]
669; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
670; AVX512F-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
671; AVX512F-NEXT:    vptestmq %zmm2, %zmm2, %k0
672; AVX512F-NEXT:    kmovw %k0, %eax
673; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
674; AVX512F-NEXT:    vzeroupper
675; AVX512F-NEXT:    retq
676;
677; AVX512VL-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
678; AVX512VL:       # %bb.0:
679; AVX512VL-NEXT:    kmovw %edi, %k1
680; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
681; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
682; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
683; AVX512VL-NEXT:    vptestmd %ymm0, %ymm0, %k0
684; AVX512VL-NEXT:    kmovw %k0, %eax
685; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
686; AVX512VL-NEXT:    vzeroupper
687; AVX512VL-NEXT:    retq
688;
689; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
690; VL_BW_DQ:       # %bb.0:
691; VL_BW_DQ-NEXT:    kmovd %edi, %k0
692; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0
693; VL_BW_DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
694; VL_BW_DQ-NEXT:    vpmovd2m %ymm0, %k0
695; VL_BW_DQ-NEXT:    kmovd %k0, %eax
696; VL_BW_DQ-NEXT:    # kill: def $al killed $al killed $eax
697; VL_BW_DQ-NEXT:    vzeroupper
698; VL_BW_DQ-NEXT:    retq
699  %b = bitcast i8 %a to <8 x i1>
700  %c = shufflevector <8 x i1> zeroinitializer, <8 x i1> %b, <8 x i32> <i32 9, i32 6, i32 1, i32 10, i32 3, i32 7, i32 7, i32 0>
701  %d = bitcast <8 x i1>%c to i8
702  ret i8 %d
703}
704
705define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) {
706; AVX512F-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
707; AVX512F:       # %bb.0:
708; AVX512F-NEXT:    kmovw %edi, %k1
709; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
710; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [9,6,1,0,3,7,7,1]
711; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [18446744073709551615,18446744073709551615,0,0,0,0,0,0]
712; AVX512F-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
713; AVX512F-NEXT:    vptestmq %zmm2, %zmm2, %k0
714; AVX512F-NEXT:    kmovw %k0, %eax
715; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
716; AVX512F-NEXT:    vzeroupper
717; AVX512F-NEXT:    retq
718;
719; AVX512VL-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
720; AVX512VL:       # %bb.0:
721; AVX512VL-NEXT:    kmovw %edi, %k1
722; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
723; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
724; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
725; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6,7]
726; AVX512VL-NEXT:    vptestmd %ymm0, %ymm0, %k0
727; AVX512VL-NEXT:    kmovw %k0, %eax
728; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
729; AVX512VL-NEXT:    vzeroupper
730; AVX512VL-NEXT:    retq
731;
732; VL_BW_DQ-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
733; VL_BW_DQ:       # %bb.0:
734; VL_BW_DQ-NEXT:    kmovd %edi, %k0
735; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0
736; VL_BW_DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
737; VL_BW_DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6,7]
738; VL_BW_DQ-NEXT:    vpmovd2m %ymm0, %k0
739; VL_BW_DQ-NEXT:    kmovd %k0, %eax
740; VL_BW_DQ-NEXT:    # kill: def $al killed $al killed $eax
741; VL_BW_DQ-NEXT:    vzeroupper
742; VL_BW_DQ-NEXT:    retq
743  %b = bitcast i8 %a to <8 x i1>
744  %c = shufflevector <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <8 x i1> %b, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 1>
745  %c1 = bitcast <8 x i1>%c to i8
746  ret i8 %c1
747}
748
749define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) {
750; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones:
751; AVX512F:       # %bb.0:
752; AVX512F-NEXT:    vpmovsxwq %xmm0, %zmm0
753; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
754; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
755; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
756; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7]
757; AVX512F-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
758; AVX512F-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
759; AVX512F-NEXT:    vptestmq %zmm2, %zmm2, %k0
760; AVX512F-NEXT:    kmovw %k0, %eax
761; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
762; AVX512F-NEXT:    vzeroupper
763; AVX512F-NEXT:    retq
764;
765; AVX512VL-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones:
766; AVX512VL:       # %bb.0:
767; AVX512VL-NEXT:    vpmovsxwd %xmm0, %ymm0
768; AVX512VL-NEXT:    vpslld $31, %ymm0, %ymm0
769; AVX512VL-NEXT:    vptestmd %ymm0, %ymm0, %k1
770; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
771; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1} {z}
772; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [9,1,2,3,4,5,6,7]
773; AVX512VL-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2
774; AVX512VL-NEXT:    vptestmd %ymm2, %ymm2, %k0
775; AVX512VL-NEXT:    kmovw %k0, %eax
776; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
777; AVX512VL-NEXT:    vzeroupper
778; AVX512VL-NEXT:    retq
779;
780; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones:
781; VL_BW_DQ:       # %bb.0:
782; VL_BW_DQ-NEXT:    vpsllw $15, %xmm0, %xmm0
783; VL_BW_DQ-NEXT:    vpmovw2m %xmm0, %k0
784; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0
785; VL_BW_DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [9,1,2,3,4,5,6,7]
786; VL_BW_DQ-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
787; VL_BW_DQ-NEXT:    vpermt2d %ymm0, %ymm1, %ymm2
788; VL_BW_DQ-NEXT:    vpmovd2m %ymm2, %k0
789; VL_BW_DQ-NEXT:    kmovd %k0, %eax
790; VL_BW_DQ-NEXT:    # kill: def $al killed $al killed $eax
791; VL_BW_DQ-NEXT:    vzeroupper
792; VL_BW_DQ-NEXT:    retq
793  %c = shufflevector <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i1> %a, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
794  %c1 = bitcast <8 x i1>%c to i8
795  ret i8 %c1
796}
797
798define i16 @shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0(i16 %a) {
799; AVX512F-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0:
800; AVX512F:       # %bb.0:
801; AVX512F-NEXT:    kmovw %edi, %k1
802; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
803; AVX512F-NEXT:    vpbroadcastd %xmm0, %zmm0
804; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
805; AVX512F-NEXT:    kmovw %k0, %eax
806; AVX512F-NEXT:    # kill: def $ax killed $ax killed $eax
807; AVX512F-NEXT:    vzeroupper
808; AVX512F-NEXT:    retq
809;
810; AVX512VL-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0:
811; AVX512VL:       # %bb.0:
812; AVX512VL-NEXT:    kmovw %edi, %k1
813; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
814; AVX512VL-NEXT:    vpbroadcastd %xmm0, %zmm0
815; AVX512VL-NEXT:    vptestmd %zmm0, %zmm0, %k0
816; AVX512VL-NEXT:    kmovw %k0, %eax
817; AVX512VL-NEXT:    # kill: def $ax killed $ax killed $eax
818; AVX512VL-NEXT:    vzeroupper
819; AVX512VL-NEXT:    retq
820;
821; VL_BW_DQ-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0:
822; VL_BW_DQ:       # %bb.0:
823; VL_BW_DQ-NEXT:    kmovd %edi, %k0
824; VL_BW_DQ-NEXT:    vpmovm2d %k0, %zmm0
825; VL_BW_DQ-NEXT:    vpbroadcastd %xmm0, %zmm0
826; VL_BW_DQ-NEXT:    vpmovd2m %zmm0, %k0
827; VL_BW_DQ-NEXT:    kmovd %k0, %eax
828; VL_BW_DQ-NEXT:    # kill: def $ax killed $ax killed $eax
829; VL_BW_DQ-NEXT:    vzeroupper
830; VL_BW_DQ-NEXT:    retq
831  %b = bitcast i16 %a to <16 x i1>
832  %c = shufflevector < 16 x i1> %b, <16 x i1> undef, <16 x i32> zeroinitializer
833  %d = bitcast <16 x i1> %c to i16
834  ret i16 %d
835}
836
837define i64 @shuf64i1_zero(i64 %a) {
838; AVX512F-LABEL: shuf64i1_zero:
839; AVX512F:       # %bb.0:
840; AVX512F-NEXT:    kmovw %edi, %k1
841; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
842; AVX512F-NEXT:    vpbroadcastd %xmm0, %zmm0
843; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
844; AVX512F-NEXT:    kmovw %k0, %eax
845; AVX512F-NEXT:    kmovw %k0, %ecx
846; AVX512F-NEXT:    shll $16, %ecx
847; AVX512F-NEXT:    orl %eax, %ecx
848; AVX512F-NEXT:    movq %rcx, %rax
849; AVX512F-NEXT:    shlq $32, %rax
850; AVX512F-NEXT:    orq %rcx, %rax
851; AVX512F-NEXT:    vzeroupper
852; AVX512F-NEXT:    retq
853;
854; AVX512VL-LABEL: shuf64i1_zero:
855; AVX512VL:       # %bb.0:
856; AVX512VL-NEXT:    kmovw %edi, %k1
857; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
858; AVX512VL-NEXT:    vpbroadcastd %xmm0, %zmm0
859; AVX512VL-NEXT:    vptestmd %zmm0, %zmm0, %k0
860; AVX512VL-NEXT:    kmovw %k0, %eax
861; AVX512VL-NEXT:    kmovw %k0, %ecx
862; AVX512VL-NEXT:    shll $16, %ecx
863; AVX512VL-NEXT:    orl %eax, %ecx
864; AVX512VL-NEXT:    movq %rcx, %rax
865; AVX512VL-NEXT:    shlq $32, %rax
866; AVX512VL-NEXT:    orq %rcx, %rax
867; AVX512VL-NEXT:    vzeroupper
868; AVX512VL-NEXT:    retq
869;
870; VL_BW_DQ-LABEL: shuf64i1_zero:
871; VL_BW_DQ:       # %bb.0:
872; VL_BW_DQ-NEXT:    kmovq %rdi, %k0
873; VL_BW_DQ-NEXT:    vpmovm2b %k0, %zmm0
874; VL_BW_DQ-NEXT:    vpbroadcastb %xmm0, %zmm0
875; VL_BW_DQ-NEXT:    vpmovb2m %zmm0, %k0
876; VL_BW_DQ-NEXT:    kmovq %k0, %rax
877; VL_BW_DQ-NEXT:    vzeroupper
878; VL_BW_DQ-NEXT:    retq
879  %b = bitcast i64 %a to <64 x i1>
880  %c = shufflevector < 64 x i1> %b, <64 x i1> undef, <64 x i32> zeroinitializer
881  %d = bitcast <64 x i1> %c to i64
882  ret i64 %d
883}
884