• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F
3; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw -mattr=+avx512vl -mattr=+avx512dq| FileCheck %s --check-prefix=VL_BW_DQ
4
5target triple = "x86_64-unknown-unknown"
6
7define <2 x i1> @shuf2i1_1_0(<2 x i1> %a) {
8; AVX512F-LABEL: shuf2i1_1_0:
9; AVX512F:       # BB#0:
10; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
11; AVX512F-NEXT:    retq
12;
13; VL_BW_DQ-LABEL: shuf2i1_1_0:
14; VL_BW_DQ:       # BB#0:
15; VL_BW_DQ-NEXT:    vpsllq $63, %xmm0, %xmm0
16; VL_BW_DQ-NEXT:    vptestmq %xmm0, %xmm0, %k0
17; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm0
18; VL_BW_DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
19; VL_BW_DQ-NEXT:    vpsllq $63, %xmm0, %xmm0
20; VL_BW_DQ-NEXT:    vptestmq %xmm0, %xmm0, %k0
21; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm0
22; VL_BW_DQ-NEXT:    retq
23  %b = shufflevector <2 x i1> %a, <2 x i1> undef, <2 x i32> <i32 1, i32 0>
24  ret <2 x i1> %b
25}
26
27define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) {
28; AVX512F-LABEL: shuf2i1_1_2:
29; AVX512F:       # BB#0:
30; AVX512F-NEXT:    movl $1, %eax
31; AVX512F-NEXT:    vmovq %rax, %xmm1
32; AVX512F-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
33; AVX512F-NEXT:    retq
34;
35; VL_BW_DQ-LABEL: shuf2i1_1_2:
36; VL_BW_DQ:       # BB#0:
37; VL_BW_DQ-NEXT:    vpsllq $63, %xmm0, %xmm0
38; VL_BW_DQ-NEXT:    vptestmq %xmm0, %xmm0, %k0
39; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm0
40; VL_BW_DQ-NEXT:    movb $1, %al
41; VL_BW_DQ-NEXT:    kmovb %eax, %k0
42; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm1
43; VL_BW_DQ-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
44; VL_BW_DQ-NEXT:    vpsllq $63, %xmm0, %xmm0
45; VL_BW_DQ-NEXT:    vptestmq %xmm0, %xmm0, %k0
46; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm0
47; VL_BW_DQ-NEXT:    retq
48  %b = shufflevector <2 x i1> %a, <2 x i1> <i1 1, i1 0>, <2 x i32> <i32 1, i32 2>
49  ret <2 x i1> %b
50}
51
52
53define <4 x i1> @shuf4i1_3_2_10(<4 x i1> %a) {
54; AVX512F-LABEL: shuf4i1_3_2_10:
55; AVX512F:       # BB#0:
56; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
57; AVX512F-NEXT:    retq
58;
59; VL_BW_DQ-LABEL: shuf4i1_3_2_10:
60; VL_BW_DQ:       # BB#0:
61; VL_BW_DQ-NEXT:    vpslld $31, %xmm0, %xmm0
62; VL_BW_DQ-NEXT:    vptestmd %xmm0, %xmm0, %k0
63; VL_BW_DQ-NEXT:    vpmovm2d %k0, %xmm0
64; VL_BW_DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
65; VL_BW_DQ-NEXT:    vpslld $31, %xmm0, %xmm0
66; VL_BW_DQ-NEXT:    vptestmd %xmm0, %xmm0, %k0
67; VL_BW_DQ-NEXT:    vpmovm2d %k0, %xmm0
68; VL_BW_DQ-NEXT:    retq
69  %b = shufflevector <4 x i1> %a, <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
70  ret <4 x i1> %b
71}
72
73define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> %a1, <8 x i64> %b1) {
74; AVX512F-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
75; AVX512F:       # BB#0:
76; AVX512F-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1
77; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
78; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1} {z}
79; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [3,6,1,0,3,7,7,0]
80; AVX512F-NEXT:    vpermq %zmm1, %zmm2, %zmm1
81; AVX512F-NEXT:    vpsllq $63, %zmm1, %zmm1
82; AVX512F-NEXT:    vptestmq %zmm1, %zmm1, %k1
83; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
84; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
85; AVX512F-NEXT:    retq
86;
87; VL_BW_DQ-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
88; VL_BW_DQ:       # BB#0:
89; VL_BW_DQ-NEXT:    vpcmpeqq %zmm2, %zmm0, %k0
90; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm0
91; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0]
92; VL_BW_DQ-NEXT:    vpermq %zmm0, %zmm1, %zmm0
93; VL_BW_DQ-NEXT:    vpsllq $63, %zmm0, %zmm0
94; VL_BW_DQ-NEXT:    vptestmq %zmm0, %zmm0, %k0
95; VL_BW_DQ-NEXT:    vpmovm2w %k0, %xmm0
96; VL_BW_DQ-NEXT:    retq
97  %a2 = icmp eq <8 x i64> %a, %a1
98  %b2 = icmp eq <8 x i64> %b, %b1
99  %c = shufflevector <8 x i1> %a2, <8 x i1> %b2, <8 x i32> <i32 3, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
100  ret <8 x i1> %c
101}
102
103define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <16 x i32> %b, <16 x i32> %a1, <16 x i32> %b1) {
104; AVX512F-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
105; AVX512F:       # BB#0:
106; AVX512F-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1
107; AVX512F-NEXT:    vpcmpeqd %zmm3, %zmm1, %k2
108; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
109; AVX512F-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k2} {z}
110; AVX512F-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1} {z}
111; AVX512F-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
112; AVX512F-NEXT:    vpermt2d %zmm1, %zmm3, %zmm2
113; AVX512F-NEXT:    vpslld $31, %zmm2, %zmm1
114; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k1
115; AVX512F-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
116; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
117; AVX512F-NEXT:    retq
118;
119; VL_BW_DQ-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
120; VL_BW_DQ:       # BB#0:
121; VL_BW_DQ-NEXT:    vpcmpeqd %zmm2, %zmm0, %k0
122; VL_BW_DQ-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1
123; VL_BW_DQ-NEXT:    vpmovm2d %k1, %zmm0
124; VL_BW_DQ-NEXT:    vpmovm2d %k0, %zmm1
125; VL_BW_DQ-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
126; VL_BW_DQ-NEXT:    vpermt2d %zmm0, %zmm2, %zmm1
127; VL_BW_DQ-NEXT:    vpslld $31, %zmm1, %zmm0
128; VL_BW_DQ-NEXT:    vptestmd %zmm0, %zmm0, %k0
129; VL_BW_DQ-NEXT:    vpmovm2b %k0, %xmm0
130; VL_BW_DQ-NEXT:    retq
131  %a2 = icmp eq <16 x i32> %a, %a1
132  %b2 = icmp eq <16 x i32> %b, %b1
133  %c = shufflevector <16 x i1> %a2, <16 x i1> %b2, <16 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
134  ret <16 x i1> %c
135}
136
137define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<32 x i1> %a) {
138; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
139; AVX512F:       # BB#0:
140; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
141; AVX512F-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,6,u,u,u,u,u,u,u,u,u,u,5,u,u,19,22,u,28,19,23,23,16,19,22,17,29,19,u,23,16]
142; AVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,6,u,12,3,7,7,0,3,6,1,13,3,u,7,0,u,u,22,u,u,u,u,u,u,u,u,u,u,21,u,u]
143; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,0,255,255,255,255,255,255,255,255,255,255,0,255,255,0,0,255,0,0,0,0,0,0,0,0,0,0,255,0,0]
144; AVX512F-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
145; AVX512F-NEXT:    retq
146;
147; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
148; VL_BW_DQ:       # BB#0:
149; VL_BW_DQ-NEXT:    vpsllw $7, %ymm0, %ymm0
150; VL_BW_DQ-NEXT:    vpmovb2m %ymm0, %k0
151; VL_BW_DQ-NEXT:    vpmovm2w %k0, %zmm0
152; VL_BW_DQ-NEXT:    vmovdqu16 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
153; VL_BW_DQ-NEXT:    vpermw %zmm0, %zmm1, %zmm0
154; VL_BW_DQ-NEXT:    vpsllw $15, %zmm0, %zmm0
155; VL_BW_DQ-NEXT:    vpmovw2m %zmm0, %k0
156; VL_BW_DQ-NEXT:    vpmovm2b %k0, %ymm0
157; VL_BW_DQ-NEXT:    retq
158  %b = shufflevector <32 x i1> %a, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
159  ret <32 x i1> %b
160}
161
162define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) {
163; AVX512F-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
164; AVX512F:       # BB#0:
165; AVX512F-NEXT:    kmovw %edi, %k1
166; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
167; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1} {z}
168; AVX512F-NEXT:    vextracti32x4 $1, %zmm1, %xmm1
169; AVX512F-NEXT:    vpbroadcastq %xmm1, %zmm1
170; AVX512F-NEXT:    vpsllq $63, %zmm1, %zmm1
171; AVX512F-NEXT:    vptestmq %zmm1, %zmm1, %k1
172; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
173; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
174; AVX512F-NEXT:    retq
175;
176; VL_BW_DQ-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
177; VL_BW_DQ:       # BB#0:
178; VL_BW_DQ-NEXT:    kmovb %edi, %k0
179; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm0
180; VL_BW_DQ-NEXT:    vextracti64x2 $1, %zmm0, %xmm0
181; VL_BW_DQ-NEXT:    vpbroadcastq %xmm0, %zmm0
182; VL_BW_DQ-NEXT:    vpsllq $63, %zmm0, %zmm0
183; VL_BW_DQ-NEXT:    vptestmq %zmm0, %zmm0, %k0
184; VL_BW_DQ-NEXT:    vpmovm2w %k0, %xmm0
185; VL_BW_DQ-NEXT:    retq
186  %b = bitcast i8 %a to <8 x i1>
187  %c = shufflevector < 8 x i1> %b, <8 x i1>undef, <8 x i32> <i32 undef, i32 2, i32 undef, i32 undef, i32 2, i32 undef, i32 2, i32 undef>
188  ret <8 x i1> %c
189}
190
191define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) {
192; AVX512F-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
193; AVX512F:       # BB#0:
194; AVX512F-NEXT:    kmovw %edi, %k1
195; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
196; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
197; AVX512F-NEXT:    vpxord %zmm1, %zmm1, %zmm1
198; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u>
199; AVX512F-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
200; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
201; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
202; AVX512F-NEXT:    kmovw %k0, %eax
203; AVX512F-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
204; AVX512F-NEXT:    retq
205;
206; VL_BW_DQ-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
207; VL_BW_DQ:       # BB#0:
208; VL_BW_DQ-NEXT:    kmovb %edi, %k0
209; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm0
210; VL_BW_DQ-NEXT:    vpxord %zmm1, %zmm1, %zmm1
211; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u>
212; VL_BW_DQ-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
213; VL_BW_DQ-NEXT:    vpsllq $63, %zmm0, %zmm0
214; VL_BW_DQ-NEXT:    vptestmq %zmm0, %zmm0, %k0
215; VL_BW_DQ-NEXT:    kmovb %k0, %eax
216; VL_BW_DQ-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
217; VL_BW_DQ-NEXT:    retq
218  %b = bitcast i8 %a to <8 x i1>
219  %c = shufflevector < 8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> <i32 10, i32 2, i32 9, i32 undef, i32 3, i32 undef, i32 2, i32 undef>
220  %d = bitcast <8 x i1> %c to i8
221  ret i8 %d
222}
223
224define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a) {
225; AVX512F-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
226; AVX512F:       # BB#0:
227; AVX512F-NEXT:    kmovw %edi, %k1
228; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
229; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
230; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5,0,1,0,1]
231; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
232; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
233; AVX512F-NEXT:    kmovw %k0, %eax
234; AVX512F-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
235; AVX512F-NEXT:    retq
236;
237; VL_BW_DQ-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
238; VL_BW_DQ:       # BB#0:
239; VL_BW_DQ-NEXT:    kmovb %edi, %k0
240; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm0
241; VL_BW_DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5,0,1,0,1]
242; VL_BW_DQ-NEXT:    vpsllq $63, %zmm0, %zmm0
243; VL_BW_DQ-NEXT:    vptestmq %zmm0, %zmm0, %k0
244; VL_BW_DQ-NEXT:    kmovb %k0, %eax
245; VL_BW_DQ-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
246; VL_BW_DQ-NEXT:    retq
247  %b = bitcast i8 %a to <8 x i1>
248  %c = shufflevector < 8 x i1> %b, <8 x i1> undef, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 undef, i32 undef, i32 undef, i32 undef>
249  %d = bitcast <8 x i1> %c to i8
250  ret i8 %d
251}
252
253define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) {
254; AVX512F-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
255; AVX512F:       # BB#0:
256; AVX512F-NEXT:    kmovw %edi, %k1
257; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
258; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
259; AVX512F-NEXT:    vpxord %zmm1, %zmm1, %zmm1
260; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0]
261; AVX512F-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
262; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
263; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
264; AVX512F-NEXT:    kmovw %k0, %eax
265; AVX512F-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
266; AVX512F-NEXT:    retq
267;
268; VL_BW_DQ-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
269; VL_BW_DQ:       # BB#0:
270; VL_BW_DQ-NEXT:    kmovb %edi, %k0
271; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm0
272; VL_BW_DQ-NEXT:    vpxord %zmm1, %zmm1, %zmm1
273; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0]
274; VL_BW_DQ-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
275; VL_BW_DQ-NEXT:    vpsllq $63, %zmm0, %zmm0
276; VL_BW_DQ-NEXT:    vptestmq %zmm0, %zmm0, %k0
277; VL_BW_DQ-NEXT:    kmovb %k0, %eax
278; VL_BW_DQ-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
279; VL_BW_DQ-NEXT:    retq
280  %b = bitcast i8 %a to <8 x i1>
281  %c = shufflevector <8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
282  %d = bitcast <8 x i1>%c to i8
283  ret i8 %d
284}
285
286define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) {
287; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
288; AVX512F:       # BB#0:
289; AVX512F-NEXT:    kmovw %edi, %k1
290; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
291; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
292; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7]
293; AVX512F-NEXT:    vpxord %zmm2, %zmm2, %zmm2
294; AVX512F-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
295; AVX512F-NEXT:    vpsllq $63, %zmm2, %zmm0
296; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
297; AVX512F-NEXT:    kmovw %k0, %eax
298; AVX512F-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
299; AVX512F-NEXT:    retq
300;
301; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
302; VL_BW_DQ:       # BB#0:
303; VL_BW_DQ-NEXT:    kmovb %edi, %k0
304; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm0
305; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7]
306; VL_BW_DQ-NEXT:    vpxord %zmm2, %zmm2, %zmm2
307; VL_BW_DQ-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
308; VL_BW_DQ-NEXT:    vpsllq $63, %zmm2, %zmm0
309; VL_BW_DQ-NEXT:    vptestmq %zmm0, %zmm0, %k0
310; VL_BW_DQ-NEXT:    kmovb %k0, %eax
311; VL_BW_DQ-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
312; VL_BW_DQ-NEXT:    retq
313  %b = bitcast i8 %a to <8 x i1>
314  %c = shufflevector <8 x i1> zeroinitializer, <8 x i1> %b, <8 x i32> <i32 9, i32 6, i32 1, i32 10, i32 3, i32 7, i32 7, i32 0>
315  %d = bitcast <8 x i1>%c to i8
316  ret i8 %d
317}
318
319define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) {
320; AVX512F-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
321; AVX512F:       # BB#0:
322; AVX512F-NEXT:    kmovw %edi, %k1
323; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
324; AVX512F-NEXT:    movb $51, %al
325; AVX512F-NEXT:    kmovw %eax, %k2
326; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k2} {z}
327; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm0 {%k1} {z}
328; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [9,6,1,0,3,7,7,1]
329; AVX512F-NEXT:    vpermt2q %zmm0, %zmm2, %zmm1
330; AVX512F-NEXT:    vpsllq $63, %zmm1, %zmm0
331; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
332; AVX512F-NEXT:    kmovw %k0, %eax
333; AVX512F-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
334; AVX512F-NEXT:    retq
335;
336; VL_BW_DQ-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
337; VL_BW_DQ:       # BB#0:
338; VL_BW_DQ-NEXT:    kmovb %edi, %k0
339; VL_BW_DQ-NEXT:    movb $51, %al
340; VL_BW_DQ-NEXT:    kmovb %eax, %k1
341; VL_BW_DQ-NEXT:    vpmovm2q %k1, %zmm0
342; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm1
343; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [9,6,1,0,3,7,7,1]
344; VL_BW_DQ-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
345; VL_BW_DQ-NEXT:    vpsllq $63, %zmm0, %zmm0
346; VL_BW_DQ-NEXT:    vptestmq %zmm0, %zmm0, %k0
347; VL_BW_DQ-NEXT:    kmovb %k0, %eax
348; VL_BW_DQ-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
349; VL_BW_DQ-NEXT:    retq
350  %b = bitcast i8 %a to <8 x i1>
351  %c = shufflevector <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <8 x i1> %b, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 1>
352  %c1 = bitcast <8 x i1>%c to i8
353  ret i8 %c1
354}
355
356define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) {
357; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones:
358; AVX512F:       # BB#0:
359; AVX512F-NEXT:    vpmovsxwq %xmm0, %zmm0
360; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
361; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
362; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
363; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1} {z}
364; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [9,1,2,3,4,5,6,7]
365; AVX512F-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
366; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
367; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
368; AVX512F-NEXT:    kmovw %k0, %eax
369; AVX512F-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
370; AVX512F-NEXT:    retq
371;
372; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones:
373; VL_BW_DQ:       # BB#0:
374; VL_BW_DQ-NEXT:    vpsllw $15, %xmm0, %xmm0
375; VL_BW_DQ-NEXT:    vpmovw2m %xmm0, %k0
376; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm0
377; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7]
378; VL_BW_DQ-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
379; VL_BW_DQ-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
380; VL_BW_DQ-NEXT:    vpsllq $63, %zmm2, %zmm0
381; VL_BW_DQ-NEXT:    vptestmq %zmm0, %zmm0, %k0
382; VL_BW_DQ-NEXT:    kmovb %k0, %eax
383; VL_BW_DQ-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
384; VL_BW_DQ-NEXT:    retq
385  %c = shufflevector <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i1> %a, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
386  %c1 = bitcast <8 x i1>%c to i8
387  ret i8 %c1
388}
389
390
391define i16 @shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0(i16 %a) {
392; AVX512F-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0:
393; AVX512F:       # BB#0:
394; AVX512F-NEXT:    kmovw %edi, %k1
395; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
396; AVX512F-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
397; AVX512F-NEXT:    vpbroadcastd %xmm0, %zmm0
398; AVX512F-NEXT:    vpslld $31, %zmm0, %zmm0
399; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
400; AVX512F-NEXT:    kmovw %k0, %eax
401; AVX512F-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
402; AVX512F-NEXT:    retq
403;
404; VL_BW_DQ-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0:
405; VL_BW_DQ:       # BB#0:
406; VL_BW_DQ-NEXT:    kmovw %edi, %k0
407; VL_BW_DQ-NEXT:    vpmovm2d %k0, %zmm0
408; VL_BW_DQ-NEXT:    vpbroadcastd %xmm0, %zmm0
409; VL_BW_DQ-NEXT:    vpslld $31, %zmm0, %zmm0
410; VL_BW_DQ-NEXT:    vptestmd %zmm0, %zmm0, %k0
411; VL_BW_DQ-NEXT:    kmovw %k0, %eax
412; VL_BW_DQ-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
413; VL_BW_DQ-NEXT:    retq
414  %b = bitcast i16 %a to <16 x i1>
415  %c = shufflevector < 16 x i1> %b, <16 x i1> undef, <16 x i32> zeroinitializer
416  %d = bitcast <16 x i1> %c to i16
417  ret i16 %d
418}
419
420define i64 @shuf64i1_zero(i64 %a) {
421; AVX512F-LABEL: shuf64i1_zero:
422; AVX512F:       # BB#0:
423; AVX512F-NEXT:    pushq %rbp
424; AVX512F-NEXT:  .Ltmp0:
425; AVX512F-NEXT:    .cfi_def_cfa_offset 16
426; AVX512F-NEXT:  .Ltmp1:
427; AVX512F-NEXT:    .cfi_offset %rbp, -16
428; AVX512F-NEXT:    movq %rsp, %rbp
429; AVX512F-NEXT:  .Ltmp2:
430; AVX512F-NEXT:    .cfi_def_cfa_register %rbp
431; AVX512F-NEXT:    andq $-32, %rsp
432; AVX512F-NEXT:    subq $96, %rsp
433; AVX512F-NEXT:    movl %edi, {{[0-9]+}}(%rsp)
434; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
435; AVX512F-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
436; AVX512F-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
437; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
438; AVX512F-NEXT:    vpbroadcastb %xmm0, %ymm0
439; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
440; AVX512F-NEXT:    vpmovsxbd %xmm1, %zmm1
441; AVX512F-NEXT:    vpslld $31, %zmm1, %zmm1
442; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k0
443; AVX512F-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
444; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
445; AVX512F-NEXT:    vpslld $31, %zmm0, %zmm0
446; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
447; AVX512F-NEXT:    kmovw %k0, (%rsp)
448; AVX512F-NEXT:    movl (%rsp), %ecx
449; AVX512F-NEXT:    movq %rcx, %rax
450; AVX512F-NEXT:    shlq $32, %rax
451; AVX512F-NEXT:    orq %rcx, %rax
452; AVX512F-NEXT:    movq %rbp, %rsp
453; AVX512F-NEXT:    popq %rbp
454; AVX512F-NEXT:    retq
455;
456; VL_BW_DQ-LABEL: shuf64i1_zero:
457; VL_BW_DQ:       # BB#0:
458; VL_BW_DQ-NEXT:    kmovq %rdi, %k0
459; VL_BW_DQ-NEXT:    vpmovm2b %k0, %zmm0
460; VL_BW_DQ-NEXT:    vpbroadcastb %xmm0, %zmm0
461; VL_BW_DQ-NEXT:    vpsllw $7, %zmm0, %zmm0
462; VL_BW_DQ-NEXT:    vpmovb2m %zmm0, %k0
463; VL_BW_DQ-NEXT:    kmovq %k0, %rax
464; VL_BW_DQ-NEXT:    retq
465  %b = bitcast i64 %a to <64 x i1>
466  %c = shufflevector < 64 x i1> %b, <64 x i1> undef, <64 x i32> zeroinitializer
467  %d = bitcast <64 x i1> %c to i64
468  ret i64 %d
469}
470