• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CDVL
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,-avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CD
6
7define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
8; AVX1-LABEL: testv4i64:
9; AVX1:       # BB#0:
10; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
11; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
12; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
13; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm3
14; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
15; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
16; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
17; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1]
18; AVX1-NEXT:    vpsubq %xmm3, %xmm1, %xmm1
19; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
20; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm5
21; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
22; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
23; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
24; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
25; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
26; AVX1-NEXT:    vpaddb %xmm5, %xmm1, %xmm1
27; AVX1-NEXT:    vpsadbw %xmm2, %xmm1, %xmm1
28; AVX1-NEXT:    vpsubq %xmm3, %xmm0, %xmm0
29; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm3
30; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
31; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
32; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
33; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
34; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
35; AVX1-NEXT:    vpsadbw %xmm2, %xmm0, %xmm0
36; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
37; AVX1-NEXT:    retq
38;
39; AVX2-LABEL: testv4i64:
40; AVX2:       # BB#0:
41; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
42; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm2
43; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
44; AVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
45; AVX2-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
46; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
47; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm3
48; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
49; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
50; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
51; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
52; AVX2-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
53; AVX2-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
54; AVX2-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
55; AVX2-NEXT:    retq
56;
57; AVX512CDVL-LABEL: testv4i64:
58; AVX512CDVL:       # BB#0:
59; AVX512CDVL-NEXT:    vpxord %ymm1, %ymm1, %ymm1
60; AVX512CDVL-NEXT:    vpsubq %ymm0, %ymm1, %ymm2
61; AVX512CDVL-NEXT:    vpandq %ymm2, %ymm0, %ymm0
62; AVX512CDVL-NEXT:    vpsubq {{.*}}(%rip){1to4}, %ymm0, %ymm0
63; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
64; AVX512CDVL-NEXT:    vpandq %ymm2, %ymm0, %ymm3
65; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
66; AVX512CDVL-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
67; AVX512CDVL-NEXT:    vpsrlw $4, %ymm0, %ymm0
68; AVX512CDVL-NEXT:    vpandq %ymm2, %ymm0, %ymm0
69; AVX512CDVL-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
70; AVX512CDVL-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
71; AVX512CDVL-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
72; AVX512CDVL-NEXT:    retq
73;
74; AVX512CD-LABEL: testv4i64:
75; AVX512CD:       # BB#0:
76; AVX512CD-NEXT:    vpxor %ymm1, %ymm1, %ymm1
77; AVX512CD-NEXT:    vpsubq %ymm0, %ymm1, %ymm2
78; AVX512CD-NEXT:    vpand %ymm2, %ymm0, %ymm0
79; AVX512CD-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
80; AVX512CD-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
81; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
82; AVX512CD-NEXT:    vpand %ymm2, %ymm0, %ymm3
83; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
84; AVX512CD-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
85; AVX512CD-NEXT:    vpsrlw $4, %ymm0, %ymm0
86; AVX512CD-NEXT:    vpand %ymm2, %ymm0, %ymm0
87; AVX512CD-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
88; AVX512CD-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
89; AVX512CD-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
90; AVX512CD-NEXT:    retq
91  %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %in, i1 0)
92  ret <4 x i64> %out
93}
94
95define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
96; AVX1-LABEL: testv4i64u:
97; AVX1:       # BB#0:
98; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
99; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
100; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
101; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm3
102; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
103; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
104; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
105; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1]
106; AVX1-NEXT:    vpsubq %xmm3, %xmm1, %xmm1
107; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
108; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm5
109; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
110; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
111; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
112; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
113; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
114; AVX1-NEXT:    vpaddb %xmm5, %xmm1, %xmm1
115; AVX1-NEXT:    vpsadbw %xmm2, %xmm1, %xmm1
116; AVX1-NEXT:    vpsubq %xmm3, %xmm0, %xmm0
117; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm3
118; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
119; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
120; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
121; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
122; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
123; AVX1-NEXT:    vpsadbw %xmm2, %xmm0, %xmm0
124; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
125; AVX1-NEXT:    retq
126;
127; AVX2-LABEL: testv4i64u:
128; AVX2:       # BB#0:
129; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
130; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm2
131; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
132; AVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
133; AVX2-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
134; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
135; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm3
136; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
137; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
138; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
139; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
140; AVX2-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
141; AVX2-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
142; AVX2-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
143; AVX2-NEXT:    retq
144;
145; AVX512CDVL-LABEL: testv4i64u:
146; AVX512CDVL:       # BB#0:
147; AVX512CDVL-NEXT:    vpxord %ymm1, %ymm1, %ymm1
148; AVX512CDVL-NEXT:    vpsubq %ymm0, %ymm1, %ymm1
149; AVX512CDVL-NEXT:    vpandq %ymm1, %ymm0, %ymm0
150; AVX512CDVL-NEXT:    vplzcntq %ymm0, %ymm0
151; AVX512CDVL-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm1
152; AVX512CDVL-NEXT:    vpsubq %ymm0, %ymm1, %ymm0
153; AVX512CDVL-NEXT:    retq
154;
155; AVX512CD-LABEL: testv4i64u:
156; AVX512CD:       # BB#0:
157; AVX512CD-NEXT:    vpxor %ymm1, %ymm1, %ymm1
158; AVX512CD-NEXT:    vpsubq %ymm0, %ymm1, %ymm1
159; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
160; AVX512CD-NEXT:    vplzcntq %zmm0, %zmm0
161; AVX512CD-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm1
162; AVX512CD-NEXT:    vpsubq %ymm0, %ymm1, %ymm0
163; AVX512CD-NEXT:    retq
164  %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %in, i1 -1)
165  ret <4 x i64> %out
166}
167
168define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
169; AVX1-LABEL: testv8i32:
170; AVX1:       # BB#0:
171; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
172; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
173; AVX1-NEXT:    vpsubd %xmm2, %xmm1, %xmm2
174; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm3
175; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
176; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
177; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
178; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1,1,1]
179; AVX1-NEXT:    vpsubd %xmm3, %xmm2, %xmm2
180; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
181; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
182; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
183; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
184; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
185; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
186; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
187; AVX1-NEXT:    vpaddb %xmm5, %xmm2, %xmm2
188; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
189; AVX1-NEXT:    vpsadbw %xmm1, %xmm5, %xmm5
190; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
191; AVX1-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
192; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
193; AVX1-NEXT:    vpsubd %xmm3, %xmm0, %xmm0
194; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm3
195; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
196; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
197; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
198; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
199; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
200; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
201; AVX1-NEXT:    vpsadbw %xmm1, %xmm3, %xmm3
202; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
203; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
204; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
205; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
206; AVX1-NEXT:    retq
207;
208; AVX2-LABEL: testv8i32:
209; AVX2:       # BB#0:
210; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
211; AVX2-NEXT:    vpsubd %ymm0, %ymm1, %ymm2
212; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
213; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm2
214; AVX2-NEXT:    vpsubd %ymm2, %ymm0, %ymm0
215; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
216; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm3
217; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
218; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
219; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
220; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
221; AVX2-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
222; AVX2-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
223; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
224; AVX2-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
225; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
226; AVX2-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
227; AVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
228; AVX2-NEXT:    retq
229;
230; AVX512CDVL-LABEL: testv8i32:
231; AVX512CDVL:       # BB#0:
232; AVX512CDVL-NEXT:    vpxord %ymm1, %ymm1, %ymm1
233; AVX512CDVL-NEXT:    vpsubd %ymm0, %ymm1, %ymm2
234; AVX512CDVL-NEXT:    vpandd %ymm2, %ymm0, %ymm0
235; AVX512CDVL-NEXT:    vpsubd {{.*}}(%rip){1to8}, %ymm0, %ymm0
236; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
237; AVX512CDVL-NEXT:    vpandq %ymm2, %ymm0, %ymm3
238; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
239; AVX512CDVL-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
240; AVX512CDVL-NEXT:    vpsrlw $4, %ymm0, %ymm0
241; AVX512CDVL-NEXT:    vpandq %ymm2, %ymm0, %ymm0
242; AVX512CDVL-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
243; AVX512CDVL-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
244; AVX512CDVL-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
245; AVX512CDVL-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
246; AVX512CDVL-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
247; AVX512CDVL-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
248; AVX512CDVL-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
249; AVX512CDVL-NEXT:    retq
250;
251; AVX512CD-LABEL: testv8i32:
252; AVX512CD:       # BB#0:
253; AVX512CD-NEXT:    vpxor %ymm1, %ymm1, %ymm1
254; AVX512CD-NEXT:    vpsubd %ymm0, %ymm1, %ymm2
255; AVX512CD-NEXT:    vpand %ymm2, %ymm0, %ymm0
256; AVX512CD-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm2
257; AVX512CD-NEXT:    vpsubd %ymm2, %ymm0, %ymm0
258; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
259; AVX512CD-NEXT:    vpand %ymm2, %ymm0, %ymm3
260; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
261; AVX512CD-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
262; AVX512CD-NEXT:    vpsrlw $4, %ymm0, %ymm0
263; AVX512CD-NEXT:    vpand %ymm2, %ymm0, %ymm0
264; AVX512CD-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
265; AVX512CD-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
266; AVX512CD-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
267; AVX512CD-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
268; AVX512CD-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
269; AVX512CD-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
270; AVX512CD-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
271; AVX512CD-NEXT:    retq
272  %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %in, i1 0)
273  ret <8 x i32> %out
274}
275
276define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
277; AVX1-LABEL: testv8i32u:
278; AVX1:       # BB#0:
279; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
280; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
281; AVX1-NEXT:    vpsubd %xmm2, %xmm1, %xmm2
282; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm3
283; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
284; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
285; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
286; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1,1,1]
287; AVX1-NEXT:    vpsubd %xmm3, %xmm2, %xmm2
288; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
289; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
290; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
291; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
292; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
293; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
294; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
295; AVX1-NEXT:    vpaddb %xmm5, %xmm2, %xmm2
296; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
297; AVX1-NEXT:    vpsadbw %xmm1, %xmm5, %xmm5
298; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
299; AVX1-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
300; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
301; AVX1-NEXT:    vpsubd %xmm3, %xmm0, %xmm0
302; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm3
303; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
304; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
305; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
306; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
307; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
308; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
309; AVX1-NEXT:    vpsadbw %xmm1, %xmm3, %xmm3
310; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
311; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
312; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
313; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
314; AVX1-NEXT:    retq
315;
316; AVX2-LABEL: testv8i32u:
317; AVX2:       # BB#0:
318; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
319; AVX2-NEXT:    vpsubd %ymm0, %ymm1, %ymm2
320; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
321; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm2
322; AVX2-NEXT:    vpsubd %ymm2, %ymm0, %ymm0
323; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
324; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm3
325; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
326; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
327; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
328; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
329; AVX2-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
330; AVX2-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
331; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
332; AVX2-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
333; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
334; AVX2-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
335; AVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
336; AVX2-NEXT:    retq
337;
338; AVX512CDVL-LABEL: testv8i32u:
339; AVX512CDVL:       # BB#0:
340; AVX512CDVL-NEXT:    vpxord %ymm1, %ymm1, %ymm1
341; AVX512CDVL-NEXT:    vpsubd %ymm0, %ymm1, %ymm1
342; AVX512CDVL-NEXT:    vpandd %ymm1, %ymm0, %ymm0
343; AVX512CDVL-NEXT:    vplzcntd %ymm0, %ymm0
344; AVX512CDVL-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
345; AVX512CDVL-NEXT:    vpsubd %ymm0, %ymm1, %ymm0
346; AVX512CDVL-NEXT:    retq
347;
348; AVX512CD-LABEL: testv8i32u:
349; AVX512CD:       # BB#0:
350; AVX512CD-NEXT:    vpxor %ymm1, %ymm1, %ymm1
351; AVX512CD-NEXT:    vpsubd %ymm0, %ymm1, %ymm1
352; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
353; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
354; AVX512CD-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
355; AVX512CD-NEXT:    vpsubd %ymm0, %ymm1, %ymm0
356; AVX512CD-NEXT:    retq
357  %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %in, i1 -1)
358  ret <8 x i32> %out
359}
360
361define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
362; AVX1-LABEL: testv16i16:
363; AVX1:       # BB#0:
364; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
365; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
366; AVX1-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
367; AVX1-NEXT:    vpsubw %xmm0, %xmm2, %xmm2
368; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
369; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
370; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1]
371; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm2
372; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
373; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
374; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
375; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
376; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
377; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
378; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
379; AVX1-NEXT:    vpaddb %xmm4, %xmm2, %xmm2
380; AVX1-NEXT:    vpsllw $8, %xmm2, %xmm4
381; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
382; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
383; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
384; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
385; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm1
386; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
387; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
388; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
389; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
390; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
391; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm1
392; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
393; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
394; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
395; AVX1-NEXT:    retq
396;
397; AVX2-LABEL: testv16i16:
398; AVX2:       # BB#0:
399; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
400; AVX2-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
401; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
402; AVX2-NEXT:    vpsubw {{.*}}(%rip), %ymm0, %ymm0
403; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
404; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
405; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
406; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
407; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
408; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
409; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
410; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
411; AVX2-NEXT:    vpsllw $8, %ymm0, %ymm1
412; AVX2-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
413; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
414; AVX2-NEXT:    retq
415;
416; AVX512CDVL-LABEL: testv16i16:
417; AVX512CDVL:       # BB#0:
418; AVX512CDVL-NEXT:    vpxord %ymm1, %ymm1, %ymm1
419; AVX512CDVL-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
420; AVX512CDVL-NEXT:    vpandq %ymm1, %ymm0, %ymm0
421; AVX512CDVL-NEXT:    vpsubw {{.*}}(%rip), %ymm0, %ymm0
422; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
423; AVX512CDVL-NEXT:    vpandq %ymm1, %ymm0, %ymm2
424; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
425; AVX512CDVL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
426; AVX512CDVL-NEXT:    vpsrlw $4, %ymm0, %ymm0
427; AVX512CDVL-NEXT:    vpandq %ymm1, %ymm0, %ymm0
428; AVX512CDVL-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
429; AVX512CDVL-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
430; AVX512CDVL-NEXT:    vpsllw $8, %ymm0, %ymm1
431; AVX512CDVL-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
432; AVX512CDVL-NEXT:    vpsrlw $8, %ymm0, %ymm0
433; AVX512CDVL-NEXT:    retq
434;
435; AVX512CD-LABEL: testv16i16:
436; AVX512CD:       # BB#0:
437; AVX512CD-NEXT:    vpxor %ymm1, %ymm1, %ymm1
438; AVX512CD-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
439; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
440; AVX512CD-NEXT:    vpsubw {{.*}}(%rip), %ymm0, %ymm0
441; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
442; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm2
443; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
444; AVX512CD-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
445; AVX512CD-NEXT:    vpsrlw $4, %ymm0, %ymm0
446; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
447; AVX512CD-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
448; AVX512CD-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
449; AVX512CD-NEXT:    vpsllw $8, %ymm0, %ymm1
450; AVX512CD-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
451; AVX512CD-NEXT:    vpsrlw $8, %ymm0, %ymm0
452; AVX512CD-NEXT:    retq
453  %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %in, i1 0)
454  ret <16 x i16> %out
455}
456
457define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
458; AVX1-LABEL: testv16i16u:
459; AVX1:       # BB#0:
460; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
461; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
462; AVX1-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
463; AVX1-NEXT:    vpsubw %xmm0, %xmm2, %xmm2
464; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
465; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
466; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1]
467; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm2
468; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
469; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
470; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
471; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
472; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
473; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
474; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
475; AVX1-NEXT:    vpaddb %xmm4, %xmm2, %xmm2
476; AVX1-NEXT:    vpsllw $8, %xmm2, %xmm4
477; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
478; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
479; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
480; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
481; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm1
482; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
483; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
484; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
485; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
486; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
487; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm1
488; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
489; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
490; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
491; AVX1-NEXT:    retq
492;
493; AVX2-LABEL: testv16i16u:
494; AVX2:       # BB#0:
495; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
496; AVX2-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
497; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
498; AVX2-NEXT:    vpsubw {{.*}}(%rip), %ymm0, %ymm0
499; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
500; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
501; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
502; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
503; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
504; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
505; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
506; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
507; AVX2-NEXT:    vpsllw $8, %ymm0, %ymm1
508; AVX2-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
509; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
510; AVX2-NEXT:    retq
511;
512; AVX512CDVL-LABEL: testv16i16u:
513; AVX512CDVL:       # BB#0:
514; AVX512CDVL-NEXT:    vpxord %ymm1, %ymm1, %ymm1
515; AVX512CDVL-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
516; AVX512CDVL-NEXT:    vpandq %ymm1, %ymm0, %ymm0
517; AVX512CDVL-NEXT:    vpsubw {{.*}}(%rip), %ymm0, %ymm0
518; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
519; AVX512CDVL-NEXT:    vpandq %ymm1, %ymm0, %ymm2
520; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
521; AVX512CDVL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
522; AVX512CDVL-NEXT:    vpsrlw $4, %ymm0, %ymm0
523; AVX512CDVL-NEXT:    vpandq %ymm1, %ymm0, %ymm0
524; AVX512CDVL-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
525; AVX512CDVL-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
526; AVX512CDVL-NEXT:    vpsllw $8, %ymm0, %ymm1
527; AVX512CDVL-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
528; AVX512CDVL-NEXT:    vpsrlw $8, %ymm0, %ymm0
529; AVX512CDVL-NEXT:    retq
530;
531; AVX512CD-LABEL: testv16i16u:
532; AVX512CD:       # BB#0:
533; AVX512CD-NEXT:    vpxor %ymm1, %ymm1, %ymm1
534; AVX512CD-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
535; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
536; AVX512CD-NEXT:    vpsubw {{.*}}(%rip), %ymm0, %ymm0
537; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
538; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm2
539; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
540; AVX512CD-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
541; AVX512CD-NEXT:    vpsrlw $4, %ymm0, %ymm0
542; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
543; AVX512CD-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
544; AVX512CD-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
545; AVX512CD-NEXT:    vpsllw $8, %ymm0, %ymm1
546; AVX512CD-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
547; AVX512CD-NEXT:    vpsrlw $8, %ymm0, %ymm0
548; AVX512CD-NEXT:    retq
549  %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %in, i1 -1)
550  ret <16 x i16> %out
551}
552
553define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
554; AVX1-LABEL: testv32i8:
555; AVX1:       # BB#0:
556; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
557; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
558; AVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
559; AVX1-NEXT:    vpsubb %xmm0, %xmm2, %xmm2
560; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
561; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
562; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
563; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
564; AVX1-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
565; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
566; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
567; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
568; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
569; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
570; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
571; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
572; AVX1-NEXT:    vpaddb %xmm4, %xmm1, %xmm1
573; AVX1-NEXT:    vpsubb %xmm2, %xmm0, %xmm0
574; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
575; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
576; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
577; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
578; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
579; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
580; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
581; AVX1-NEXT:    retq
582;
583; AVX2-LABEL: testv32i8:
584; AVX2:       # BB#0:
585; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
586; AVX2-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
587; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
588; AVX2-NEXT:    vpsubb {{.*}}(%rip), %ymm0, %ymm0
589; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
590; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
591; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
592; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
593; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
594; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
595; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
596; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
597; AVX2-NEXT:    retq
598;
599; AVX512CDVL-LABEL: testv32i8:
600; AVX512CDVL:       # BB#0:
601; AVX512CDVL-NEXT:    vpxord %ymm1, %ymm1, %ymm1
602; AVX512CDVL-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
603; AVX512CDVL-NEXT:    vpandq %ymm1, %ymm0, %ymm0
604; AVX512CDVL-NEXT:    vpsubb {{.*}}(%rip), %ymm0, %ymm0
605; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
606; AVX512CDVL-NEXT:    vpandq %ymm1, %ymm0, %ymm2
607; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
608; AVX512CDVL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
609; AVX512CDVL-NEXT:    vpsrlw $4, %ymm0, %ymm0
610; AVX512CDVL-NEXT:    vpandq %ymm1, %ymm0, %ymm0
611; AVX512CDVL-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
612; AVX512CDVL-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
613; AVX512CDVL-NEXT:    retq
614;
615; AVX512CD-LABEL: testv32i8:
616; AVX512CD:       # BB#0:
617; AVX512CD-NEXT:    vpxor %ymm1, %ymm1, %ymm1
618; AVX512CD-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
619; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
620; AVX512CD-NEXT:    vpsubb {{.*}}(%rip), %ymm0, %ymm0
621; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
622; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm2
623; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
624; AVX512CD-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
625; AVX512CD-NEXT:    vpsrlw $4, %ymm0, %ymm0
626; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
627; AVX512CD-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
628; AVX512CD-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
629; AVX512CD-NEXT:    retq
630  %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %in, i1 0)
631  ret <32 x i8> %out
632}
633
634define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
635; AVX1-LABEL: testv32i8u:
636; AVX1:       # BB#0:
637; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
638; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
639; AVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
640; AVX1-NEXT:    vpsubb %xmm0, %xmm2, %xmm2
641; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
642; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
643; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
644; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
645; AVX1-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
646; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
647; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
648; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
649; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
650; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
651; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
652; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
653; AVX1-NEXT:    vpaddb %xmm4, %xmm1, %xmm1
654; AVX1-NEXT:    vpsubb %xmm2, %xmm0, %xmm0
655; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
656; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
657; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
658; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
659; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
660; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
661; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
662; AVX1-NEXT:    retq
663;
664; AVX2-LABEL: testv32i8u:
665; AVX2:       # BB#0:
666; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
667; AVX2-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
668; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
669; AVX2-NEXT:    vpsubb {{.*}}(%rip), %ymm0, %ymm0
670; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
671; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
672; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
673; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
674; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
675; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
676; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
677; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
678; AVX2-NEXT:    retq
679;
680; AVX512CDVL-LABEL: testv32i8u:
681; AVX512CDVL:       # BB#0:
682; AVX512CDVL-NEXT:    vpxord %ymm1, %ymm1, %ymm1
683; AVX512CDVL-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
684; AVX512CDVL-NEXT:    vpandq %ymm1, %ymm0, %ymm0
685; AVX512CDVL-NEXT:    vpsubb {{.*}}(%rip), %ymm0, %ymm0
686; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
687; AVX512CDVL-NEXT:    vpandq %ymm1, %ymm0, %ymm2
688; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
689; AVX512CDVL-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
690; AVX512CDVL-NEXT:    vpsrlw $4, %ymm0, %ymm0
691; AVX512CDVL-NEXT:    vpandq %ymm1, %ymm0, %ymm0
692; AVX512CDVL-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
693; AVX512CDVL-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
694; AVX512CDVL-NEXT:    retq
695;
696; AVX512CD-LABEL: testv32i8u:
697; AVX512CD:       # BB#0:
698; AVX512CD-NEXT:    vpxor %ymm1, %ymm1, %ymm1
699; AVX512CD-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
700; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
701; AVX512CD-NEXT:    vpsubb {{.*}}(%rip), %ymm0, %ymm0
702; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
703; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm2
704; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
705; AVX512CD-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
706; AVX512CD-NEXT:    vpsrlw $4, %ymm0, %ymm0
707; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
708; AVX512CD-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
709; AVX512CD-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
710; AVX512CD-NEXT:    retq
711  %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %in, i1 -1)
712  ret <32 x i8> %out
713}
714
715define <4 x i64> @foldv4i64() nounwind {
716; AVX1-LABEL: foldv4i64:
717; AVX1:       # BB#0:
718; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,64,0]
719; AVX1-NEXT:    retq
720;
721; AVX2-LABEL: foldv4i64:
722; AVX2:       # BB#0:
723; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,64,0]
724; AVX2-NEXT:    retq
725;
726; AVX512CDVL-LABEL: foldv4i64:
727; AVX512CDVL:       # BB#0:
728; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} ymm0 = [8,0,64,0]
729; AVX512CDVL-NEXT:    retq
730;
731; AVX512CD-LABEL: foldv4i64:
732; AVX512CD:       # BB#0:
733; AVX512CD-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,64,0]
734; AVX512CD-NEXT:    retq
735  %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> <i64 256, i64 -1, i64 0, i64 255>, i1 0)
736  ret <4 x i64> %out
737}
738
739define <4 x i64> @foldv4i64u() nounwind {
740; AVX1-LABEL: foldv4i64u:
741; AVX1:       # BB#0:
742; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,64,0]
743; AVX1-NEXT:    retq
744;
745; AVX2-LABEL: foldv4i64u:
746; AVX2:       # BB#0:
747; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,64,0]
748; AVX2-NEXT:    retq
749;
750; AVX512CDVL-LABEL: foldv4i64u:
751; AVX512CDVL:       # BB#0:
752; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} ymm0 = [8,0,64,0]
753; AVX512CDVL-NEXT:    retq
754;
755; AVX512CD-LABEL: foldv4i64u:
756; AVX512CD:       # BB#0:
757; AVX512CD-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,64,0]
758; AVX512CD-NEXT:    retq
759  %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> <i64 256, i64 -1, i64 0, i64 255>, i1 -1)
760  ret <4 x i64> %out
761}
762
763define <8 x i32> @foldv8i32() nounwind {
764; AVX1-LABEL: foldv8i32:
765; AVX1:       # BB#0:
766; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
767; AVX1-NEXT:    retq
768;
769; AVX2-LABEL: foldv8i32:
770; AVX2:       # BB#0:
771; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
772; AVX2-NEXT:    retq
773;
774; AVX512CDVL-LABEL: foldv8i32:
775; AVX512CDVL:       # BB#0:
776; AVX512CDVL-NEXT:    vmovdqa32 {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
777; AVX512CDVL-NEXT:    retq
778;
779; AVX512CD-LABEL: foldv8i32:
780; AVX512CD:       # BB#0:
781; AVX512CD-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
782; AVX512CD-NEXT:    retq
783  %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>, i1 0)
784  ret <8 x i32> %out
785}
786
787define <8 x i32> @foldv8i32u() nounwind {
788; AVX1-LABEL: foldv8i32u:
789; AVX1:       # BB#0:
790; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
791; AVX1-NEXT:    retq
792;
793; AVX2-LABEL: foldv8i32u:
794; AVX2:       # BB#0:
795; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
796; AVX2-NEXT:    retq
797;
798; AVX512CDVL-LABEL: foldv8i32u:
799; AVX512CDVL:       # BB#0:
800; AVX512CDVL-NEXT:    vmovdqa32 {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
801; AVX512CDVL-NEXT:    retq
802;
803; AVX512CD-LABEL: foldv8i32u:
804; AVX512CD:       # BB#0:
805; AVX512CD-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
806; AVX512CD-NEXT:    retq
807  %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>, i1 -1)
808  ret <8 x i32> %out
809}
810
811define <16 x i16> @foldv16i16() nounwind {
812; AVX1-LABEL: foldv16i16:
813; AVX1:       # BB#0:
814; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
815; AVX1-NEXT:    retq
816;
817; AVX2-LABEL: foldv16i16:
818; AVX2:       # BB#0:
819; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
820; AVX2-NEXT:    retq
821;
822; AVX512CDVL-LABEL: foldv16i16:
823; AVX512CDVL:       # BB#0:
824; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
825; AVX512CDVL-NEXT:    retq
826;
827; AVX512CD-LABEL: foldv16i16:
828; AVX512CD:       # BB#0:
829; AVX512CD-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
830; AVX512CD-NEXT:    retq
831  %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88, i16 -2, i16 254, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32>, i1 0)
832  ret <16 x i16> %out
833}
834
835define <16 x i16> @foldv16i16u() nounwind {
836; AVX1-LABEL: foldv16i16u:
837; AVX1:       # BB#0:
838; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
839; AVX1-NEXT:    retq
840;
841; AVX2-LABEL: foldv16i16u:
842; AVX2:       # BB#0:
843; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
844; AVX2-NEXT:    retq
845;
846; AVX512CDVL-LABEL: foldv16i16u:
847; AVX512CDVL:       # BB#0:
848; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
849; AVX512CDVL-NEXT:    retq
850;
851; AVX512CD-LABEL: foldv16i16u:
852; AVX512CD:       # BB#0:
853; AVX512CD-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
854; AVX512CD-NEXT:    retq
855  %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88, i16 -2, i16 254, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32>, i1 -1)
856  ret <16 x i16> %out
857}
858
859define <32 x i8> @foldv32i8() nounwind {
860; AVX1-LABEL: foldv32i8:
861; AVX1:       # BB#0:
862; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
863; AVX1-NEXT:    retq
864;
865; AVX2-LABEL: foldv32i8:
866; AVX2:       # BB#0:
867; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
868; AVX2-NEXT:    retq
869;
870; AVX512CDVL-LABEL: foldv32i8:
871; AVX512CDVL:       # BB#0:
872; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
873; AVX512CDVL-NEXT:    retq
874;
875; AVX512CD-LABEL: foldv32i8:
876; AVX512CD:       # BB#0:
877; AVX512CD-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
878; AVX512CD-NEXT:    retq
879  %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 256, i8 -256, i8 -128, i8 -64, i8 -32, i8 -16, i8 -8, i8 -4, i8 -2, i8 -1, i8 3, i8 5, i8 7, i8 127>, i1 0)
880  ret <32 x i8> %out
881}
882
883define <32 x i8> @foldv32i8u() nounwind {
884; AVX1-LABEL: foldv32i8u:
885; AVX1:       # BB#0:
886; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
887; AVX1-NEXT:    retq
888;
889; AVX2-LABEL: foldv32i8u:
890; AVX2:       # BB#0:
891; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
892; AVX2-NEXT:    retq
893;
894; AVX512CDVL-LABEL: foldv32i8u:
895; AVX512CDVL:       # BB#0:
896; AVX512CDVL-NEXT:    vmovdqa64 {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
897; AVX512CDVL-NEXT:    retq
898;
899; AVX512CD-LABEL: foldv32i8u:
900; AVX512CD:       # BB#0:
901; AVX512CD-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
902; AVX512CD-NEXT:    retq
903  %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 256, i8 -256, i8 -128, i8 -64, i8 -32, i8 -16, i8 -8, i8 -4, i8 -2, i8 -1, i8 3, i8 5, i8 7, i8 127>, i1 -1)
904  ret <32 x i8> %out
905}
906
907declare <4 x i64> @llvm.cttz.v4i64(<4 x i64>, i1)
908declare <8 x i32> @llvm.cttz.v8i32(<8 x i32>, i1)
909declare <16 x i16> @llvm.cttz.v16i16(<16 x i16>, i1)
910declare <32 x i8> @llvm.cttz.v32i8(<32 x i8>, i1)
911