• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
4
5define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
6; AVX1-LABEL: testv4i64:
7; AVX1:       # BB#0:
8; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
9; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
10; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
11; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm3
12; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
13; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
14; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
15; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1]
16; AVX1-NEXT:    vpsubq %xmm3, %xmm1, %xmm1
17; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
18; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm5
19; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
20; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
21; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
22; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
23; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
24; AVX1-NEXT:    vpaddb %xmm5, %xmm1, %xmm1
25; AVX1-NEXT:    vpsadbw %xmm2, %xmm1, %xmm1
26; AVX1-NEXT:    vpsubq %xmm3, %xmm0, %xmm0
27; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm3
28; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
29; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
30; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
31; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
32; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
33; AVX1-NEXT:    vpsadbw %xmm2, %xmm0, %xmm0
34; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
35; AVX1-NEXT:    retq
36;
37; AVX2-LABEL: testv4i64:
38; AVX2:       # BB#0:
39; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
40; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm2
41; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
42; AVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
43; AVX2-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
44; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
45; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm3
46; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
47; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
48; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
49; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
50; AVX2-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
51; AVX2-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
52; AVX2-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
53; AVX2-NEXT:    retq
54  %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %in, i1 0)
55  ret <4 x i64> %out
56}
57
58define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
59; AVX1-LABEL: testv4i64u:
60; AVX1:       # BB#0:
61; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
62; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
63; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
64; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm3
65; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
66; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
67; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
68; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1]
69; AVX1-NEXT:    vpsubq %xmm3, %xmm1, %xmm1
70; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
71; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm5
72; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
73; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
74; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
75; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
76; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
77; AVX1-NEXT:    vpaddb %xmm5, %xmm1, %xmm1
78; AVX1-NEXT:    vpsadbw %xmm2, %xmm1, %xmm1
79; AVX1-NEXT:    vpsubq %xmm3, %xmm0, %xmm0
80; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm3
81; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
82; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
83; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
84; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
85; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
86; AVX1-NEXT:    vpsadbw %xmm2, %xmm0, %xmm0
87; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
88; AVX1-NEXT:    retq
89;
90; AVX2-LABEL: testv4i64u:
91; AVX2:       # BB#0:
92; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
93; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm2
94; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
95; AVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
96; AVX2-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
97; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
98; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm3
99; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
100; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
101; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
102; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
103; AVX2-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
104; AVX2-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
105; AVX2-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
106; AVX2-NEXT:    retq
107  %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %in, i1 -1)
108  ret <4 x i64> %out
109}
110
111define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
112; AVX1-LABEL: testv8i32:
113; AVX1:       # BB#0:
114; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
115; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
116; AVX1-NEXT:    vpsubd %xmm2, %xmm1, %xmm2
117; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm3
118; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
119; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
120; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
121; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1,1,1]
122; AVX1-NEXT:    vpsubd %xmm3, %xmm2, %xmm2
123; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
124; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
125; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
126; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
127; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
128; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
129; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
130; AVX1-NEXT:    vpaddb %xmm5, %xmm2, %xmm2
131; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
132; AVX1-NEXT:    vpsadbw %xmm1, %xmm5, %xmm5
133; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
134; AVX1-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
135; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
136; AVX1-NEXT:    vpsubd %xmm3, %xmm0, %xmm0
137; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm3
138; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
139; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
140; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
141; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
142; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
143; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
144; AVX1-NEXT:    vpsadbw %xmm1, %xmm3, %xmm3
145; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
146; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
147; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
148; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
149; AVX1-NEXT:    retq
150;
151; AVX2-LABEL: testv8i32:
152; AVX2:       # BB#0:
153; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
154; AVX2-NEXT:    vpsubd %ymm0, %ymm1, %ymm2
155; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
156; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm2
157; AVX2-NEXT:    vpsubd %ymm2, %ymm0, %ymm0
158; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
159; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm3
160; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
161; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
162; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
163; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
164; AVX2-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
165; AVX2-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
166; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
167; AVX2-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
168; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
169; AVX2-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
170; AVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
171; AVX2-NEXT:    retq
172  %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %in, i1 0)
173  ret <8 x i32> %out
174}
175
176define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
177; AVX1-LABEL: testv8i32u:
178; AVX1:       # BB#0:
179; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
180; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
181; AVX1-NEXT:    vpsubd %xmm2, %xmm1, %xmm2
182; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm3
183; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
184; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
185; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
186; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1,1,1]
187; AVX1-NEXT:    vpsubd %xmm3, %xmm2, %xmm2
188; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
189; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
190; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
191; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
192; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
193; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
194; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
195; AVX1-NEXT:    vpaddb %xmm5, %xmm2, %xmm2
196; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
197; AVX1-NEXT:    vpsadbw %xmm1, %xmm5, %xmm5
198; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
199; AVX1-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
200; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
201; AVX1-NEXT:    vpsubd %xmm3, %xmm0, %xmm0
202; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm3
203; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
204; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
205; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
206; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
207; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
208; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
209; AVX1-NEXT:    vpsadbw %xmm1, %xmm3, %xmm3
210; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
211; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
212; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
213; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
214; AVX1-NEXT:    retq
215;
216; AVX2-LABEL: testv8i32u:
217; AVX2:       # BB#0:
218; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
219; AVX2-NEXT:    vpsubd %ymm0, %ymm1, %ymm2
220; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
221; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm2
222; AVX2-NEXT:    vpsubd %ymm2, %ymm0, %ymm0
223; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
224; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm3
225; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
226; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
227; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
228; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
229; AVX2-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
230; AVX2-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
231; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
232; AVX2-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
233; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
234; AVX2-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
235; AVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
236; AVX2-NEXT:    retq
237  %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %in, i1 -1)
238  ret <8 x i32> %out
239}
240
241define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
242; AVX1-LABEL: testv16i16:
243; AVX1:       # BB#0:
244; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
245; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
246; AVX1-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
247; AVX1-NEXT:    vpsubw %xmm0, %xmm2, %xmm2
248; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
249; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
250; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1]
251; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm2
252; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
253; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
254; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
255; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
256; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
257; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
258; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
259; AVX1-NEXT:    vpaddb %xmm4, %xmm2, %xmm2
260; AVX1-NEXT:    vpsllw $8, %xmm2, %xmm4
261; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
262; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
263; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
264; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
265; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm1
266; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
267; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
268; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
269; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
270; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
271; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm1
272; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
273; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
274; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
275; AVX1-NEXT:    retq
276;
277; AVX2-LABEL: testv16i16:
278; AVX2:       # BB#0:
279; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
280; AVX2-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
281; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
282; AVX2-NEXT:    vpsubw {{.*}}(%rip), %ymm0, %ymm0
283; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
284; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
285; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
286; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
287; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
288; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
289; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
290; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
291; AVX2-NEXT:    vpsllw $8, %ymm0, %ymm1
292; AVX2-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
293; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
294; AVX2-NEXT:    retq
295  %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %in, i1 0)
296  ret <16 x i16> %out
297}
298
299define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
300; AVX1-LABEL: testv16i16u:
301; AVX1:       # BB#0:
302; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
303; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
304; AVX1-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
305; AVX1-NEXT:    vpsubw %xmm0, %xmm2, %xmm2
306; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
307; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
308; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1]
309; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm2
310; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
311; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
312; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
313; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
314; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
315; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
316; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
317; AVX1-NEXT:    vpaddb %xmm4, %xmm2, %xmm2
318; AVX1-NEXT:    vpsllw $8, %xmm2, %xmm4
319; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
320; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
321; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
322; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
323; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm1
324; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
325; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
326; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
327; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
328; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
329; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm1
330; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
331; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
332; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
333; AVX1-NEXT:    retq
334;
335; AVX2-LABEL: testv16i16u:
336; AVX2:       # BB#0:
337; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
338; AVX2-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
339; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
340; AVX2-NEXT:    vpsubw {{.*}}(%rip), %ymm0, %ymm0
341; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
342; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
343; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
344; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
345; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
346; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
347; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
348; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
349; AVX2-NEXT:    vpsllw $8, %ymm0, %ymm1
350; AVX2-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
351; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
352; AVX2-NEXT:    retq
353  %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %in, i1 -1)
354  ret <16 x i16> %out
355}
356
357define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
358; AVX1-LABEL: testv32i8:
359; AVX1:       # BB#0:
360; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
361; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
362; AVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
363; AVX1-NEXT:    vpsubb %xmm0, %xmm2, %xmm2
364; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
365; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
366; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
367; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
368; AVX1-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
369; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
370; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
371; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
372; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
373; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
374; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
375; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
376; AVX1-NEXT:    vpaddb %xmm4, %xmm1, %xmm1
377; AVX1-NEXT:    vpsubb %xmm2, %xmm0, %xmm0
378; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
379; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
380; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
381; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
382; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
383; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
384; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
385; AVX1-NEXT:    retq
386;
387; AVX2-LABEL: testv32i8:
388; AVX2:       # BB#0:
389; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
390; AVX2-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
391; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
392; AVX2-NEXT:    vpsubb {{.*}}(%rip), %ymm0, %ymm0
393; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
394; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
395; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
396; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
397; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
398; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
399; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
400; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
401; AVX2-NEXT:    retq
402  %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %in, i1 0)
403  ret <32 x i8> %out
404}
405
406define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
407; AVX1-LABEL: testv32i8u:
408; AVX1:       # BB#0:
409; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
410; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
411; AVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
412; AVX1-NEXT:    vpsubb %xmm0, %xmm2, %xmm2
413; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
414; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
415; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
416; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
417; AVX1-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
418; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
419; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
420; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
421; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
422; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
423; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
424; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
425; AVX1-NEXT:    vpaddb %xmm4, %xmm1, %xmm1
426; AVX1-NEXT:    vpsubb %xmm2, %xmm0, %xmm0
427; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
428; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
429; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
430; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
431; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
432; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
433; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
434; AVX1-NEXT:    retq
435;
436; AVX2-LABEL: testv32i8u:
437; AVX2:       # BB#0:
438; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
439; AVX2-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
440; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
441; AVX2-NEXT:    vpsubb {{.*}}(%rip), %ymm0, %ymm0
442; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
443; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
444; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
445; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
446; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
447; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
448; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
449; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
450; AVX2-NEXT:    retq
451  %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %in, i1 -1)
452  ret <32 x i8> %out
453}
454
455define <4 x i64> @foldv4i64() nounwind {
456; ALL-LABEL: foldv4i64:
457; ALL:       # BB#0:
458; ALL-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,64,0]
459; ALL-NEXT:    retq
460  %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> <i64 256, i64 -1, i64 0, i64 255>, i1 0)
461  ret <4 x i64> %out
462}
463
464define <4 x i64> @foldv4i64u() nounwind {
465; ALL-LABEL: foldv4i64u:
466; ALL:       # BB#0:
467; ALL-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,64,0]
468; ALL-NEXT:    retq
469  %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> <i64 256, i64 -1, i64 0, i64 255>, i1 -1)
470  ret <4 x i64> %out
471}
472
473define <8 x i32> @foldv8i32() nounwind {
474; ALL-LABEL: foldv8i32:
475; ALL:       # BB#0:
476; ALL-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
477; ALL-NEXT:    retq
478  %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>, i1 0)
479  ret <8 x i32> %out
480}
481
482define <8 x i32> @foldv8i32u() nounwind {
483; ALL-LABEL: foldv8i32u:
484; ALL:       # BB#0:
485; ALL-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
486; ALL-NEXT:    retq
487  %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>, i1 -1)
488  ret <8 x i32> %out
489}
490
491define <16 x i16> @foldv16i16() nounwind {
492; ALL-LABEL: foldv16i16:
493; ALL:       # BB#0:
494; ALL-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
495; ALL-NEXT:    retq
496  %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88, i16 -2, i16 254, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32>, i1 0)
497  ret <16 x i16> %out
498}
499
500define <16 x i16> @foldv16i16u() nounwind {
501; ALL-LABEL: foldv16i16u:
502; ALL:       # BB#0:
503; ALL-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
504; ALL-NEXT:    retq
505  %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88, i16 -2, i16 254, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32>, i1 -1)
506  ret <16 x i16> %out
507}
508
509define <32 x i8> @foldv32i8() nounwind {
510; ALL-LABEL: foldv32i8:
511; ALL:       # BB#0:
512; ALL-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
513; ALL-NEXT:    retq
514  %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 256, i8 -256, i8 -128, i8 -64, i8 -32, i8 -16, i8 -8, i8 -4, i8 -2, i8 -1, i8 3, i8 5, i8 7, i8 127>, i1 0)
515  ret <32 x i8> %out
516}
517
518define <32 x i8> @foldv32i8u() nounwind {
519; ALL-LABEL: foldv32i8u:
520; ALL:       # BB#0:
521; ALL-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
522; ALL-NEXT:    retq
523  %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 256, i8 -256, i8 -128, i8 -64, i8 -32, i8 -16, i8 -8, i8 -4, i8 -2, i8 -1, i8 3, i8 5, i8 7, i8 127>, i1 -1)
524  ret <32 x i8> %out
525}
526
527declare <4 x i64> @llvm.cttz.v4i64(<4 x i64>, i1)
528declare <8 x i32> @llvm.cttz.v8i32(<8 x i32>, i1)
529declare <16 x i16> @llvm.cttz.v16i16(<16 x i16>, i1)
530declare <32 x i8> @llvm.cttz.v32i8(<32 x i8>, i1)
531