• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512VPOPCNTDQ
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512VPOPCNTDQVL
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bitalg | FileCheck %s --check-prefix=ALL --check-prefix=BITALG_NOVLX
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bitalg,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=BITALG
12
13define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
14; SSE2-LABEL: testv2i64:
15; SSE2:       # %bb.0:
16; SSE2-NEXT:    movdqa %xmm0, %xmm1
17; SSE2-NEXT:    psrlq $1, %xmm1
18; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
19; SSE2-NEXT:    psubq %xmm1, %xmm0
20; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323]
21; SSE2-NEXT:    movdqa %xmm0, %xmm2
22; SSE2-NEXT:    pand %xmm1, %xmm2
23; SSE2-NEXT:    psrlq $2, %xmm0
24; SSE2-NEXT:    pand %xmm1, %xmm0
25; SSE2-NEXT:    paddq %xmm2, %xmm0
26; SSE2-NEXT:    movdqa %xmm0, %xmm1
27; SSE2-NEXT:    psrlq $4, %xmm1
28; SSE2-NEXT:    paddq %xmm0, %xmm1
29; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
30; SSE2-NEXT:    pxor %xmm0, %xmm0
31; SSE2-NEXT:    psadbw %xmm0, %xmm1
32; SSE2-NEXT:    movdqa %xmm1, %xmm0
33; SSE2-NEXT:    retq
34;
35; SSE3-LABEL: testv2i64:
36; SSE3:       # %bb.0:
37; SSE3-NEXT:    movdqa %xmm0, %xmm1
38; SSE3-NEXT:    psrlq $1, %xmm1
39; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
40; SSE3-NEXT:    psubq %xmm1, %xmm0
41; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323]
42; SSE3-NEXT:    movdqa %xmm0, %xmm2
43; SSE3-NEXT:    pand %xmm1, %xmm2
44; SSE3-NEXT:    psrlq $2, %xmm0
45; SSE3-NEXT:    pand %xmm1, %xmm0
46; SSE3-NEXT:    paddq %xmm2, %xmm0
47; SSE3-NEXT:    movdqa %xmm0, %xmm1
48; SSE3-NEXT:    psrlq $4, %xmm1
49; SSE3-NEXT:    paddq %xmm0, %xmm1
50; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
51; SSE3-NEXT:    pxor %xmm0, %xmm0
52; SSE3-NEXT:    psadbw %xmm0, %xmm1
53; SSE3-NEXT:    movdqa %xmm1, %xmm0
54; SSE3-NEXT:    retq
55;
56; SSSE3-LABEL: testv2i64:
57; SSSE3:       # %bb.0:
58; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
59; SSSE3-NEXT:    movdqa %xmm0, %xmm2
60; SSSE3-NEXT:    pand %xmm1, %xmm2
61; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
62; SSSE3-NEXT:    movdqa %xmm3, %xmm4
63; SSSE3-NEXT:    pshufb %xmm2, %xmm4
64; SSSE3-NEXT:    psrlw $4, %xmm0
65; SSSE3-NEXT:    pand %xmm1, %xmm0
66; SSSE3-NEXT:    pshufb %xmm0, %xmm3
67; SSSE3-NEXT:    paddb %xmm4, %xmm3
68; SSSE3-NEXT:    pxor %xmm0, %xmm0
69; SSSE3-NEXT:    psadbw %xmm3, %xmm0
70; SSSE3-NEXT:    retq
71;
72; SSE41-LABEL: testv2i64:
73; SSE41:       # %bb.0:
74; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
75; SSE41-NEXT:    movdqa %xmm0, %xmm2
76; SSE41-NEXT:    pand %xmm1, %xmm2
77; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
78; SSE41-NEXT:    movdqa %xmm3, %xmm4
79; SSE41-NEXT:    pshufb %xmm2, %xmm4
80; SSE41-NEXT:    psrlw $4, %xmm0
81; SSE41-NEXT:    pand %xmm1, %xmm0
82; SSE41-NEXT:    pshufb %xmm0, %xmm3
83; SSE41-NEXT:    paddb %xmm4, %xmm3
84; SSE41-NEXT:    pxor %xmm0, %xmm0
85; SSE41-NEXT:    psadbw %xmm3, %xmm0
86; SSE41-NEXT:    retq
87;
88; AVX1-LABEL: testv2i64:
89; AVX1:       # %bb.0:
90; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
91; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
92; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
93; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
94; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
95; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
96; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
97; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
98; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
99; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
100; AVX1-NEXT:    retq
101;
102; AVX2-LABEL: testv2i64:
103; AVX2:       # %bb.0:
104; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
105; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
106; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
107; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
108; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
109; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
110; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
111; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
112; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
113; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
114; AVX2-NEXT:    retq
115;
116; AVX512VPOPCNTDQ-LABEL: testv2i64:
117; AVX512VPOPCNTDQ:       # %bb.0:
118; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
119; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
120; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
121; AVX512VPOPCNTDQ-NEXT:    vzeroupper
122; AVX512VPOPCNTDQ-NEXT:    retq
123;
124; AVX512VPOPCNTDQVL-LABEL: testv2i64:
125; AVX512VPOPCNTDQVL:       # %bb.0:
126; AVX512VPOPCNTDQVL-NEXT:    vpopcntq %xmm0, %xmm0
127; AVX512VPOPCNTDQVL-NEXT:    retq
128;
129; BITALG_NOVLX-LABEL: testv2i64:
130; BITALG_NOVLX:       # %bb.0:
131; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
132; BITALG_NOVLX-NEXT:    vpand %xmm1, %xmm0, %xmm2
133; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
134; BITALG_NOVLX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
135; BITALG_NOVLX-NEXT:    vpsrlw $4, %xmm0, %xmm0
136; BITALG_NOVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
137; BITALG_NOVLX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
138; BITALG_NOVLX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
139; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
140; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
141; BITALG_NOVLX-NEXT:    retq
142;
143; BITALG-LABEL: testv2i64:
144; BITALG:       # %bb.0:
145; BITALG-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
146; BITALG-NEXT:    vpand %xmm1, %xmm0, %xmm2
147; BITALG-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
148; BITALG-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
149; BITALG-NEXT:    vpsrlw $4, %xmm0, %xmm0
150; BITALG-NEXT:    vpand %xmm1, %xmm0, %xmm0
151; BITALG-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
152; BITALG-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
153; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
154; BITALG-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
155; BITALG-NEXT:    retq
156  %out = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %in)
157  ret <2 x i64> %out
158}
159
160define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
161; SSE2-LABEL: testv4i32:
162; SSE2:       # %bb.0:
163; SSE2-NEXT:    movdqa %xmm0, %xmm1
164; SSE2-NEXT:    psrld $1, %xmm1
165; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
166; SSE2-NEXT:    psubd %xmm1, %xmm0
167; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [858993459,858993459,858993459,858993459]
168; SSE2-NEXT:    movdqa %xmm0, %xmm2
169; SSE2-NEXT:    pand %xmm1, %xmm2
170; SSE2-NEXT:    psrld $2, %xmm0
171; SSE2-NEXT:    pand %xmm1, %xmm0
172; SSE2-NEXT:    paddd %xmm2, %xmm0
173; SSE2-NEXT:    movdqa %xmm0, %xmm1
174; SSE2-NEXT:    psrld $4, %xmm1
175; SSE2-NEXT:    paddd %xmm0, %xmm1
176; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
177; SSE2-NEXT:    pxor %xmm0, %xmm0
178; SSE2-NEXT:    movdqa %xmm1, %xmm2
179; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
180; SSE2-NEXT:    psadbw %xmm0, %xmm2
181; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
182; SSE2-NEXT:    psadbw %xmm0, %xmm1
183; SSE2-NEXT:    packuswb %xmm2, %xmm1
184; SSE2-NEXT:    movdqa %xmm1, %xmm0
185; SSE2-NEXT:    retq
186;
187; SSE3-LABEL: testv4i32:
188; SSE3:       # %bb.0:
189; SSE3-NEXT:    movdqa %xmm0, %xmm1
190; SSE3-NEXT:    psrld $1, %xmm1
191; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
192; SSE3-NEXT:    psubd %xmm1, %xmm0
193; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [858993459,858993459,858993459,858993459]
194; SSE3-NEXT:    movdqa %xmm0, %xmm2
195; SSE3-NEXT:    pand %xmm1, %xmm2
196; SSE3-NEXT:    psrld $2, %xmm0
197; SSE3-NEXT:    pand %xmm1, %xmm0
198; SSE3-NEXT:    paddd %xmm2, %xmm0
199; SSE3-NEXT:    movdqa %xmm0, %xmm1
200; SSE3-NEXT:    psrld $4, %xmm1
201; SSE3-NEXT:    paddd %xmm0, %xmm1
202; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
203; SSE3-NEXT:    pxor %xmm0, %xmm0
204; SSE3-NEXT:    movdqa %xmm1, %xmm2
205; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
206; SSE3-NEXT:    psadbw %xmm0, %xmm2
207; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
208; SSE3-NEXT:    psadbw %xmm0, %xmm1
209; SSE3-NEXT:    packuswb %xmm2, %xmm1
210; SSE3-NEXT:    movdqa %xmm1, %xmm0
211; SSE3-NEXT:    retq
212;
213; SSSE3-LABEL: testv4i32:
214; SSSE3:       # %bb.0:
215; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
216; SSSE3-NEXT:    movdqa %xmm0, %xmm3
217; SSSE3-NEXT:    pand %xmm2, %xmm3
218; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
219; SSSE3-NEXT:    movdqa %xmm1, %xmm4
220; SSSE3-NEXT:    pshufb %xmm3, %xmm4
221; SSSE3-NEXT:    psrlw $4, %xmm0
222; SSSE3-NEXT:    pand %xmm2, %xmm0
223; SSSE3-NEXT:    pshufb %xmm0, %xmm1
224; SSSE3-NEXT:    paddb %xmm4, %xmm1
225; SSSE3-NEXT:    pxor %xmm0, %xmm0
226; SSSE3-NEXT:    movdqa %xmm1, %xmm2
227; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
228; SSSE3-NEXT:    psadbw %xmm0, %xmm2
229; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
230; SSSE3-NEXT:    psadbw %xmm0, %xmm1
231; SSSE3-NEXT:    packuswb %xmm2, %xmm1
232; SSSE3-NEXT:    movdqa %xmm1, %xmm0
233; SSSE3-NEXT:    retq
234;
235; SSE41-LABEL: testv4i32:
236; SSE41:       # %bb.0:
237; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
238; SSE41-NEXT:    movdqa %xmm0, %xmm2
239; SSE41-NEXT:    pand %xmm1, %xmm2
240; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
241; SSE41-NEXT:    movdqa %xmm3, %xmm4
242; SSE41-NEXT:    pshufb %xmm2, %xmm4
243; SSE41-NEXT:    psrlw $4, %xmm0
244; SSE41-NEXT:    pand %xmm1, %xmm0
245; SSE41-NEXT:    pshufb %xmm0, %xmm3
246; SSE41-NEXT:    paddb %xmm4, %xmm3
247; SSE41-NEXT:    pxor %xmm1, %xmm1
248; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero
249; SSE41-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3]
250; SSE41-NEXT:    psadbw %xmm1, %xmm3
251; SSE41-NEXT:    psadbw %xmm1, %xmm0
252; SSE41-NEXT:    packuswb %xmm3, %xmm0
253; SSE41-NEXT:    retq
254;
255; AVX1-LABEL: testv4i32:
256; AVX1:       # %bb.0:
257; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
258; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
259; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
260; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
261; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
262; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
263; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
264; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
265; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
266; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
267; AVX1-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
268; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
269; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
270; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
271; AVX1-NEXT:    retq
272;
273; AVX2-LABEL: testv4i32:
274; AVX2:       # %bb.0:
275; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
276; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
277; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
278; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
279; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
280; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
281; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
282; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
283; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
284; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
285; AVX2-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
286; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
287; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
288; AVX2-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
289; AVX2-NEXT:    retq
290;
291; AVX512VPOPCNTDQ-LABEL: testv4i32:
292; AVX512VPOPCNTDQ:       # %bb.0:
293; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
294; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
295; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
296; AVX512VPOPCNTDQ-NEXT:    vzeroupper
297; AVX512VPOPCNTDQ-NEXT:    retq
298;
299; AVX512VPOPCNTDQVL-LABEL: testv4i32:
300; AVX512VPOPCNTDQVL:       # %bb.0:
301; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %xmm0, %xmm0
302; AVX512VPOPCNTDQVL-NEXT:    retq
303;
304; BITALG_NOVLX-LABEL: testv4i32:
305; BITALG_NOVLX:       # %bb.0:
306; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
307; BITALG_NOVLX-NEXT:    vpand %xmm1, %xmm0, %xmm2
308; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
309; BITALG_NOVLX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
310; BITALG_NOVLX-NEXT:    vpsrlw $4, %xmm0, %xmm0
311; BITALG_NOVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
312; BITALG_NOVLX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
313; BITALG_NOVLX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
314; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
315; BITALG_NOVLX-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
316; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
317; BITALG_NOVLX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
318; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
319; BITALG_NOVLX-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
320; BITALG_NOVLX-NEXT:    retq
321;
322; BITALG-LABEL: testv4i32:
323; BITALG:       # %bb.0:
324; BITALG-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
325; BITALG-NEXT:    vpand %xmm1, %xmm0, %xmm2
326; BITALG-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
327; BITALG-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
328; BITALG-NEXT:    vpsrlw $4, %xmm0, %xmm0
329; BITALG-NEXT:    vpand %xmm1, %xmm0, %xmm0
330; BITALG-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
331; BITALG-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
332; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
333; BITALG-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
334; BITALG-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
335; BITALG-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
336; BITALG-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
337; BITALG-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
338; BITALG-NEXT:    retq
339  %out = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %in)
340  ret <4 x i32> %out
341}
342
343define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
344; SSE2-LABEL: testv8i16:
345; SSE2:       # %bb.0:
346; SSE2-NEXT:    movdqa %xmm0, %xmm1
347; SSE2-NEXT:    psrlw $1, %xmm1
348; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
349; SSE2-NEXT:    psubw %xmm1, %xmm0
350; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107]
351; SSE2-NEXT:    movdqa %xmm0, %xmm2
352; SSE2-NEXT:    pand %xmm1, %xmm2
353; SSE2-NEXT:    psrlw $2, %xmm0
354; SSE2-NEXT:    pand %xmm1, %xmm0
355; SSE2-NEXT:    paddw %xmm2, %xmm0
356; SSE2-NEXT:    movdqa %xmm0, %xmm1
357; SSE2-NEXT:    psrlw $4, %xmm1
358; SSE2-NEXT:    paddw %xmm0, %xmm1
359; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
360; SSE2-NEXT:    movdqa %xmm1, %xmm0
361; SSE2-NEXT:    psllw $8, %xmm0
362; SSE2-NEXT:    paddb %xmm1, %xmm0
363; SSE2-NEXT:    psrlw $8, %xmm0
364; SSE2-NEXT:    retq
365;
366; SSE3-LABEL: testv8i16:
367; SSE3:       # %bb.0:
368; SSE3-NEXT:    movdqa %xmm0, %xmm1
369; SSE3-NEXT:    psrlw $1, %xmm1
370; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
371; SSE3-NEXT:    psubw %xmm1, %xmm0
372; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107]
373; SSE3-NEXT:    movdqa %xmm0, %xmm2
374; SSE3-NEXT:    pand %xmm1, %xmm2
375; SSE3-NEXT:    psrlw $2, %xmm0
376; SSE3-NEXT:    pand %xmm1, %xmm0
377; SSE3-NEXT:    paddw %xmm2, %xmm0
378; SSE3-NEXT:    movdqa %xmm0, %xmm1
379; SSE3-NEXT:    psrlw $4, %xmm1
380; SSE3-NEXT:    paddw %xmm0, %xmm1
381; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
382; SSE3-NEXT:    movdqa %xmm1, %xmm0
383; SSE3-NEXT:    psllw $8, %xmm0
384; SSE3-NEXT:    paddb %xmm1, %xmm0
385; SSE3-NEXT:    psrlw $8, %xmm0
386; SSE3-NEXT:    retq
387;
388; SSSE3-LABEL: testv8i16:
389; SSSE3:       # %bb.0:
390; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
391; SSSE3-NEXT:    movdqa %xmm0, %xmm2
392; SSSE3-NEXT:    pand %xmm1, %xmm2
393; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
394; SSSE3-NEXT:    movdqa %xmm3, %xmm4
395; SSSE3-NEXT:    pshufb %xmm2, %xmm4
396; SSSE3-NEXT:    psrlw $4, %xmm0
397; SSSE3-NEXT:    pand %xmm1, %xmm0
398; SSSE3-NEXT:    pshufb %xmm0, %xmm3
399; SSSE3-NEXT:    paddb %xmm4, %xmm3
400; SSSE3-NEXT:    movdqa %xmm3, %xmm0
401; SSSE3-NEXT:    psllw $8, %xmm0
402; SSSE3-NEXT:    paddb %xmm3, %xmm0
403; SSSE3-NEXT:    psrlw $8, %xmm0
404; SSSE3-NEXT:    retq
405;
406; SSE41-LABEL: testv8i16:
407; SSE41:       # %bb.0:
408; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
409; SSE41-NEXT:    movdqa %xmm0, %xmm2
410; SSE41-NEXT:    pand %xmm1, %xmm2
411; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
412; SSE41-NEXT:    movdqa %xmm3, %xmm4
413; SSE41-NEXT:    pshufb %xmm2, %xmm4
414; SSE41-NEXT:    psrlw $4, %xmm0
415; SSE41-NEXT:    pand %xmm1, %xmm0
416; SSE41-NEXT:    pshufb %xmm0, %xmm3
417; SSE41-NEXT:    paddb %xmm4, %xmm3
418; SSE41-NEXT:    movdqa %xmm3, %xmm0
419; SSE41-NEXT:    psllw $8, %xmm0
420; SSE41-NEXT:    paddb %xmm3, %xmm0
421; SSE41-NEXT:    psrlw $8, %xmm0
422; SSE41-NEXT:    retq
423;
424; AVX1-LABEL: testv8i16:
425; AVX1:       # %bb.0:
426; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
427; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
428; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
429; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
430; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
431; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
432; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
433; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
434; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm1
435; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
436; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
437; AVX1-NEXT:    retq
438;
439; AVX2-LABEL: testv8i16:
440; AVX2:       # %bb.0:
441; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
442; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
443; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
444; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
445; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
446; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
447; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
448; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
449; AVX2-NEXT:    vpsllw $8, %xmm0, %xmm1
450; AVX2-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
451; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm0
452; AVX2-NEXT:    retq
453;
454; AVX512VPOPCNTDQ-LABEL: testv8i16:
455; AVX512VPOPCNTDQ:       # %bb.0:
456; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
457; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
458; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm0, %ymm0
459; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
460; AVX512VPOPCNTDQ-NEXT:    vzeroupper
461; AVX512VPOPCNTDQ-NEXT:    retq
462;
463; AVX512VPOPCNTDQVL-LABEL: testv8i16:
464; AVX512VPOPCNTDQVL:       # %bb.0:
465; AVX512VPOPCNTDQVL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
466; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %ymm0, %ymm0
467; AVX512VPOPCNTDQVL-NEXT:    vpmovdw %ymm0, %xmm0
468; AVX512VPOPCNTDQVL-NEXT:    vzeroupper
469; AVX512VPOPCNTDQVL-NEXT:    retq
470;
471; BITALG_NOVLX-LABEL: testv8i16:
472; BITALG_NOVLX:       # %bb.0:
473; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
474; BITALG_NOVLX-NEXT:    vpopcntw %zmm0, %zmm0
475; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
476; BITALG_NOVLX-NEXT:    vzeroupper
477; BITALG_NOVLX-NEXT:    retq
478;
479; BITALG-LABEL: testv8i16:
480; BITALG:       # %bb.0:
481; BITALG-NEXT:    vpopcntw %xmm0, %xmm0
482; BITALG-NEXT:    retq
483  %out = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %in)
484  ret <8 x i16> %out
485}
486
487define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
488; SSE2-LABEL: testv16i8:
489; SSE2:       # %bb.0:
490; SSE2-NEXT:    movdqa %xmm0, %xmm1
491; SSE2-NEXT:    psrlw $1, %xmm1
492; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
493; SSE2-NEXT:    psubb %xmm1, %xmm0
494; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
495; SSE2-NEXT:    movdqa %xmm0, %xmm2
496; SSE2-NEXT:    pand %xmm1, %xmm2
497; SSE2-NEXT:    psrlw $2, %xmm0
498; SSE2-NEXT:    pand %xmm1, %xmm0
499; SSE2-NEXT:    paddb %xmm2, %xmm0
500; SSE2-NEXT:    movdqa %xmm0, %xmm1
501; SSE2-NEXT:    psrlw $4, %xmm1
502; SSE2-NEXT:    paddb %xmm0, %xmm1
503; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
504; SSE2-NEXT:    movdqa %xmm1, %xmm0
505; SSE2-NEXT:    retq
506;
507; SSE3-LABEL: testv16i8:
508; SSE3:       # %bb.0:
509; SSE3-NEXT:    movdqa %xmm0, %xmm1
510; SSE3-NEXT:    psrlw $1, %xmm1
511; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
512; SSE3-NEXT:    psubb %xmm1, %xmm0
513; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
514; SSE3-NEXT:    movdqa %xmm0, %xmm2
515; SSE3-NEXT:    pand %xmm1, %xmm2
516; SSE3-NEXT:    psrlw $2, %xmm0
517; SSE3-NEXT:    pand %xmm1, %xmm0
518; SSE3-NEXT:    paddb %xmm2, %xmm0
519; SSE3-NEXT:    movdqa %xmm0, %xmm1
520; SSE3-NEXT:    psrlw $4, %xmm1
521; SSE3-NEXT:    paddb %xmm0, %xmm1
522; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
523; SSE3-NEXT:    movdqa %xmm1, %xmm0
524; SSE3-NEXT:    retq
525;
526; SSSE3-LABEL: testv16i8:
527; SSSE3:       # %bb.0:
528; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
529; SSSE3-NEXT:    movdqa %xmm0, %xmm3
530; SSSE3-NEXT:    pand %xmm2, %xmm3
531; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
532; SSSE3-NEXT:    movdqa %xmm1, %xmm4
533; SSSE3-NEXT:    pshufb %xmm3, %xmm4
534; SSSE3-NEXT:    psrlw $4, %xmm0
535; SSSE3-NEXT:    pand %xmm2, %xmm0
536; SSSE3-NEXT:    pshufb %xmm0, %xmm1
537; SSSE3-NEXT:    paddb %xmm4, %xmm1
538; SSSE3-NEXT:    movdqa %xmm1, %xmm0
539; SSSE3-NEXT:    retq
540;
541; SSE41-LABEL: testv16i8:
542; SSE41:       # %bb.0:
543; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
544; SSE41-NEXT:    movdqa %xmm0, %xmm3
545; SSE41-NEXT:    pand %xmm2, %xmm3
546; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
547; SSE41-NEXT:    movdqa %xmm1, %xmm4
548; SSE41-NEXT:    pshufb %xmm3, %xmm4
549; SSE41-NEXT:    psrlw $4, %xmm0
550; SSE41-NEXT:    pand %xmm2, %xmm0
551; SSE41-NEXT:    pshufb %xmm0, %xmm1
552; SSE41-NEXT:    paddb %xmm4, %xmm1
553; SSE41-NEXT:    movdqa %xmm1, %xmm0
554; SSE41-NEXT:    retq
555;
556; AVX1-LABEL: testv16i8:
557; AVX1:       # %bb.0:
558; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
559; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
560; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
561; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
562; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
563; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
564; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
565; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
566; AVX1-NEXT:    retq
567;
568; AVX2-LABEL: testv16i8:
569; AVX2:       # %bb.0:
570; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
571; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
572; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
573; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
574; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
575; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
576; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
577; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
578; AVX2-NEXT:    retq
579;
580; AVX512VPOPCNTDQ-LABEL: testv16i8:
581; AVX512VPOPCNTDQ:       # %bb.0:
582; AVX512VPOPCNTDQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
583; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
584; AVX512VPOPCNTDQ-NEXT:    vpmovdb %zmm0, %xmm0
585; AVX512VPOPCNTDQ-NEXT:    vzeroupper
586; AVX512VPOPCNTDQ-NEXT:    retq
587;
588; AVX512VPOPCNTDQVL-LABEL: testv16i8:
589; AVX512VPOPCNTDQVL:       # %bb.0:
590; AVX512VPOPCNTDQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
591; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %zmm0, %zmm0
592; AVX512VPOPCNTDQVL-NEXT:    vpmovdb %zmm0, %xmm0
593; AVX512VPOPCNTDQVL-NEXT:    vzeroupper
594; AVX512VPOPCNTDQVL-NEXT:    retq
595;
596; BITALG_NOVLX-LABEL: testv16i8:
597; BITALG_NOVLX:       # %bb.0:
598; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
599; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
600; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
601; BITALG_NOVLX-NEXT:    vzeroupper
602; BITALG_NOVLX-NEXT:    retq
603;
604; BITALG-LABEL: testv16i8:
605; BITALG:       # %bb.0:
606; BITALG-NEXT:    vpopcntb %xmm0, %xmm0
607; BITALG-NEXT:    retq
608  %out = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %in)
609  ret <16 x i8> %out
610}
611
612define <2 x i64> @foldv2i64() nounwind {
613; SSE-LABEL: foldv2i64:
614; SSE:       # %bb.0:
615; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,64]
616; SSE-NEXT:    retq
617;
618; AVX-LABEL: foldv2i64:
619; AVX:       # %bb.0:
620; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [1,64]
621; AVX-NEXT:    retq
622;
623; BITALG_NOVLX-LABEL: foldv2i64:
624; BITALG_NOVLX:       # %bb.0:
625; BITALG_NOVLX-NEXT:    vmovaps {{.*#+}} xmm0 = [1,64]
626; BITALG_NOVLX-NEXT:    retq
627;
628; BITALG-LABEL: foldv2i64:
629; BITALG:       # %bb.0:
630; BITALG-NEXT:    vmovaps {{.*#+}} xmm0 = [1,64]
631; BITALG-NEXT:    retq
632  %out = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> <i64 256, i64 -1>)
633  ret <2 x i64> %out
634}
635
636define <4 x i32> @foldv4i32() nounwind {
637; SSE-LABEL: foldv4i32:
638; SSE:       # %bb.0:
639; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,32,0,8]
640; SSE-NEXT:    retq
641;
642; AVX-LABEL: foldv4i32:
643; AVX:       # %bb.0:
644; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [1,32,0,8]
645; AVX-NEXT:    retq
646;
647; BITALG_NOVLX-LABEL: foldv4i32:
648; BITALG_NOVLX:       # %bb.0:
649; BITALG_NOVLX-NEXT:    vmovaps {{.*#+}} xmm0 = [1,32,0,8]
650; BITALG_NOVLX-NEXT:    retq
651;
652; BITALG-LABEL: foldv4i32:
653; BITALG:       # %bb.0:
654; BITALG-NEXT:    vmovaps {{.*#+}} xmm0 = [1,32,0,8]
655; BITALG-NEXT:    retq
656  %out = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> <i32 256, i32 -1, i32 0, i32 255>)
657  ret <4 x i32> %out
658}
659
660define <8 x i16> @foldv8i16() nounwind {
661; SSE-LABEL: foldv8i16:
662; SSE:       # %bb.0:
663; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3]
664; SSE-NEXT:    retq
665;
666; AVX-LABEL: foldv8i16:
667; AVX:       # %bb.0:
668; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3]
669; AVX-NEXT:    retq
670;
671; BITALG_NOVLX-LABEL: foldv8i16:
672; BITALG_NOVLX:       # %bb.0:
673; BITALG_NOVLX-NEXT:    vmovaps {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3]
674; BITALG_NOVLX-NEXT:    retq
675;
676; BITALG-LABEL: foldv8i16:
677; BITALG:       # %bb.0:
678; BITALG-NEXT:    vmovaps {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3]
679; BITALG-NEXT:    retq
680  %out = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88>)
681  ret <8 x i16> %out
682}
683
684define <16 x i8> @foldv16i8() nounwind {
685; SSE-LABEL: foldv16i8:
686; SSE:       # %bb.0:
687; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1]
688; SSE-NEXT:    retq
689;
690; AVX-LABEL: foldv16i8:
691; AVX:       # %bb.0:
692; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1]
693; AVX-NEXT:    retq
694;
695; BITALG_NOVLX-LABEL: foldv16i8:
696; BITALG_NOVLX:       # %bb.0:
697; BITALG_NOVLX-NEXT:    vmovaps {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1]
698; BITALG_NOVLX-NEXT:    retq
699;
700; BITALG-LABEL: foldv16i8:
701; BITALG:       # %bb.0:
702; BITALG-NEXT:    vmovaps {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1]
703; BITALG-NEXT:    retq
704  %out = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32>)
705  ret <16 x i8> %out
706}
707
708declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>)
709declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>)
710declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>)
711declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>)
712