• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=FUNC -check-prefix=SI %s
2; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=FUNC -check-prefix=VI %s
3; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=EG -check-prefix=FUNC %s
4
5declare i16 @llvm.ctpop.i16(i16) nounwind readnone
6declare <2 x i16> @llvm.ctpop.v2i16(<2 x i16>) nounwind readnone
7declare <4 x i16> @llvm.ctpop.v4i16(<4 x i16>) nounwind readnone
8declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>) nounwind readnone
9declare <16 x i16> @llvm.ctpop.v16i16(<16 x i16>) nounwind readnone
10
11declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
12
13; FUNC-LABEL: {{^}}s_ctpop_i16:
14; GCN: s_load_dword [[SVAL:s[0-9]+]],
15; GCN: s_bcnt1_i32_b32 [[SRESULT:s[0-9]+]], [[SVAL]]
16; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
17; GCN: buffer_store_short [[VRESULT]],
18; GCN: s_endpgm
19
20; EG: BCNT_INT
21define amdgpu_kernel void @s_ctpop_i16(i16 addrspace(1)* noalias %out, i16 %val) nounwind {
22  %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
23  store i16 %ctpop, i16 addrspace(1)* %out, align 4
24  ret void
25}
26
27; XXX - Why 0 in register?
28; FUNC-LABEL: {{^}}v_ctpop_i16:
29; GCN: {{buffer|flat}}_load_ushort [[VAL:v[0-9]+]],
30; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], 0
31; GCN: buffer_store_short [[RESULT]],
32; GCN: s_endpgm
33
34; EG: BCNT_INT
35define amdgpu_kernel void @v_ctpop_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
36  %tid = call i32 @llvm.amdgcn.workitem.id.x()
37  %in.gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid
38  %val = load i16, i16 addrspace(1)* %in.gep, align 4
39  %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
40  store i16 %ctpop, i16 addrspace(1)* %out, align 4
41  ret void
42}
43
44; FUNC-LABEL: {{^}}v_ctpop_add_chain_i16:
45; SI: buffer_load_ushort [[VAL0:v[0-9]+]],
46; SI: buffer_load_ushort [[VAL1:v[0-9]+]],
47; VI: flat_load_ushort [[VAL0:v[0-9]+]],
48; VI: flat_load_ushort [[VAL1:v[0-9]+]],
49; GCN: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], [[VAL1]], 0
50; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
51; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
52; GCN: buffer_store_short [[RESULT]],
53; GCN: s_endpgm
54
55; EG: BCNT_INT
56; EG: BCNT_INT
57define amdgpu_kernel void @v_ctpop_add_chain_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in0, i16 addrspace(1)* noalias %in1) nounwind {
58  %tid = call i32 @llvm.amdgcn.workitem.id.x()
59  %in0.gep = getelementptr i16, i16 addrspace(1)* %in0, i32 %tid
60  %in1.gep = getelementptr i16, i16 addrspace(1)* %in1, i32 %tid
61  %val0 = load volatile i16, i16 addrspace(1)* %in0.gep, align 4
62  %val1 = load volatile i16, i16 addrspace(1)* %in1.gep, align 4
63  %ctpop0 = call i16 @llvm.ctpop.i16(i16 %val0) nounwind readnone
64  %ctpop1 = call i16 @llvm.ctpop.i16(i16 %val1) nounwind readnone
65  %add = add i16 %ctpop0, %ctpop1
66  store i16 %add, i16 addrspace(1)* %out, align 4
67  ret void
68}
69
70; FUNC-LABEL: {{^}}v_ctpop_add_sgpr_i16:
71; GCN: {{buffer|flat}}_load_ushort [[VAL0:v[0-9]+]],
72; GCN: s_waitcnt
73; GCN-NEXT: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL0]], s{{[0-9]+}}
74; GCN: buffer_store_short [[RESULT]],
75; GCN: s_endpgm
76define amdgpu_kernel void @v_ctpop_add_sgpr_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in, i16 %sval) nounwind {
77  %tid = call i32 @llvm.amdgcn.workitem.id.x()
78  %in.gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid
79  %val = load i16, i16 addrspace(1)* %in.gep, align 4
80  %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
81  %add = add i16 %ctpop, %sval
82  store i16 %add, i16 addrspace(1)* %out, align 4
83  ret void
84}
85
86; FUNC-LABEL: {{^}}v_ctpop_v2i16:
87; GCN: v_bcnt_u32_b32{{(_e64)*}}
88; GCN: v_bcnt_u32_b32{{(_e64)*}}
89; GCN: s_endpgm
90
91; EG: BCNT_INT
92; EG: BCNT_INT
93define amdgpu_kernel void @v_ctpop_v2i16(<2 x i16> addrspace(1)* noalias %out, <2 x i16> addrspace(1)* noalias %in) nounwind {
94  %tid = call i32 @llvm.amdgcn.workitem.id.x()
95  %in.gep = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i32 %tid
96  %val = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep, align 8
97  %ctpop = call <2 x i16> @llvm.ctpop.v2i16(<2 x i16> %val) nounwind readnone
98  store <2 x i16> %ctpop, <2 x i16> addrspace(1)* %out, align 8
99  ret void
100}
101
102; FUNC-LABEL: {{^}}v_ctpop_v4i16:
103; GCN: v_bcnt_u32_b32{{(_e64)*}}
104; GCN: v_bcnt_u32_b32{{(_e64)*}}
105; GCN: v_bcnt_u32_b32{{(_e64)*}}
106; GCN: v_bcnt_u32_b32{{(_e64)*}}
107; GCN: s_endpgm
108
109; EG: BCNT_INT
110; EG: BCNT_INT
111; EG: BCNT_INT
112; EG: BCNT_INT
113define amdgpu_kernel void @v_ctpop_v4i16(<4 x i16> addrspace(1)* noalias %out, <4 x i16> addrspace(1)* noalias %in) nounwind {
114  %tid = call i32 @llvm.amdgcn.workitem.id.x()
115  %in.gep = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i32 %tid
116  %val = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep, align 16
117  %ctpop = call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %val) nounwind readnone
118  store <4 x i16> %ctpop, <4 x i16> addrspace(1)* %out, align 16
119  ret void
120}
121
122; FUNC-LABEL: {{^}}v_ctpop_v8i16:
123; GCN: v_bcnt_u32_b32{{(_e64)*}}
124; GCN: v_bcnt_u32_b32{{(_e64)*}}
125; GCN: v_bcnt_u32_b32{{(_e64)*}}
126; GCN: v_bcnt_u32_b32{{(_e64)*}}
127; GCN: v_bcnt_u32_b32{{(_e64)*}}
128; GCN: v_bcnt_u32_b32{{(_e64)*}}
129; GCN: v_bcnt_u32_b32{{(_e64)*}}
130; GCN: v_bcnt_u32_b32{{(_e64)*}}
131; GCN: s_endpgm
132
133; EG: BCNT_INT
134; EG: BCNT_INT
135; EG: BCNT_INT
136; EG: BCNT_INT
137; EG: BCNT_INT
138; EG: BCNT_INT
139; EG: BCNT_INT
140; EG: BCNT_INT
141define amdgpu_kernel void @v_ctpop_v8i16(<8 x i16> addrspace(1)* noalias %out, <8 x i16> addrspace(1)* noalias %in) nounwind {
142  %tid = call i32 @llvm.amdgcn.workitem.id.x()
143  %in.gep = getelementptr <8 x i16>, <8 x i16> addrspace(1)* %in, i32 %tid
144  %val = load <8 x i16>, <8 x i16> addrspace(1)* %in.gep, align 32
145  %ctpop = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %val) nounwind readnone
146  store <8 x i16> %ctpop, <8 x i16> addrspace(1)* %out, align 32
147  ret void
148}
149
150; FUNC-LABEL: {{^}}v_ctpop_v16i16:
151; GCN: v_bcnt_u32_b32{{(_e64)*}}
152; GCN: v_bcnt_u32_b32{{(_e64)*}}
153; GCN: v_bcnt_u32_b32{{(_e64)*}}
154; GCN: v_bcnt_u32_b32{{(_e64)*}}
155; GCN: v_bcnt_u32_b32{{(_e64)*}}
156; GCN: v_bcnt_u32_b32{{(_e64)*}}
157; GCN: v_bcnt_u32_b32{{(_e64)*}}
158; GCN: v_bcnt_u32_b32{{(_e64)*}}
159; GCN: v_bcnt_u32_b32{{(_e64)*}}
160; GCN: v_bcnt_u32_b32{{(_e64)*}}
161; GCN: v_bcnt_u32_b32{{(_e64)*}}
162; GCN: v_bcnt_u32_b32{{(_e64)*}}
163; GCN: v_bcnt_u32_b32{{(_e64)*}}
164; GCN: v_bcnt_u32_b32{{(_e64)*}}
165; GCN: v_bcnt_u32_b32{{(_e64)*}}
166; GCN: v_bcnt_u32_b32{{(_e64)*}}
167; GCN: s_endpgm
168
169; EG: BCNT_INT
170; EG: BCNT_INT
171; EG: BCNT_INT
172; EG: BCNT_INT
173; EG: BCNT_INT
174; EG: BCNT_INT
175; EG: BCNT_INT
176; EG: BCNT_INT
177; EG: BCNT_INT
178; EG: BCNT_INT
179; EG: BCNT_INT
180; EG: BCNT_INT
181; EG: BCNT_INT
182; EG: BCNT_INT
183; EG: BCNT_INT
184; EG: BCNT_INT
185define amdgpu_kernel void @v_ctpop_v16i16(<16 x i16> addrspace(1)* noalias %out, <16 x i16> addrspace(1)* noalias %in) nounwind {
186  %tid = call i32 @llvm.amdgcn.workitem.id.x()
187  %in.gep = getelementptr <16 x i16>, <16 x i16> addrspace(1)* %in, i32 %tid
188  %val = load <16 x i16>, <16 x i16> addrspace(1)* %in.gep, align 32
189  %ctpop = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %val) nounwind readnone
190  store <16 x i16> %ctpop, <16 x i16> addrspace(1)* %out, align 32
191  ret void
192}
193
194; FUNC-LABEL: {{^}}v_ctpop_i16_add_inline_constant:
195; GCN: {{buffer|flat}}_load_ushort [[VAL:v[0-9]+]],
196; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], 4
197; GCN: buffer_store_short [[RESULT]],
198; GCN: s_endpgm
199
200; EG: BCNT_INT
201define amdgpu_kernel void @v_ctpop_i16_add_inline_constant(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
202  %tid = call i32 @llvm.amdgcn.workitem.id.x()
203  %in.gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid
204  %val = load i16, i16 addrspace(1)* %in.gep, align 4
205  %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
206  %add = add i16 %ctpop, 4
207  store i16 %add, i16 addrspace(1)* %out, align 4
208  ret void
209}
210
211; FUNC-LABEL: {{^}}v_ctpop_i16_add_inline_constant_inv:
212; GCN: {{buffer|flat}}_load_ushort [[VAL:v[0-9]+]],
213; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], 4
214; GCN: buffer_store_short [[RESULT]],
215; GCN: s_endpgm
216
217; EG: BCNT_INT
218define amdgpu_kernel void @v_ctpop_i16_add_inline_constant_inv(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
219  %tid = call i32 @llvm.amdgcn.workitem.id.x()
220  %in.gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid
221  %val = load i16, i16 addrspace(1)* %in.gep, align 4
222  %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
223  %add = add i16 4, %ctpop
224  store i16 %add, i16 addrspace(1)* %out, align 4
225  ret void
226}
227
228; FUNC-LABEL: {{^}}v_ctpop_i16_add_literal:
229; GCN-DAG: {{buffer|flat}}_load_ushort [[VAL:v[0-9]+]],
230; SI-DAG: s_movk_i32 [[LIT:s[0-9]+]], 0x3e7
231; VI-DAG: s_movk_i32 [[LIT:s[0-9]+]], 0x3e7
232; SI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
233; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
234; GCN: buffer_store_short [[RESULT]],
235; GCN: s_endpgm
236define amdgpu_kernel void @v_ctpop_i16_add_literal(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
237  %tid = call i32 @llvm.amdgcn.workitem.id.x()
238  %in.gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid
239  %val = load i16, i16 addrspace(1)* %in.gep, align 4
240  %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
241  %add = add i16 %ctpop, 999
242  store i16 %add, i16 addrspace(1)* %out, align 4
243  ret void
244}
245
246; FUNC-LABEL: {{^}}v_ctpop_i16_add_var:
247; GCN-DAG: {{buffer|flat}}_load_ushort [[VAL:v[0-9]+]],
248; GCN-DAG: s_load_dword [[VAR:s[0-9]+]],
249; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
250; GCN: buffer_store_short [[RESULT]],
251; GCN: s_endpgm
252
253; EG: BCNT_INT
254define amdgpu_kernel void @v_ctpop_i16_add_var(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in, i16 %const) nounwind {
255  %tid = call i32 @llvm.amdgcn.workitem.id.x()
256  %in.gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid
257  %val = load i16, i16 addrspace(1)* %in.gep, align 4
258  %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
259  %add = add i16 %ctpop, %const
260  store i16 %add, i16 addrspace(1)* %out, align 4
261  ret void
262}
263
264; FUNC-LABEL: {{^}}v_ctpop_i16_add_var_inv:
265; GCN-DAG: {{buffer|flat}}_load_ushort [[VAL:v[0-9]+]],
266; GCN-DAG: s_load_dword [[VAR:s[0-9]+]],
267; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
268; GCN: buffer_store_short [[RESULT]],
269; GCN: s_endpgm
270
271; EG: BCNT_INT
272define amdgpu_kernel void @v_ctpop_i16_add_var_inv(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in, i16 %const) nounwind {
273  %tid = call i32 @llvm.amdgcn.workitem.id.x()
274  %in.gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid
275  %val = load i16, i16 addrspace(1)* %in.gep, align 4
276  %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
277  %add = add i16 %const, %ctpop
278  store i16 %add, i16 addrspace(1)* %out, align 4
279  ret void
280}
281
282; FUNC-LABEL: {{^}}v_ctpop_i16_add_vvar_inv:
283; SI: buffer_load_ushort [[VAR:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64
284; SI: buffer_load_ushort [[VAL:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64
285; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAR]], [[VAL]]
286; VI: flat_load_ushort [[VAR:v[0-9]+]], v[{{[0-9]+:[0-9]+}}]
287; VI: flat_load_ushort [[VAL:v[0-9]+]], v[{{[0-9]+:[0-9]+}}]
288; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAR]], [[VAL]]
289; GCN: buffer_store_short [[RESULT]],
290; GCN: s_endpgm
291
292; EG: BCNT_INT
293define amdgpu_kernel void @v_ctpop_i16_add_vvar_inv(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in, i16 addrspace(1)* noalias %constptr) nounwind {
294  %tid = call i32 @llvm.amdgcn.workitem.id.x()
295  %in.gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid
296  %val = load i16, i16 addrspace(1)* %in.gep, align 4
297  %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
298  %gep = getelementptr i16, i16 addrspace(1)* %constptr, i32 %tid
299  %const = load i16, i16 addrspace(1)* %gep, align 4
300  %add = add i16 %const, %ctpop
301  store i16 %add, i16 addrspace(1)* %out, align 4
302  ret void
303}
304
305; FIXME: We currently disallow SALU instructions in all branches,
306; but there are some cases when the should be allowed.
307
308; FUNC-LABEL: {{^}}ctpop_i16_in_br:
309; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xd
310; VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x34
311
312; GCN: s_and_b32 [[CTPOP_ARG:s[0-9]+]], [[VAL]], 0xffff
313; GCN: s_bcnt1_i32_b32  [[SRESULT:s[0-9]+]], [[CTPOP_ARG]]
314; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], [[SRESULT]]
315; GCN: buffer_store_short [[RESULT]],
316; GCN: s_endpgm
317; EG: BCNT_INT
318define amdgpu_kernel void @ctpop_i16_in_br(i16 addrspace(1)* %out, i16 addrspace(1)* %in, i16 %ctpop_arg, i16 %cond) {
319entry:
320  %tmp0 = icmp eq i16 %cond, 0
321  br i1 %tmp0, label %if, label %else
322
323if:
324  %tmp2 = call i16 @llvm.ctpop.i16(i16 %ctpop_arg)
325  br label %endif
326
327else:
328  %tmp3 = getelementptr i16, i16 addrspace(1)* %in, i16 1
329  %tmp4 = load i16, i16 addrspace(1)* %tmp3
330  br label %endif
331
332endif:
333  %tmp5 = phi i16 [%tmp2, %if], [%tmp4, %else]
334  store i16 %tmp5, i16 addrspace(1)* %out
335  ret void
336}
337