• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; RUN: llc -march=amdgcn -verify-machineinstrs -enable-misched -asm-verbose < %s | FileCheck -check-prefix=SI %s
2
3declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
4
5; SI-LABEL: {{^}}test_if:
6; Make sure the i1 values created by the cfg structurizer pass are
7; moved using VALU instructions
8
9
10; waitcnt should be inserted after exec modification
11; SI: v_cmp_lt_i32_e32 vcc, 0,
12; SI-NEXT: s_and_saveexec_b64 [[SAVE1:s\[[0-9]+:[0-9]+\]]], vcc
13; SI-NEXT: s_xor_b64 [[SAVE2:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE1]]
14; SI-NEXT: ; mask branch [[FLOW_BB:BB[0-9]+_[0-9]+]]
15; SI-NEXT: s_cbranch_execz [[FLOW_BB]]
16
17; SI-NEXT: BB{{[0-9]+}}_1: ; %LeafBlock3
18; SI-NOT: s_mov_b64 s[{{[0-9]:[0-9]}}], -1
19; SI: v_mov_b32_e32 v{{[0-9]}}, -1
20; SI: s_and_saveexec_b64
21; SI-NEXT: ; mask branch
22
23; v_mov should be after exec modification
24; SI: [[FLOW_BB]]:
25; SI-NEXT: s_or_saveexec_b64 [[SAVE3:s\[[0-9]+:[0-9]+\]]], [[SAVE2]]
26; SI-NEXT: v_mov_b32_e32 v{{[0-9]+}}
27; SI-NEXT: s_xor_b64 exec, exec, [[SAVE3]]
28; SI-NEXT: ; mask branch
29;
30define amdgpu_kernel void @test_if(i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 {
31entry:
32  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
33  switch i32 %tid, label %default [
34    i32 0, label %case0
35    i32 1, label %case1
36  ]
37
38case0:
39  %arrayidx1 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b
40  store i32 13, i32 addrspace(1)* %arrayidx1, align 4
41  br label %end
42
43case1:
44  %arrayidx5 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b
45  store i32 17, i32 addrspace(1)* %arrayidx5, align 4
46  br label %end
47
48default:
49  %cmp8 = icmp eq i32 %tid, 2
50  %arrayidx10 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b
51  br i1 %cmp8, label %if, label %else
52
53if:
54  store i32 19, i32 addrspace(1)* %arrayidx10, align 4
55  br label %end
56
57else:
58  store i32 21, i32 addrspace(1)* %arrayidx10, align 4
59  br label %end
60
61end:
62  ret void
63}
64
65; SI-LABEL: {{^}}simple_test_v_if:
66; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
67; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
68; SI-NEXT: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]]
69
70; SI-NEXT: BB{{[0-9]+_[0-9]+}}:
71; SI: buffer_store_dword
72
73; SI-NEXT: {{^}}[[EXIT]]:
74; SI: s_endpgm
75define amdgpu_kernel void @simple_test_v_if(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
76  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
77  %is.0 = icmp ne i32 %tid, 0
78  br i1 %is.0, label %then, label %exit
79
80then:
81  %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid
82  store i32 999, i32 addrspace(1)* %gep
83  br label %exit
84
85exit:
86  ret void
87}
88
89; FIXME: It would be better to endpgm in the then block.
90
91; SI-LABEL: {{^}}simple_test_v_if_ret_else_ret:
92; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
93; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
94; SI-NEXT: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]]
95
96; SI-NEXT: BB{{[0-9]+_[0-9]+}}:
97; SI: buffer_store_dword
98
99; SI-NEXT: {{^}}[[EXIT]]:
100; SI: s_endpgm
101define amdgpu_kernel void @simple_test_v_if_ret_else_ret(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
102  %tid = call i32 @llvm.amdgcn.workitem.id.x()
103  %is.0 = icmp ne i32 %tid, 0
104  br i1 %is.0, label %then, label %exit
105
106then:
107  %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid
108  store i32 999, i32 addrspace(1)* %gep
109  ret void
110
111exit:
112  ret void
113}
114
115; Final block has more than a ret to execute. This was miscompiled
116; before function exit blocks were unified since the endpgm would
117; terminate the then wavefront before reaching the store.
118
119; SI-LABEL: {{^}}simple_test_v_if_ret_else_code_ret:
120; SI: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}}
121; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
122; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
123; SI: ; mask branch [[FLOW:BB[0-9]+_[0-9]+]]
124
125; SI-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %exit
126; SI: ds_write_b32
127
128; SI-NEXT: {{^}}[[FLOW]]:
129; SI-NEXT: s_or_saveexec_b64
130; SI-NEXT: s_xor_b64 exec, exec
131; SI-NEXT: ; mask branch [[UNIFIED_RETURN:BB[0-9]+_[0-9]+]]
132
133; SI-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %then
134; SI: s_waitcnt
135; SI-NEXT: buffer_store_dword
136
137; SI-NEXT: {{^}}[[UNIFIED_RETURN]]: ; %UnifiedReturnBlock
138; SI: s_endpgm
139define amdgpu_kernel void @simple_test_v_if_ret_else_code_ret(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
140  %tid = call i32 @llvm.amdgcn.workitem.id.x()
141  %is.0 = icmp ne i32 %tid, 0
142  br i1 %is.0, label %then, label %exit
143
144then:
145  %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid
146  store i32 999, i32 addrspace(1)* %gep
147  ret void
148
149exit:
150  store volatile i32 7, i32 addrspace(3)* undef
151  ret void
152}
153
154; SI-LABEL: {{^}}simple_test_v_loop:
155; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
156; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
157; SI-NEXT: ; mask branch
158; SI-NEXT: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]]
159
160; SI: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
161
162; SI: [[LABEL_LOOP:BB[0-9]+_[0-9]+]]:
163; SI: buffer_load_dword
164; SI-DAG: buffer_store_dword
165; SI-DAG: v_cmp_eq_u32_e32 vcc, 0x100
166; SI: s_cbranch_vccz [[LABEL_LOOP]]
167; SI: [[LABEL_EXIT]]:
168; SI: s_endpgm
169
170define amdgpu_kernel void @simple_test_v_loop(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
171entry:
172  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
173  %is.0 = icmp ne i32 %tid, 0
174  %limit = add i32 %tid, 64
175  br i1 %is.0, label %loop, label %exit
176
177loop:
178  %i = phi i32 [%tid, %entry], [%i.inc, %loop]
179  %gep.src = getelementptr i32, i32 addrspace(1)* %src, i32 %i
180  %gep.dst = getelementptr i32, i32 addrspace(1)* %dst, i32 %i
181  %load = load i32, i32 addrspace(1)* %src
182  store i32 %load, i32 addrspace(1)* %gep.dst
183  %i.inc = add nsw i32 %i, 1
184  %cmp = icmp eq i32 %limit, %i.inc
185  br i1 %cmp, label %exit, label %loop
186
187exit:
188  ret void
189}
190
191; SI-LABEL: {{^}}multi_vcond_loop:
192
193; Load loop limit from buffer
194; Branch to exit if uniformly not taken
195; SI: ; %bb.0:
196; SI: buffer_load_dword [[VBOUND:v[0-9]+]]
197; SI: v_cmp_lt_i32_e32 vcc
198; SI: s_and_saveexec_b64 [[OUTER_CMP_SREG:s\[[0-9]+:[0-9]+\]]], vcc
199; SI-NEXT: ; mask branch
200; SI-NEXT: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]]
201
202; Initialize inner condition to false
203; SI: BB{{[0-9]+_[0-9]+}}: ; %bb10.preheader
204; SI: s_mov_b64 [[ZERO:s\[[0-9]+:[0-9]+\]]], 0{{$}}
205; SI: s_mov_b64 [[COND_STATE:s\[[0-9]+:[0-9]+\]]], [[ZERO]]
206
207; Clear exec bits for workitems that load -1s
208; SI: [[LABEL_LOOP:BB[0-9]+_[0-9]+]]:
209; SI: buffer_load_dword [[B:v[0-9]+]]
210; SI: buffer_load_dword [[A:v[0-9]+]]
211; SI-DAG: v_cmp_ne_u32_e64 [[NEG1_CHECK_0:s\[[0-9]+:[0-9]+\]]], -1, [[A]]
212; SI-DAG: v_cmp_ne_u32_e32 [[NEG1_CHECK_1:vcc]], -1, [[B]]
213; SI: s_and_b64 [[ORNEG1:s\[[0-9]+:[0-9]+\]]], [[NEG1_CHECK_1]], [[NEG1_CHECK_0]]
214; SI: s_and_saveexec_b64 [[ORNEG2:s\[[0-9]+:[0-9]+\]]], [[ORNEG1]]
215; SI: s_xor_b64 [[ORNEG3:s\[[0-9]+:[0-9]+\]]], exec, [[ORNEG2]]
216; SI: s_cbranch_execz [[LABEL_FLOW:BB[0-9]+_[0-9]+]]
217
218; SI: BB{{[0-9]+_[0-9]+}}: ; %bb20
219; SI: buffer_store_dword
220; SI: v_cmp_ge_i64_e{{32|64}} [[CMP:s\[[0-9]+:[0-9]+\]|vcc]]
221; SI: s_or_b64 [[TMP:s\[[0-9]+:[0-9]+\]]], [[CMP]], [[COND_STATE]]
222
223; SI: [[LABEL_FLOW]]:
224; SI-NEXT: ; in Loop: Header=[[LABEL_LOOP]]
225; SI-NEXT: s_or_b64 exec, exec, [[ORNEG3]]
226; SI-NEXT: s_mov_b64 [[MOVED_TMP:s\[[0-9]+:[0-9]+\]]], [[TMP]]
227; SI-NEXT: s_and_b64 [[MASKED_ORNEG3:s\[[0-9]+:[0-9]+\]]], exec, [[ORNEG3]]
228; SI-NEXT: s_or_b64 [[COND_STATE]], [[MASKED_ORNEG3]], [[MOVED_TMP]]
229; SI-NEXT: s_andn2_b64 exec, exec, [[COND_STATE]]
230; SI-NEXT: s_cbranch_execnz [[LABEL_LOOP]]
231
232; SI: [[LABEL_EXIT]]:
233; SI-NOT: [[COND_STATE]]
234; SI: s_endpgm
235
236define amdgpu_kernel void @multi_vcond_loop(i32 addrspace(1)* noalias nocapture %arg, i32 addrspace(1)* noalias nocapture readonly %arg1, i32 addrspace(1)* noalias nocapture readonly %arg2, i32 addrspace(1)* noalias nocapture readonly %arg3) #1 {
237bb:
238  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
239  %tmp4 = sext i32 %tmp to i64
240  %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg3, i64 %tmp4
241  %tmp6 = load i32, i32 addrspace(1)* %tmp5, align 4
242  %tmp7 = icmp sgt i32 %tmp6, 0
243  %tmp8 = sext i32 %tmp6 to i64
244  br i1 %tmp7, label %bb10, label %bb26
245
246bb10:                                             ; preds = %bb, %bb20
247  %tmp11 = phi i64 [ %tmp23, %bb20 ], [ 0, %bb ]
248  %tmp12 = add nsw i64 %tmp11, %tmp4
249  %tmp13 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp12
250  %tmp14 = load i32, i32 addrspace(1)* %tmp13, align 4
251  %tmp15 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp12
252  %tmp16 = load i32, i32 addrspace(1)* %tmp15, align 4
253  %tmp17 = icmp ne i32 %tmp14, -1
254  %tmp18 = icmp ne i32 %tmp16, -1
255  %tmp19 = and i1 %tmp17, %tmp18
256  br i1 %tmp19, label %bb20, label %bb26
257
258bb20:                                             ; preds = %bb10
259  %tmp21 = add nsw i32 %tmp16, %tmp14
260  %tmp22 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp12
261  store i32 %tmp21, i32 addrspace(1)* %tmp22, align 4
262  %tmp23 = add nuw nsw i64 %tmp11, 1
263  %tmp24 = icmp slt i64 %tmp23, %tmp8
264  br i1 %tmp24, label %bb10, label %bb26
265
266bb26:                                             ; preds = %bb10, %bb20, %bb
267  ret void
268}
269
270attributes #0 = { nounwind readnone }
271attributes #1 = { nounwind }
272