1 /*
2 * Copyright © 2020 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #include "aco_builder.h"
26 #include "aco_ir.h"
27
28 #include <vector>
29
30 namespace aco {
31 namespace {
32
33 /* there can also be LDS and VALU clauses, but I don't see how those are interesting */
34 enum clause_type {
35 clause_smem,
36 clause_other,
37 /* GFX10: */
38 clause_vmem,
39 clause_flat,
40 /* GFX11: */
41 clause_mimg_load,
42 clause_mimg_store,
43 clause_mimg_atomic,
44 clause_mimg_sample,
45 clause_vmem_load,
46 clause_vmem_store,
47 clause_vmem_atomic,
48 clause_flat_load,
49 clause_flat_store,
50 clause_flat_atomic,
51 clause_bvh,
52 };
53
54 void
emit_clause(Builder & bld,unsigned num_instrs,aco_ptr<Instruction> * instrs)55 emit_clause(Builder& bld, unsigned num_instrs, aco_ptr<Instruction>* instrs)
56 {
57 unsigned start = 0;
58 unsigned end = num_instrs;
59
60 if (bld.program->gfx_level < GFX11) {
61 /* skip any stores at the start */
62 for (; (start < num_instrs) && instrs[start]->definitions.empty(); start++)
63 bld.insert(std::move(instrs[start]));
64
65 for (end = start; (end < num_instrs) && !instrs[end]->definitions.empty(); end++)
66 ;
67 }
68
69 unsigned clause_size = end - start;
70 if (clause_size > 1)
71 bld.sopp(aco_opcode::s_clause, -1, clause_size - 1);
72
73 for (unsigned i = start; i < num_instrs; i++)
74 bld.insert(std::move(instrs[i]));
75 }
76
77 clause_type
get_type(Program * program,aco_ptr<Instruction> & instr)78 get_type(Program* program, aco_ptr<Instruction>& instr)
79 {
80 if (instr->isSMEM() && !instr->operands.empty())
81 return clause_smem;
82
83 if (program->gfx_level >= GFX11) {
84 if (instr->isMIMG()) {
85 switch (instr->opcode) {
86 case aco_opcode::image_bvh_intersect_ray:
87 case aco_opcode::image_bvh64_intersect_ray: return clause_bvh;
88 case aco_opcode::image_atomic_swap:
89 case aco_opcode::image_atomic_cmpswap:
90 case aco_opcode::image_atomic_add:
91 case aco_opcode::image_atomic_sub:
92 case aco_opcode::image_atomic_rsub:
93 case aco_opcode::image_atomic_smin:
94 case aco_opcode::image_atomic_umin:
95 case aco_opcode::image_atomic_smax:
96 case aco_opcode::image_atomic_umax:
97 case aco_opcode::image_atomic_and:
98 case aco_opcode::image_atomic_or:
99 case aco_opcode::image_atomic_xor:
100 case aco_opcode::image_atomic_inc:
101 case aco_opcode::image_atomic_dec:
102 case aco_opcode::image_atomic_fcmpswap:
103 case aco_opcode::image_atomic_fmin:
104 case aco_opcode::image_atomic_fmax: return clause_mimg_atomic;
105 default:
106 if (instr->definitions.empty())
107 return clause_mimg_store;
108 else
109 return !instr->operands[1].isUndefined() && instr->operands[1].regClass() == s4
110 ? clause_mimg_sample
111 : clause_mimg_load;
112 }
113 } else if (instr->isMTBUF() || instr->isScratch()) {
114 return instr->definitions.empty() ? clause_vmem_store : clause_vmem_load;
115 } else if (instr->isMUBUF()) {
116 switch (instr->opcode) {
117 case aco_opcode::buffer_atomic_add:
118 case aco_opcode::buffer_atomic_and_x2:
119 case aco_opcode::buffer_atomic_rsub:
120 case aco_opcode::buffer_atomic_umax:
121 case aco_opcode::buffer_atomic_dec:
122 case aco_opcode::buffer_atomic_smax:
123 case aco_opcode::buffer_atomic_fmax:
124 case aco_opcode::buffer_atomic_rsub_x2:
125 case aco_opcode::buffer_atomic_smin:
126 case aco_opcode::buffer_atomic_sub:
127 case aco_opcode::buffer_atomic_sub_x2:
128 case aco_opcode::buffer_atomic_xor_x2:
129 case aco_opcode::buffer_atomic_add_f32:
130 case aco_opcode::buffer_atomic_inc:
131 case aco_opcode::buffer_atomic_swap_x2:
132 case aco_opcode::buffer_atomic_cmpswap:
133 case aco_opcode::buffer_atomic_fmin_x2:
134 case aco_opcode::buffer_atomic_umin:
135 case aco_opcode::buffer_atomic_or:
136 case aco_opcode::buffer_atomic_umax_x2:
137 case aco_opcode::buffer_atomic_smin_x2:
138 case aco_opcode::buffer_atomic_umin_x2:
139 case aco_opcode::buffer_atomic_cmpswap_x2:
140 case aco_opcode::buffer_atomic_add_x2:
141 case aco_opcode::buffer_atomic_swap:
142 case aco_opcode::buffer_atomic_and:
143 case aco_opcode::buffer_atomic_fmin:
144 case aco_opcode::buffer_atomic_fcmpswap_x2:
145 case aco_opcode::buffer_atomic_or_x2:
146 case aco_opcode::buffer_atomic_fcmpswap:
147 case aco_opcode::buffer_atomic_xor:
148 case aco_opcode::buffer_atomic_dec_x2:
149 case aco_opcode::buffer_atomic_fmax_x2:
150 case aco_opcode::buffer_atomic_csub:
151 case aco_opcode::buffer_atomic_inc_x2:
152 case aco_opcode::buffer_atomic_smax_x2: return clause_vmem_atomic;
153 default: return instr->definitions.empty() ? clause_vmem_store : clause_vmem_load;
154 }
155 } else if (instr->isGlobal()) {
156 switch (instr->opcode) {
157 case aco_opcode::global_atomic_swap:
158 case aco_opcode::global_atomic_umax:
159 case aco_opcode::global_atomic_cmpswap:
160 case aco_opcode::global_atomic_and_x2:
161 case aco_opcode::global_atomic_fmax:
162 case aco_opcode::global_atomic_smax_x2:
163 case aco_opcode::global_atomic_fmax_x2:
164 case aco_opcode::global_atomic_dec:
165 case aco_opcode::global_atomic_dec_x2:
166 case aco_opcode::global_atomic_umin:
167 case aco_opcode::global_atomic_fcmpswap_x2:
168 case aco_opcode::global_atomic_inc:
169 case aco_opcode::global_atomic_and:
170 case aco_opcode::global_atomic_fmin:
171 case aco_opcode::global_atomic_fcmpswap:
172 case aco_opcode::global_atomic_or_x2:
173 case aco_opcode::global_atomic_smax:
174 case aco_opcode::global_atomic_sub:
175 case aco_opcode::global_atomic_xor:
176 case aco_opcode::global_atomic_swap_x2:
177 case aco_opcode::global_atomic_umax_x2:
178 case aco_opcode::global_atomic_umin_x2:
179 case aco_opcode::global_atomic_xor_x2:
180 case aco_opcode::global_atomic_inc_x2:
181 case aco_opcode::global_atomic_fmin_x2:
182 case aco_opcode::global_atomic_add_f32:
183 case aco_opcode::global_atomic_add:
184 case aco_opcode::global_atomic_or:
185 case aco_opcode::global_atomic_add_x2:
186 case aco_opcode::global_atomic_smin_x2:
187 case aco_opcode::global_atomic_smin:
188 case aco_opcode::global_atomic_csub:
189 case aco_opcode::global_atomic_sub_x2:
190 case aco_opcode::global_atomic_cmpswap_x2: return clause_vmem_atomic;
191 default: return instr->definitions.empty() ? clause_vmem_store : clause_vmem_load;
192 }
193 } else if (instr->isFlat()) {
194 switch (instr->opcode) {
195 case aco_opcode::flat_atomic_smax:
196 case aco_opcode::flat_atomic_fcmpswap_x2:
197 case aco_opcode::flat_atomic_inc_x2:
198 case aco_opcode::flat_atomic_dec:
199 case aco_opcode::flat_atomic_fmin:
200 case aco_opcode::flat_atomic_umax_x2:
201 case aco_opcode::flat_atomic_add_f32:
202 case aco_opcode::flat_atomic_or:
203 case aco_opcode::flat_atomic_smax_x2:
204 case aco_opcode::flat_atomic_umin:
205 case aco_opcode::flat_atomic_sub:
206 case aco_opcode::flat_atomic_swap:
207 case aco_opcode::flat_atomic_swap_x2:
208 case aco_opcode::flat_atomic_cmpswap_x2:
209 case aco_opcode::flat_atomic_fcmpswap:
210 case aco_opcode::flat_atomic_add:
211 case aco_opcode::flat_atomic_umin_x2:
212 case aco_opcode::flat_atomic_xor_x2:
213 case aco_opcode::flat_atomic_smin:
214 case aco_opcode::flat_atomic_fmax_x2:
215 case aco_opcode::flat_atomic_cmpswap:
216 case aco_opcode::flat_atomic_dec_x2:
217 case aco_opcode::flat_atomic_sub_x2:
218 case aco_opcode::flat_atomic_add_x2:
219 case aco_opcode::flat_atomic_umax:
220 case aco_opcode::flat_atomic_xor:
221 case aco_opcode::flat_atomic_and_x2:
222 case aco_opcode::flat_atomic_inc:
223 case aco_opcode::flat_atomic_and:
224 case aco_opcode::flat_atomic_fmin_x2:
225 case aco_opcode::flat_atomic_smin_x2:
226 case aco_opcode::flat_atomic_or_x2:
227 case aco_opcode::flat_atomic_fmax: return clause_flat_atomic;
228 default: return instr->definitions.empty() ? clause_flat_store : clause_flat_load;
229 }
230 }
231 } else {
232 if (instr->isVMEM() && !instr->operands.empty()) {
233 if (program->gfx_level == GFX10 && instr->isMIMG() && get_mimg_nsa_dwords(instr.get()) > 0)
234 return clause_other;
235 else
236 return clause_vmem;
237 } else if (instr->isScratch() || instr->isGlobal()) {
238 return clause_vmem;
239 } else if (instr->isFlat()) {
240 return clause_flat;
241 }
242 }
243 return clause_other;
244 }
245
246 } /* end namespace */
247
248 void
form_hard_clauses(Program * program)249 form_hard_clauses(Program* program)
250 {
251 /* The ISA documentation says 63 is the maximum for GFX11/12, but according to
252 * LLVM there are HW bugs with more than 32 instructions.
253 */
254 const unsigned max_clause_length = program->gfx_level >= GFX11 ? 32 : 63;
255 for (Block& block : program->blocks) {
256 unsigned num_instrs = 0;
257 aco_ptr<Instruction> current_instrs[63];
258 clause_type current_type = clause_other;
259
260 std::vector<aco_ptr<Instruction>> new_instructions;
261 new_instructions.reserve(block.instructions.size());
262 Builder bld(program, &new_instructions);
263
264 for (unsigned i = 0; i < block.instructions.size(); i++) {
265 aco_ptr<Instruction>& instr = block.instructions[i];
266
267 clause_type type = get_type(program, instr);
268 if (type != current_type || num_instrs == max_clause_length ||
269 (num_instrs && !should_form_clause(current_instrs[0].get(), instr.get()))) {
270 emit_clause(bld, num_instrs, current_instrs);
271 num_instrs = 0;
272 current_type = type;
273 }
274
275 if (type == clause_other) {
276 bld.insert(std::move(instr));
277 continue;
278 }
279
280 current_instrs[num_instrs++] = std::move(instr);
281 }
282
283 emit_clause(bld, num_instrs, current_instrs);
284
285 block.instructions = std::move(new_instructions);
286 }
287 }
288 } // namespace aco
289