1 /*
2 * Copyright © 2019 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #include "aco_builder.h"
26 #include "aco_ir.h"
27
28 #include <algorithm>
29 #include <map>
30 #include <unordered_map>
31 #include <vector>
32
33 /*
34 * Implements an algorithm to lower to Concentional SSA Form (CSSA).
35 * After "Revisiting Out-of-SSA Translation for Correctness, CodeQuality, and Efficiency"
36 * by B. Boissinot, A. Darte, F. Rastello, B. Dupont de Dinechin, C. Guillon,
37 *
38 * By lowering the IR to CSSA, the insertion of parallelcopies is separated from
39 * the register coalescing problem. Additionally, correctness is ensured w.r.t. spilling.
40 * The algorithm coalesces non-interfering phi-resources while taking value-equality
41 * into account. Re-indexes the SSA-defs.
42 */
43
44 namespace aco {
45 namespace {
46
47 typedef std::vector<Temp> merge_set;
48
49 struct copy {
50 Definition def;
51 Operand op;
52 };
53
54 struct merge_node {
55 Operand value = Operand(); /* original value: can be an SSA-def or constant value */
56 uint32_t index = -1u; /* index into the vector of merge sets */
57 uint32_t defined_at = -1u; /* defining block */
58
59 /* we also remember two dominating defs with the same value: */
60 Temp equal_anc_in = Temp(); /* within the same merge set */
61 Temp equal_anc_out = Temp(); /* from a different set */
62 };
63
64 struct cssa_ctx {
65 Program* program;
66 std::vector<IDSet>& live_out; /* live-out sets per block */
67 std::vector<std::vector<copy>> parallelcopies; /* copies per block */
68 std::vector<merge_set> merge_sets; /* each vector is one (ordered) merge set */
69 std::unordered_map<uint32_t, merge_node> merge_node_table; /* tempid -> merge node */
70 };
71
72 /* create (virtual) parallelcopies for each phi instruction and
73 * already merge copy-definitions with phi-defs into merge sets */
74 void
collect_parallelcopies(cssa_ctx & ctx)75 collect_parallelcopies(cssa_ctx& ctx)
76 {
77 ctx.parallelcopies.resize(ctx.program->blocks.size());
78 Builder bld(ctx.program);
79 for (Block& block : ctx.program->blocks) {
80 for (aco_ptr<Instruction>& phi : block.instructions) {
81 if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi)
82 break;
83
84 const Definition& def = phi->definitions[0];
85
86 /* if the definition is not temp, it is the exec mask.
87 * We can reload the exec mask directly from the spill slot.
88 */
89 if (!def.isTemp())
90 continue;
91
92 std::vector<unsigned>& preds =
93 phi->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds;
94 uint32_t index = ctx.merge_sets.size();
95 merge_set set;
96
97 bool has_preheader_copy = false;
98 for (unsigned i = 0; i < phi->operands.size(); i++) {
99 Operand op = phi->operands[i];
100 if (op.isUndefined())
101 continue;
102
103 if (def.regClass().type() == RegType::sgpr && !op.isTemp()) {
104 /* SGPR inline constants and literals on GFX10+ can be spilled
105 * and reloaded directly (without intermediate register) */
106 if (op.isConstant()) {
107 if (ctx.program->chip_class >= GFX10)
108 continue;
109 if (op.size() == 1 && !op.isLiteral())
110 continue;
111 } else {
112 assert(op.isFixed() && op.physReg() == exec);
113 continue;
114 }
115 }
116
117 /* create new temporary and rename operands */
118 Temp tmp = bld.tmp(def.regClass());
119 ctx.parallelcopies[preds[i]].emplace_back(copy{Definition(tmp), op});
120 phi->operands[i] = Operand(tmp);
121
122 /* place the new operands in the same merge set */
123 set.emplace_back(tmp);
124 ctx.merge_node_table[tmp.id()] = {op, index, preds[i]};
125
126 /* update the liveness information */
127 if (op.isKill())
128 ctx.live_out[preds[i]].erase(op.tempId());
129 ctx.live_out[preds[i]].insert(tmp.id());
130
131 has_preheader_copy |= i == 0 && block.kind & block_kind_loop_header;
132 }
133
134 if (set.empty())
135 continue;
136
137 /* place the definition in dominance-order */
138 if (def.isTemp()) {
139 if (has_preheader_copy)
140 set.emplace(std::next(set.begin()), def.getTemp());
141 else if (block.kind & block_kind_loop_header)
142 set.emplace(set.begin(), def.getTemp());
143 else
144 set.emplace_back(def.getTemp());
145 ctx.merge_node_table[def.tempId()] = {Operand(def.getTemp()), index, block.index};
146 }
147 ctx.merge_sets.emplace_back(set);
148 }
149 }
150 }
151
152 /* check whether the definition of a comes after b. */
153 inline bool
defined_after(cssa_ctx & ctx,Temp a,Temp b)154 defined_after(cssa_ctx& ctx, Temp a, Temp b)
155 {
156 merge_node& node_a = ctx.merge_node_table[a.id()];
157 merge_node& node_b = ctx.merge_node_table[b.id()];
158 if (node_a.defined_at == node_b.defined_at)
159 return a.id() > b.id();
160
161 return node_a.defined_at > node_b.defined_at;
162 }
163
164 /* check whether a dominates b where b is defined after a */
165 inline bool
dominates(cssa_ctx & ctx,Temp a,Temp b)166 dominates(cssa_ctx& ctx, Temp a, Temp b)
167 {
168 assert(defined_after(ctx, b, a));
169 merge_node& node_a = ctx.merge_node_table[a.id()];
170 merge_node& node_b = ctx.merge_node_table[b.id()];
171 unsigned idom = node_b.defined_at;
172 while (idom > node_a.defined_at)
173 idom = b.regClass().type() == RegType::vgpr ? ctx.program->blocks[idom].logical_idom
174 : ctx.program->blocks[idom].linear_idom;
175
176 return idom == node_a.defined_at;
177 }
178
179 /* check intersection between var and parent:
180 * We already know that parent dominates var. */
181 inline bool
intersects(cssa_ctx & ctx,Temp var,Temp parent)182 intersects(cssa_ctx& ctx, Temp var, Temp parent)
183 {
184 merge_node& node_var = ctx.merge_node_table[var.id()];
185 merge_node& node_parent = ctx.merge_node_table[parent.id()];
186 assert(node_var.index != node_parent.index);
187 uint32_t block_idx = node_var.defined_at;
188
189 /* if the parent is live-out at the definition block of var, they intersect */
190 bool parent_live = ctx.live_out[block_idx].count(parent.id());
191 if (parent_live)
192 return true;
193
194 /* parent is defined in a different block than var */
195 if (node_parent.defined_at < node_var.defined_at) {
196 /* if the parent is not live-in, they don't interfere */
197 std::vector<uint32_t>& preds = var.type() == RegType::vgpr
198 ? ctx.program->blocks[block_idx].logical_preds
199 : ctx.program->blocks[block_idx].linear_preds;
200 for (uint32_t pred : preds) {
201 if (!ctx.live_out[pred].count(parent.id()))
202 return false;
203 }
204 }
205
206 for (const copy& cp : ctx.parallelcopies[block_idx]) {
207 /* if var is defined at the edge, they don't intersect */
208 if (cp.def.getTemp() == var)
209 return false;
210 if (cp.op.isTemp() && cp.op.getTemp() == parent)
211 parent_live = true;
212 }
213 /* if the parent is live at the edge, they intersect */
214 if (parent_live)
215 return true;
216
217 /* both, parent and var, are present in the same block */
218 const Block& block = ctx.program->blocks[block_idx];
219 for (auto it = block.instructions.crbegin(); it != block.instructions.crend(); ++it) {
220 /* if the parent was not encountered yet, it can only be used by a phi */
221 if (is_phi(it->get()))
222 break;
223
224 for (const Definition& def : (*it)->definitions) {
225 if (!def.isTemp())
226 continue;
227 /* if parent was not found yet, they don't intersect */
228 if (def.getTemp() == var)
229 return false;
230 }
231
232 for (const Operand& op : (*it)->operands) {
233 if (!op.isTemp())
234 continue;
235 /* if the var was defined before this point, they intersect */
236 if (op.getTemp() == parent)
237 return true;
238 }
239 }
240
241 return false;
242 }
243
244 /* check interference between var and parent:
245 * i.e. they have different values and intersect.
246 * If parent and var share the same value, also updates the equal ancestor. */
247 inline bool
interference(cssa_ctx & ctx,Temp var,Temp parent)248 interference(cssa_ctx& ctx, Temp var, Temp parent)
249 {
250 assert(var != parent);
251 merge_node& node_var = ctx.merge_node_table[var.id()];
252 node_var.equal_anc_out = Temp();
253
254 if (node_var.index == ctx.merge_node_table[parent.id()].index) {
255 /* check/update in other set */
256 parent = ctx.merge_node_table[parent.id()].equal_anc_out;
257 }
258
259 Temp tmp = parent;
260 /* check if var intersects with parent or any equal-valued ancestor */
261 while (tmp != Temp() && !intersects(ctx, var, tmp)) {
262 merge_node& node_tmp = ctx.merge_node_table[tmp.id()];
263 tmp = node_tmp.equal_anc_in;
264 }
265
266 /* no intersection found */
267 if (tmp == Temp())
268 return false;
269
270 /* var and parent, same value, but in different sets */
271 if (node_var.value == ctx.merge_node_table[parent.id()].value) {
272 node_var.equal_anc_out = tmp;
273 return false;
274 }
275
276 /* var and parent, different values and intersect */
277 return true;
278 }
279
280 /* tries to merge set_b into set_a of given temporary and
281 * drops that temporary as it is being coalesced */
282 bool
try_merge_merge_set(cssa_ctx & ctx,Temp dst,merge_set & set_b)283 try_merge_merge_set(cssa_ctx& ctx, Temp dst, merge_set& set_b)
284 {
285 auto def_node_it = ctx.merge_node_table.find(dst.id());
286 uint32_t index = def_node_it->second.index;
287 merge_set& set_a = ctx.merge_sets[index];
288 std::vector<Temp> dom; /* stack of the traversal */
289 merge_set union_set; /* the new merged merge-set */
290 uint32_t i_a = 0;
291 uint32_t i_b = 0;
292
293 while (i_a < set_a.size() || i_b < set_b.size()) {
294 Temp current;
295 if (i_a == set_a.size())
296 current = set_b[i_b++];
297 else if (i_b == set_b.size())
298 current = set_a[i_a++];
299 /* else pick the one defined first */
300 else if (defined_after(ctx, set_a[i_a], set_b[i_b]))
301 current = set_b[i_b++];
302 else
303 current = set_a[i_a++];
304
305 while (!dom.empty() && !dominates(ctx, dom.back(), current))
306 dom.pop_back(); /* not the desired parent, remove */
307
308 if (!dom.empty() && interference(ctx, current, dom.back()))
309 return false; /* intersection detected */
310
311 dom.emplace_back(current); /* otherwise, keep checking */
312 if (current != dst)
313 union_set.emplace_back(current); /* maintain the new merge-set sorted */
314 }
315
316 /* update hashmap */
317 for (Temp t : union_set) {
318 merge_node& node = ctx.merge_node_table[t.id()];
319 /* update the equal ancestors:
320 * i.e. the 'closest' dominating def with the same value */
321 Temp in = node.equal_anc_in;
322 Temp out = node.equal_anc_out;
323 if (in == Temp() || (out != Temp() && defined_after(ctx, out, in)))
324 node.equal_anc_in = out;
325 node.equal_anc_out = Temp();
326 /* update merge-set index */
327 node.index = index;
328 }
329 set_b = merge_set(); /* free the old set_b */
330 ctx.merge_sets[index] = union_set;
331 ctx.merge_node_table.erase(dst.id()); /* remove the temporary */
332
333 return true;
334 }
335
336 /* returns true if the copy can safely be omitted */
337 bool
try_coalesce_copy(cssa_ctx & ctx,copy copy,uint32_t block_idx)338 try_coalesce_copy(cssa_ctx& ctx, copy copy, uint32_t block_idx)
339 {
340 /* we can only coalesce temporaries */
341 if (!copy.op.isTemp())
342 return false;
343
344 /* try emplace a merge_node for the copy operand */
345 merge_node& op_node = ctx.merge_node_table[copy.op.tempId()];
346 if (op_node.defined_at == -1u) {
347 /* find defining block of operand */
348 uint32_t pred = block_idx;
349 do {
350 block_idx = pred;
351 pred = copy.op.regClass().type() == RegType::vgpr ? ctx.program->blocks[pred].logical_idom
352 : ctx.program->blocks[pred].linear_idom;
353 } while (block_idx != pred && ctx.live_out[pred].count(copy.op.tempId()));
354 op_node.defined_at = block_idx;
355 op_node.value = copy.op;
356 }
357
358 /* we can only coalesce copies of the same register class */
359 if (copy.op.regClass() != copy.def.regClass())
360 return false;
361
362 /* check if this operand has not yet been coalesced */
363 if (op_node.index == -1u) {
364 merge_set op_set = merge_set{copy.op.getTemp()};
365 return try_merge_merge_set(ctx, copy.def.getTemp(), op_set);
366 }
367
368 /* check if this operand has been coalesced into the same set */
369 assert(ctx.merge_node_table.count(copy.def.tempId()));
370 if (op_node.index == ctx.merge_node_table[copy.def.tempId()].index)
371 return true;
372
373 /* otherwise, try to coalesce both merge sets */
374 return try_merge_merge_set(ctx, copy.def.getTemp(), ctx.merge_sets[op_node.index]);
375 }
376
377 /* node in the location-transfer-graph */
378 struct ltg_node {
379 copy cp;
380 uint32_t read_idx;
381 uint32_t num_uses = 0;
382 };
383
384 /* emit the copies in an order that does not
385 * create interferences within a merge-set */
386 void
emit_copies_block(Builder & bld,std::map<uint32_t,ltg_node> & ltg,RegType type)387 emit_copies_block(Builder& bld, std::map<uint32_t, ltg_node>& ltg, RegType type)
388 {
389 auto&& it = ltg.begin();
390 while (it != ltg.end()) {
391 const copy& cp = it->second.cp;
392 /* wrong regclass or still needed as operand */
393 if (cp.def.regClass().type() != type || it->second.num_uses > 0) {
394 ++it;
395 continue;
396 }
397
398 /* emit the copy */
399 bld.copy(cp.def, it->second.cp.op);
400
401 /* update the location transfer graph */
402 if (it->second.read_idx != -1u) {
403 auto&& other = ltg.find(it->second.read_idx);
404 if (other != ltg.end())
405 other->second.num_uses--;
406 }
407 ltg.erase(it);
408 it = ltg.begin();
409 }
410
411 /* count the number of remaining circular dependencies */
412 unsigned num = std::count_if(ltg.begin(), ltg.end(),
413 [&](auto& n) { return n.second.cp.def.regClass().type() == type; });
414
415 /* if there are circular dependencies, we just emit them as single parallelcopy */
416 if (num) {
417 // TODO: this should be restricted to a feasible number of registers
418 // and otherwise use a temporary to avoid having to reload more (spilled)
419 // variables than we have registers.
420 aco_ptr<Pseudo_instruction> copy{create_instruction<Pseudo_instruction>(
421 aco_opcode::p_parallelcopy, Format::PSEUDO, num, num)};
422 it = ltg.begin();
423 for (unsigned i = 0; i < num; i++) {
424 while (it->second.cp.def.regClass().type() != type)
425 ++it;
426
427 copy->definitions[i] = it->second.cp.def;
428 copy->operands[i] = it->second.cp.op;
429 it = ltg.erase(it);
430 }
431 bld.insert(std::move(copy));
432 }
433 }
434
435 /* either emits or coalesces all parallelcopies and
436 * renames the phi-operands accordingly. */
437 void
emit_parallelcopies(cssa_ctx & ctx)438 emit_parallelcopies(cssa_ctx& ctx)
439 {
440 std::unordered_map<uint32_t, Operand> renames;
441
442 /* we iterate backwards to prioritize coalescing in else-blocks */
443 for (int i = ctx.program->blocks.size() - 1; i >= 0; i--) {
444 if (ctx.parallelcopies[i].empty())
445 continue;
446
447 std::map<uint32_t, ltg_node> ltg;
448 bool has_vgpr_copy = false;
449 bool has_sgpr_copy = false;
450
451 /* first, try to coalesce all parallelcopies */
452 for (const copy& cp : ctx.parallelcopies[i]) {
453 if (try_coalesce_copy(ctx, cp, i)) {
454 renames.emplace(cp.def.tempId(), cp.op);
455 /* update liveness info */
456 ctx.live_out[i].erase(cp.def.tempId());
457 ctx.live_out[i].insert(cp.op.tempId());
458 } else {
459 uint32_t read_idx = -1u;
460 if (cp.op.isTemp())
461 read_idx = ctx.merge_node_table[cp.op.tempId()].index;
462 uint32_t write_idx = ctx.merge_node_table[cp.def.tempId()].index;
463 assert(write_idx != -1u);
464 ltg[write_idx] = {cp, read_idx};
465
466 bool is_vgpr = cp.def.regClass().type() == RegType::vgpr;
467 has_vgpr_copy |= is_vgpr;
468 has_sgpr_copy |= !is_vgpr;
469 }
470 }
471
472 /* build location-transfer-graph */
473 for (auto& pair : ltg) {
474 if (pair.second.read_idx == -1u)
475 continue;
476 auto&& it = ltg.find(pair.second.read_idx);
477 if (it != ltg.end())
478 it->second.num_uses++;
479 }
480
481 /* emit parallelcopies ordered */
482 Builder bld(ctx.program);
483 Block& block = ctx.program->blocks[i];
484
485 if (has_vgpr_copy) {
486 /* emit VGPR copies */
487 auto IsLogicalEnd = [](const aco_ptr<Instruction>& inst) -> bool
488 { return inst->opcode == aco_opcode::p_logical_end; };
489 auto it =
490 std::find_if(block.instructions.rbegin(), block.instructions.rend(), IsLogicalEnd);
491 bld.reset(&block.instructions, std::prev(it.base()));
492 emit_copies_block(bld, ltg, RegType::vgpr);
493 }
494
495 if (has_sgpr_copy) {
496 /* emit SGPR copies */
497 aco_ptr<Instruction> branch = std::move(block.instructions.back());
498 block.instructions.pop_back();
499 bld.reset(&block.instructions);
500 emit_copies_block(bld, ltg, RegType::sgpr);
501 bld.insert(std::move(branch));
502 }
503 }
504
505 /* finally, rename coalesced phi operands */
506 for (Block& block : ctx.program->blocks) {
507 for (aco_ptr<Instruction>& phi : block.instructions) {
508 if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi)
509 break;
510
511 for (Operand& op : phi->operands) {
512 if (!op.isTemp())
513 continue;
514 auto&& it = renames.find(op.tempId());
515 if (it != renames.end()) {
516 op = it->second;
517 renames.erase(it);
518 }
519 }
520 }
521 }
522 assert(renames.empty());
523 }
524
525 } /* end namespace */
526
527 void
lower_to_cssa(Program * program,live & live_vars)528 lower_to_cssa(Program* program, live& live_vars)
529 {
530 reindex_ssa(program, live_vars.live_out);
531 cssa_ctx ctx = {program, live_vars.live_out};
532 collect_parallelcopies(ctx);
533 emit_parallelcopies(ctx);
534
535 /* update live variable information */
536 live_vars = live_var_analysis(program);
537 }
538 } // namespace aco
539