1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 * Eric Anholt <eric@anholt.net>
25 *
26 */
27
28 #include "brw_eu.h"
29 #include "brw_fs.h"
30 #include "brw_cfg.h"
31 #include "util/set.h"
32 #include "util/register_allocate.h"
33
34 using namespace brw;
35
36 static void
assign_reg(unsigned * reg_hw_locations,fs_reg * reg)37 assign_reg(unsigned *reg_hw_locations, fs_reg *reg)
38 {
39 if (reg->file == VGRF) {
40 reg->nr = reg_hw_locations[reg->nr] + reg->offset / REG_SIZE;
41 reg->offset %= REG_SIZE;
42 }
43 }
44
45 void
assign_regs_trivial()46 fs_visitor::assign_regs_trivial()
47 {
48 unsigned hw_reg_mapping[this->alloc.count + 1];
49 unsigned i;
50 int reg_width = dispatch_width / 8;
51
52 /* Note that compressed instructions require alignment to 2 registers. */
53 hw_reg_mapping[0] = ALIGN(this->first_non_payload_grf, reg_width);
54 for (i = 1; i <= this->alloc.count; i++) {
55 hw_reg_mapping[i] = (hw_reg_mapping[i - 1] +
56 this->alloc.sizes[i - 1]);
57 }
58 this->grf_used = hw_reg_mapping[this->alloc.count];
59
60 foreach_block_and_inst(block, fs_inst, inst, cfg) {
61 assign_reg(hw_reg_mapping, &inst->dst);
62 for (i = 0; i < inst->sources; i++) {
63 assign_reg(hw_reg_mapping, &inst->src[i]);
64 }
65 }
66
67 if (this->grf_used >= max_grf) {
68 fail("Ran out of regs on trivial allocator (%d/%d)\n",
69 this->grf_used, max_grf);
70 } else {
71 this->alloc.count = this->grf_used;
72 }
73
74 }
75
76 /**
77 * Size of a register from the aligned_bary_class register class.
78 */
79 static unsigned
aligned_bary_size(unsigned dispatch_width)80 aligned_bary_size(unsigned dispatch_width)
81 {
82 return (dispatch_width == 8 ? 2 : 4);
83 }
84
85 static void
brw_alloc_reg_set(struct brw_compiler * compiler,int dispatch_width)86 brw_alloc_reg_set(struct brw_compiler *compiler, int dispatch_width)
87 {
88 const struct gen_device_info *devinfo = compiler->devinfo;
89 int base_reg_count = BRW_MAX_GRF;
90 const int index = util_logbase2(dispatch_width / 8);
91
92 if (dispatch_width > 8 && devinfo->gen >= 7) {
93 /* For IVB+, we don't need the PLN hacks or the even-reg alignment in
94 * SIMD16. Therefore, we can use the exact same register sets for
95 * SIMD16 as we do for SIMD8 and we don't need to recalculate them.
96 */
97 compiler->fs_reg_sets[index] = compiler->fs_reg_sets[0];
98 return;
99 }
100
101 /* The registers used to make up almost all values handled in the compiler
102 * are a scalar value occupying a single register (or 2 registers in the
103 * case of SIMD16, which is handled by dividing base_reg_count by 2 and
104 * multiplying allocated register numbers by 2). Things that were
105 * aggregates of scalar values at the GLSL level were split to scalar
106 * values by split_virtual_grfs().
107 *
108 * However, texture SEND messages return a series of contiguous registers
109 * to write into. We currently always ask for 4 registers, but we may
110 * convert that to use less some day.
111 *
112 * Additionally, on gen5 we need aligned pairs of registers for the PLN
113 * instruction, and on gen4 we need 8 contiguous regs for workaround simd16
114 * texturing.
115 */
116 const int class_count = MAX_VGRF_SIZE;
117 int class_sizes[MAX_VGRF_SIZE];
118 for (unsigned i = 0; i < MAX_VGRF_SIZE; i++)
119 class_sizes[i] = i + 1;
120
121 memset(compiler->fs_reg_sets[index].class_to_ra_reg_range, 0,
122 sizeof(compiler->fs_reg_sets[index].class_to_ra_reg_range));
123 int *class_to_ra_reg_range = compiler->fs_reg_sets[index].class_to_ra_reg_range;
124
125 /* Compute the total number of registers across all classes. */
126 int ra_reg_count = 0;
127 for (int i = 0; i < class_count; i++) {
128 if (devinfo->gen <= 5 && dispatch_width >= 16) {
129 /* From the G45 PRM:
130 *
131 * In order to reduce the hardware complexity, the following
132 * rules and restrictions apply to the compressed instruction:
133 * ...
134 * * Operand Alignment Rule: With the exceptions listed below, a
135 * source/destination operand in general should be aligned to
136 * even 256-bit physical register with a region size equal to
137 * two 256-bit physical register
138 */
139 ra_reg_count += (base_reg_count - (class_sizes[i] - 1)) / 2;
140 } else {
141 ra_reg_count += base_reg_count - (class_sizes[i] - 1);
142 }
143 /* Mark the last register. We'll fill in the beginnings later. */
144 class_to_ra_reg_range[class_sizes[i]] = ra_reg_count;
145 }
146
147 /* Fill out the rest of the range markers */
148 for (int i = 1; i < 17; ++i) {
149 if (class_to_ra_reg_range[i] == 0)
150 class_to_ra_reg_range[i] = class_to_ra_reg_range[i-1];
151 }
152
153 uint8_t *ra_reg_to_grf = ralloc_array(compiler, uint8_t, ra_reg_count);
154 struct ra_regs *regs = ra_alloc_reg_set(compiler, ra_reg_count, false);
155 if (devinfo->gen >= 6)
156 ra_set_allocate_round_robin(regs);
157 int *classes = ralloc_array(compiler, int, class_count);
158 int aligned_bary_class = -1;
159
160 /* Allocate space for q values. We allocate class_count + 1 because we
161 * want to leave room for the aligned barycentric class if we have it.
162 */
163 unsigned int **q_values = ralloc_array(compiler, unsigned int *,
164 class_count + 1);
165 for (int i = 0; i < class_count + 1; ++i)
166 q_values[i] = ralloc_array(q_values, unsigned int, class_count + 1);
167
168 /* Now, add the registers to their classes, and add the conflicts
169 * between them and the base GRF registers (and also each other).
170 */
171 int reg = 0;
172 int aligned_bary_base_reg = 0;
173 int aligned_bary_reg_count = 0;
174 for (int i = 0; i < class_count; i++) {
175 int class_reg_count;
176 if (devinfo->gen <= 5 && dispatch_width >= 16) {
177 class_reg_count = (base_reg_count - (class_sizes[i] - 1)) / 2;
178
179 /* See comment below. The only difference here is that we are
180 * dealing with pairs of registers instead of single registers.
181 * Registers of odd sizes simply get rounded up. */
182 for (int j = 0; j < class_count; j++)
183 q_values[i][j] = (class_sizes[i] + 1) / 2 +
184 (class_sizes[j] + 1) / 2 - 1;
185 } else {
186 class_reg_count = base_reg_count - (class_sizes[i] - 1);
187
188 /* From register_allocate.c:
189 *
190 * q(B,C) (indexed by C, B is this register class) in
191 * Runeson/Nyström paper. This is "how many registers of B could
192 * the worst choice register from C conflict with".
193 *
194 * If we just let the register allocation algorithm compute these
195 * values, is extremely expensive. However, since all of our
196 * registers are laid out, we can very easily compute them
197 * ourselves. View the register from C as fixed starting at GRF n
198 * somwhere in the middle, and the register from B as sliding back
199 * and forth. Then the first register to conflict from B is the
200 * one starting at n - class_size[B] + 1 and the last register to
201 * conflict will start at n + class_size[B] - 1. Therefore, the
202 * number of conflicts from B is class_size[B] + class_size[C] - 1.
203 *
204 * +-+-+-+-+-+-+ +-+-+-+-+-+-+
205 * B | | | | | |n| --> | | | | | | |
206 * +-+-+-+-+-+-+ +-+-+-+-+-+-+
207 * +-+-+-+-+-+
208 * C |n| | | | |
209 * +-+-+-+-+-+
210 */
211 for (int j = 0; j < class_count; j++)
212 q_values[i][j] = class_sizes[i] + class_sizes[j] - 1;
213 }
214 classes[i] = ra_alloc_reg_class(regs);
215
216 /* Save this off for the aligned barycentric class at the end. */
217 if (class_sizes[i] == int(aligned_bary_size(dispatch_width))) {
218 aligned_bary_base_reg = reg;
219 aligned_bary_reg_count = class_reg_count;
220 }
221
222 if (devinfo->gen <= 5 && dispatch_width >= 16) {
223 for (int j = 0; j < class_reg_count; j++) {
224 ra_class_add_reg(regs, classes[i], reg);
225
226 ra_reg_to_grf[reg] = j * 2;
227
228 for (int base_reg = j;
229 base_reg < j + (class_sizes[i] + 1) / 2;
230 base_reg++) {
231 ra_add_reg_conflict(regs, base_reg, reg);
232 }
233
234 reg++;
235 }
236 } else {
237 for (int j = 0; j < class_reg_count; j++) {
238 ra_class_add_reg(regs, classes[i], reg);
239
240 ra_reg_to_grf[reg] = j;
241
242 for (int base_reg = j;
243 base_reg < j + class_sizes[i];
244 base_reg++) {
245 ra_add_reg_conflict(regs, base_reg, reg);
246 }
247
248 reg++;
249 }
250 }
251 }
252 assert(reg == ra_reg_count);
253
254 /* Applying transitivity to all of the base registers gives us the
255 * appropreate register conflict relationships everywhere.
256 */
257 for (int reg = 0; reg < base_reg_count; reg++)
258 ra_make_reg_conflicts_transitive(regs, reg);
259
260 /* Add a special class for aligned barycentrics, which we'll put the
261 * first source of LINTERP on so that we can do PLN on Gen <= 6.
262 */
263 if (devinfo->has_pln && (devinfo->gen == 6 ||
264 (dispatch_width == 8 && devinfo->gen <= 5))) {
265 aligned_bary_class = ra_alloc_reg_class(regs);
266
267 for (int i = 0; i < aligned_bary_reg_count; i++) {
268 if ((ra_reg_to_grf[aligned_bary_base_reg + i] & 1) == 0) {
269 ra_class_add_reg(regs, aligned_bary_class,
270 aligned_bary_base_reg + i);
271 }
272 }
273
274 for (int i = 0; i < class_count; i++) {
275 /* These are a little counter-intuitive because the barycentric
276 * registers are required to be aligned while the register they are
277 * potentially interferring with are not. In the case where the size
278 * is even, the worst-case is that the register is odd-aligned. In
279 * the odd-size case, it doesn't matter.
280 */
281 q_values[class_count][i] = class_sizes[i] / 2 +
282 aligned_bary_size(dispatch_width) / 2;
283 q_values[i][class_count] = class_sizes[i] +
284 aligned_bary_size(dispatch_width) - 1;
285 }
286 q_values[class_count][class_count] = aligned_bary_size(dispatch_width) - 1;
287 }
288
289 ra_set_finalize(regs, q_values);
290
291 ralloc_free(q_values);
292
293 compiler->fs_reg_sets[index].regs = regs;
294 for (unsigned i = 0; i < ARRAY_SIZE(compiler->fs_reg_sets[index].classes); i++)
295 compiler->fs_reg_sets[index].classes[i] = -1;
296 for (int i = 0; i < class_count; i++)
297 compiler->fs_reg_sets[index].classes[class_sizes[i] - 1] = classes[i];
298 compiler->fs_reg_sets[index].ra_reg_to_grf = ra_reg_to_grf;
299 compiler->fs_reg_sets[index].aligned_bary_class = aligned_bary_class;
300 }
301
302 void
brw_fs_alloc_reg_sets(struct brw_compiler * compiler)303 brw_fs_alloc_reg_sets(struct brw_compiler *compiler)
304 {
305 brw_alloc_reg_set(compiler, 8);
306 brw_alloc_reg_set(compiler, 16);
307 brw_alloc_reg_set(compiler, 32);
308 }
309
310 static int
count_to_loop_end(const bblock_t * block)311 count_to_loop_end(const bblock_t *block)
312 {
313 if (block->end()->opcode == BRW_OPCODE_WHILE)
314 return block->end_ip;
315
316 int depth = 1;
317 /* Skip the first block, since we don't want to count the do the calling
318 * function found.
319 */
320 for (block = block->next();
321 depth > 0;
322 block = block->next()) {
323 if (block->start()->opcode == BRW_OPCODE_DO)
324 depth++;
325 if (block->end()->opcode == BRW_OPCODE_WHILE) {
326 depth--;
327 if (depth == 0)
328 return block->end_ip;
329 }
330 }
331 unreachable("not reached");
332 }
333
calculate_payload_ranges(int payload_node_count,int * payload_last_use_ip) const334 void fs_visitor::calculate_payload_ranges(int payload_node_count,
335 int *payload_last_use_ip) const
336 {
337 int loop_depth = 0;
338 int loop_end_ip = 0;
339
340 for (int i = 0; i < payload_node_count; i++)
341 payload_last_use_ip[i] = -1;
342
343 int ip = 0;
344 foreach_block_and_inst(block, fs_inst, inst, cfg) {
345 switch (inst->opcode) {
346 case BRW_OPCODE_DO:
347 loop_depth++;
348
349 /* Since payload regs are deffed only at the start of the shader
350 * execution, any uses of the payload within a loop mean the live
351 * interval extends to the end of the outermost loop. Find the ip of
352 * the end now.
353 */
354 if (loop_depth == 1)
355 loop_end_ip = count_to_loop_end(block);
356 break;
357 case BRW_OPCODE_WHILE:
358 loop_depth--;
359 break;
360 default:
361 break;
362 }
363
364 int use_ip;
365 if (loop_depth > 0)
366 use_ip = loop_end_ip;
367 else
368 use_ip = ip;
369
370 /* Note that UNIFORM args have been turned into FIXED_GRF by
371 * assign_curbe_setup(), and interpolation uses fixed hardware regs from
372 * the start (see interp_reg()).
373 */
374 for (int i = 0; i < inst->sources; i++) {
375 if (inst->src[i].file == FIXED_GRF) {
376 int node_nr = inst->src[i].nr;
377 if (node_nr >= payload_node_count)
378 continue;
379
380 for (unsigned j = 0; j < regs_read(inst, i); j++) {
381 payload_last_use_ip[node_nr + j] = use_ip;
382 assert(node_nr + j < unsigned(payload_node_count));
383 }
384 }
385 }
386
387 /* Special case instructions which have extra implied registers used. */
388 switch (inst->opcode) {
389 case CS_OPCODE_CS_TERMINATE:
390 payload_last_use_ip[0] = use_ip;
391 break;
392
393 default:
394 if (inst->eot) {
395 /* We could omit this for the !inst->header_present case, except
396 * that the simulator apparently incorrectly reads from g0/g1
397 * instead of sideband. It also really freaks out driver
398 * developers to see g0 used in unusual places, so just always
399 * reserve it.
400 */
401 payload_last_use_ip[0] = use_ip;
402 payload_last_use_ip[1] = use_ip;
403 }
404 break;
405 }
406
407 ip++;
408 }
409 }
410
411 class fs_reg_alloc {
412 public:
fs_reg_alloc(fs_visitor * fs)413 fs_reg_alloc(fs_visitor *fs):
414 fs(fs), devinfo(fs->devinfo), compiler(fs->compiler),
415 live(fs->live_analysis.require()), g(NULL),
416 have_spill_costs(false)
417 {
418 mem_ctx = ralloc_context(NULL);
419
420 /* Stash the number of instructions so we can sanity check that our
421 * counts still match liveness.
422 */
423 live_instr_count = fs->cfg->last_block()->end_ip + 1;
424
425 spill_insts = _mesa_pointer_set_create(mem_ctx);
426
427 /* Most of this allocation was written for a reg_width of 1
428 * (dispatch_width == 8). In extending to SIMD16, the code was
429 * left in place and it was converted to have the hardware
430 * registers it's allocating be contiguous physical pairs of regs
431 * for reg_width == 2.
432 */
433 int reg_width = fs->dispatch_width / 8;
434 rsi = util_logbase2(reg_width);
435 payload_node_count = ALIGN(fs->first_non_payload_grf, reg_width);
436
437 /* Get payload IP information */
438 payload_last_use_ip = ralloc_array(mem_ctx, int, payload_node_count);
439
440 node_count = 0;
441 first_payload_node = 0;
442 first_mrf_hack_node = 0;
443 scratch_header_node = 0;
444 grf127_send_hack_node = 0;
445 first_vgrf_node = 0;
446 last_vgrf_node = 0;
447 first_spill_node = 0;
448
449 spill_vgrf_ip = NULL;
450 spill_vgrf_ip_alloc = 0;
451 spill_node_count = 0;
452 }
453
~fs_reg_alloc()454 ~fs_reg_alloc()
455 {
456 ralloc_free(mem_ctx);
457 }
458
459 bool assign_regs(bool allow_spilling, bool spill_all);
460
461 private:
462 void setup_live_interference(unsigned node,
463 int node_start_ip, int node_end_ip);
464 void setup_inst_interference(const fs_inst *inst);
465
466 void build_interference_graph(bool allow_spilling);
467 void discard_interference_graph();
468
469 void emit_unspill(const fs_builder &bld, fs_reg dst,
470 uint32_t spill_offset, unsigned count);
471 void emit_spill(const fs_builder &bld, fs_reg src,
472 uint32_t spill_offset, unsigned count);
473
474 void set_spill_costs();
475 int choose_spill_reg();
476 fs_reg alloc_scratch_header();
477 fs_reg alloc_spill_reg(unsigned size, int ip);
478 void spill_reg(unsigned spill_reg);
479
480 void *mem_ctx;
481 fs_visitor *fs;
482 const gen_device_info *devinfo;
483 const brw_compiler *compiler;
484 const fs_live_variables &live;
485 int live_instr_count;
486
487 set *spill_insts;
488
489 /* Which compiler->fs_reg_sets[] to use */
490 int rsi;
491
492 ra_graph *g;
493 bool have_spill_costs;
494
495 int payload_node_count;
496 int *payload_last_use_ip;
497
498 int node_count;
499 int first_payload_node;
500 int first_mrf_hack_node;
501 int scratch_header_node;
502 int grf127_send_hack_node;
503 int first_vgrf_node;
504 int last_vgrf_node;
505 int first_spill_node;
506
507 int *spill_vgrf_ip;
508 int spill_vgrf_ip_alloc;
509 int spill_node_count;
510
511 fs_reg scratch_header;
512 };
513
514 /**
515 * Sets the mrf_used array to indicate which MRFs are used by the shader IR
516 *
517 * This is used in assign_regs() to decide which of the GRFs that we use as
518 * MRFs on gen7 get normally register allocated, and in register spilling to
519 * see if we can actually use MRFs to do spills without overwriting normal MRF
520 * contents.
521 */
522 static void
get_used_mrfs(const fs_visitor * v,bool * mrf_used)523 get_used_mrfs(const fs_visitor *v, bool *mrf_used)
524 {
525 int reg_width = v->dispatch_width / 8;
526
527 memset(mrf_used, 0, BRW_MAX_MRF(v->devinfo->gen) * sizeof(bool));
528
529 foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
530 if (inst->dst.file == MRF) {
531 int reg = inst->dst.nr & ~BRW_MRF_COMPR4;
532 mrf_used[reg] = true;
533 if (reg_width == 2) {
534 if (inst->dst.nr & BRW_MRF_COMPR4) {
535 mrf_used[reg + 4] = true;
536 } else {
537 mrf_used[reg + 1] = true;
538 }
539 }
540 }
541
542 if (inst->mlen > 0) {
543 for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) {
544 mrf_used[inst->base_mrf + i] = true;
545 }
546 }
547 }
548 }
549
550 namespace {
551 /**
552 * Maximum spill block size we expect to encounter in 32B units.
553 *
554 * This is somewhat arbitrary and doesn't necessarily limit the maximum
555 * variable size that can be spilled -- A higher value will allow a
556 * variable of a given size to be spilled more efficiently with a smaller
557 * number of scratch messages, but will increase the likelihood of a
558 * collision between the MRFs reserved for spilling and other MRFs used by
559 * the program (and possibly increase GRF register pressure on platforms
560 * without hardware MRFs), what could cause register allocation to fail.
561 *
562 * For the moment reserve just enough space so a register of 32 bit
563 * component type and natural region width can be spilled without splitting
564 * into multiple (force_writemask_all) scratch messages.
565 */
566 unsigned
spill_max_size(const backend_shader * s)567 spill_max_size(const backend_shader *s)
568 {
569 /* FINISHME - On Gen7+ it should be possible to avoid this limit
570 * altogether by spilling directly from the temporary GRF
571 * allocated to hold the result of the instruction (and the
572 * scratch write header).
573 */
574 /* FINISHME - The shader's dispatch width probably belongs in
575 * backend_shader (or some nonexistent fs_shader class?)
576 * rather than in the visitor class.
577 */
578 return static_cast<const fs_visitor *>(s)->dispatch_width / 8;
579 }
580
581 /**
582 * First MRF register available for spilling.
583 */
584 unsigned
spill_base_mrf(const backend_shader * s)585 spill_base_mrf(const backend_shader *s)
586 {
587 /* We don't use the MRF hack on Gen9+ */
588 assert(s->devinfo->gen < 9);
589 return BRW_MAX_MRF(s->devinfo->gen) - spill_max_size(s) - 1;
590 }
591 }
592
593 void
setup_live_interference(unsigned node,int node_start_ip,int node_end_ip)594 fs_reg_alloc::setup_live_interference(unsigned node,
595 int node_start_ip, int node_end_ip)
596 {
597 /* Mark any virtual grf that is live between the start of the program and
598 * the last use of a payload node interfering with that payload node.
599 */
600 for (int i = 0; i < payload_node_count; i++) {
601 if (payload_last_use_ip[i] == -1)
602 continue;
603
604 /* Note that we use a <= comparison, unlike vgrfs_interfere(),
605 * in order to not have to worry about the uniform issue described in
606 * calculate_live_intervals().
607 */
608 if (node_start_ip <= payload_last_use_ip[i])
609 ra_add_node_interference(g, node, first_payload_node + i);
610 }
611
612 /* If we have the MRF hack enabled, mark this node as interfering with all
613 * MRF registers.
614 */
615 if (first_mrf_hack_node >= 0) {
616 for (int i = spill_base_mrf(fs); i < BRW_MAX_MRF(devinfo->gen); i++)
617 ra_add_node_interference(g, node, first_mrf_hack_node + i);
618 }
619
620 /* Everything interferes with the scratch header */
621 if (scratch_header_node >= 0)
622 ra_add_node_interference(g, node, scratch_header_node);
623
624 /* Add interference with every vgrf whose live range intersects this
625 * node's. We only need to look at nodes below this one as the reflexivity
626 * of interference will take care of the rest.
627 */
628 for (unsigned n2 = first_vgrf_node;
629 n2 <= (unsigned)last_vgrf_node && n2 < node; n2++) {
630 unsigned vgrf = n2 - first_vgrf_node;
631 if (!(node_end_ip <= live.vgrf_start[vgrf] ||
632 live.vgrf_end[vgrf] <= node_start_ip))
633 ra_add_node_interference(g, node, n2);
634 }
635 }
636
637 void
setup_inst_interference(const fs_inst * inst)638 fs_reg_alloc::setup_inst_interference(const fs_inst *inst)
639 {
640 /* Certain instructions can't safely use the same register for their
641 * sources and destination. Add interference.
642 */
643 if (inst->dst.file == VGRF && inst->has_source_and_destination_hazard()) {
644 for (unsigned i = 0; i < inst->sources; i++) {
645 if (inst->src[i].file == VGRF) {
646 ra_add_node_interference(g, first_vgrf_node + inst->dst.nr,
647 first_vgrf_node + inst->src[i].nr);
648 }
649 }
650 }
651
652 /* In 16-wide instructions we have an issue where a compressed
653 * instruction is actually two instructions executed simultaneously.
654 * It's actually ok to have the source and destination registers be
655 * the same. In this case, each instruction over-writes its own
656 * source and there's no problem. The real problem here is if the
657 * source and destination registers are off by one. Then you can end
658 * up in a scenario where the first instruction over-writes the
659 * source of the second instruction. Since the compiler doesn't know
660 * about this level of granularity, we simply make the source and
661 * destination interfere.
662 */
663 if (inst->exec_size >= 16 && inst->dst.file == VGRF) {
664 for (int i = 0; i < inst->sources; ++i) {
665 if (inst->src[i].file == VGRF) {
666 ra_add_node_interference(g, first_vgrf_node + inst->dst.nr,
667 first_vgrf_node + inst->src[i].nr);
668 }
669 }
670 }
671
672 if (grf127_send_hack_node >= 0) {
673 /* At Intel Broadwell PRM, vol 07, section "Instruction Set Reference",
674 * subsection "EUISA Instructions", Send Message (page 990):
675 *
676 * "r127 must not be used for return address when there is a src and
677 * dest overlap in send instruction."
678 *
679 * We are avoiding using grf127 as part of the destination of send
680 * messages adding a node interference to the grf127_send_hack_node.
681 * This node has a fixed asignment to grf127.
682 *
683 * We don't apply it to SIMD16 instructions because previous code avoids
684 * any register overlap between sources and destination.
685 */
686 if (inst->exec_size < 16 && inst->is_send_from_grf() &&
687 inst->dst.file == VGRF)
688 ra_add_node_interference(g, first_vgrf_node + inst->dst.nr,
689 grf127_send_hack_node);
690
691 /* Spilling instruction are genereated as SEND messages from MRF but as
692 * Gen7+ supports sending from GRF the driver will maps assingn these
693 * MRF registers to a GRF. Implementations reuses the dest of the send
694 * message as source. So as we will have an overlap for sure, we create
695 * an interference between destination and grf127.
696 */
697 if ((inst->opcode == SHADER_OPCODE_GEN7_SCRATCH_READ ||
698 inst->opcode == SHADER_OPCODE_GEN4_SCRATCH_READ) &&
699 inst->dst.file == VGRF)
700 ra_add_node_interference(g, first_vgrf_node + inst->dst.nr,
701 grf127_send_hack_node);
702 }
703
704 /* From the Skylake PRM Vol. 2a docs for sends:
705 *
706 * "It is required that the second block of GRFs does not overlap with
707 * the first block."
708 *
709 * Normally, this is taken care of by fixup_sends_duplicate_payload() but
710 * in the case where one of the registers is an undefined value, the
711 * register allocator may decide that they don't interfere even though
712 * they're used as sources in the same instruction. We also need to add
713 * interference here.
714 */
715 if (devinfo->gen >= 9) {
716 if (inst->opcode == SHADER_OPCODE_SEND && inst->ex_mlen > 0 &&
717 inst->src[2].file == VGRF && inst->src[3].file == VGRF &&
718 inst->src[2].nr != inst->src[3].nr)
719 ra_add_node_interference(g, first_vgrf_node + inst->src[2].nr,
720 first_vgrf_node + inst->src[3].nr);
721 }
722
723 /* When we do send-from-GRF for FB writes, we need to ensure that the last
724 * write instruction sends from a high register. This is because the
725 * vertex fetcher wants to start filling the low payload registers while
726 * the pixel data port is still working on writing out the memory. If we
727 * don't do this, we get rendering artifacts.
728 *
729 * We could just do "something high". Instead, we just pick the highest
730 * register that works.
731 */
732 if (inst->eot) {
733 const int vgrf = inst->opcode == SHADER_OPCODE_SEND ?
734 inst->src[2].nr : inst->src[0].nr;
735 int size = fs->alloc.sizes[vgrf];
736 int reg = compiler->fs_reg_sets[rsi].class_to_ra_reg_range[size] - 1;
737
738 if (first_mrf_hack_node >= 0) {
739 /* If something happened to spill, we want to push the EOT send
740 * register early enough in the register file that we don't
741 * conflict with any used MRF hack registers.
742 */
743 reg -= BRW_MAX_MRF(devinfo->gen) - spill_base_mrf(fs);
744 } else if (grf127_send_hack_node >= 0) {
745 /* Avoid r127 which might be unusable if the node was previously
746 * written by a SIMD8 SEND message with source/destination overlap.
747 */
748 reg--;
749 }
750
751 ra_set_node_reg(g, first_vgrf_node + vgrf, reg);
752 }
753 }
754
755 void
build_interference_graph(bool allow_spilling)756 fs_reg_alloc::build_interference_graph(bool allow_spilling)
757 {
758 /* Compute the RA node layout */
759 node_count = 0;
760 first_payload_node = node_count;
761 node_count += payload_node_count;
762 if (devinfo->gen >= 7 && devinfo->gen < 9 && allow_spilling) {
763 first_mrf_hack_node = node_count;
764 node_count += BRW_MAX_GRF - GEN7_MRF_HACK_START;
765 } else {
766 first_mrf_hack_node = -1;
767 }
768 if (devinfo->gen >= 8) {
769 grf127_send_hack_node = node_count;
770 node_count ++;
771 } else {
772 grf127_send_hack_node = -1;
773 }
774 first_vgrf_node = node_count;
775 node_count += fs->alloc.count;
776 last_vgrf_node = node_count - 1;
777 if (devinfo->gen >= 9 && allow_spilling) {
778 scratch_header_node = node_count++;
779 } else {
780 scratch_header_node = -1;
781 }
782 first_spill_node = node_count;
783
784 fs->calculate_payload_ranges(payload_node_count,
785 payload_last_use_ip);
786
787 assert(g == NULL);
788 g = ra_alloc_interference_graph(compiler->fs_reg_sets[rsi].regs, node_count);
789 ralloc_steal(mem_ctx, g);
790
791 /* Set up the payload nodes */
792 for (int i = 0; i < payload_node_count; i++) {
793 /* Mark each payload node as being allocated to its physical register.
794 *
795 * The alternative would be to have per-physical-register classes, which
796 * would just be silly.
797 */
798 if (devinfo->gen <= 5 && fs->dispatch_width >= 16) {
799 /* We have to divide by 2 here because we only have even numbered
800 * registers. Some of the payload registers will be odd, but
801 * that's ok because their physical register numbers have already
802 * been assigned. The only thing this is used for is interference.
803 */
804 ra_set_node_reg(g, first_payload_node + i, i / 2);
805 } else {
806 ra_set_node_reg(g, first_payload_node + i, i);
807 }
808 }
809
810 if (first_mrf_hack_node >= 0) {
811 /* Mark each MRF reg node as being allocated to its physical
812 * register.
813 *
814 * The alternative would be to have per-physical-register classes,
815 * which would just be silly.
816 */
817 for (int i = 0; i < BRW_MAX_MRF(devinfo->gen); i++) {
818 ra_set_node_reg(g, first_mrf_hack_node + i,
819 GEN7_MRF_HACK_START + i);
820 }
821 }
822
823 if (grf127_send_hack_node >= 0)
824 ra_set_node_reg(g, grf127_send_hack_node, 127);
825
826 /* Specify the classes of each virtual register. */
827 for (unsigned i = 0; i < fs->alloc.count; i++) {
828 unsigned size = fs->alloc.sizes[i];
829
830 assert(size <= ARRAY_SIZE(compiler->fs_reg_sets[rsi].classes) &&
831 "Register allocation relies on split_virtual_grfs()");
832
833 ra_set_node_class(g, first_vgrf_node + i,
834 compiler->fs_reg_sets[rsi].classes[size - 1]);
835 }
836
837 /* Special case: on pre-Gen7 hardware that supports PLN, the second operand
838 * of a PLN instruction needs to be an even-numbered register, so we have a
839 * special register class aligned_bary_class to handle this case.
840 */
841 if (compiler->fs_reg_sets[rsi].aligned_bary_class >= 0) {
842 foreach_block_and_inst(block, fs_inst, inst, fs->cfg) {
843 if (inst->opcode == FS_OPCODE_LINTERP && inst->src[0].file == VGRF &&
844 fs->alloc.sizes[inst->src[0].nr] ==
845 aligned_bary_size(fs->dispatch_width)) {
846 ra_set_node_class(g, first_vgrf_node + inst->src[0].nr,
847 compiler->fs_reg_sets[rsi].aligned_bary_class);
848 }
849 }
850 }
851
852 /* Add interference based on the live range of the register */
853 for (unsigned i = 0; i < fs->alloc.count; i++) {
854 setup_live_interference(first_vgrf_node + i,
855 live.vgrf_start[i],
856 live.vgrf_end[i]);
857 }
858
859 /* Add interference based on the instructions in which a register is used.
860 */
861 foreach_block_and_inst(block, fs_inst, inst, fs->cfg)
862 setup_inst_interference(inst);
863 }
864
865 void
discard_interference_graph()866 fs_reg_alloc::discard_interference_graph()
867 {
868 ralloc_free(g);
869 g = NULL;
870 have_spill_costs = false;
871 }
872
873 void
emit_unspill(const fs_builder & bld,fs_reg dst,uint32_t spill_offset,unsigned count)874 fs_reg_alloc::emit_unspill(const fs_builder &bld, fs_reg dst,
875 uint32_t spill_offset, unsigned count)
876 {
877 const gen_device_info *devinfo = bld.shader->devinfo;
878 const unsigned reg_size = dst.component_size(bld.dispatch_width()) /
879 REG_SIZE;
880 assert(count % reg_size == 0);
881
882 for (unsigned i = 0; i < count / reg_size; i++) {
883 fs_inst *unspill_inst;
884 if (devinfo->gen >= 9) {
885 fs_reg header = this->scratch_header;
886 fs_builder ubld = bld.exec_all().group(1, 0);
887 assert(spill_offset % 16 == 0);
888 unspill_inst = ubld.MOV(component(header, 2),
889 brw_imm_ud(spill_offset / 16));
890 _mesa_set_add(spill_insts, unspill_inst);
891
892 fs_reg srcs[] = { brw_imm_ud(0), brw_imm_ud(0), header };
893 unspill_inst = bld.emit(SHADER_OPCODE_SEND, dst,
894 srcs, ARRAY_SIZE(srcs));
895 unspill_inst->mlen = 1;
896 unspill_inst->header_size = 1;
897 unspill_inst->size_written = reg_size * REG_SIZE;
898 unspill_inst->send_has_side_effects = false;
899 unspill_inst->send_is_volatile = true;
900 unspill_inst->sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
901 unspill_inst->desc =
902 brw_dp_read_desc(devinfo, GEN8_BTI_STATELESS_NON_COHERENT,
903 BRW_DATAPORT_OWORD_BLOCK_DWORDS(reg_size * 8),
904 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
905 BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
906 } else if (devinfo->gen >= 7 && spill_offset < (1 << 12) * REG_SIZE) {
907 /* The Gen7 descriptor-based offset is 12 bits of HWORD units.
908 * Because the Gen7-style scratch block read is hardwired to BTI 255,
909 * on Gen9+ it would cause the DC to do an IA-coherent read, what
910 * largely outweighs the slight advantage from not having to provide
911 * the address as part of the message header, so we're better off
912 * using plain old oword block reads.
913 */
914 unspill_inst = bld.emit(SHADER_OPCODE_GEN7_SCRATCH_READ, dst);
915 unspill_inst->offset = spill_offset;
916 } else {
917 unspill_inst = bld.emit(SHADER_OPCODE_GEN4_SCRATCH_READ, dst);
918 unspill_inst->offset = spill_offset;
919 unspill_inst->base_mrf = spill_base_mrf(bld.shader);
920 unspill_inst->mlen = 1; /* header contains offset */
921 }
922 _mesa_set_add(spill_insts, unspill_inst);
923
924 dst.offset += reg_size * REG_SIZE;
925 spill_offset += reg_size * REG_SIZE;
926 }
927 }
928
929 void
emit_spill(const fs_builder & bld,fs_reg src,uint32_t spill_offset,unsigned count)930 fs_reg_alloc::emit_spill(const fs_builder &bld, fs_reg src,
931 uint32_t spill_offset, unsigned count)
932 {
933 const gen_device_info *devinfo = bld.shader->devinfo;
934 const unsigned reg_size = src.component_size(bld.dispatch_width()) /
935 REG_SIZE;
936 assert(count % reg_size == 0);
937
938 for (unsigned i = 0; i < count / reg_size; i++) {
939 fs_inst *spill_inst;
940 if (devinfo->gen >= 9) {
941 fs_reg header = this->scratch_header;
942 fs_builder ubld = bld.exec_all().group(1, 0);
943 assert(spill_offset % 16 == 0);
944 spill_inst = ubld.MOV(component(header, 2),
945 brw_imm_ud(spill_offset / 16));
946 _mesa_set_add(spill_insts, spill_inst);
947
948 fs_reg srcs[] = { brw_imm_ud(0), brw_imm_ud(0), header, src };
949 spill_inst = bld.emit(SHADER_OPCODE_SEND, bld.null_reg_f(),
950 srcs, ARRAY_SIZE(srcs));
951 spill_inst->mlen = 1;
952 spill_inst->ex_mlen = reg_size;
953 spill_inst->size_written = 0;
954 spill_inst->header_size = 1;
955 spill_inst->send_has_side_effects = true;
956 spill_inst->send_is_volatile = false;
957 spill_inst->sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
958 spill_inst->desc =
959 brw_dp_write_desc(devinfo, GEN8_BTI_STATELESS_NON_COHERENT,
960 BRW_DATAPORT_OWORD_BLOCK_DWORDS(reg_size * 8),
961 GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE,
962 0 /* not a render target */,
963 false /* send_commit_msg */);
964 } else {
965 spill_inst = bld.emit(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
966 bld.null_reg_f(), src);
967 spill_inst->offset = spill_offset;
968 spill_inst->mlen = 1 + reg_size; /* header, value */
969 spill_inst->base_mrf = spill_base_mrf(bld.shader);
970 }
971 _mesa_set_add(spill_insts, spill_inst);
972
973 src.offset += reg_size * REG_SIZE;
974 spill_offset += reg_size * REG_SIZE;
975 }
976 }
977
978 void
set_spill_costs()979 fs_reg_alloc::set_spill_costs()
980 {
981 float block_scale = 1.0;
982 float spill_costs[fs->alloc.count];
983 bool no_spill[fs->alloc.count];
984
985 for (unsigned i = 0; i < fs->alloc.count; i++) {
986 spill_costs[i] = 0.0;
987 no_spill[i] = false;
988 }
989
990 /* Calculate costs for spilling nodes. Call it a cost of 1 per
991 * spill/unspill we'll have to do, and guess that the insides of
992 * loops run 10 times.
993 */
994 foreach_block_and_inst(block, fs_inst, inst, fs->cfg) {
995 for (unsigned int i = 0; i < inst->sources; i++) {
996 if (inst->src[i].file == VGRF)
997 spill_costs[inst->src[i].nr] += regs_read(inst, i) * block_scale;
998 }
999
1000 if (inst->dst.file == VGRF)
1001 spill_costs[inst->dst.nr] += regs_written(inst) * block_scale;
1002
1003 /* Don't spill anything we generated while spilling */
1004 if (_mesa_set_search(spill_insts, inst)) {
1005 for (unsigned int i = 0; i < inst->sources; i++) {
1006 if (inst->src[i].file == VGRF)
1007 no_spill[inst->src[i].nr] = true;
1008 }
1009 if (inst->dst.file == VGRF)
1010 no_spill[inst->dst.nr] = true;
1011 }
1012
1013 switch (inst->opcode) {
1014
1015 case BRW_OPCODE_DO:
1016 block_scale *= 10;
1017 break;
1018
1019 case BRW_OPCODE_WHILE:
1020 block_scale /= 10;
1021 break;
1022
1023 case BRW_OPCODE_IF:
1024 case BRW_OPCODE_IFF:
1025 block_scale *= 0.5;
1026 break;
1027
1028 case BRW_OPCODE_ENDIF:
1029 block_scale /= 0.5;
1030 break;
1031
1032 default:
1033 break;
1034 }
1035 }
1036
1037 for (unsigned i = 0; i < fs->alloc.count; i++) {
1038 /* Do the no_spill check first. Registers that are used as spill
1039 * temporaries may have been allocated after we calculated liveness so
1040 * we shouldn't look their liveness up. Fortunately, they're always
1041 * used in SCRATCH_READ/WRITE instructions so they'll always be flagged
1042 * no_spill.
1043 */
1044 if (no_spill[i])
1045 continue;
1046
1047 int live_length = live.vgrf_end[i] - live.vgrf_start[i];
1048 if (live_length <= 0)
1049 continue;
1050
1051 /* Divide the cost (in number of spills/fills) by the log of the length
1052 * of the live range of the register. This will encourage spill logic
1053 * to spill long-living things before spilling short-lived things where
1054 * spilling is less likely to actually do us any good. We use the log
1055 * of the length because it will fall off very quickly and not cause us
1056 * to spill medium length registers with more uses.
1057 */
1058 float adjusted_cost = spill_costs[i] / logf(live_length);
1059 ra_set_node_spill_cost(g, first_vgrf_node + i, adjusted_cost);
1060 }
1061
1062 have_spill_costs = true;
1063 }
1064
1065 int
choose_spill_reg()1066 fs_reg_alloc::choose_spill_reg()
1067 {
1068 if (!have_spill_costs)
1069 set_spill_costs();
1070
1071 int node = ra_get_best_spill_node(g);
1072 if (node < 0)
1073 return -1;
1074
1075 assert(node >= first_vgrf_node);
1076 return node - first_vgrf_node;
1077 }
1078
1079 fs_reg
alloc_scratch_header()1080 fs_reg_alloc::alloc_scratch_header()
1081 {
1082 int vgrf = fs->alloc.allocate(1);
1083 assert(first_vgrf_node + vgrf == scratch_header_node);
1084 ra_set_node_class(g, scratch_header_node,
1085 compiler->fs_reg_sets[rsi].classes[0]);
1086
1087 setup_live_interference(scratch_header_node, 0, INT_MAX);
1088
1089 return fs_reg(VGRF, vgrf, BRW_REGISTER_TYPE_UD);
1090 }
1091
1092 fs_reg
alloc_spill_reg(unsigned size,int ip)1093 fs_reg_alloc::alloc_spill_reg(unsigned size, int ip)
1094 {
1095 int vgrf = fs->alloc.allocate(size);
1096 int n = ra_add_node(g, compiler->fs_reg_sets[rsi].classes[size - 1]);
1097 assert(n == first_vgrf_node + vgrf);
1098 assert(n == first_spill_node + spill_node_count);
1099
1100 setup_live_interference(n, ip - 1, ip + 1);
1101
1102 /* Add interference between this spill node and any other spill nodes for
1103 * the same instruction.
1104 */
1105 for (int s = 0; s < spill_node_count; s++) {
1106 if (spill_vgrf_ip[s] == ip)
1107 ra_add_node_interference(g, n, first_spill_node + s);
1108 }
1109
1110 /* Add this spill node to the list for next time */
1111 if (spill_node_count >= spill_vgrf_ip_alloc) {
1112 if (spill_vgrf_ip_alloc == 0)
1113 spill_vgrf_ip_alloc = 16;
1114 else
1115 spill_vgrf_ip_alloc *= 2;
1116 spill_vgrf_ip = reralloc(mem_ctx, spill_vgrf_ip, int,
1117 spill_vgrf_ip_alloc);
1118 }
1119 spill_vgrf_ip[spill_node_count++] = ip;
1120
1121 return fs_reg(VGRF, vgrf);
1122 }
1123
1124 void
spill_reg(unsigned spill_reg)1125 fs_reg_alloc::spill_reg(unsigned spill_reg)
1126 {
1127 int size = fs->alloc.sizes[spill_reg];
1128 unsigned int spill_offset = fs->last_scratch;
1129 assert(ALIGN(spill_offset, 16) == spill_offset); /* oword read/write req. */
1130
1131 /* Spills may use MRFs 13-15 in the SIMD16 case. Our texturing is done
1132 * using up to 11 MRFs starting from either m1 or m2, and fb writes can use
1133 * up to m13 (gen6+ simd16: 2 header + 8 color + 2 src0alpha + 2 omask) or
1134 * m15 (gen4-5 simd16: 2 header + 8 color + 1 aads + 2 src depth + 2 dst
1135 * depth), starting from m1. In summary: We may not be able to spill in
1136 * SIMD16 mode, because we'd stomp the FB writes.
1137 */
1138 if (!fs->spilled_any_registers) {
1139 if (devinfo->gen >= 9) {
1140 this->scratch_header = alloc_scratch_header();
1141 fs_builder ubld = fs->bld.exec_all().group(8, 0).at(
1142 fs->cfg->first_block(), fs->cfg->first_block()->start());
1143 fs_inst *header_inst = ubld.emit(SHADER_OPCODE_SCRATCH_HEADER,
1144 this->scratch_header);
1145 _mesa_set_add(spill_insts, header_inst);
1146 } else {
1147 bool mrf_used[BRW_MAX_MRF(devinfo->gen)];
1148 get_used_mrfs(fs, mrf_used);
1149
1150 for (int i = spill_base_mrf(fs); i < BRW_MAX_MRF(devinfo->gen); i++) {
1151 if (mrf_used[i]) {
1152 fs->fail("Register spilling not supported with m%d used", i);
1153 return;
1154 }
1155 }
1156 }
1157
1158 fs->spilled_any_registers = true;
1159 }
1160
1161 fs->last_scratch += size * REG_SIZE;
1162
1163 /* We're about to replace all uses of this register. It no longer
1164 * conflicts with anything so we can get rid of its interference.
1165 */
1166 ra_set_node_spill_cost(g, first_vgrf_node + spill_reg, 0);
1167 ra_reset_node_interference(g, first_vgrf_node + spill_reg);
1168
1169 /* Generate spill/unspill instructions for the objects being
1170 * spilled. Right now, we spill or unspill the whole thing to a
1171 * virtual grf of the same size. For most instructions, though, we
1172 * could just spill/unspill the GRF being accessed.
1173 */
1174 int ip = 0;
1175 foreach_block_and_inst (block, fs_inst, inst, fs->cfg) {
1176 const fs_builder ibld = fs_builder(fs, block, inst);
1177 exec_node *before = inst->prev;
1178 exec_node *after = inst->next;
1179
1180 for (unsigned int i = 0; i < inst->sources; i++) {
1181 if (inst->src[i].file == VGRF &&
1182 inst->src[i].nr == spill_reg) {
1183 int count = regs_read(inst, i);
1184 int subset_spill_offset = spill_offset +
1185 ROUND_DOWN_TO(inst->src[i].offset, REG_SIZE);
1186 fs_reg unspill_dst = alloc_spill_reg(count, ip);
1187
1188 inst->src[i].nr = unspill_dst.nr;
1189 inst->src[i].offset %= REG_SIZE;
1190
1191 /* We read the largest power-of-two divisor of the register count
1192 * (because only POT scratch read blocks are allowed by the
1193 * hardware) up to the maximum supported block size.
1194 */
1195 const unsigned width =
1196 MIN2(32, 1u << (ffs(MAX2(1, count) * 8) - 1));
1197
1198 /* Set exec_all() on unspill messages under the (rather
1199 * pessimistic) assumption that there is no one-to-one
1200 * correspondence between channels of the spilled variable in
1201 * scratch space and the scratch read message, which operates on
1202 * 32 bit channels. It shouldn't hurt in any case because the
1203 * unspill destination is a block-local temporary.
1204 */
1205 emit_unspill(ibld.exec_all().group(width, 0), unspill_dst,
1206 subset_spill_offset, count);
1207 }
1208 }
1209
1210 if (inst->dst.file == VGRF &&
1211 inst->dst.nr == spill_reg) {
1212 int subset_spill_offset = spill_offset +
1213 ROUND_DOWN_TO(inst->dst.offset, REG_SIZE);
1214 fs_reg spill_src = alloc_spill_reg(regs_written(inst), ip);
1215
1216 inst->dst.nr = spill_src.nr;
1217 inst->dst.offset %= REG_SIZE;
1218
1219 /* If we're immediately spilling the register, we should not use
1220 * destination dependency hints. Doing so will cause the GPU do
1221 * try to read and write the register at the same time and may
1222 * hang the GPU.
1223 */
1224 inst->no_dd_clear = false;
1225 inst->no_dd_check = false;
1226
1227 /* Calculate the execution width of the scratch messages (which work
1228 * in terms of 32 bit components so we have a fixed number of eight
1229 * channels per spilled register). We attempt to write one
1230 * exec_size-wide component of the variable at a time without
1231 * exceeding the maximum number of (fake) MRF registers reserved for
1232 * spills.
1233 */
1234 const unsigned width = 8 * MIN2(
1235 DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE),
1236 spill_max_size(fs));
1237
1238 /* Spills should only write data initialized by the instruction for
1239 * whichever channels are enabled in the excution mask. If that's
1240 * not possible we'll have to emit a matching unspill before the
1241 * instruction and set force_writemask_all on the spill.
1242 */
1243 const bool per_channel =
1244 inst->dst.is_contiguous() && type_sz(inst->dst.type) == 4 &&
1245 inst->exec_size == width;
1246
1247 /* Builder used to emit the scratch messages. */
1248 const fs_builder ubld = ibld.exec_all(!per_channel).group(width, 0);
1249
1250 /* If our write is going to affect just part of the
1251 * regs_written(inst), then we need to unspill the destination since
1252 * we write back out all of the regs_written(). If the original
1253 * instruction had force_writemask_all set and is not a partial
1254 * write, there should be no need for the unspill since the
1255 * instruction will be overwriting the whole destination in any case.
1256 */
1257 if (inst->is_partial_write() ||
1258 (!inst->force_writemask_all && !per_channel))
1259 emit_unspill(ubld, spill_src, subset_spill_offset,
1260 regs_written(inst));
1261
1262 emit_spill(ubld.at(block, inst->next), spill_src,
1263 subset_spill_offset, regs_written(inst));
1264 }
1265
1266 for (fs_inst *inst = (fs_inst *)before->next;
1267 inst != after; inst = (fs_inst *)inst->next)
1268 setup_inst_interference(inst);
1269
1270 /* We don't advance the ip for scratch read/write instructions
1271 * because we consider them to have the same ip as instruction we're
1272 * spilling around for the purposes of interference. Also, we're
1273 * inserting spill instructions without re-running liveness analysis
1274 * and we don't want to mess up our IPs.
1275 */
1276 if (!_mesa_set_search(spill_insts, inst))
1277 ip++;
1278 }
1279
1280 assert(ip == live_instr_count);
1281 }
1282
1283 bool
assign_regs(bool allow_spilling,bool spill_all)1284 fs_reg_alloc::assign_regs(bool allow_spilling, bool spill_all)
1285 {
1286 build_interference_graph(fs->spilled_any_registers || spill_all);
1287
1288 bool spilled = false;
1289 while (1) {
1290 /* Debug of register spilling: Go spill everything. */
1291 if (unlikely(spill_all)) {
1292 int reg = choose_spill_reg();
1293 if (reg != -1) {
1294 spill_reg(reg);
1295 continue;
1296 }
1297 }
1298
1299 if (ra_allocate(g))
1300 break;
1301
1302 if (!allow_spilling)
1303 return false;
1304
1305 /* Failed to allocate registers. Spill a reg, and the caller will
1306 * loop back into here to try again.
1307 */
1308 int reg = choose_spill_reg();
1309 if (reg == -1)
1310 return false;
1311
1312 /* If we're going to spill but we've never spilled before, we need to
1313 * re-build the interference graph with MRFs enabled to allow spilling.
1314 */
1315 if (!fs->spilled_any_registers) {
1316 discard_interference_graph();
1317 build_interference_graph(true);
1318 }
1319
1320 spilled = true;
1321
1322 spill_reg(reg);
1323 }
1324
1325 if (spilled)
1326 fs->invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
1327
1328 /* Get the chosen virtual registers for each node, and map virtual
1329 * regs in the register classes back down to real hardware reg
1330 * numbers.
1331 */
1332 unsigned hw_reg_mapping[fs->alloc.count];
1333 fs->grf_used = fs->first_non_payload_grf;
1334 for (unsigned i = 0; i < fs->alloc.count; i++) {
1335 int reg = ra_get_node_reg(g, first_vgrf_node + i);
1336
1337 hw_reg_mapping[i] = compiler->fs_reg_sets[rsi].ra_reg_to_grf[reg];
1338 fs->grf_used = MAX2(fs->grf_used,
1339 hw_reg_mapping[i] + fs->alloc.sizes[i]);
1340 }
1341
1342 foreach_block_and_inst(block, fs_inst, inst, fs->cfg) {
1343 assign_reg(hw_reg_mapping, &inst->dst);
1344 for (int i = 0; i < inst->sources; i++) {
1345 assign_reg(hw_reg_mapping, &inst->src[i]);
1346 }
1347 }
1348
1349 fs->alloc.count = fs->grf_used;
1350
1351 return true;
1352 }
1353
1354 bool
assign_regs(bool allow_spilling,bool spill_all)1355 fs_visitor::assign_regs(bool allow_spilling, bool spill_all)
1356 {
1357 fs_reg_alloc alloc(this);
1358 bool success = alloc.assign_regs(allow_spilling, spill_all);
1359 if (!success && allow_spilling) {
1360 fail("no register to spill:\n");
1361 dump_instructions(NULL);
1362 }
1363 return success;
1364 }
1365