• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2010 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * Authors:
24  *    Eric Anholt <eric@anholt.net>
25  *
26  */
27 
28 #include "brw_eu.h"
29 #include "brw_fs.h"
30 #include "brw_cfg.h"
31 #include "util/set.h"
32 #include "util/register_allocate.h"
33 
34 using namespace brw;
35 
36 static void
assign_reg(unsigned * reg_hw_locations,fs_reg * reg)37 assign_reg(unsigned *reg_hw_locations, fs_reg *reg)
38 {
39    if (reg->file == VGRF) {
40       reg->nr = reg_hw_locations[reg->nr] + reg->offset / REG_SIZE;
41       reg->offset %= REG_SIZE;
42    }
43 }
44 
45 void
assign_regs_trivial()46 fs_visitor::assign_regs_trivial()
47 {
48    unsigned hw_reg_mapping[this->alloc.count + 1];
49    unsigned i;
50    int reg_width = dispatch_width / 8;
51 
52    /* Note that compressed instructions require alignment to 2 registers. */
53    hw_reg_mapping[0] = ALIGN(this->first_non_payload_grf, reg_width);
54    for (i = 1; i <= this->alloc.count; i++) {
55       hw_reg_mapping[i] = (hw_reg_mapping[i - 1] +
56 			   this->alloc.sizes[i - 1]);
57    }
58    this->grf_used = hw_reg_mapping[this->alloc.count];
59 
60    foreach_block_and_inst(block, fs_inst, inst, cfg) {
61       assign_reg(hw_reg_mapping, &inst->dst);
62       for (i = 0; i < inst->sources; i++) {
63          assign_reg(hw_reg_mapping, &inst->src[i]);
64       }
65    }
66 
67    if (this->grf_used >= max_grf) {
68       fail("Ran out of regs on trivial allocator (%d/%d)\n",
69 	   this->grf_used, max_grf);
70    } else {
71       this->alloc.count = this->grf_used;
72    }
73 
74 }
75 
76 /**
77  * Size of a register from the aligned_bary_class register class.
78  */
79 static unsigned
aligned_bary_size(unsigned dispatch_width)80 aligned_bary_size(unsigned dispatch_width)
81 {
82    return (dispatch_width == 8 ? 2 : 4);
83 }
84 
85 static void
brw_alloc_reg_set(struct brw_compiler * compiler,int dispatch_width)86 brw_alloc_reg_set(struct brw_compiler *compiler, int dispatch_width)
87 {
88    const struct gen_device_info *devinfo = compiler->devinfo;
89    int base_reg_count = BRW_MAX_GRF;
90    const int index = util_logbase2(dispatch_width / 8);
91 
92    if (dispatch_width > 8 && devinfo->gen >= 7) {
93       /* For IVB+, we don't need the PLN hacks or the even-reg alignment in
94        * SIMD16.  Therefore, we can use the exact same register sets for
95        * SIMD16 as we do for SIMD8 and we don't need to recalculate them.
96        */
97       compiler->fs_reg_sets[index] = compiler->fs_reg_sets[0];
98       return;
99    }
100 
101    /* The registers used to make up almost all values handled in the compiler
102     * are a scalar value occupying a single register (or 2 registers in the
103     * case of SIMD16, which is handled by dividing base_reg_count by 2 and
104     * multiplying allocated register numbers by 2).  Things that were
105     * aggregates of scalar values at the GLSL level were split to scalar
106     * values by split_virtual_grfs().
107     *
108     * However, texture SEND messages return a series of contiguous registers
109     * to write into.  We currently always ask for 4 registers, but we may
110     * convert that to use less some day.
111     *
112     * Additionally, on gen5 we need aligned pairs of registers for the PLN
113     * instruction, and on gen4 we need 8 contiguous regs for workaround simd16
114     * texturing.
115     */
116    const int class_count = MAX_VGRF_SIZE;
117    int class_sizes[MAX_VGRF_SIZE];
118    for (unsigned i = 0; i < MAX_VGRF_SIZE; i++)
119       class_sizes[i] = i + 1;
120 
121    memset(compiler->fs_reg_sets[index].class_to_ra_reg_range, 0,
122           sizeof(compiler->fs_reg_sets[index].class_to_ra_reg_range));
123    int *class_to_ra_reg_range = compiler->fs_reg_sets[index].class_to_ra_reg_range;
124 
125    /* Compute the total number of registers across all classes. */
126    int ra_reg_count = 0;
127    for (int i = 0; i < class_count; i++) {
128       if (devinfo->gen <= 5 && dispatch_width >= 16) {
129          /* From the G45 PRM:
130           *
131           * In order to reduce the hardware complexity, the following
132           * rules and restrictions apply to the compressed instruction:
133           * ...
134           * * Operand Alignment Rule: With the exceptions listed below, a
135           *   source/destination operand in general should be aligned to
136           *   even 256-bit physical register with a region size equal to
137           *   two 256-bit physical register
138           */
139          ra_reg_count += (base_reg_count - (class_sizes[i] - 1)) / 2;
140       } else {
141          ra_reg_count += base_reg_count - (class_sizes[i] - 1);
142       }
143       /* Mark the last register. We'll fill in the beginnings later. */
144       class_to_ra_reg_range[class_sizes[i]] = ra_reg_count;
145    }
146 
147    /* Fill out the rest of the range markers */
148    for (int i = 1; i < 17; ++i) {
149       if (class_to_ra_reg_range[i] == 0)
150          class_to_ra_reg_range[i] = class_to_ra_reg_range[i-1];
151    }
152 
153    uint8_t *ra_reg_to_grf = ralloc_array(compiler, uint8_t, ra_reg_count);
154    struct ra_regs *regs = ra_alloc_reg_set(compiler, ra_reg_count, false);
155    if (devinfo->gen >= 6)
156       ra_set_allocate_round_robin(regs);
157    int *classes = ralloc_array(compiler, int, class_count);
158    int aligned_bary_class = -1;
159 
160    /* Allocate space for q values.  We allocate class_count + 1 because we
161     * want to leave room for the aligned barycentric class if we have it.
162     */
163    unsigned int **q_values = ralloc_array(compiler, unsigned int *,
164                                           class_count + 1);
165    for (int i = 0; i < class_count + 1; ++i)
166       q_values[i] = ralloc_array(q_values, unsigned int, class_count + 1);
167 
168    /* Now, add the registers to their classes, and add the conflicts
169     * between them and the base GRF registers (and also each other).
170     */
171    int reg = 0;
172    int aligned_bary_base_reg = 0;
173    int aligned_bary_reg_count = 0;
174    for (int i = 0; i < class_count; i++) {
175       int class_reg_count;
176       if (devinfo->gen <= 5 && dispatch_width >= 16) {
177          class_reg_count = (base_reg_count - (class_sizes[i] - 1)) / 2;
178 
179          /* See comment below.  The only difference here is that we are
180           * dealing with pairs of registers instead of single registers.
181           * Registers of odd sizes simply get rounded up. */
182          for (int j = 0; j < class_count; j++)
183             q_values[i][j] = (class_sizes[i] + 1) / 2 +
184                              (class_sizes[j] + 1) / 2 - 1;
185       } else {
186          class_reg_count = base_reg_count - (class_sizes[i] - 1);
187 
188          /* From register_allocate.c:
189           *
190           * q(B,C) (indexed by C, B is this register class) in
191           * Runeson/Nyström paper.  This is "how many registers of B could
192           * the worst choice register from C conflict with".
193           *
194           * If we just let the register allocation algorithm compute these
195           * values, is extremely expensive.  However, since all of our
196           * registers are laid out, we can very easily compute them
197           * ourselves.  View the register from C as fixed starting at GRF n
198           * somwhere in the middle, and the register from B as sliding back
199           * and forth.  Then the first register to conflict from B is the
200           * one starting at n - class_size[B] + 1 and the last register to
201           * conflict will start at n + class_size[B] - 1.  Therefore, the
202           * number of conflicts from B is class_size[B] + class_size[C] - 1.
203           *
204           *   +-+-+-+-+-+-+     +-+-+-+-+-+-+
205           * B | | | | | |n| --> | | | | | | |
206           *   +-+-+-+-+-+-+     +-+-+-+-+-+-+
207           *             +-+-+-+-+-+
208           * C           |n| | | | |
209           *             +-+-+-+-+-+
210           */
211          for (int j = 0; j < class_count; j++)
212             q_values[i][j] = class_sizes[i] + class_sizes[j] - 1;
213       }
214       classes[i] = ra_alloc_reg_class(regs);
215 
216       /* Save this off for the aligned barycentric class at the end. */
217       if (class_sizes[i] == int(aligned_bary_size(dispatch_width))) {
218          aligned_bary_base_reg = reg;
219          aligned_bary_reg_count = class_reg_count;
220       }
221 
222       if (devinfo->gen <= 5 && dispatch_width >= 16) {
223          for (int j = 0; j < class_reg_count; j++) {
224             ra_class_add_reg(regs, classes[i], reg);
225 
226             ra_reg_to_grf[reg] = j * 2;
227 
228             for (int base_reg = j;
229                  base_reg < j + (class_sizes[i] + 1) / 2;
230                  base_reg++) {
231                ra_add_reg_conflict(regs, base_reg, reg);
232             }
233 
234             reg++;
235          }
236       } else {
237          for (int j = 0; j < class_reg_count; j++) {
238             ra_class_add_reg(regs, classes[i], reg);
239 
240             ra_reg_to_grf[reg] = j;
241 
242             for (int base_reg = j;
243                  base_reg < j + class_sizes[i];
244                  base_reg++) {
245                ra_add_reg_conflict(regs, base_reg, reg);
246             }
247 
248             reg++;
249          }
250       }
251    }
252    assert(reg == ra_reg_count);
253 
254    /* Applying transitivity to all of the base registers gives us the
255     * appropreate register conflict relationships everywhere.
256     */
257    for (int reg = 0; reg < base_reg_count; reg++)
258       ra_make_reg_conflicts_transitive(regs, reg);
259 
260    /* Add a special class for aligned barycentrics, which we'll put the
261     * first source of LINTERP on so that we can do PLN on Gen <= 6.
262     */
263    if (devinfo->has_pln && (devinfo->gen == 6 ||
264                             (dispatch_width == 8 && devinfo->gen <= 5))) {
265       aligned_bary_class = ra_alloc_reg_class(regs);
266 
267       for (int i = 0; i < aligned_bary_reg_count; i++) {
268 	 if ((ra_reg_to_grf[aligned_bary_base_reg + i] & 1) == 0) {
269 	    ra_class_add_reg(regs, aligned_bary_class,
270                              aligned_bary_base_reg + i);
271 	 }
272       }
273 
274       for (int i = 0; i < class_count; i++) {
275          /* These are a little counter-intuitive because the barycentric
276           * registers are required to be aligned while the register they are
277           * potentially interferring with are not.  In the case where the size
278           * is even, the worst-case is that the register is odd-aligned.  In
279           * the odd-size case, it doesn't matter.
280           */
281          q_values[class_count][i] = class_sizes[i] / 2 +
282                                     aligned_bary_size(dispatch_width) / 2;
283          q_values[i][class_count] = class_sizes[i] +
284                                     aligned_bary_size(dispatch_width) - 1;
285       }
286       q_values[class_count][class_count] = aligned_bary_size(dispatch_width) - 1;
287    }
288 
289    ra_set_finalize(regs, q_values);
290 
291    ralloc_free(q_values);
292 
293    compiler->fs_reg_sets[index].regs = regs;
294    for (unsigned i = 0; i < ARRAY_SIZE(compiler->fs_reg_sets[index].classes); i++)
295       compiler->fs_reg_sets[index].classes[i] = -1;
296    for (int i = 0; i < class_count; i++)
297       compiler->fs_reg_sets[index].classes[class_sizes[i] - 1] = classes[i];
298    compiler->fs_reg_sets[index].ra_reg_to_grf = ra_reg_to_grf;
299    compiler->fs_reg_sets[index].aligned_bary_class = aligned_bary_class;
300 }
301 
302 void
brw_fs_alloc_reg_sets(struct brw_compiler * compiler)303 brw_fs_alloc_reg_sets(struct brw_compiler *compiler)
304 {
305    brw_alloc_reg_set(compiler, 8);
306    brw_alloc_reg_set(compiler, 16);
307    brw_alloc_reg_set(compiler, 32);
308 }
309 
310 static int
count_to_loop_end(const bblock_t * block)311 count_to_loop_end(const bblock_t *block)
312 {
313    if (block->end()->opcode == BRW_OPCODE_WHILE)
314       return block->end_ip;
315 
316    int depth = 1;
317    /* Skip the first block, since we don't want to count the do the calling
318     * function found.
319     */
320    for (block = block->next();
321         depth > 0;
322         block = block->next()) {
323       if (block->start()->opcode == BRW_OPCODE_DO)
324          depth++;
325       if (block->end()->opcode == BRW_OPCODE_WHILE) {
326          depth--;
327          if (depth == 0)
328             return block->end_ip;
329       }
330    }
331    unreachable("not reached");
332 }
333 
calculate_payload_ranges(int payload_node_count,int * payload_last_use_ip) const334 void fs_visitor::calculate_payload_ranges(int payload_node_count,
335                                           int *payload_last_use_ip) const
336 {
337    int loop_depth = 0;
338    int loop_end_ip = 0;
339 
340    for (int i = 0; i < payload_node_count; i++)
341       payload_last_use_ip[i] = -1;
342 
343    int ip = 0;
344    foreach_block_and_inst(block, fs_inst, inst, cfg) {
345       switch (inst->opcode) {
346       case BRW_OPCODE_DO:
347          loop_depth++;
348 
349          /* Since payload regs are deffed only at the start of the shader
350           * execution, any uses of the payload within a loop mean the live
351           * interval extends to the end of the outermost loop.  Find the ip of
352           * the end now.
353           */
354          if (loop_depth == 1)
355             loop_end_ip = count_to_loop_end(block);
356          break;
357       case BRW_OPCODE_WHILE:
358          loop_depth--;
359          break;
360       default:
361          break;
362       }
363 
364       int use_ip;
365       if (loop_depth > 0)
366          use_ip = loop_end_ip;
367       else
368          use_ip = ip;
369 
370       /* Note that UNIFORM args have been turned into FIXED_GRF by
371        * assign_curbe_setup(), and interpolation uses fixed hardware regs from
372        * the start (see interp_reg()).
373        */
374       for (int i = 0; i < inst->sources; i++) {
375          if (inst->src[i].file == FIXED_GRF) {
376             int node_nr = inst->src[i].nr;
377             if (node_nr >= payload_node_count)
378                continue;
379 
380             for (unsigned j = 0; j < regs_read(inst, i); j++) {
381                payload_last_use_ip[node_nr + j] = use_ip;
382                assert(node_nr + j < unsigned(payload_node_count));
383             }
384          }
385       }
386 
387       /* Special case instructions which have extra implied registers used. */
388       switch (inst->opcode) {
389       case CS_OPCODE_CS_TERMINATE:
390          payload_last_use_ip[0] = use_ip;
391          break;
392 
393       default:
394          if (inst->eot) {
395             /* We could omit this for the !inst->header_present case, except
396              * that the simulator apparently incorrectly reads from g0/g1
397              * instead of sideband.  It also really freaks out driver
398              * developers to see g0 used in unusual places, so just always
399              * reserve it.
400              */
401             payload_last_use_ip[0] = use_ip;
402             payload_last_use_ip[1] = use_ip;
403          }
404          break;
405       }
406 
407       ip++;
408    }
409 }
410 
411 class fs_reg_alloc {
412 public:
fs_reg_alloc(fs_visitor * fs)413    fs_reg_alloc(fs_visitor *fs):
414       fs(fs), devinfo(fs->devinfo), compiler(fs->compiler),
415       live(fs->live_analysis.require()), g(NULL),
416       have_spill_costs(false)
417    {
418       mem_ctx = ralloc_context(NULL);
419 
420       /* Stash the number of instructions so we can sanity check that our
421        * counts still match liveness.
422        */
423       live_instr_count = fs->cfg->last_block()->end_ip + 1;
424 
425       spill_insts = _mesa_pointer_set_create(mem_ctx);
426 
427       /* Most of this allocation was written for a reg_width of 1
428        * (dispatch_width == 8).  In extending to SIMD16, the code was
429        * left in place and it was converted to have the hardware
430        * registers it's allocating be contiguous physical pairs of regs
431        * for reg_width == 2.
432        */
433       int reg_width = fs->dispatch_width / 8;
434       rsi = util_logbase2(reg_width);
435       payload_node_count = ALIGN(fs->first_non_payload_grf, reg_width);
436 
437       /* Get payload IP information */
438       payload_last_use_ip = ralloc_array(mem_ctx, int, payload_node_count);
439 
440       node_count = 0;
441       first_payload_node = 0;
442       first_mrf_hack_node = 0;
443       scratch_header_node = 0;
444       grf127_send_hack_node = 0;
445       first_vgrf_node = 0;
446       last_vgrf_node = 0;
447       first_spill_node = 0;
448 
449       spill_vgrf_ip = NULL;
450       spill_vgrf_ip_alloc = 0;
451       spill_node_count = 0;
452    }
453 
~fs_reg_alloc()454    ~fs_reg_alloc()
455    {
456       ralloc_free(mem_ctx);
457    }
458 
459    bool assign_regs(bool allow_spilling, bool spill_all);
460 
461 private:
462    void setup_live_interference(unsigned node,
463                                 int node_start_ip, int node_end_ip);
464    void setup_inst_interference(const fs_inst *inst);
465 
466    void build_interference_graph(bool allow_spilling);
467    void discard_interference_graph();
468 
469    void emit_unspill(const fs_builder &bld, fs_reg dst,
470                      uint32_t spill_offset, unsigned count);
471    void emit_spill(const fs_builder &bld, fs_reg src,
472                    uint32_t spill_offset, unsigned count);
473 
474    void set_spill_costs();
475    int choose_spill_reg();
476    fs_reg alloc_scratch_header();
477    fs_reg alloc_spill_reg(unsigned size, int ip);
478    void spill_reg(unsigned spill_reg);
479 
480    void *mem_ctx;
481    fs_visitor *fs;
482    const gen_device_info *devinfo;
483    const brw_compiler *compiler;
484    const fs_live_variables &live;
485    int live_instr_count;
486 
487    set *spill_insts;
488 
489    /* Which compiler->fs_reg_sets[] to use */
490    int rsi;
491 
492    ra_graph *g;
493    bool have_spill_costs;
494 
495    int payload_node_count;
496    int *payload_last_use_ip;
497 
498    int node_count;
499    int first_payload_node;
500    int first_mrf_hack_node;
501    int scratch_header_node;
502    int grf127_send_hack_node;
503    int first_vgrf_node;
504    int last_vgrf_node;
505    int first_spill_node;
506 
507    int *spill_vgrf_ip;
508    int spill_vgrf_ip_alloc;
509    int spill_node_count;
510 
511    fs_reg scratch_header;
512 };
513 
514 /**
515  * Sets the mrf_used array to indicate which MRFs are used by the shader IR
516  *
517  * This is used in assign_regs() to decide which of the GRFs that we use as
518  * MRFs on gen7 get normally register allocated, and in register spilling to
519  * see if we can actually use MRFs to do spills without overwriting normal MRF
520  * contents.
521  */
522 static void
get_used_mrfs(const fs_visitor * v,bool * mrf_used)523 get_used_mrfs(const fs_visitor *v, bool *mrf_used)
524 {
525    int reg_width = v->dispatch_width / 8;
526 
527    memset(mrf_used, 0, BRW_MAX_MRF(v->devinfo->gen) * sizeof(bool));
528 
529    foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
530       if (inst->dst.file == MRF) {
531          int reg = inst->dst.nr & ~BRW_MRF_COMPR4;
532          mrf_used[reg] = true;
533          if (reg_width == 2) {
534             if (inst->dst.nr & BRW_MRF_COMPR4) {
535                mrf_used[reg + 4] = true;
536             } else {
537                mrf_used[reg + 1] = true;
538             }
539          }
540       }
541 
542       if (inst->mlen > 0) {
543 	 for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) {
544             mrf_used[inst->base_mrf + i] = true;
545          }
546       }
547    }
548 }
549 
550 namespace {
551    /**
552     * Maximum spill block size we expect to encounter in 32B units.
553     *
554     * This is somewhat arbitrary and doesn't necessarily limit the maximum
555     * variable size that can be spilled -- A higher value will allow a
556     * variable of a given size to be spilled more efficiently with a smaller
557     * number of scratch messages, but will increase the likelihood of a
558     * collision between the MRFs reserved for spilling and other MRFs used by
559     * the program (and possibly increase GRF register pressure on platforms
560     * without hardware MRFs), what could cause register allocation to fail.
561     *
562     * For the moment reserve just enough space so a register of 32 bit
563     * component type and natural region width can be spilled without splitting
564     * into multiple (force_writemask_all) scratch messages.
565     */
566    unsigned
spill_max_size(const backend_shader * s)567    spill_max_size(const backend_shader *s)
568    {
569       /* FINISHME - On Gen7+ it should be possible to avoid this limit
570        *            altogether by spilling directly from the temporary GRF
571        *            allocated to hold the result of the instruction (and the
572        *            scratch write header).
573        */
574       /* FINISHME - The shader's dispatch width probably belongs in
575        *            backend_shader (or some nonexistent fs_shader class?)
576        *            rather than in the visitor class.
577        */
578       return static_cast<const fs_visitor *>(s)->dispatch_width / 8;
579    }
580 
581    /**
582     * First MRF register available for spilling.
583     */
584    unsigned
spill_base_mrf(const backend_shader * s)585    spill_base_mrf(const backend_shader *s)
586    {
587       /* We don't use the MRF hack on Gen9+ */
588       assert(s->devinfo->gen < 9);
589       return BRW_MAX_MRF(s->devinfo->gen) - spill_max_size(s) - 1;
590    }
591 }
592 
593 void
setup_live_interference(unsigned node,int node_start_ip,int node_end_ip)594 fs_reg_alloc::setup_live_interference(unsigned node,
595                                       int node_start_ip, int node_end_ip)
596 {
597    /* Mark any virtual grf that is live between the start of the program and
598     * the last use of a payload node interfering with that payload node.
599     */
600    for (int i = 0; i < payload_node_count; i++) {
601       if (payload_last_use_ip[i] == -1)
602          continue;
603 
604       /* Note that we use a <= comparison, unlike vgrfs_interfere(),
605        * in order to not have to worry about the uniform issue described in
606        * calculate_live_intervals().
607        */
608       if (node_start_ip <= payload_last_use_ip[i])
609          ra_add_node_interference(g, node, first_payload_node + i);
610    }
611 
612    /* If we have the MRF hack enabled, mark this node as interfering with all
613     * MRF registers.
614     */
615    if (first_mrf_hack_node >= 0) {
616       for (int i = spill_base_mrf(fs); i < BRW_MAX_MRF(devinfo->gen); i++)
617          ra_add_node_interference(g, node, first_mrf_hack_node + i);
618    }
619 
620    /* Everything interferes with the scratch header */
621    if (scratch_header_node >= 0)
622       ra_add_node_interference(g, node, scratch_header_node);
623 
624    /* Add interference with every vgrf whose live range intersects this
625     * node's.  We only need to look at nodes below this one as the reflexivity
626     * of interference will take care of the rest.
627     */
628    for (unsigned n2 = first_vgrf_node;
629         n2 <= (unsigned)last_vgrf_node && n2 < node; n2++) {
630       unsigned vgrf = n2 - first_vgrf_node;
631       if (!(node_end_ip <= live.vgrf_start[vgrf] ||
632             live.vgrf_end[vgrf] <= node_start_ip))
633          ra_add_node_interference(g, node, n2);
634    }
635 }
636 
637 void
setup_inst_interference(const fs_inst * inst)638 fs_reg_alloc::setup_inst_interference(const fs_inst *inst)
639 {
640    /* Certain instructions can't safely use the same register for their
641     * sources and destination.  Add interference.
642     */
643    if (inst->dst.file == VGRF && inst->has_source_and_destination_hazard()) {
644       for (unsigned i = 0; i < inst->sources; i++) {
645          if (inst->src[i].file == VGRF) {
646             ra_add_node_interference(g, first_vgrf_node + inst->dst.nr,
647                                         first_vgrf_node + inst->src[i].nr);
648          }
649       }
650    }
651 
652    /* In 16-wide instructions we have an issue where a compressed
653     * instruction is actually two instructions executed simultaneously.
654     * It's actually ok to have the source and destination registers be
655     * the same.  In this case, each instruction over-writes its own
656     * source and there's no problem.  The real problem here is if the
657     * source and destination registers are off by one.  Then you can end
658     * up in a scenario where the first instruction over-writes the
659     * source of the second instruction.  Since the compiler doesn't know
660     * about this level of granularity, we simply make the source and
661     * destination interfere.
662     */
663    if (inst->exec_size >= 16 && inst->dst.file == VGRF) {
664       for (int i = 0; i < inst->sources; ++i) {
665          if (inst->src[i].file == VGRF) {
666             ra_add_node_interference(g, first_vgrf_node + inst->dst.nr,
667                                         first_vgrf_node + inst->src[i].nr);
668          }
669       }
670    }
671 
672    if (grf127_send_hack_node >= 0) {
673       /* At Intel Broadwell PRM, vol 07, section "Instruction Set Reference",
674        * subsection "EUISA Instructions", Send Message (page 990):
675        *
676        * "r127 must not be used for return address when there is a src and
677        * dest overlap in send instruction."
678        *
679        * We are avoiding using grf127 as part of the destination of send
680        * messages adding a node interference to the grf127_send_hack_node.
681        * This node has a fixed asignment to grf127.
682        *
683        * We don't apply it to SIMD16 instructions because previous code avoids
684        * any register overlap between sources and destination.
685        */
686       if (inst->exec_size < 16 && inst->is_send_from_grf() &&
687           inst->dst.file == VGRF)
688          ra_add_node_interference(g, first_vgrf_node + inst->dst.nr,
689                                      grf127_send_hack_node);
690 
691       /* Spilling instruction are genereated as SEND messages from MRF but as
692        * Gen7+ supports sending from GRF the driver will maps assingn these
693        * MRF registers to a GRF. Implementations reuses the dest of the send
694        * message as source. So as we will have an overlap for sure, we create
695        * an interference between destination and grf127.
696        */
697       if ((inst->opcode == SHADER_OPCODE_GEN7_SCRATCH_READ ||
698            inst->opcode == SHADER_OPCODE_GEN4_SCRATCH_READ) &&
699           inst->dst.file == VGRF)
700          ra_add_node_interference(g, first_vgrf_node + inst->dst.nr,
701                                      grf127_send_hack_node);
702    }
703 
704    /* From the Skylake PRM Vol. 2a docs for sends:
705     *
706     *    "It is required that the second block of GRFs does not overlap with
707     *    the first block."
708     *
709     * Normally, this is taken care of by fixup_sends_duplicate_payload() but
710     * in the case where one of the registers is an undefined value, the
711     * register allocator may decide that they don't interfere even though
712     * they're used as sources in the same instruction.  We also need to add
713     * interference here.
714     */
715    if (devinfo->gen >= 9) {
716       if (inst->opcode == SHADER_OPCODE_SEND && inst->ex_mlen > 0 &&
717           inst->src[2].file == VGRF && inst->src[3].file == VGRF &&
718           inst->src[2].nr != inst->src[3].nr)
719          ra_add_node_interference(g, first_vgrf_node + inst->src[2].nr,
720                                      first_vgrf_node + inst->src[3].nr);
721    }
722 
723    /* When we do send-from-GRF for FB writes, we need to ensure that the last
724     * write instruction sends from a high register.  This is because the
725     * vertex fetcher wants to start filling the low payload registers while
726     * the pixel data port is still working on writing out the memory.  If we
727     * don't do this, we get rendering artifacts.
728     *
729     * We could just do "something high".  Instead, we just pick the highest
730     * register that works.
731     */
732    if (inst->eot) {
733       const int vgrf = inst->opcode == SHADER_OPCODE_SEND ?
734                        inst->src[2].nr : inst->src[0].nr;
735       int size = fs->alloc.sizes[vgrf];
736       int reg = compiler->fs_reg_sets[rsi].class_to_ra_reg_range[size] - 1;
737 
738       if (first_mrf_hack_node >= 0) {
739          /* If something happened to spill, we want to push the EOT send
740           * register early enough in the register file that we don't
741           * conflict with any used MRF hack registers.
742           */
743          reg -= BRW_MAX_MRF(devinfo->gen) - spill_base_mrf(fs);
744       } else if (grf127_send_hack_node >= 0) {
745          /* Avoid r127 which might be unusable if the node was previously
746           * written by a SIMD8 SEND message with source/destination overlap.
747           */
748          reg--;
749       }
750 
751       ra_set_node_reg(g, first_vgrf_node + vgrf, reg);
752    }
753 }
754 
755 void
build_interference_graph(bool allow_spilling)756 fs_reg_alloc::build_interference_graph(bool allow_spilling)
757 {
758    /* Compute the RA node layout */
759    node_count = 0;
760    first_payload_node = node_count;
761    node_count += payload_node_count;
762    if (devinfo->gen >= 7 && devinfo->gen < 9 && allow_spilling) {
763       first_mrf_hack_node = node_count;
764       node_count += BRW_MAX_GRF - GEN7_MRF_HACK_START;
765    } else {
766       first_mrf_hack_node = -1;
767    }
768    if (devinfo->gen >= 8) {
769       grf127_send_hack_node = node_count;
770       node_count ++;
771    } else {
772       grf127_send_hack_node = -1;
773    }
774    first_vgrf_node = node_count;
775    node_count += fs->alloc.count;
776    last_vgrf_node = node_count - 1;
777    if (devinfo->gen >= 9 && allow_spilling) {
778       scratch_header_node = node_count++;
779    } else {
780       scratch_header_node = -1;
781    }
782    first_spill_node = node_count;
783 
784    fs->calculate_payload_ranges(payload_node_count,
785                                 payload_last_use_ip);
786 
787    assert(g == NULL);
788    g = ra_alloc_interference_graph(compiler->fs_reg_sets[rsi].regs, node_count);
789    ralloc_steal(mem_ctx, g);
790 
791    /* Set up the payload nodes */
792    for (int i = 0; i < payload_node_count; i++) {
793       /* Mark each payload node as being allocated to its physical register.
794        *
795        * The alternative would be to have per-physical-register classes, which
796        * would just be silly.
797        */
798       if (devinfo->gen <= 5 && fs->dispatch_width >= 16) {
799          /* We have to divide by 2 here because we only have even numbered
800           * registers.  Some of the payload registers will be odd, but
801           * that's ok because their physical register numbers have already
802           * been assigned.  The only thing this is used for is interference.
803           */
804          ra_set_node_reg(g, first_payload_node + i, i / 2);
805       } else {
806          ra_set_node_reg(g, first_payload_node + i, i);
807       }
808    }
809 
810    if (first_mrf_hack_node >= 0) {
811       /* Mark each MRF reg node as being allocated to its physical
812        * register.
813        *
814        * The alternative would be to have per-physical-register classes,
815        * which would just be silly.
816        */
817       for (int i = 0; i < BRW_MAX_MRF(devinfo->gen); i++) {
818          ra_set_node_reg(g, first_mrf_hack_node + i,
819                             GEN7_MRF_HACK_START + i);
820       }
821    }
822 
823    if (grf127_send_hack_node >= 0)
824       ra_set_node_reg(g, grf127_send_hack_node, 127);
825 
826    /* Specify the classes of each virtual register. */
827    for (unsigned i = 0; i < fs->alloc.count; i++) {
828       unsigned size = fs->alloc.sizes[i];
829 
830       assert(size <= ARRAY_SIZE(compiler->fs_reg_sets[rsi].classes) &&
831              "Register allocation relies on split_virtual_grfs()");
832 
833       ra_set_node_class(g, first_vgrf_node + i,
834                         compiler->fs_reg_sets[rsi].classes[size - 1]);
835    }
836 
837    /* Special case: on pre-Gen7 hardware that supports PLN, the second operand
838     * of a PLN instruction needs to be an even-numbered register, so we have a
839     * special register class aligned_bary_class to handle this case.
840     */
841    if (compiler->fs_reg_sets[rsi].aligned_bary_class >= 0) {
842       foreach_block_and_inst(block, fs_inst, inst, fs->cfg) {
843          if (inst->opcode == FS_OPCODE_LINTERP && inst->src[0].file == VGRF &&
844              fs->alloc.sizes[inst->src[0].nr] ==
845                aligned_bary_size(fs->dispatch_width)) {
846             ra_set_node_class(g, first_vgrf_node + inst->src[0].nr,
847                               compiler->fs_reg_sets[rsi].aligned_bary_class);
848          }
849       }
850    }
851 
852    /* Add interference based on the live range of the register */
853    for (unsigned i = 0; i < fs->alloc.count; i++) {
854       setup_live_interference(first_vgrf_node + i,
855                               live.vgrf_start[i],
856                               live.vgrf_end[i]);
857    }
858 
859    /* Add interference based on the instructions in which a register is used.
860     */
861    foreach_block_and_inst(block, fs_inst, inst, fs->cfg)
862       setup_inst_interference(inst);
863 }
864 
865 void
discard_interference_graph()866 fs_reg_alloc::discard_interference_graph()
867 {
868    ralloc_free(g);
869    g = NULL;
870    have_spill_costs = false;
871 }
872 
873 void
emit_unspill(const fs_builder & bld,fs_reg dst,uint32_t spill_offset,unsigned count)874 fs_reg_alloc::emit_unspill(const fs_builder &bld, fs_reg dst,
875                            uint32_t spill_offset, unsigned count)
876 {
877    const gen_device_info *devinfo = bld.shader->devinfo;
878    const unsigned reg_size = dst.component_size(bld.dispatch_width()) /
879                              REG_SIZE;
880    assert(count % reg_size == 0);
881 
882    for (unsigned i = 0; i < count / reg_size; i++) {
883       fs_inst *unspill_inst;
884       if (devinfo->gen >= 9) {
885          fs_reg header = this->scratch_header;
886          fs_builder ubld = bld.exec_all().group(1, 0);
887          assert(spill_offset % 16 == 0);
888          unspill_inst = ubld.MOV(component(header, 2),
889                                  brw_imm_ud(spill_offset / 16));
890          _mesa_set_add(spill_insts, unspill_inst);
891 
892          fs_reg srcs[] = { brw_imm_ud(0), brw_imm_ud(0), header };
893          unspill_inst = bld.emit(SHADER_OPCODE_SEND, dst,
894                                  srcs, ARRAY_SIZE(srcs));
895          unspill_inst->mlen = 1;
896          unspill_inst->header_size = 1;
897          unspill_inst->size_written = reg_size * REG_SIZE;
898          unspill_inst->send_has_side_effects = false;
899          unspill_inst->send_is_volatile = true;
900          unspill_inst->sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
901          unspill_inst->desc =
902             brw_dp_read_desc(devinfo, GEN8_BTI_STATELESS_NON_COHERENT,
903                              BRW_DATAPORT_OWORD_BLOCK_DWORDS(reg_size * 8),
904                              BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
905                              BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
906       } else if (devinfo->gen >= 7 && spill_offset < (1 << 12) * REG_SIZE) {
907          /* The Gen7 descriptor-based offset is 12 bits of HWORD units.
908           * Because the Gen7-style scratch block read is hardwired to BTI 255,
909           * on Gen9+ it would cause the DC to do an IA-coherent read, what
910           * largely outweighs the slight advantage from not having to provide
911           * the address as part of the message header, so we're better off
912           * using plain old oword block reads.
913           */
914          unspill_inst = bld.emit(SHADER_OPCODE_GEN7_SCRATCH_READ, dst);
915          unspill_inst->offset = spill_offset;
916       } else {
917          unspill_inst = bld.emit(SHADER_OPCODE_GEN4_SCRATCH_READ, dst);
918          unspill_inst->offset = spill_offset;
919          unspill_inst->base_mrf = spill_base_mrf(bld.shader);
920          unspill_inst->mlen = 1; /* header contains offset */
921       }
922       _mesa_set_add(spill_insts, unspill_inst);
923 
924       dst.offset += reg_size * REG_SIZE;
925       spill_offset += reg_size * REG_SIZE;
926    }
927 }
928 
929 void
emit_spill(const fs_builder & bld,fs_reg src,uint32_t spill_offset,unsigned count)930 fs_reg_alloc::emit_spill(const fs_builder &bld, fs_reg src,
931                          uint32_t spill_offset, unsigned count)
932 {
933    const gen_device_info *devinfo = bld.shader->devinfo;
934    const unsigned reg_size = src.component_size(bld.dispatch_width()) /
935                              REG_SIZE;
936    assert(count % reg_size == 0);
937 
938    for (unsigned i = 0; i < count / reg_size; i++) {
939       fs_inst *spill_inst;
940       if (devinfo->gen >= 9) {
941          fs_reg header = this->scratch_header;
942          fs_builder ubld = bld.exec_all().group(1, 0);
943          assert(spill_offset % 16 == 0);
944          spill_inst = ubld.MOV(component(header, 2),
945                                brw_imm_ud(spill_offset / 16));
946          _mesa_set_add(spill_insts, spill_inst);
947 
948          fs_reg srcs[] = { brw_imm_ud(0), brw_imm_ud(0), header, src };
949          spill_inst = bld.emit(SHADER_OPCODE_SEND, bld.null_reg_f(),
950                                srcs, ARRAY_SIZE(srcs));
951          spill_inst->mlen = 1;
952          spill_inst->ex_mlen = reg_size;
953          spill_inst->size_written = 0;
954          spill_inst->header_size = 1;
955          spill_inst->send_has_side_effects = true;
956          spill_inst->send_is_volatile = false;
957          spill_inst->sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
958          spill_inst->desc =
959             brw_dp_write_desc(devinfo, GEN8_BTI_STATELESS_NON_COHERENT,
960                               BRW_DATAPORT_OWORD_BLOCK_DWORDS(reg_size * 8),
961                               GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE,
962                               0 /* not a render target */,
963                               false /* send_commit_msg */);
964       } else {
965          spill_inst = bld.emit(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
966                                bld.null_reg_f(), src);
967          spill_inst->offset = spill_offset;
968          spill_inst->mlen = 1 + reg_size; /* header, value */
969          spill_inst->base_mrf = spill_base_mrf(bld.shader);
970       }
971       _mesa_set_add(spill_insts, spill_inst);
972 
973       src.offset += reg_size * REG_SIZE;
974       spill_offset += reg_size * REG_SIZE;
975    }
976 }
977 
978 void
set_spill_costs()979 fs_reg_alloc::set_spill_costs()
980 {
981    float block_scale = 1.0;
982    float spill_costs[fs->alloc.count];
983    bool no_spill[fs->alloc.count];
984 
985    for (unsigned i = 0; i < fs->alloc.count; i++) {
986       spill_costs[i] = 0.0;
987       no_spill[i] = false;
988    }
989 
990    /* Calculate costs for spilling nodes.  Call it a cost of 1 per
991     * spill/unspill we'll have to do, and guess that the insides of
992     * loops run 10 times.
993     */
994    foreach_block_and_inst(block, fs_inst, inst, fs->cfg) {
995       for (unsigned int i = 0; i < inst->sources; i++) {
996 	 if (inst->src[i].file == VGRF)
997             spill_costs[inst->src[i].nr] += regs_read(inst, i) * block_scale;
998       }
999 
1000       if (inst->dst.file == VGRF)
1001          spill_costs[inst->dst.nr] += regs_written(inst) * block_scale;
1002 
1003       /* Don't spill anything we generated while spilling */
1004       if (_mesa_set_search(spill_insts, inst)) {
1005          for (unsigned int i = 0; i < inst->sources; i++) {
1006 	    if (inst->src[i].file == VGRF)
1007                no_spill[inst->src[i].nr] = true;
1008          }
1009 	 if (inst->dst.file == VGRF)
1010             no_spill[inst->dst.nr] = true;
1011       }
1012 
1013       switch (inst->opcode) {
1014 
1015       case BRW_OPCODE_DO:
1016 	 block_scale *= 10;
1017 	 break;
1018 
1019       case BRW_OPCODE_WHILE:
1020 	 block_scale /= 10;
1021 	 break;
1022 
1023       case BRW_OPCODE_IF:
1024       case BRW_OPCODE_IFF:
1025          block_scale *= 0.5;
1026          break;
1027 
1028       case BRW_OPCODE_ENDIF:
1029          block_scale /= 0.5;
1030          break;
1031 
1032       default:
1033 	 break;
1034       }
1035    }
1036 
1037    for (unsigned i = 0; i < fs->alloc.count; i++) {
1038       /* Do the no_spill check first.  Registers that are used as spill
1039        * temporaries may have been allocated after we calculated liveness so
1040        * we shouldn't look their liveness up.  Fortunately, they're always
1041        * used in SCRATCH_READ/WRITE instructions so they'll always be flagged
1042        * no_spill.
1043        */
1044       if (no_spill[i])
1045          continue;
1046 
1047       int live_length = live.vgrf_end[i] - live.vgrf_start[i];
1048       if (live_length <= 0)
1049          continue;
1050 
1051       /* Divide the cost (in number of spills/fills) by the log of the length
1052        * of the live range of the register.  This will encourage spill logic
1053        * to spill long-living things before spilling short-lived things where
1054        * spilling is less likely to actually do us any good.  We use the log
1055        * of the length because it will fall off very quickly and not cause us
1056        * to spill medium length registers with more uses.
1057        */
1058       float adjusted_cost = spill_costs[i] / logf(live_length);
1059       ra_set_node_spill_cost(g, first_vgrf_node + i, adjusted_cost);
1060    }
1061 
1062    have_spill_costs = true;
1063 }
1064 
1065 int
choose_spill_reg()1066 fs_reg_alloc::choose_spill_reg()
1067 {
1068    if (!have_spill_costs)
1069       set_spill_costs();
1070 
1071    int node = ra_get_best_spill_node(g);
1072    if (node < 0)
1073       return -1;
1074 
1075    assert(node >= first_vgrf_node);
1076    return node - first_vgrf_node;
1077 }
1078 
1079 fs_reg
alloc_scratch_header()1080 fs_reg_alloc::alloc_scratch_header()
1081 {
1082    int vgrf = fs->alloc.allocate(1);
1083    assert(first_vgrf_node + vgrf == scratch_header_node);
1084    ra_set_node_class(g, scratch_header_node,
1085                         compiler->fs_reg_sets[rsi].classes[0]);
1086 
1087    setup_live_interference(scratch_header_node, 0, INT_MAX);
1088 
1089    return fs_reg(VGRF, vgrf, BRW_REGISTER_TYPE_UD);
1090 }
1091 
1092 fs_reg
alloc_spill_reg(unsigned size,int ip)1093 fs_reg_alloc::alloc_spill_reg(unsigned size, int ip)
1094 {
1095    int vgrf = fs->alloc.allocate(size);
1096    int n = ra_add_node(g, compiler->fs_reg_sets[rsi].classes[size - 1]);
1097    assert(n == first_vgrf_node + vgrf);
1098    assert(n == first_spill_node + spill_node_count);
1099 
1100    setup_live_interference(n, ip - 1, ip + 1);
1101 
1102    /* Add interference between this spill node and any other spill nodes for
1103     * the same instruction.
1104     */
1105    for (int s = 0; s < spill_node_count; s++) {
1106       if (spill_vgrf_ip[s] == ip)
1107          ra_add_node_interference(g, n, first_spill_node + s);
1108    }
1109 
1110    /* Add this spill node to the list for next time */
1111    if (spill_node_count >= spill_vgrf_ip_alloc) {
1112       if (spill_vgrf_ip_alloc == 0)
1113          spill_vgrf_ip_alloc = 16;
1114       else
1115          spill_vgrf_ip_alloc *= 2;
1116       spill_vgrf_ip = reralloc(mem_ctx, spill_vgrf_ip, int,
1117                                spill_vgrf_ip_alloc);
1118    }
1119    spill_vgrf_ip[spill_node_count++] = ip;
1120 
1121    return fs_reg(VGRF, vgrf);
1122 }
1123 
1124 void
spill_reg(unsigned spill_reg)1125 fs_reg_alloc::spill_reg(unsigned spill_reg)
1126 {
1127    int size = fs->alloc.sizes[spill_reg];
1128    unsigned int spill_offset = fs->last_scratch;
1129    assert(ALIGN(spill_offset, 16) == spill_offset); /* oword read/write req. */
1130 
1131    /* Spills may use MRFs 13-15 in the SIMD16 case.  Our texturing is done
1132     * using up to 11 MRFs starting from either m1 or m2, and fb writes can use
1133     * up to m13 (gen6+ simd16: 2 header + 8 color + 2 src0alpha + 2 omask) or
1134     * m15 (gen4-5 simd16: 2 header + 8 color + 1 aads + 2 src depth + 2 dst
1135     * depth), starting from m1.  In summary: We may not be able to spill in
1136     * SIMD16 mode, because we'd stomp the FB writes.
1137     */
1138    if (!fs->spilled_any_registers) {
1139       if (devinfo->gen >= 9) {
1140          this->scratch_header = alloc_scratch_header();
1141          fs_builder ubld = fs->bld.exec_all().group(8, 0).at(
1142             fs->cfg->first_block(), fs->cfg->first_block()->start());
1143          fs_inst *header_inst = ubld.emit(SHADER_OPCODE_SCRATCH_HEADER,
1144                                           this->scratch_header);
1145          _mesa_set_add(spill_insts, header_inst);
1146       } else {
1147          bool mrf_used[BRW_MAX_MRF(devinfo->gen)];
1148          get_used_mrfs(fs, mrf_used);
1149 
1150          for (int i = spill_base_mrf(fs); i < BRW_MAX_MRF(devinfo->gen); i++) {
1151             if (mrf_used[i]) {
1152                fs->fail("Register spilling not supported with m%d used", i);
1153              return;
1154             }
1155          }
1156       }
1157 
1158       fs->spilled_any_registers = true;
1159    }
1160 
1161    fs->last_scratch += size * REG_SIZE;
1162 
1163    /* We're about to replace all uses of this register.  It no longer
1164     * conflicts with anything so we can get rid of its interference.
1165     */
1166    ra_set_node_spill_cost(g, first_vgrf_node + spill_reg, 0);
1167    ra_reset_node_interference(g, first_vgrf_node + spill_reg);
1168 
1169    /* Generate spill/unspill instructions for the objects being
1170     * spilled.  Right now, we spill or unspill the whole thing to a
1171     * virtual grf of the same size.  For most instructions, though, we
1172     * could just spill/unspill the GRF being accessed.
1173     */
1174    int ip = 0;
1175    foreach_block_and_inst (block, fs_inst, inst, fs->cfg) {
1176       const fs_builder ibld = fs_builder(fs, block, inst);
1177       exec_node *before = inst->prev;
1178       exec_node *after = inst->next;
1179 
1180       for (unsigned int i = 0; i < inst->sources; i++) {
1181 	 if (inst->src[i].file == VGRF &&
1182              inst->src[i].nr == spill_reg) {
1183             int count = regs_read(inst, i);
1184             int subset_spill_offset = spill_offset +
1185                ROUND_DOWN_TO(inst->src[i].offset, REG_SIZE);
1186             fs_reg unspill_dst = alloc_spill_reg(count, ip);
1187 
1188             inst->src[i].nr = unspill_dst.nr;
1189             inst->src[i].offset %= REG_SIZE;
1190 
1191             /* We read the largest power-of-two divisor of the register count
1192              * (because only POT scratch read blocks are allowed by the
1193              * hardware) up to the maximum supported block size.
1194              */
1195             const unsigned width =
1196                MIN2(32, 1u << (ffs(MAX2(1, count) * 8) - 1));
1197 
1198             /* Set exec_all() on unspill messages under the (rather
1199              * pessimistic) assumption that there is no one-to-one
1200              * correspondence between channels of the spilled variable in
1201              * scratch space and the scratch read message, which operates on
1202              * 32 bit channels.  It shouldn't hurt in any case because the
1203              * unspill destination is a block-local temporary.
1204              */
1205             emit_unspill(ibld.exec_all().group(width, 0), unspill_dst,
1206                          subset_spill_offset, count);
1207 	 }
1208       }
1209 
1210       if (inst->dst.file == VGRF &&
1211           inst->dst.nr == spill_reg) {
1212          int subset_spill_offset = spill_offset +
1213             ROUND_DOWN_TO(inst->dst.offset, REG_SIZE);
1214          fs_reg spill_src = alloc_spill_reg(regs_written(inst), ip);
1215 
1216          inst->dst.nr = spill_src.nr;
1217          inst->dst.offset %= REG_SIZE;
1218 
1219          /* If we're immediately spilling the register, we should not use
1220           * destination dependency hints.  Doing so will cause the GPU do
1221           * try to read and write the register at the same time and may
1222           * hang the GPU.
1223           */
1224          inst->no_dd_clear = false;
1225          inst->no_dd_check = false;
1226 
1227          /* Calculate the execution width of the scratch messages (which work
1228           * in terms of 32 bit components so we have a fixed number of eight
1229           * channels per spilled register).  We attempt to write one
1230           * exec_size-wide component of the variable at a time without
1231           * exceeding the maximum number of (fake) MRF registers reserved for
1232           * spills.
1233           */
1234          const unsigned width = 8 * MIN2(
1235             DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE),
1236             spill_max_size(fs));
1237 
1238          /* Spills should only write data initialized by the instruction for
1239           * whichever channels are enabled in the excution mask.  If that's
1240           * not possible we'll have to emit a matching unspill before the
1241           * instruction and set force_writemask_all on the spill.
1242           */
1243          const bool per_channel =
1244             inst->dst.is_contiguous() && type_sz(inst->dst.type) == 4 &&
1245             inst->exec_size == width;
1246 
1247          /* Builder used to emit the scratch messages. */
1248          const fs_builder ubld = ibld.exec_all(!per_channel).group(width, 0);
1249 
1250 	 /* If our write is going to affect just part of the
1251           * regs_written(inst), then we need to unspill the destination since
1252           * we write back out all of the regs_written().  If the original
1253           * instruction had force_writemask_all set and is not a partial
1254           * write, there should be no need for the unspill since the
1255           * instruction will be overwriting the whole destination in any case.
1256 	  */
1257          if (inst->is_partial_write() ||
1258              (!inst->force_writemask_all && !per_channel))
1259             emit_unspill(ubld, spill_src, subset_spill_offset,
1260                          regs_written(inst));
1261 
1262          emit_spill(ubld.at(block, inst->next), spill_src,
1263                     subset_spill_offset, regs_written(inst));
1264       }
1265 
1266       for (fs_inst *inst = (fs_inst *)before->next;
1267            inst != after; inst = (fs_inst *)inst->next)
1268          setup_inst_interference(inst);
1269 
1270       /* We don't advance the ip for scratch read/write instructions
1271        * because we consider them to have the same ip as instruction we're
1272        * spilling around for the purposes of interference.  Also, we're
1273        * inserting spill instructions without re-running liveness analysis
1274        * and we don't want to mess up our IPs.
1275        */
1276       if (!_mesa_set_search(spill_insts, inst))
1277          ip++;
1278    }
1279 
1280    assert(ip == live_instr_count);
1281 }
1282 
1283 bool
assign_regs(bool allow_spilling,bool spill_all)1284 fs_reg_alloc::assign_regs(bool allow_spilling, bool spill_all)
1285 {
1286    build_interference_graph(fs->spilled_any_registers || spill_all);
1287 
1288    bool spilled = false;
1289    while (1) {
1290       /* Debug of register spilling: Go spill everything. */
1291       if (unlikely(spill_all)) {
1292          int reg = choose_spill_reg();
1293          if (reg != -1) {
1294             spill_reg(reg);
1295             continue;
1296          }
1297       }
1298 
1299       if (ra_allocate(g))
1300          break;
1301 
1302       if (!allow_spilling)
1303          return false;
1304 
1305       /* Failed to allocate registers.  Spill a reg, and the caller will
1306        * loop back into here to try again.
1307        */
1308       int reg = choose_spill_reg();
1309       if (reg == -1)
1310          return false;
1311 
1312       /* If we're going to spill but we've never spilled before, we need to
1313        * re-build the interference graph with MRFs enabled to allow spilling.
1314        */
1315       if (!fs->spilled_any_registers) {
1316          discard_interference_graph();
1317          build_interference_graph(true);
1318       }
1319 
1320       spilled = true;
1321 
1322       spill_reg(reg);
1323    }
1324 
1325    if (spilled)
1326       fs->invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
1327 
1328    /* Get the chosen virtual registers for each node, and map virtual
1329     * regs in the register classes back down to real hardware reg
1330     * numbers.
1331     */
1332    unsigned hw_reg_mapping[fs->alloc.count];
1333    fs->grf_used = fs->first_non_payload_grf;
1334    for (unsigned i = 0; i < fs->alloc.count; i++) {
1335       int reg = ra_get_node_reg(g, first_vgrf_node + i);
1336 
1337       hw_reg_mapping[i] = compiler->fs_reg_sets[rsi].ra_reg_to_grf[reg];
1338       fs->grf_used = MAX2(fs->grf_used,
1339 			  hw_reg_mapping[i] + fs->alloc.sizes[i]);
1340    }
1341 
1342    foreach_block_and_inst(block, fs_inst, inst, fs->cfg) {
1343       assign_reg(hw_reg_mapping, &inst->dst);
1344       for (int i = 0; i < inst->sources; i++) {
1345          assign_reg(hw_reg_mapping, &inst->src[i]);
1346       }
1347    }
1348 
1349    fs->alloc.count = fs->grf_used;
1350 
1351    return true;
1352 }
1353 
1354 bool
assign_regs(bool allow_spilling,bool spill_all)1355 fs_visitor::assign_regs(bool allow_spilling, bool spill_all)
1356 {
1357    fs_reg_alloc alloc(this);
1358    bool success = alloc.assign_regs(allow_spilling, spill_all);
1359    if (!success && allow_spilling) {
1360       fail("no register to spill:\n");
1361       dump_instructions(NULL);
1362    }
1363    return success;
1364 }
1365