• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2010 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * Authors:
24  *    Eric Anholt <eric@anholt.net>
25  *
26  */
27 
28 #include "elk_eu.h"
29 #include "elk_fs.h"
30 #include "elk_fs_builder.h"
31 #include "elk_cfg.h"
32 #include "util/set.h"
33 #include "util/register_allocate.h"
34 
35 using namespace elk;
36 
37 #define REG_CLASS_COUNT 20
38 
39 static void
assign_reg(const struct intel_device_info * devinfo,unsigned * reg_hw_locations,elk_fs_reg * reg)40 assign_reg(const struct intel_device_info *devinfo,
41            unsigned *reg_hw_locations, elk_fs_reg *reg)
42 {
43    if (reg->file == VGRF) {
44       reg->nr = reg_unit(devinfo) * reg_hw_locations[reg->nr] + reg->offset / REG_SIZE;
45       reg->offset %= REG_SIZE;
46    }
47 }
48 
49 void
assign_regs_trivial()50 elk_fs_visitor::assign_regs_trivial()
51 {
52    unsigned hw_reg_mapping[this->alloc.count + 1];
53    unsigned i;
54    int reg_width = dispatch_width / 8;
55 
56    /* Note that compressed instructions require alignment to 2 registers. */
57    hw_reg_mapping[0] = ALIGN(this->first_non_payload_grf, reg_width);
58    for (i = 1; i <= this->alloc.count; i++) {
59       hw_reg_mapping[i] = (hw_reg_mapping[i - 1] +
60                            DIV_ROUND_UP(this->alloc.sizes[i - 1],
61                                         reg_unit(devinfo)));
62    }
63    this->grf_used = hw_reg_mapping[this->alloc.count];
64 
65    foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
66       assign_reg(devinfo, hw_reg_mapping, &inst->dst);
67       for (i = 0; i < inst->sources; i++) {
68          assign_reg(devinfo, hw_reg_mapping, &inst->src[i]);
69       }
70    }
71 
72    if (this->grf_used >= max_grf) {
73       fail("Ran out of regs on trivial allocator (%d/%d)\n",
74 	   this->grf_used, max_grf);
75    } else {
76       this->alloc.count = this->grf_used;
77    }
78 
79 }
80 
81 /**
82  * Size of a register from the aligned_bary_class register class.
83  */
84 static unsigned
aligned_bary_size(unsigned dispatch_width)85 aligned_bary_size(unsigned dispatch_width)
86 {
87    return (dispatch_width == 8 ? 2 : 4);
88 }
89 
90 static void
elk_alloc_reg_set(struct elk_compiler * compiler,int dispatch_width)91 elk_alloc_reg_set(struct elk_compiler *compiler, int dispatch_width)
92 {
93    const struct intel_device_info *devinfo = compiler->devinfo;
94    int base_reg_count = ELK_MAX_GRF;
95    const int index = util_logbase2(dispatch_width / 8);
96 
97    if (dispatch_width > 8 && devinfo->ver >= 7) {
98       /* For IVB+, we don't need the PLN hacks or the even-reg alignment in
99        * SIMD16.  Therefore, we can use the exact same register sets for
100        * SIMD16 as we do for SIMD8 and we don't need to recalculate them.
101        */
102       compiler->fs_reg_sets[index] = compiler->fs_reg_sets[0];
103       return;
104    }
105 
106    /* The registers used to make up almost all values handled in the compiler
107     * are a scalar value occupying a single register (or 2 registers in the
108     * case of SIMD16, which is handled by dividing base_reg_count by 2 and
109     * multiplying allocated register numbers by 2).  Things that were
110     * aggregates of scalar values at the GLSL level were split to scalar
111     * values by split_virtual_grfs().
112     *
113     * However, texture SEND messages return a series of contiguous registers
114     * to write into.  We currently always ask for 4 registers, but we may
115     * convert that to use less some day.
116     *
117     * Additionally, on gfx5 we need aligned pairs of registers for the PLN
118     * instruction, and on gfx4 we need 8 contiguous regs for workaround simd16
119     * texturing.
120     */
121    assert(REG_CLASS_COUNT == MAX_VGRF_SIZE(devinfo) / reg_unit(devinfo));
122    int class_sizes[REG_CLASS_COUNT];
123    for (unsigned i = 0; i < REG_CLASS_COUNT; i++)
124       class_sizes[i] = i + 1;
125 
126    struct ra_regs *regs = ra_alloc_reg_set(compiler, ELK_MAX_GRF, false);
127    if (devinfo->ver >= 6)
128       ra_set_allocate_round_robin(regs);
129    struct ra_class **classes = ralloc_array(compiler, struct ra_class *,
130                                             REG_CLASS_COUNT);
131    struct ra_class *aligned_bary_class = NULL;
132 
133    /* Now, make the register classes for each size of contiguous register
134     * allocation we might need to make.
135     */
136    for (int i = 0; i < REG_CLASS_COUNT; i++) {
137       classes[i] = ra_alloc_contig_reg_class(regs, class_sizes[i]);
138 
139       if (devinfo->ver <= 5 && dispatch_width >= 16) {
140          /* From the G45 PRM:
141           *
142           * In order to reduce the hardware complexity, the following
143           * rules and restrictions apply to the compressed instruction:
144           * ...
145           * * Operand Alignment Rule: With the exceptions listed below, a
146           *   source/destination operand in general should be aligned to
147           *   even 256-bit physical register with a region size equal to
148           *   two 256-bit physical register
149           */
150          for (int reg = 0; reg <= base_reg_count - class_sizes[i]; reg += 2)
151             ra_class_add_reg(classes[i], reg);
152       } else {
153          for (int reg = 0; reg <= base_reg_count - class_sizes[i]; reg++)
154             ra_class_add_reg(classes[i], reg);
155       }
156    }
157 
158    /* Add a special class for aligned barycentrics, which we'll put the
159     * first source of LINTERP on so that we can do PLN on Gen <= 6.
160     */
161    if (devinfo->has_pln && (devinfo->ver == 6 ||
162                             (dispatch_width == 8 && devinfo->ver <= 5))) {
163       int contig_len = aligned_bary_size(dispatch_width);
164       aligned_bary_class = ra_alloc_contig_reg_class(regs, contig_len);
165 
166       for (int i = 0; i <= base_reg_count - contig_len; i += 2)
167          ra_class_add_reg(aligned_bary_class, i);
168    }
169 
170    ra_set_finalize(regs, NULL);
171 
172    compiler->fs_reg_sets[index].regs = regs;
173    for (unsigned i = 0; i < ARRAY_SIZE(compiler->fs_reg_sets[index].classes); i++)
174       compiler->fs_reg_sets[index].classes[i] = NULL;
175    for (int i = 0; i < REG_CLASS_COUNT; i++)
176       compiler->fs_reg_sets[index].classes[class_sizes[i] - 1] = classes[i];
177    compiler->fs_reg_sets[index].aligned_bary_class = aligned_bary_class;
178 }
179 
180 void
elk_fs_alloc_reg_sets(struct elk_compiler * compiler)181 elk_fs_alloc_reg_sets(struct elk_compiler *compiler)
182 {
183    elk_alloc_reg_set(compiler, 8);
184    elk_alloc_reg_set(compiler, 16);
185    elk_alloc_reg_set(compiler, 32);
186 }
187 
188 static int
count_to_loop_end(const elk_bblock_t * block)189 count_to_loop_end(const elk_bblock_t *block)
190 {
191    if (block->end()->opcode == ELK_OPCODE_WHILE)
192       return block->end_ip;
193 
194    int depth = 1;
195    /* Skip the first block, since we don't want to count the do the calling
196     * function found.
197     */
198    for (block = block->next();
199         depth > 0;
200         block = block->next()) {
201       if (block->start()->opcode == ELK_OPCODE_DO)
202          depth++;
203       if (block->end()->opcode == ELK_OPCODE_WHILE) {
204          depth--;
205          if (depth == 0)
206             return block->end_ip;
207       }
208    }
209    unreachable("not reached");
210 }
211 
calculate_payload_ranges(unsigned payload_node_count,int * payload_last_use_ip) const212 void elk_fs_visitor::calculate_payload_ranges(unsigned payload_node_count,
213                                           int *payload_last_use_ip) const
214 {
215    int loop_depth = 0;
216    int loop_end_ip = 0;
217 
218    for (unsigned i = 0; i < payload_node_count; i++)
219       payload_last_use_ip[i] = -1;
220 
221    int ip = 0;
222    foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
223       switch (inst->opcode) {
224       case ELK_OPCODE_DO:
225          loop_depth++;
226 
227          /* Since payload regs are deffed only at the start of the shader
228           * execution, any uses of the payload within a loop mean the live
229           * interval extends to the end of the outermost loop.  Find the ip of
230           * the end now.
231           */
232          if (loop_depth == 1)
233             loop_end_ip = count_to_loop_end(block);
234          break;
235       case ELK_OPCODE_WHILE:
236          loop_depth--;
237          break;
238       default:
239          break;
240       }
241 
242       int use_ip;
243       if (loop_depth > 0)
244          use_ip = loop_end_ip;
245       else
246          use_ip = ip;
247 
248       /* Note that UNIFORM args have been turned into FIXED_GRF by
249        * assign_curbe_setup(), and interpolation uses fixed hardware regs from
250        * the start (see interp_reg()).
251        */
252       for (int i = 0; i < inst->sources; i++) {
253          if (inst->src[i].file == FIXED_GRF) {
254             unsigned reg_nr = inst->src[i].nr;
255             if (reg_nr / reg_unit(devinfo) >= payload_node_count)
256                continue;
257 
258             for (unsigned j = reg_nr / reg_unit(devinfo);
259                  j < DIV_ROUND_UP(reg_nr + regs_read(inst, i),
260                                   reg_unit(devinfo));
261                  j++) {
262                payload_last_use_ip[j] = use_ip;
263                assert(j < payload_node_count);
264             }
265          }
266       }
267 
268       if (inst->dst.file == FIXED_GRF) {
269          unsigned reg_nr = inst->dst.nr;
270          if (reg_nr / reg_unit(devinfo) < payload_node_count) {
271             for (unsigned j = reg_nr / reg_unit(devinfo);
272                  j < DIV_ROUND_UP(reg_nr + regs_written(inst),
273                                   reg_unit(devinfo));
274                  j++) {
275                payload_last_use_ip[j] = use_ip;
276                assert(j < payload_node_count);
277             }
278          }
279       }
280 
281       /* Special case instructions which have extra implied registers used. */
282       switch (inst->opcode) {
283       case ELK_CS_OPCODE_CS_TERMINATE:
284          payload_last_use_ip[0] = use_ip;
285          break;
286 
287       default:
288          if (inst->eot) {
289             /* We could omit this for the !inst->header_present case, except
290              * that the simulator apparently incorrectly reads from g0/g1
291              * instead of sideband.  It also really freaks out driver
292              * developers to see g0 used in unusual places, so just always
293              * reserve it.
294              */
295             payload_last_use_ip[0] = use_ip;
296             payload_last_use_ip[1] = use_ip;
297          }
298          break;
299       }
300 
301       ip++;
302    }
303 }
304 
305 class elk_fs_reg_alloc {
306 public:
elk_fs_reg_alloc(elk_fs_visitor * fs)307    elk_fs_reg_alloc(elk_fs_visitor *fs):
308       fs(fs), devinfo(fs->devinfo), compiler(fs->compiler),
309       live(fs->live_analysis.require()), g(NULL),
310       have_spill_costs(false)
311    {
312       mem_ctx = ralloc_context(NULL);
313 
314       /* Stash the number of instructions so we can sanity check that our
315        * counts still match liveness.
316        */
317       live_instr_count = fs->cfg->last_block()->end_ip + 1;
318 
319       spill_insts = _mesa_pointer_set_create(mem_ctx);
320 
321       /* Most of this allocation was written for a reg_width of 1
322        * (dispatch_width == 8).  In extending to SIMD16, the code was
323        * left in place and it was converted to have the hardware
324        * registers it's allocating be contiguous physical pairs of regs
325        * for reg_width == 2.
326        */
327       int reg_width = fs->dispatch_width / 8;
328       rsi = util_logbase2(reg_width);
329       payload_node_count = ALIGN(fs->first_non_payload_grf, reg_width);
330 
331       /* Get payload IP information */
332       payload_last_use_ip = ralloc_array(mem_ctx, int, payload_node_count);
333 
334       node_count = 0;
335       first_payload_node = 0;
336       first_mrf_hack_node = 0;
337       scratch_header_node = 0;
338       grf127_send_hack_node = 0;
339       first_vgrf_node = 0;
340       last_vgrf_node = 0;
341       first_spill_node = 0;
342 
343       spill_vgrf_ip = NULL;
344       spill_vgrf_ip_alloc = 0;
345       spill_node_count = 0;
346    }
347 
~elk_fs_reg_alloc()348    ~elk_fs_reg_alloc()
349    {
350       ralloc_free(mem_ctx);
351    }
352 
353    bool assign_regs(bool allow_spilling, bool spill_all);
354 
355 private:
356    void setup_live_interference(unsigned node,
357                                 int node_start_ip, int node_end_ip);
358    void setup_inst_interference(const elk_fs_inst *inst);
359 
360    void build_interference_graph(bool allow_spilling);
361    void discard_interference_graph();
362 
363    elk_fs_reg build_lane_offsets(const fs_builder &bld,
364                              uint32_t spill_offset, int ip);
365    elk_fs_reg build_single_offset(const fs_builder &bld,
366                               uint32_t spill_offset, int ip);
367 
368    void emit_unspill(const fs_builder &bld, struct shader_stats *stats,
369                      elk_fs_reg dst, uint32_t spill_offset, unsigned count, int ip);
370    void emit_spill(const fs_builder &bld, struct shader_stats *stats,
371                    elk_fs_reg src, uint32_t spill_offset, unsigned count, int ip);
372 
373    void set_spill_costs();
374    int choose_spill_reg();
375    elk_fs_reg alloc_scratch_header();
376    elk_fs_reg alloc_spill_reg(unsigned size, int ip);
377    void spill_reg(unsigned spill_reg);
378 
379    void *mem_ctx;
380    elk_fs_visitor *fs;
381    const intel_device_info *devinfo;
382    const elk_compiler *compiler;
383    const fs_live_variables &live;
384    int live_instr_count;
385 
386    set *spill_insts;
387 
388    /* Which compiler->fs_reg_sets[] to use */
389    int rsi;
390 
391    ra_graph *g;
392    bool have_spill_costs;
393 
394    int payload_node_count;
395    int *payload_last_use_ip;
396 
397    int node_count;
398    int first_payload_node;
399    int first_mrf_hack_node;
400    int scratch_header_node;
401    int grf127_send_hack_node;
402    int first_vgrf_node;
403    int last_vgrf_node;
404    int first_spill_node;
405 
406    int *spill_vgrf_ip;
407    int spill_vgrf_ip_alloc;
408    int spill_node_count;
409 
410    elk_fs_reg scratch_header;
411 };
412 
413 /**
414  * Sets the mrf_used array to indicate which MRFs are used by the shader IR
415  *
416  * This is used in assign_regs() to decide which of the GRFs that we use as
417  * MRFs on gfx7 get normally register allocated, and in register spilling to
418  * see if we can actually use MRFs to do spills without overwriting normal MRF
419  * contents.
420  */
421 static void
get_used_mrfs(const elk_fs_visitor * v,bool * mrf_used)422 get_used_mrfs(const elk_fs_visitor *v, bool *mrf_used)
423 {
424    int reg_width = v->dispatch_width / 8;
425 
426    memset(mrf_used, 0, ELK_MAX_MRF(v->devinfo->ver) * sizeof(bool));
427 
428    foreach_block_and_inst(block, elk_fs_inst, inst, v->cfg) {
429       if (inst->dst.file == MRF) {
430          int reg = inst->dst.nr & ~ELK_MRF_COMPR4;
431          mrf_used[reg] = true;
432          if (reg_width == 2) {
433             if (inst->dst.nr & ELK_MRF_COMPR4) {
434                mrf_used[reg + 4] = true;
435             } else {
436                mrf_used[reg + 1] = true;
437             }
438          }
439       }
440 
441       if (inst->mlen > 0) {
442 	 for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) {
443             mrf_used[inst->base_mrf + i] = true;
444          }
445       }
446    }
447 }
448 
449 namespace {
450    /**
451     * Maximum spill block size we expect to encounter in 32B units.
452     *
453     * This is somewhat arbitrary and doesn't necessarily limit the maximum
454     * variable size that can be spilled -- A higher value will allow a
455     * variable of a given size to be spilled more efficiently with a smaller
456     * number of scratch messages, but will increase the likelihood of a
457     * collision between the MRFs reserved for spilling and other MRFs used by
458     * the program (and possibly increase GRF register pressure on platforms
459     * without hardware MRFs), what could cause register allocation to fail.
460     *
461     * For the moment reserve just enough space so a register of 32 bit
462     * component type and natural region width can be spilled without splitting
463     * into multiple (force_writemask_all) scratch messages.
464     */
465    unsigned
spill_max_size(const elk_backend_shader * s)466    spill_max_size(const elk_backend_shader *s)
467    {
468       /* LSC is limited to SIMD16 sends */
469       if (s->devinfo->has_lsc)
470          return 2;
471 
472       /* FINISHME - On Gfx7+ it should be possible to avoid this limit
473        *            altogether by spilling directly from the temporary GRF
474        *            allocated to hold the result of the instruction (and the
475        *            scratch write header).
476        */
477       /* FINISHME - The shader's dispatch width probably belongs in
478        *            elk_backend_shader (or some nonexistent fs_shader class?)
479        *            rather than in the visitor class.
480        */
481       return static_cast<const elk_fs_visitor *>(s)->dispatch_width / 8;
482    }
483 
484    /**
485     * First MRF register available for spilling.
486     */
487    unsigned
spill_base_mrf(const elk_backend_shader * s)488    spill_base_mrf(const elk_backend_shader *s)
489    {
490       /* We don't use the MRF hack on Gfx9+ */
491       assert(s->devinfo->ver < 9);
492       return ELK_MAX_MRF(s->devinfo->ver) - spill_max_size(s) - 1;
493    }
494 }
495 
496 void
setup_live_interference(unsigned node,int node_start_ip,int node_end_ip)497 elk_fs_reg_alloc::setup_live_interference(unsigned node,
498                                       int node_start_ip, int node_end_ip)
499 {
500    /* Mark any virtual grf that is live between the start of the program and
501     * the last use of a payload node interfering with that payload node.
502     */
503    for (int i = 0; i < payload_node_count; i++) {
504       if (payload_last_use_ip[i] == -1)
505          continue;
506 
507       /* Note that we use a <= comparison, unlike vgrfs_interfere(),
508        * in order to not have to worry about the uniform issue described in
509        * calculate_live_intervals().
510        */
511       if (node_start_ip <= payload_last_use_ip[i])
512          ra_add_node_interference(g, node, first_payload_node + i);
513    }
514 
515    /* If we have the MRF hack enabled, mark this node as interfering with all
516     * MRF registers.
517     */
518    if (first_mrf_hack_node >= 0) {
519       for (int i = spill_base_mrf(fs); i < ELK_MAX_MRF(devinfo->ver); i++)
520          ra_add_node_interference(g, node, first_mrf_hack_node + i);
521    }
522 
523    /* Everything interferes with the scratch header */
524    if (scratch_header_node >= 0)
525       ra_add_node_interference(g, node, scratch_header_node);
526 
527    /* Add interference with every vgrf whose live range intersects this
528     * node's.  We only need to look at nodes below this one as the reflexivity
529     * of interference will take care of the rest.
530     */
531    for (unsigned n2 = first_vgrf_node;
532         n2 <= (unsigned)last_vgrf_node && n2 < node; n2++) {
533       unsigned vgrf = n2 - first_vgrf_node;
534       if (!(node_end_ip <= live.vgrf_start[vgrf] ||
535             live.vgrf_end[vgrf] <= node_start_ip))
536          ra_add_node_interference(g, node, n2);
537    }
538 }
539 
540 void
setup_inst_interference(const elk_fs_inst * inst)541 elk_fs_reg_alloc::setup_inst_interference(const elk_fs_inst *inst)
542 {
543    /* Certain instructions can't safely use the same register for their
544     * sources and destination.  Add interference.
545     */
546    if (inst->dst.file == VGRF && inst->has_source_and_destination_hazard()) {
547       for (unsigned i = 0; i < inst->sources; i++) {
548          if (inst->src[i].file == VGRF) {
549             ra_add_node_interference(g, first_vgrf_node + inst->dst.nr,
550                                         first_vgrf_node + inst->src[i].nr);
551          }
552       }
553    }
554 
555    /* A compressed instruction is actually two instructions executed
556     * simultaneously.  On most platforms, it ok to have the source and
557     * destination registers be the same.  In this case, each instruction
558     * over-writes its own source and there's no problem.  The real problem
559     * here is if the source and destination registers are off by one.  Then
560     * you can end up in a scenario where the first instruction over-writes the
561     * source of the second instruction.  Since the compiler doesn't know about
562     * this level of granularity, we simply make the source and destination
563     * interfere.
564     */
565    if (inst->dst.component_size(inst->exec_size) > REG_SIZE &&
566        inst->dst.file == VGRF) {
567       for (int i = 0; i < inst->sources; ++i) {
568          if (inst->src[i].file == VGRF) {
569             ra_add_node_interference(g, first_vgrf_node + inst->dst.nr,
570                                         first_vgrf_node + inst->src[i].nr);
571          }
572       }
573    }
574 
575    if (grf127_send_hack_node >= 0) {
576       /* At Intel Broadwell PRM, vol 07, section "Instruction Set Reference",
577        * subsection "EUISA Instructions", Send Message (page 990):
578        *
579        * "r127 must not be used for return address when there is a src and
580        * dest overlap in send instruction."
581        *
582        * We are avoiding using grf127 as part of the destination of send
583        * messages adding a node interference to the grf127_send_hack_node.
584        * This node has a fixed assignment to grf127.
585        *
586        * We don't apply it to SIMD16 instructions because previous code avoids
587        * any register overlap between sources and destination.
588        */
589       if (inst->exec_size < 16 && inst->is_send_from_grf() &&
590           inst->dst.file == VGRF)
591          ra_add_node_interference(g, first_vgrf_node + inst->dst.nr,
592                                      grf127_send_hack_node);
593 
594       /* Spilling instruction are generated as SEND messages from MRF but as
595        * Gfx7+ supports sending from GRF the driver will maps assingn these
596        * MRF registers to a GRF. Implementations reuses the dest of the send
597        * message as source. So as we will have an overlap for sure, we create
598        * an interference between destination and grf127.
599        */
600       if ((inst->opcode == ELK_SHADER_OPCODE_GFX7_SCRATCH_READ ||
601            inst->opcode == ELK_SHADER_OPCODE_GFX4_SCRATCH_READ) &&
602           inst->dst.file == VGRF)
603          ra_add_node_interference(g, first_vgrf_node + inst->dst.nr,
604                                      grf127_send_hack_node);
605    }
606 
607    /* From the Skylake PRM Vol. 2a docs for sends:
608     *
609     *    "It is required that the second block of GRFs does not overlap with
610     *    the first block."
611     *
612     * Normally, this is taken care of by fixup_sends_duplicate_payload() but
613     * in the case where one of the registers is an undefined value, the
614     * register allocator may decide that they don't interfere even though
615     * they're used as sources in the same instruction.  We also need to add
616     * interference here.
617     */
618    if (devinfo->ver >= 9) {
619       if (inst->opcode == ELK_SHADER_OPCODE_SEND && inst->ex_mlen > 0 &&
620           inst->src[2].file == VGRF && inst->src[3].file == VGRF &&
621           inst->src[2].nr != inst->src[3].nr)
622          ra_add_node_interference(g, first_vgrf_node + inst->src[2].nr,
623                                      first_vgrf_node + inst->src[3].nr);
624    }
625 
626    /* When we do send-from-GRF for FB writes, we need to ensure that the last
627     * write instruction sends from a high register.  This is because the
628     * vertex fetcher wants to start filling the low payload registers while
629     * the pixel data port is still working on writing out the memory.  If we
630     * don't do this, we get rendering artifacts.
631     *
632     * We could just do "something high".  Instead, we just pick the highest
633     * register that works.
634     */
635    if (inst->eot) {
636       const int vgrf = inst->opcode == ELK_SHADER_OPCODE_SEND ?
637                        inst->src[2].nr : inst->src[0].nr;
638       const int size = DIV_ROUND_UP(fs->alloc.sizes[vgrf], reg_unit(devinfo));
639       int reg = ELK_MAX_GRF - size;
640 
641       if (first_mrf_hack_node >= 0) {
642          /* If something happened to spill, we want to push the EOT send
643           * register early enough in the register file that we don't
644           * conflict with any used MRF hack registers.
645           */
646          reg -= ELK_MAX_MRF(devinfo->ver) - spill_base_mrf(fs);
647       } else if (grf127_send_hack_node >= 0) {
648          /* Avoid r127 which might be unusable if the node was previously
649           * written by a SIMD8 SEND message with source/destination overlap.
650           */
651          reg--;
652       }
653 
654       ra_set_node_reg(g, first_vgrf_node + vgrf, reg);
655 
656       if (inst->ex_mlen > 0) {
657          const int vgrf = inst->src[3].nr;
658          reg -= DIV_ROUND_UP(fs->alloc.sizes[vgrf], reg_unit(devinfo));
659          ra_set_node_reg(g, first_vgrf_node + vgrf, reg);
660       }
661    }
662 }
663 
664 void
build_interference_graph(bool allow_spilling)665 elk_fs_reg_alloc::build_interference_graph(bool allow_spilling)
666 {
667    /* Compute the RA node layout */
668    node_count = 0;
669    first_payload_node = node_count;
670    node_count += payload_node_count;
671    if (devinfo->ver >= 7 && devinfo->ver < 9 && allow_spilling) {
672       first_mrf_hack_node = node_count;
673       node_count += ELK_MAX_GRF - GFX7_MRF_HACK_START;
674    } else {
675       first_mrf_hack_node = -1;
676    }
677    if (devinfo->ver >= 8) {
678       grf127_send_hack_node = node_count;
679       node_count ++;
680    } else {
681       grf127_send_hack_node = -1;
682    }
683    first_vgrf_node = node_count;
684    node_count += fs->alloc.count;
685    last_vgrf_node = node_count - 1;
686    if ((devinfo->ver >= 9 && devinfo->verx10 < 125) && allow_spilling) {
687       scratch_header_node = node_count++;
688    } else {
689       scratch_header_node = -1;
690    }
691    first_spill_node = node_count;
692 
693    fs->calculate_payload_ranges(payload_node_count,
694                                 payload_last_use_ip);
695 
696    assert(g == NULL);
697    g = ra_alloc_interference_graph(compiler->fs_reg_sets[rsi].regs, node_count);
698    ralloc_steal(mem_ctx, g);
699 
700    /* Set up the payload nodes */
701    for (int i = 0; i < payload_node_count; i++)
702       ra_set_node_reg(g, first_payload_node + i, i);
703 
704    if (first_mrf_hack_node >= 0) {
705       /* Mark each MRF reg node as being allocated to its physical
706        * register.
707        *
708        * The alternative would be to have per-physical-register classes,
709        * which would just be silly.
710        */
711       for (int i = 0; i < ELK_MAX_MRF(devinfo->ver); i++) {
712          ra_set_node_reg(g, first_mrf_hack_node + i,
713                             GFX7_MRF_HACK_START + i);
714       }
715    }
716 
717    if (grf127_send_hack_node >= 0)
718       ra_set_node_reg(g, grf127_send_hack_node, 127);
719 
720    /* Specify the classes of each virtual register. */
721    for (unsigned i = 0; i < fs->alloc.count; i++) {
722       unsigned size = DIV_ROUND_UP(fs->alloc.sizes[i], reg_unit(devinfo));
723 
724       assert(size <= ARRAY_SIZE(compiler->fs_reg_sets[rsi].classes) &&
725              "Register allocation relies on split_virtual_grfs()");
726 
727       ra_set_node_class(g, first_vgrf_node + i,
728                         compiler->fs_reg_sets[rsi].classes[size - 1]);
729    }
730 
731    /* Special case: on pre-Gfx7 hardware that supports PLN, the second operand
732     * of a PLN instruction needs to be an even-numbered register, so we have a
733     * special register class aligned_bary_class to handle this case.
734     */
735    if (compiler->fs_reg_sets[rsi].aligned_bary_class) {
736       foreach_block_and_inst(block, elk_fs_inst, inst, fs->cfg) {
737          if (inst->opcode == ELK_FS_OPCODE_LINTERP && inst->src[0].file == VGRF &&
738              fs->alloc.sizes[inst->src[0].nr] ==
739                aligned_bary_size(fs->dispatch_width)) {
740             ra_set_node_class(g, first_vgrf_node + inst->src[0].nr,
741                               compiler->fs_reg_sets[rsi].aligned_bary_class);
742          }
743       }
744    }
745 
746    /* Add interference based on the live range of the register */
747    for (unsigned i = 0; i < fs->alloc.count; i++) {
748       setup_live_interference(first_vgrf_node + i,
749                               live.vgrf_start[i],
750                               live.vgrf_end[i]);
751    }
752 
753    /* Add interference based on the instructions in which a register is used.
754     */
755    foreach_block_and_inst(block, elk_fs_inst, inst, fs->cfg)
756       setup_inst_interference(inst);
757 }
758 
759 void
discard_interference_graph()760 elk_fs_reg_alloc::discard_interference_graph()
761 {
762    ralloc_free(g);
763    g = NULL;
764    have_spill_costs = false;
765 }
766 
767 elk_fs_reg
build_single_offset(const fs_builder & bld,uint32_t spill_offset,int ip)768 elk_fs_reg_alloc::build_single_offset(const fs_builder &bld, uint32_t spill_offset, int ip)
769 {
770    elk_fs_reg offset = retype(alloc_spill_reg(1, ip), ELK_REGISTER_TYPE_UD);
771    elk_fs_inst *inst = bld.MOV(offset, elk_imm_ud(spill_offset));
772    _mesa_set_add(spill_insts, inst);
773    return offset;
774 }
775 
776 elk_fs_reg
build_lane_offsets(const fs_builder & bld,uint32_t spill_offset,int ip)777 elk_fs_reg_alloc::build_lane_offsets(const fs_builder &bld, uint32_t spill_offset, int ip)
778 {
779    /* LSC messages are limited to SIMD16 */
780    assert(bld.dispatch_width() <= 16);
781 
782    const fs_builder ubld = bld.exec_all();
783    const unsigned reg_count = ubld.dispatch_width() / 8;
784 
785    elk_fs_reg offset = retype(alloc_spill_reg(reg_count, ip), ELK_REGISTER_TYPE_UD);
786    elk_fs_inst *inst;
787 
788    /* Build an offset per lane in SIMD8 */
789    inst = ubld.group(8, 0).MOV(retype(offset, ELK_REGISTER_TYPE_UW),
790                                elk_imm_uv(0x76543210));
791    _mesa_set_add(spill_insts, inst);
792    inst = ubld.group(8, 0).MOV(offset, retype(offset, ELK_REGISTER_TYPE_UW));
793    _mesa_set_add(spill_insts, inst);
794 
795    /* Build offsets in the upper 8 lanes of SIMD16 */
796    if (ubld.dispatch_width() > 8) {
797       inst = ubld.group(8, 0).ADD(
798          byte_offset(offset, REG_SIZE),
799          byte_offset(offset, 0),
800          elk_imm_ud(8));
801       _mesa_set_add(spill_insts, inst);
802    }
803 
804    /* Make the offset a dword */
805    inst = ubld.SHL(offset, offset, elk_imm_ud(2));
806    _mesa_set_add(spill_insts, inst);
807 
808    /* Add the base offset */
809    inst = ubld.ADD(offset, offset, elk_imm_ud(spill_offset));
810    _mesa_set_add(spill_insts, inst);
811 
812    return offset;
813 }
814 
815 void
emit_unspill(const fs_builder & bld,struct shader_stats * stats,elk_fs_reg dst,uint32_t spill_offset,unsigned count,int ip)816 elk_fs_reg_alloc::emit_unspill(const fs_builder &bld,
817                            struct shader_stats *stats,
818                            elk_fs_reg dst,
819                            uint32_t spill_offset, unsigned count, int ip)
820 {
821    const intel_device_info *devinfo = bld.shader->devinfo;
822    const unsigned reg_size = dst.component_size(bld.dispatch_width()) /
823                              REG_SIZE;
824    assert(count % reg_size == 0);
825 
826    for (unsigned i = 0; i < count / reg_size; i++) {
827       ++stats->fill_count;
828 
829       elk_fs_inst *unspill_inst;
830       if (devinfo->verx10 >= 125) {
831          /* LSC is limited to SIMD16 load/store but we can load more using
832           * transpose messages.
833           */
834          const bool use_transpose = bld.dispatch_width() > 16;
835          const fs_builder ubld = use_transpose ? bld.exec_all().group(1, 0) : bld;
836          elk_fs_reg offset;
837          if (use_transpose) {
838             offset = build_single_offset(ubld, spill_offset, ip);
839          } else {
840             offset = build_lane_offsets(ubld, spill_offset, ip);
841          }
842          /* We leave the extended descriptor empty and flag the instruction to
843           * ask the generated to insert the extended descriptor in the address
844           * register. That way we don't need to burn an additional register
845           * for register allocation spill/fill.
846           */
847          elk_fs_reg srcs[] = {
848             elk_imm_ud(0), /* desc */
849             elk_imm_ud(0), /* ex_desc */
850             offset,        /* payload */
851             elk_fs_reg(),      /* payload2 */
852          };
853 
854          unspill_inst = ubld.emit(ELK_SHADER_OPCODE_SEND, dst,
855                                   srcs, ARRAY_SIZE(srcs));
856          unspill_inst->sfid = GFX12_SFID_UGM;
857          unspill_inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
858                                            unspill_inst->exec_size,
859                                            LSC_ADDR_SURFTYPE_SS,
860                                            LSC_ADDR_SIZE_A32,
861                                            1 /* num_coordinates */,
862                                            LSC_DATA_SIZE_D32,
863                                            use_transpose ? reg_size * 8 : 1 /* num_channels */,
864                                            use_transpose,
865                                            LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
866                                            true /* has_dest */);
867          unspill_inst->header_size = 0;
868          unspill_inst->mlen =
869             lsc_msg_desc_src0_len(devinfo, unspill_inst->desc);
870          unspill_inst->ex_mlen = 0;
871          unspill_inst->size_written =
872             lsc_msg_desc_dest_len(devinfo, unspill_inst->desc) * REG_SIZE;
873          unspill_inst->send_has_side_effects = false;
874          unspill_inst->send_is_volatile = true;
875          unspill_inst->send_ex_desc_scratch = true;
876       } else if (devinfo->ver >= 9) {
877          elk_fs_reg header = this->scratch_header;
878          fs_builder ubld = bld.exec_all().group(1, 0);
879          assert(spill_offset % 16 == 0);
880          unspill_inst = ubld.MOV(component(header, 2),
881                                  elk_imm_ud(spill_offset / 16));
882          _mesa_set_add(spill_insts, unspill_inst);
883 
884          const unsigned bti = GFX8_BTI_STATELESS_NON_COHERENT;
885          const elk_fs_reg ex_desc = elk_imm_ud(0);
886 
887          elk_fs_reg srcs[] = { elk_imm_ud(0), ex_desc, header };
888          unspill_inst = bld.emit(ELK_SHADER_OPCODE_SEND, dst,
889                                  srcs, ARRAY_SIZE(srcs));
890          unspill_inst->mlen = 1;
891          unspill_inst->header_size = 1;
892          unspill_inst->size_written = reg_size * REG_SIZE;
893          unspill_inst->send_has_side_effects = false;
894          unspill_inst->send_is_volatile = true;
895          unspill_inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
896          unspill_inst->desc =
897             elk_dp_desc(devinfo, bti,
898                         ELK_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
899                         ELK_DATAPORT_OWORD_BLOCK_DWORDS(reg_size * 8));
900       } else if (devinfo->ver >= 7 && spill_offset < (1 << 12) * REG_SIZE) {
901          /* The Gfx7 descriptor-based offset is 12 bits of HWORD units.
902           * Because the Gfx7-style scratch block read is hardwired to BTI 255,
903           * on Gfx9+ it would cause the DC to do an IA-coherent read, what
904           * largely outweighs the slight advantage from not having to provide
905           * the address as part of the message header, so we're better off
906           * using plain old oword block reads.
907           */
908          unspill_inst = bld.emit(ELK_SHADER_OPCODE_GFX7_SCRATCH_READ, dst);
909          unspill_inst->offset = spill_offset;
910       } else {
911          unspill_inst = bld.emit(ELK_SHADER_OPCODE_GFX4_SCRATCH_READ, dst);
912          unspill_inst->offset = spill_offset;
913          unspill_inst->base_mrf = spill_base_mrf(bld.shader);
914          unspill_inst->mlen = 1; /* header contains offset */
915       }
916       _mesa_set_add(spill_insts, unspill_inst);
917 
918       dst.offset += reg_size * REG_SIZE;
919       spill_offset += reg_size * REG_SIZE;
920    }
921 }
922 
923 void
emit_spill(const fs_builder & bld,struct shader_stats * stats,elk_fs_reg src,uint32_t spill_offset,unsigned count,int ip)924 elk_fs_reg_alloc::emit_spill(const fs_builder &bld,
925                          struct shader_stats *stats,
926                          elk_fs_reg src,
927                          uint32_t spill_offset, unsigned count, int ip)
928 {
929    const intel_device_info *devinfo = bld.shader->devinfo;
930    const unsigned reg_size = src.component_size(bld.dispatch_width()) /
931                              REG_SIZE;
932    assert(count % reg_size == 0);
933 
934    for (unsigned i = 0; i < count / reg_size; i++) {
935       ++stats->spill_count;
936 
937       elk_fs_inst *spill_inst;
938       if (devinfo->verx10 >= 125) {
939          elk_fs_reg offset = build_lane_offsets(bld, spill_offset, ip);
940          /* We leave the extended descriptor empty and flag the instruction
941           * relocate the extended descriptor. That way the surface offset is
942           * directly put into the instruction and we don't need to use a
943           * register to hold it.
944           */
945          elk_fs_reg srcs[] = {
946             elk_imm_ud(0),        /* desc */
947             elk_imm_ud(0),        /* ex_desc */
948             offset,               /* payload */
949             src,                  /* payload2 */
950          };
951          spill_inst = bld.emit(ELK_SHADER_OPCODE_SEND, bld.null_reg_f(),
952                                srcs, ARRAY_SIZE(srcs));
953          spill_inst->sfid = GFX12_SFID_UGM;
954          spill_inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE,
955                                          bld.dispatch_width(),
956                                          LSC_ADDR_SURFTYPE_SS,
957                                          LSC_ADDR_SIZE_A32,
958                                          1 /* num_coordinates */,
959                                          LSC_DATA_SIZE_D32,
960                                          1 /* num_channels */,
961                                          false /* transpose */,
962                                          LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
963                                          false /* has_dest */);
964          spill_inst->header_size = 0;
965          spill_inst->mlen = lsc_msg_desc_src0_len(devinfo, spill_inst->desc);
966          spill_inst->ex_mlen = reg_size;
967          spill_inst->size_written = 0;
968          spill_inst->send_has_side_effects = true;
969          spill_inst->send_is_volatile = false;
970          spill_inst->send_ex_desc_scratch = true;
971       } else if (devinfo->ver >= 9) {
972          elk_fs_reg header = this->scratch_header;
973          fs_builder ubld = bld.exec_all().group(1, 0);
974          assert(spill_offset % 16 == 0);
975          spill_inst = ubld.MOV(component(header, 2),
976                                elk_imm_ud(spill_offset / 16));
977          _mesa_set_add(spill_insts, spill_inst);
978 
979          const unsigned bti = GFX8_BTI_STATELESS_NON_COHERENT;
980          const elk_fs_reg ex_desc = elk_imm_ud(0);
981 
982          elk_fs_reg srcs[] = { elk_imm_ud(0), ex_desc, header, src };
983          spill_inst = bld.emit(ELK_SHADER_OPCODE_SEND, bld.null_reg_f(),
984                                srcs, ARRAY_SIZE(srcs));
985          spill_inst->mlen = 1;
986          spill_inst->ex_mlen = reg_size;
987          spill_inst->size_written = 0;
988          spill_inst->header_size = 1;
989          spill_inst->send_has_side_effects = true;
990          spill_inst->send_is_volatile = false;
991          spill_inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
992          spill_inst->desc =
993             elk_dp_desc(devinfo, bti,
994                         GFX6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE,
995                         ELK_DATAPORT_OWORD_BLOCK_DWORDS(reg_size * 8));
996       } else {
997          spill_inst = bld.emit(ELK_SHADER_OPCODE_GFX4_SCRATCH_WRITE,
998                                bld.null_reg_f(), src);
999          spill_inst->offset = spill_offset;
1000          spill_inst->mlen = 1 + reg_size; /* header, value */
1001          spill_inst->base_mrf = spill_base_mrf(bld.shader);
1002       }
1003       _mesa_set_add(spill_insts, spill_inst);
1004 
1005       src.offset += reg_size * REG_SIZE;
1006       spill_offset += reg_size * REG_SIZE;
1007    }
1008 }
1009 
1010 void
set_spill_costs()1011 elk_fs_reg_alloc::set_spill_costs()
1012 {
1013    float block_scale = 1.0;
1014    float spill_costs[fs->alloc.count];
1015    bool no_spill[fs->alloc.count];
1016 
1017    for (unsigned i = 0; i < fs->alloc.count; i++) {
1018       spill_costs[i] = 0.0;
1019       no_spill[i] = false;
1020    }
1021 
1022    /* Calculate costs for spilling nodes.  Call it a cost of 1 per
1023     * spill/unspill we'll have to do, and guess that the insides of
1024     * loops run 10 times.
1025     */
1026    foreach_block_and_inst(block, elk_fs_inst, inst, fs->cfg) {
1027       for (unsigned int i = 0; i < inst->sources; i++) {
1028 	 if (inst->src[i].file == VGRF)
1029             spill_costs[inst->src[i].nr] += regs_read(inst, i) * block_scale;
1030       }
1031 
1032       if (inst->dst.file == VGRF)
1033          spill_costs[inst->dst.nr] += regs_written(inst) * block_scale;
1034 
1035       /* Don't spill anything we generated while spilling */
1036       if (_mesa_set_search(spill_insts, inst)) {
1037          for (unsigned int i = 0; i < inst->sources; i++) {
1038 	    if (inst->src[i].file == VGRF)
1039                no_spill[inst->src[i].nr] = true;
1040          }
1041 	 if (inst->dst.file == VGRF)
1042             no_spill[inst->dst.nr] = true;
1043       }
1044 
1045       switch (inst->opcode) {
1046 
1047       case ELK_OPCODE_DO:
1048 	 block_scale *= 10;
1049 	 break;
1050 
1051       case ELK_OPCODE_WHILE:
1052 	 block_scale /= 10;
1053 	 break;
1054 
1055       case ELK_OPCODE_IF:
1056       case ELK_OPCODE_IFF:
1057          block_scale *= 0.5;
1058          break;
1059 
1060       case ELK_OPCODE_ENDIF:
1061          block_scale /= 0.5;
1062          break;
1063 
1064       default:
1065 	 break;
1066       }
1067    }
1068 
1069    for (unsigned i = 0; i < fs->alloc.count; i++) {
1070       /* Do the no_spill check first.  Registers that are used as spill
1071        * temporaries may have been allocated after we calculated liveness so
1072        * we shouldn't look their liveness up.  Fortunately, they're always
1073        * used in SCRATCH_READ/WRITE instructions so they'll always be flagged
1074        * no_spill.
1075        */
1076       if (no_spill[i])
1077          continue;
1078 
1079       int live_length = live.vgrf_end[i] - live.vgrf_start[i];
1080       if (live_length <= 0)
1081          continue;
1082 
1083       /* Divide the cost (in number of spills/fills) by the log of the length
1084        * of the live range of the register.  This will encourage spill logic
1085        * to spill long-living things before spilling short-lived things where
1086        * spilling is less likely to actually do us any good.  We use the log
1087        * of the length because it will fall off very quickly and not cause us
1088        * to spill medium length registers with more uses.
1089        */
1090       float adjusted_cost = spill_costs[i] / logf(live_length);
1091       ra_set_node_spill_cost(g, first_vgrf_node + i, adjusted_cost);
1092    }
1093 
1094    have_spill_costs = true;
1095 }
1096 
1097 int
choose_spill_reg()1098 elk_fs_reg_alloc::choose_spill_reg()
1099 {
1100    if (!have_spill_costs)
1101       set_spill_costs();
1102 
1103    int node = ra_get_best_spill_node(g);
1104    if (node < 0)
1105       return -1;
1106 
1107    assert(node >= first_vgrf_node);
1108    return node - first_vgrf_node;
1109 }
1110 
1111 elk_fs_reg
alloc_scratch_header()1112 elk_fs_reg_alloc::alloc_scratch_header()
1113 {
1114    int vgrf = fs->alloc.allocate(1);
1115    assert(first_vgrf_node + vgrf == scratch_header_node);
1116    ra_set_node_class(g, scratch_header_node,
1117                         compiler->fs_reg_sets[rsi].classes[0]);
1118 
1119    setup_live_interference(scratch_header_node, 0, INT_MAX);
1120 
1121    return elk_fs_reg(VGRF, vgrf, ELK_REGISTER_TYPE_UD);
1122 }
1123 
1124 elk_fs_reg
alloc_spill_reg(unsigned size,int ip)1125 elk_fs_reg_alloc::alloc_spill_reg(unsigned size, int ip)
1126 {
1127    int vgrf = fs->alloc.allocate(ALIGN(size, reg_unit(devinfo)));
1128    int class_idx = DIV_ROUND_UP(size, reg_unit(devinfo)) - 1;
1129    int n = ra_add_node(g, compiler->fs_reg_sets[rsi].classes[class_idx]);
1130    assert(n == first_vgrf_node + vgrf);
1131    assert(n == first_spill_node + spill_node_count);
1132 
1133    setup_live_interference(n, ip - 1, ip + 1);
1134 
1135    /* Add interference between this spill node and any other spill nodes for
1136     * the same instruction.
1137     */
1138    for (int s = 0; s < spill_node_count; s++) {
1139       if (spill_vgrf_ip[s] == ip)
1140          ra_add_node_interference(g, n, first_spill_node + s);
1141    }
1142 
1143    /* Add this spill node to the list for next time */
1144    if (spill_node_count >= spill_vgrf_ip_alloc) {
1145       if (spill_vgrf_ip_alloc == 0)
1146          spill_vgrf_ip_alloc = 16;
1147       else
1148          spill_vgrf_ip_alloc *= 2;
1149       spill_vgrf_ip = reralloc(mem_ctx, spill_vgrf_ip, int,
1150                                spill_vgrf_ip_alloc);
1151    }
1152    spill_vgrf_ip[spill_node_count++] = ip;
1153 
1154    return elk_fs_reg(VGRF, vgrf);
1155 }
1156 
1157 void
spill_reg(unsigned spill_reg)1158 elk_fs_reg_alloc::spill_reg(unsigned spill_reg)
1159 {
1160    int size = fs->alloc.sizes[spill_reg];
1161    unsigned int spill_offset = fs->last_scratch;
1162    assert(ALIGN(spill_offset, 16) == spill_offset); /* oword read/write req. */
1163 
1164    /* Spills may use MRFs 13-15 in the SIMD16 case.  Our texturing is done
1165     * using up to 11 MRFs starting from either m1 or m2, and fb writes can use
1166     * up to m13 (gfx6+ simd16: 2 header + 8 color + 2 src0alpha + 2 omask) or
1167     * m15 (gfx4-5 simd16: 2 header + 8 color + 1 aads + 2 src depth + 2 dst
1168     * depth), starting from m1.  In summary: We may not be able to spill in
1169     * SIMD16 mode, because we'd stomp the FB writes.
1170     */
1171    if (!fs->spilled_any_registers) {
1172       if (devinfo->verx10 >= 125) {
1173          /* We will allocate a register on the fly */
1174       } else if (devinfo->ver >= 9) {
1175          this->scratch_header = alloc_scratch_header();
1176          fs_builder ubld = fs_builder(fs, 8).exec_all().at(
1177             fs->cfg->first_block(), fs->cfg->first_block()->start());
1178 
1179          elk_fs_inst *inst = ubld.emit(ELK_SHADER_OPCODE_SCRATCH_HEADER,
1180                                    this->scratch_header);
1181          _mesa_set_add(spill_insts, inst);
1182       } else {
1183          bool mrf_used[ELK_MAX_MRF(devinfo->ver)];
1184          get_used_mrfs(fs, mrf_used);
1185 
1186          for (int i = spill_base_mrf(fs); i < ELK_MAX_MRF(devinfo->ver); i++) {
1187             if (mrf_used[i]) {
1188                fs->fail("Register spilling not supported with m%d used", i);
1189              return;
1190             }
1191          }
1192       }
1193 
1194       fs->spilled_any_registers = true;
1195    }
1196 
1197    fs->last_scratch += size * REG_SIZE;
1198 
1199    /* We're about to replace all uses of this register.  It no longer
1200     * conflicts with anything so we can get rid of its interference.
1201     */
1202    ra_set_node_spill_cost(g, first_vgrf_node + spill_reg, 0);
1203    ra_reset_node_interference(g, first_vgrf_node + spill_reg);
1204 
1205    /* Generate spill/unspill instructions for the objects being
1206     * spilled.  Right now, we spill or unspill the whole thing to a
1207     * virtual grf of the same size.  For most instructions, though, we
1208     * could just spill/unspill the GRF being accessed.
1209     */
1210    int ip = 0;
1211    foreach_block_and_inst (block, elk_fs_inst, inst, fs->cfg) {
1212       const fs_builder ibld = fs_builder(fs, block, inst);
1213       exec_node *before = inst->prev;
1214       exec_node *after = inst->next;
1215 
1216       for (unsigned int i = 0; i < inst->sources; i++) {
1217 	 if (inst->src[i].file == VGRF &&
1218              inst->src[i].nr == spill_reg) {
1219             int count = regs_read(inst, i);
1220             int subset_spill_offset = spill_offset +
1221                ROUND_DOWN_TO(inst->src[i].offset, REG_SIZE);
1222             elk_fs_reg unspill_dst = alloc_spill_reg(count, ip);
1223 
1224             inst->src[i].nr = unspill_dst.nr;
1225             inst->src[i].offset %= REG_SIZE;
1226 
1227             /* We read the largest power-of-two divisor of the register count
1228              * (because only POT scratch read blocks are allowed by the
1229              * hardware) up to the maximum supported block size.
1230              */
1231             const unsigned width =
1232                MIN2(32, 1u << (ffs(MAX2(1, count) * 8) - 1));
1233 
1234             /* Set exec_all() on unspill messages under the (rather
1235              * pessimistic) assumption that there is no one-to-one
1236              * correspondence between channels of the spilled variable in
1237              * scratch space and the scratch read message, which operates on
1238              * 32 bit channels.  It shouldn't hurt in any case because the
1239              * unspill destination is a block-local temporary.
1240              */
1241             emit_unspill(ibld.exec_all().group(width, 0), &fs->shader_stats,
1242                          unspill_dst, subset_spill_offset, count, ip);
1243 	 }
1244       }
1245 
1246       if (inst->dst.file == VGRF &&
1247           inst->dst.nr == spill_reg &&
1248           inst->opcode != ELK_SHADER_OPCODE_UNDEF) {
1249          int subset_spill_offset = spill_offset +
1250             ROUND_DOWN_TO(inst->dst.offset, REG_SIZE);
1251          elk_fs_reg spill_src = alloc_spill_reg(regs_written(inst), ip);
1252 
1253          inst->dst.nr = spill_src.nr;
1254          inst->dst.offset %= REG_SIZE;
1255 
1256          /* If we're immediately spilling the register, we should not use
1257           * destination dependency hints.  Doing so will cause the GPU do
1258           * try to read and write the register at the same time and may
1259           * hang the GPU.
1260           */
1261          inst->no_dd_clear = false;
1262          inst->no_dd_check = false;
1263 
1264          /* Calculate the execution width of the scratch messages (which work
1265           * in terms of 32 bit components so we have a fixed number of eight
1266           * channels per spilled register).  We attempt to write one
1267           * exec_size-wide component of the variable at a time without
1268           * exceeding the maximum number of (fake) MRF registers reserved for
1269           * spills.
1270           */
1271          const unsigned width = 8 * reg_unit(devinfo) *
1272             DIV_ROUND_UP(MIN2(inst->dst.component_size(inst->exec_size),
1273                               spill_max_size(fs) * REG_SIZE),
1274                          reg_unit(devinfo) * REG_SIZE);
1275 
1276          /* Spills should only write data initialized by the instruction for
1277           * whichever channels are enabled in the execution mask.  If that's
1278           * not possible we'll have to emit a matching unspill before the
1279           * instruction and set force_writemask_all on the spill.
1280           */
1281          const bool per_channel =
1282             inst->dst.is_contiguous() && type_sz(inst->dst.type) == 4 &&
1283             inst->exec_size == width;
1284 
1285          /* Builder used to emit the scratch messages. */
1286          const fs_builder ubld = ibld.exec_all(!per_channel).group(width, 0);
1287 
1288 	 /* If our write is going to affect just part of the
1289           * regs_written(inst), then we need to unspill the destination since
1290           * we write back out all of the regs_written().  If the original
1291           * instruction had force_writemask_all set and is not a partial
1292           * write, there should be no need for the unspill since the
1293           * instruction will be overwriting the whole destination in any case.
1294 	  */
1295          if (inst->is_partial_write() ||
1296              (!inst->force_writemask_all && !per_channel))
1297             emit_unspill(ubld, &fs->shader_stats, spill_src,
1298                          subset_spill_offset, regs_written(inst), ip);
1299 
1300          emit_spill(ubld.at(block, inst->next), &fs->shader_stats, spill_src,
1301                     subset_spill_offset, regs_written(inst), ip);
1302       }
1303 
1304       for (elk_fs_inst *inst = (elk_fs_inst *)before->next;
1305            inst != after; inst = (elk_fs_inst *)inst->next)
1306          setup_inst_interference(inst);
1307 
1308       /* We don't advance the ip for scratch read/write instructions
1309        * because we consider them to have the same ip as instruction we're
1310        * spilling around for the purposes of interference.  Also, we're
1311        * inserting spill instructions without re-running liveness analysis
1312        * and we don't want to mess up our IPs.
1313        */
1314       if (!_mesa_set_search(spill_insts, inst))
1315          ip++;
1316    }
1317 
1318    assert(ip == live_instr_count);
1319 }
1320 
1321 bool
assign_regs(bool allow_spilling,bool spill_all)1322 elk_fs_reg_alloc::assign_regs(bool allow_spilling, bool spill_all)
1323 {
1324    build_interference_graph(fs->spilled_any_registers || spill_all);
1325 
1326    unsigned spilled = 0;
1327    while (1) {
1328       /* Debug of register spilling: Go spill everything. */
1329       if (unlikely(spill_all)) {
1330          int reg = choose_spill_reg();
1331          if (reg != -1) {
1332             spill_reg(reg);
1333             continue;
1334          }
1335       }
1336 
1337       if (ra_allocate(g))
1338          break;
1339 
1340       if (!allow_spilling)
1341          return false;
1342 
1343       /* Failed to allocate registers.  Spill some regs, and the caller will
1344        * loop back into here to try again.
1345        */
1346       unsigned nr_spills = 1;
1347       if (compiler->spilling_rate)
1348          nr_spills = MAX2(1, spilled / compiler->spilling_rate);
1349 
1350       for (unsigned j = 0; j < nr_spills; j++) {
1351          int reg = choose_spill_reg();
1352          if (reg == -1) {
1353             if (j == 0)
1354                return false; /* Nothing to spill */
1355             break;
1356          }
1357 
1358          /* If we're going to spill but we've never spilled before, we need
1359           * to re-build the interference graph with MRFs enabled to allow
1360           * spilling.
1361           */
1362          if (!fs->spilled_any_registers) {
1363             discard_interference_graph();
1364             build_interference_graph(true);
1365          }
1366 
1367          spill_reg(reg);
1368          spilled++;
1369       }
1370    }
1371 
1372    if (spilled)
1373       fs->invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
1374 
1375    /* Get the chosen virtual registers for each node, and map virtual
1376     * regs in the register classes back down to real hardware reg
1377     * numbers.
1378     */
1379    unsigned hw_reg_mapping[fs->alloc.count];
1380    fs->grf_used = fs->first_non_payload_grf;
1381    for (unsigned i = 0; i < fs->alloc.count; i++) {
1382       int reg = ra_get_node_reg(g, first_vgrf_node + i);
1383 
1384       hw_reg_mapping[i] = reg;
1385       fs->grf_used = MAX2(fs->grf_used,
1386 			  hw_reg_mapping[i] + DIV_ROUND_UP(fs->alloc.sizes[i],
1387                                                            reg_unit(devinfo)));
1388    }
1389 
1390    foreach_block_and_inst(block, elk_fs_inst, inst, fs->cfg) {
1391       assign_reg(devinfo, hw_reg_mapping, &inst->dst);
1392       for (int i = 0; i < inst->sources; i++) {
1393          assign_reg(devinfo, hw_reg_mapping, &inst->src[i]);
1394       }
1395    }
1396 
1397    fs->alloc.count = fs->grf_used;
1398 
1399    return true;
1400 }
1401 
1402 bool
assign_regs(bool allow_spilling,bool spill_all)1403 elk_fs_visitor::assign_regs(bool allow_spilling, bool spill_all)
1404 {
1405    elk_fs_reg_alloc alloc(this);
1406    bool success = alloc.assign_regs(allow_spilling, spill_all);
1407    if (!success && allow_spilling) {
1408       fail("no register to spill:\n");
1409       dump_instructions(NULL);
1410    }
1411    return success;
1412 }
1413