• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2010 Intel Corporation
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "brw_fs.h"
7 #include "brw_fs_builder.h"
8 
9 using namespace brw;
10 
11 /**
12  * Split large virtual GRFs into separate components if we can.
13  *
14  * This pass aggressively splits VGRFs into as small a chunks as possible,
15  * down to single registers if it can.  If no VGRFs can be split, we return
16  * false so this pass can safely be used inside an optimization loop.  We
17  * want to split, because virtual GRFs are what we register allocate and
18  * spill (due to contiguousness requirements for some instructions), and
19  * they're what we naturally generate in the codegen process, but most
20  * virtual GRFs don't actually need to be contiguous sets of GRFs.  If we
21  * split, we'll end up with reduced live intervals and better dead code
22  * elimination and coalescing.
23  */
24 bool
brw_fs_opt_split_virtual_grfs(fs_visitor & s)25 brw_fs_opt_split_virtual_grfs(fs_visitor &s)
26 {
27    /* Compact the register file so we eliminate dead vgrfs.  This
28     * only defines split points for live registers, so if we have
29     * too large dead registers they will hit assertions later.
30     */
31    brw_fs_opt_compact_virtual_grfs(s);
32 
33    unsigned num_vars = s.alloc.count;
34 
35    /* Count the total number of registers */
36    unsigned reg_count = 0;
37    unsigned vgrf_to_reg[num_vars];
38    for (unsigned i = 0; i < num_vars; i++) {
39       vgrf_to_reg[i] = reg_count;
40       reg_count += s.alloc.sizes[i];
41    }
42 
43    /* An array of "split points".  For each register slot, this indicates
44     * if this slot can be separated from the previous slot.  Every time an
45     * instruction uses multiple elements of a register (as a source or
46     * destination), we mark the used slots as inseparable.  Then we go
47     * through and split the registers into the smallest pieces we can.
48     */
49    bool *split_points = new bool[reg_count];
50    memset(split_points, 0, reg_count * sizeof(*split_points));
51 
52    /* Mark all used registers as fully splittable */
53    foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
54       if (inst->dst.file == VGRF) {
55          unsigned reg = vgrf_to_reg[inst->dst.nr];
56          for (unsigned j = 1; j < s.alloc.sizes[inst->dst.nr]; j++)
57             split_points[reg + j] = true;
58       }
59 
60       for (unsigned i = 0; i < inst->sources; i++) {
61          if (inst->src[i].file == VGRF) {
62             unsigned reg = vgrf_to_reg[inst->src[i].nr];
63             for (unsigned j = 1; j < s.alloc.sizes[inst->src[i].nr]; j++)
64                split_points[reg + j] = true;
65          }
66       }
67    }
68 
69    foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
70       /* We fix up undef instructions later */
71       if (inst->opcode == SHADER_OPCODE_UNDEF) {
72          assert(inst->dst.file == VGRF);
73          continue;
74       }
75 
76       if (inst->dst.file == VGRF) {
77          unsigned reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
78          for (unsigned j = 1; j < regs_written(inst); j++)
79             split_points[reg + j] = false;
80       }
81       for (unsigned i = 0; i < inst->sources; i++) {
82          if (inst->src[i].file == VGRF) {
83             unsigned reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
84             for (unsigned j = 1; j < regs_read(inst, i); j++)
85                split_points[reg + j] = false;
86          }
87       }
88    }
89 
90    /* Bitset of which registers have been split */
91    bool *vgrf_has_split = new bool[num_vars];
92    memset(vgrf_has_split, 0, num_vars * sizeof(*vgrf_has_split));
93 
94    unsigned *new_virtual_grf = new unsigned[reg_count];
95    unsigned *new_reg_offset = new unsigned[reg_count];
96 
97    unsigned reg = 0;
98    bool has_splits = false;
99    for (unsigned i = 0; i < num_vars; i++) {
100       /* The first one should always be 0 as a quick sanity check. */
101       assert(split_points[reg] == false);
102 
103       /* j = 0 case */
104       new_reg_offset[reg] = 0;
105       reg++;
106       unsigned offset = 1;
107 
108       /* j > 0 case */
109       for (unsigned j = 1; j < s.alloc.sizes[i]; j++) {
110          /* If this is a split point, reset the offset to 0 and allocate a
111           * new virtual GRF for the previous offset many registers
112           */
113          if (split_points[reg]) {
114             has_splits = true;
115             vgrf_has_split[i] = true;
116             assert(offset <= MAX_VGRF_SIZE(s.devinfo));
117             unsigned grf = s.alloc.allocate(offset);
118             for (unsigned k = reg - offset; k < reg; k++)
119                new_virtual_grf[k] = grf;
120             offset = 0;
121          }
122          new_reg_offset[reg] = offset;
123          offset++;
124          reg++;
125       }
126 
127       /* The last one gets the original register number */
128       assert(offset <= MAX_VGRF_SIZE(s.devinfo));
129       s.alloc.sizes[i] = offset;
130       for (unsigned k = reg - offset; k < reg; k++)
131          new_virtual_grf[k] = i;
132    }
133    assert(reg == reg_count);
134 
135    bool progress;
136    if (!has_splits) {
137       progress = false;
138       goto cleanup;
139    }
140 
141    foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
142       if (inst->opcode == SHADER_OPCODE_UNDEF) {
143          assert(inst->dst.file == VGRF);
144          if (vgrf_has_split[inst->dst.nr]) {
145             const fs_builder ibld(&s, block, inst);
146             assert(inst->size_written % REG_SIZE == 0);
147             unsigned reg_offset = inst->dst.offset / REG_SIZE;
148             unsigned size_written = 0;
149             while (size_written < inst->size_written) {
150                reg = vgrf_to_reg[inst->dst.nr] + reg_offset + size_written / REG_SIZE;
151                fs_inst *undef =
152                   ibld.UNDEF(
153                      byte_offset(fs_reg(VGRF, new_virtual_grf[reg], inst->dst.type),
154                                  new_reg_offset[reg] * REG_SIZE));
155                undef->size_written =
156                   MIN2(inst->size_written - size_written, undef->size_written);
157                assert(undef->size_written % REG_SIZE == 0);
158                size_written += undef->size_written;
159             }
160             inst->remove(block);
161          } else {
162             reg = vgrf_to_reg[inst->dst.nr];
163             assert(new_reg_offset[reg] == 0);
164             assert(new_virtual_grf[reg] == inst->dst.nr);
165          }
166          continue;
167       }
168 
169       if (inst->dst.file == VGRF) {
170          reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
171          if (vgrf_has_split[inst->dst.nr]) {
172             inst->dst.nr = new_virtual_grf[reg];
173             inst->dst.offset = new_reg_offset[reg] * REG_SIZE +
174                                inst->dst.offset % REG_SIZE;
175             assert(new_reg_offset[reg] < s.alloc.sizes[new_virtual_grf[reg]]);
176          } else {
177             assert(new_reg_offset[reg] == inst->dst.offset / REG_SIZE);
178             assert(new_virtual_grf[reg] == inst->dst.nr);
179          }
180       }
181       for (unsigned i = 0; i < inst->sources; i++) {
182 	 if (inst->src[i].file != VGRF)
183             continue;
184 
185          reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
186          if (vgrf_has_split[inst->src[i].nr]) {
187             inst->src[i].nr = new_virtual_grf[reg];
188             inst->src[i].offset = new_reg_offset[reg] * REG_SIZE +
189                                   inst->src[i].offset % REG_SIZE;
190             assert(new_reg_offset[reg] < s.alloc.sizes[new_virtual_grf[reg]]);
191          } else {
192             assert(new_reg_offset[reg] == inst->src[i].offset / REG_SIZE);
193             assert(new_virtual_grf[reg] == inst->src[i].nr);
194          }
195       }
196    }
197    s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | DEPENDENCY_VARIABLES);
198 
199    progress = true;
200 
201 cleanup:
202    delete[] split_points;
203    delete[] vgrf_has_split;
204    delete[] new_virtual_grf;
205    delete[] new_reg_offset;
206 
207    return progress;
208 }
209 
210 /**
211  * Remove unused virtual GRFs and compact the vgrf_* arrays.
212  *
213  * During code generation, we create tons of temporary variables, many of
214  * which get immediately killed and are never used again.  Yet, in later
215  * optimization and analysis passes, such as compute_live_intervals, we need
216  * to loop over all the virtual GRFs.  Compacting them can save a lot of
217  * overhead.
218  */
219 bool
brw_fs_opt_compact_virtual_grfs(fs_visitor & s)220 brw_fs_opt_compact_virtual_grfs(fs_visitor &s)
221 {
222    bool progress = false;
223    int *remap_table = new int[s.alloc.count];
224    memset(remap_table, -1, s.alloc.count * sizeof(int));
225 
226    /* Mark which virtual GRFs are used. */
227    foreach_block_and_inst(block, const fs_inst, inst, s.cfg) {
228       if (inst->dst.file == VGRF)
229          remap_table[inst->dst.nr] = 0;
230 
231       for (int i = 0; i < inst->sources; i++) {
232          if (inst->src[i].file == VGRF)
233             remap_table[inst->src[i].nr] = 0;
234       }
235    }
236 
237    /* Compact the GRF arrays. */
238    int new_index = 0;
239    for (unsigned i = 0; i < s.alloc.count; i++) {
240       if (remap_table[i] == -1) {
241          /* We just found an unused register.  This means that we are
242           * actually going to compact something.
243           */
244          progress = true;
245       } else {
246          remap_table[i] = new_index;
247          s.alloc.sizes[new_index] = s.alloc.sizes[i];
248          s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | DEPENDENCY_VARIABLES);
249          ++new_index;
250       }
251    }
252 
253    s.alloc.count = new_index;
254 
255    /* Patch all the instructions to use the newly renumbered registers */
256    foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
257       if (inst->dst.file == VGRF)
258          inst->dst.nr = remap_table[inst->dst.nr];
259 
260       for (int i = 0; i < inst->sources; i++) {
261          if (inst->src[i].file == VGRF)
262             inst->src[i].nr = remap_table[inst->src[i].nr];
263       }
264    }
265 
266    /* Patch all the references to delta_xy, since they're used in register
267     * allocation.  If they're unused, switch them to BAD_FILE so we don't
268     * think some random VGRF is delta_xy.
269     */
270    for (unsigned i = 0; i < ARRAY_SIZE(s.delta_xy); i++) {
271       if (s.delta_xy[i].file == VGRF) {
272          if (remap_table[s.delta_xy[i].nr] != -1) {
273             s.delta_xy[i].nr = remap_table[s.delta_xy[i].nr];
274          } else {
275             s.delta_xy[i].file = BAD_FILE;
276          }
277       }
278    }
279 
280    delete[] remap_table;
281 
282    return progress;
283 }
284 
285 
286