1 /*
2 * Copyright © 2010 Intel Corporation
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "brw_fs.h"
7 #include "brw_fs_builder.h"
8
9 using namespace brw;
10
11 /**
12 * Split large virtual GRFs into separate components if we can.
13 *
14 * This pass aggressively splits VGRFs into as small a chunks as possible,
15 * down to single registers if it can. If no VGRFs can be split, we return
16 * false so this pass can safely be used inside an optimization loop. We
17 * want to split, because virtual GRFs are what we register allocate and
18 * spill (due to contiguousness requirements for some instructions), and
19 * they're what we naturally generate in the codegen process, but most
20 * virtual GRFs don't actually need to be contiguous sets of GRFs. If we
21 * split, we'll end up with reduced live intervals and better dead code
22 * elimination and coalescing.
23 */
24 bool
brw_fs_opt_split_virtual_grfs(fs_visitor & s)25 brw_fs_opt_split_virtual_grfs(fs_visitor &s)
26 {
27 /* Compact the register file so we eliminate dead vgrfs. This
28 * only defines split points for live registers, so if we have
29 * too large dead registers they will hit assertions later.
30 */
31 brw_fs_opt_compact_virtual_grfs(s);
32
33 unsigned num_vars = s.alloc.count;
34
35 /* Count the total number of registers */
36 unsigned reg_count = 0;
37 unsigned vgrf_to_reg[num_vars];
38 for (unsigned i = 0; i < num_vars; i++) {
39 vgrf_to_reg[i] = reg_count;
40 reg_count += s.alloc.sizes[i];
41 }
42
43 /* An array of "split points". For each register slot, this indicates
44 * if this slot can be separated from the previous slot. Every time an
45 * instruction uses multiple elements of a register (as a source or
46 * destination), we mark the used slots as inseparable. Then we go
47 * through and split the registers into the smallest pieces we can.
48 */
49 bool *split_points = new bool[reg_count];
50 memset(split_points, 0, reg_count * sizeof(*split_points));
51
52 /* Mark all used registers as fully splittable */
53 foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
54 if (inst->dst.file == VGRF) {
55 unsigned reg = vgrf_to_reg[inst->dst.nr];
56 for (unsigned j = 1; j < s.alloc.sizes[inst->dst.nr]; j++)
57 split_points[reg + j] = true;
58 }
59
60 for (unsigned i = 0; i < inst->sources; i++) {
61 if (inst->src[i].file == VGRF) {
62 unsigned reg = vgrf_to_reg[inst->src[i].nr];
63 for (unsigned j = 1; j < s.alloc.sizes[inst->src[i].nr]; j++)
64 split_points[reg + j] = true;
65 }
66 }
67 }
68
69 foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
70 /* We fix up undef instructions later */
71 if (inst->opcode == SHADER_OPCODE_UNDEF) {
72 assert(inst->dst.file == VGRF);
73 continue;
74 }
75
76 if (inst->dst.file == VGRF) {
77 unsigned reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
78 for (unsigned j = 1; j < regs_written(inst); j++)
79 split_points[reg + j] = false;
80 }
81 for (unsigned i = 0; i < inst->sources; i++) {
82 if (inst->src[i].file == VGRF) {
83 unsigned reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
84 for (unsigned j = 1; j < regs_read(inst, i); j++)
85 split_points[reg + j] = false;
86 }
87 }
88 }
89
90 /* Bitset of which registers have been split */
91 bool *vgrf_has_split = new bool[num_vars];
92 memset(vgrf_has_split, 0, num_vars * sizeof(*vgrf_has_split));
93
94 unsigned *new_virtual_grf = new unsigned[reg_count];
95 unsigned *new_reg_offset = new unsigned[reg_count];
96
97 unsigned reg = 0;
98 bool has_splits = false;
99 for (unsigned i = 0; i < num_vars; i++) {
100 /* The first one should always be 0 as a quick sanity check. */
101 assert(split_points[reg] == false);
102
103 /* j = 0 case */
104 new_reg_offset[reg] = 0;
105 reg++;
106 unsigned offset = 1;
107
108 /* j > 0 case */
109 for (unsigned j = 1; j < s.alloc.sizes[i]; j++) {
110 /* If this is a split point, reset the offset to 0 and allocate a
111 * new virtual GRF for the previous offset many registers
112 */
113 if (split_points[reg]) {
114 has_splits = true;
115 vgrf_has_split[i] = true;
116 assert(offset <= MAX_VGRF_SIZE(s.devinfo));
117 unsigned grf = s.alloc.allocate(offset);
118 for (unsigned k = reg - offset; k < reg; k++)
119 new_virtual_grf[k] = grf;
120 offset = 0;
121 }
122 new_reg_offset[reg] = offset;
123 offset++;
124 reg++;
125 }
126
127 /* The last one gets the original register number */
128 assert(offset <= MAX_VGRF_SIZE(s.devinfo));
129 s.alloc.sizes[i] = offset;
130 for (unsigned k = reg - offset; k < reg; k++)
131 new_virtual_grf[k] = i;
132 }
133 assert(reg == reg_count);
134
135 bool progress;
136 if (!has_splits) {
137 progress = false;
138 goto cleanup;
139 }
140
141 foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
142 if (inst->opcode == SHADER_OPCODE_UNDEF) {
143 assert(inst->dst.file == VGRF);
144 if (vgrf_has_split[inst->dst.nr]) {
145 const fs_builder ibld(&s, block, inst);
146 assert(inst->size_written % REG_SIZE == 0);
147 unsigned reg_offset = inst->dst.offset / REG_SIZE;
148 unsigned size_written = 0;
149 while (size_written < inst->size_written) {
150 reg = vgrf_to_reg[inst->dst.nr] + reg_offset + size_written / REG_SIZE;
151 fs_inst *undef =
152 ibld.UNDEF(
153 byte_offset(fs_reg(VGRF, new_virtual_grf[reg], inst->dst.type),
154 new_reg_offset[reg] * REG_SIZE));
155 undef->size_written =
156 MIN2(inst->size_written - size_written, undef->size_written);
157 assert(undef->size_written % REG_SIZE == 0);
158 size_written += undef->size_written;
159 }
160 inst->remove(block);
161 } else {
162 reg = vgrf_to_reg[inst->dst.nr];
163 assert(new_reg_offset[reg] == 0);
164 assert(new_virtual_grf[reg] == inst->dst.nr);
165 }
166 continue;
167 }
168
169 if (inst->dst.file == VGRF) {
170 reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
171 if (vgrf_has_split[inst->dst.nr]) {
172 inst->dst.nr = new_virtual_grf[reg];
173 inst->dst.offset = new_reg_offset[reg] * REG_SIZE +
174 inst->dst.offset % REG_SIZE;
175 assert(new_reg_offset[reg] < s.alloc.sizes[new_virtual_grf[reg]]);
176 } else {
177 assert(new_reg_offset[reg] == inst->dst.offset / REG_SIZE);
178 assert(new_virtual_grf[reg] == inst->dst.nr);
179 }
180 }
181 for (unsigned i = 0; i < inst->sources; i++) {
182 if (inst->src[i].file != VGRF)
183 continue;
184
185 reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
186 if (vgrf_has_split[inst->src[i].nr]) {
187 inst->src[i].nr = new_virtual_grf[reg];
188 inst->src[i].offset = new_reg_offset[reg] * REG_SIZE +
189 inst->src[i].offset % REG_SIZE;
190 assert(new_reg_offset[reg] < s.alloc.sizes[new_virtual_grf[reg]]);
191 } else {
192 assert(new_reg_offset[reg] == inst->src[i].offset / REG_SIZE);
193 assert(new_virtual_grf[reg] == inst->src[i].nr);
194 }
195 }
196 }
197 s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | DEPENDENCY_VARIABLES);
198
199 progress = true;
200
201 cleanup:
202 delete[] split_points;
203 delete[] vgrf_has_split;
204 delete[] new_virtual_grf;
205 delete[] new_reg_offset;
206
207 return progress;
208 }
209
210 /**
211 * Remove unused virtual GRFs and compact the vgrf_* arrays.
212 *
213 * During code generation, we create tons of temporary variables, many of
214 * which get immediately killed and are never used again. Yet, in later
215 * optimization and analysis passes, such as compute_live_intervals, we need
216 * to loop over all the virtual GRFs. Compacting them can save a lot of
217 * overhead.
218 */
219 bool
brw_fs_opt_compact_virtual_grfs(fs_visitor & s)220 brw_fs_opt_compact_virtual_grfs(fs_visitor &s)
221 {
222 bool progress = false;
223 int *remap_table = new int[s.alloc.count];
224 memset(remap_table, -1, s.alloc.count * sizeof(int));
225
226 /* Mark which virtual GRFs are used. */
227 foreach_block_and_inst(block, const fs_inst, inst, s.cfg) {
228 if (inst->dst.file == VGRF)
229 remap_table[inst->dst.nr] = 0;
230
231 for (int i = 0; i < inst->sources; i++) {
232 if (inst->src[i].file == VGRF)
233 remap_table[inst->src[i].nr] = 0;
234 }
235 }
236
237 /* Compact the GRF arrays. */
238 int new_index = 0;
239 for (unsigned i = 0; i < s.alloc.count; i++) {
240 if (remap_table[i] == -1) {
241 /* We just found an unused register. This means that we are
242 * actually going to compact something.
243 */
244 progress = true;
245 } else {
246 remap_table[i] = new_index;
247 s.alloc.sizes[new_index] = s.alloc.sizes[i];
248 s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | DEPENDENCY_VARIABLES);
249 ++new_index;
250 }
251 }
252
253 s.alloc.count = new_index;
254
255 /* Patch all the instructions to use the newly renumbered registers */
256 foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
257 if (inst->dst.file == VGRF)
258 inst->dst.nr = remap_table[inst->dst.nr];
259
260 for (int i = 0; i < inst->sources; i++) {
261 if (inst->src[i].file == VGRF)
262 inst->src[i].nr = remap_table[inst->src[i].nr];
263 }
264 }
265
266 /* Patch all the references to delta_xy, since they're used in register
267 * allocation. If they're unused, switch them to BAD_FILE so we don't
268 * think some random VGRF is delta_xy.
269 */
270 for (unsigned i = 0; i < ARRAY_SIZE(s.delta_xy); i++) {
271 if (s.delta_xy[i].file == VGRF) {
272 if (remap_table[s.delta_xy[i].nr] != -1) {
273 s.delta_xy[i].nr = remap_table[s.delta_xy[i].nr];
274 } else {
275 s.delta_xy[i].file = BAD_FILE;
276 }
277 }
278 }
279
280 delete[] remap_table;
281
282 return progress;
283 }
284
285
286