1 /*
2 * Copyright (C) 2021 Collabora, Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #include "compiler.h"
25 #include "bi_builder.h"
26
27 /* This optimization pass, intended to run once after code emission but before
28 * copy propagation, analyzes direct word-aligned UBO reads and promotes a
29 * subset to moves from FAU. It is the sole populator of the UBO push data
30 * structure returned back to the command stream. */
31
32 static bool
bi_is_ubo(bi_instr * ins)33 bi_is_ubo(bi_instr *ins)
34 {
35 return (bi_opcode_props[ins->op].message == BIFROST_MESSAGE_LOAD) &&
36 (ins->seg == BI_SEG_UBO);
37 }
38
39 static bool
bi_is_direct_aligned_ubo(bi_instr * ins)40 bi_is_direct_aligned_ubo(bi_instr *ins)
41 {
42 return bi_is_ubo(ins) &&
43 (ins->src[0].type == BI_INDEX_CONSTANT) &&
44 (ins->src[1].type == BI_INDEX_CONSTANT) &&
45 ((ins->src[0].value & 0x3) == 0);
46 }
47
48 /* Represents use data for a single UBO */
49
50 #define MAX_UBO_WORDS (65536 / 16)
51
52 struct bi_ubo_block {
53 BITSET_DECLARE(pushed, MAX_UBO_WORDS);
54 uint8_t range[MAX_UBO_WORDS];
55 };
56
57 struct bi_ubo_analysis {
58 /* Per block analysis */
59 unsigned nr_blocks;
60 struct bi_ubo_block *blocks;
61 };
62
63 static struct bi_ubo_analysis
bi_analyze_ranges(bi_context * ctx)64 bi_analyze_ranges(bi_context *ctx)
65 {
66 struct bi_ubo_analysis res = {
67 .nr_blocks = ctx->nir->info.num_ubos + 1,
68 };
69
70 res.blocks = calloc(res.nr_blocks, sizeof(struct bi_ubo_block));
71
72 bi_foreach_instr_global(ctx, ins) {
73 if (!bi_is_direct_aligned_ubo(ins)) continue;
74
75 unsigned ubo = ins->src[1].value;
76 unsigned word = ins->src[0].value / 4;
77 unsigned channels = bi_opcode_props[ins->op].sr_count;
78
79 assert(ubo < res.nr_blocks);
80 assert(channels > 0 && channels <= 4);
81
82 if (word >= MAX_UBO_WORDS) continue;
83
84 /* Must use max if the same base is read with different channel
85 * counts, which is possible with nir_opt_shrink_vectors */
86 uint8_t *range = res.blocks[ubo].range;
87 range[word] = MAX2(range[word], channels);
88 }
89
90 return res;
91 }
92
93 /* Select UBO words to push. A sophisticated implementation would consider the
94 * number of uses and perhaps the control flow to estimate benefit. This is not
95 * sophisticated. Select from the last UBO first to prioritize sysvals. */
96
97 static void
bi_pick_ubo(struct panfrost_ubo_push * push,struct bi_ubo_analysis * analysis)98 bi_pick_ubo(struct panfrost_ubo_push *push, struct bi_ubo_analysis *analysis)
99 {
100 for (signed ubo = analysis->nr_blocks - 1; ubo >= 0; --ubo) {
101 struct bi_ubo_block *block = &analysis->blocks[ubo];
102
103 for (unsigned r = 0; r < MAX_UBO_WORDS; ++r) {
104 unsigned range = block->range[r];
105
106 /* Don't push something we don't access */
107 if (range == 0) continue;
108
109 /* Don't push more than possible */
110 if (push->count > PAN_MAX_PUSH - range)
111 return;
112
113 for (unsigned offs = 0; offs < range; ++offs) {
114 struct panfrost_ubo_word word = {
115 .ubo = ubo,
116 .offset = (r + offs) * 4
117 };
118
119 push->words[push->count++] = word;
120 }
121
122 /* Mark it as pushed so we can rewrite */
123 BITSET_SET(block->pushed, r);
124 }
125 }
126 }
127
128 void
bi_opt_push_ubo(bi_context * ctx)129 bi_opt_push_ubo(bi_context *ctx)
130 {
131 struct bi_ubo_analysis analysis = bi_analyze_ranges(ctx);
132 bi_pick_ubo(ctx->info.push, &analysis);
133
134 ctx->ubo_mask = 0;
135
136 bi_foreach_instr_global_safe(ctx, ins) {
137 if (!bi_is_ubo(ins)) continue;
138
139 unsigned ubo = ins->src[1].value;
140 unsigned offset = ins->src[0].value;
141
142 if (!bi_is_direct_aligned_ubo(ins)) {
143 /* The load can't be pushed, so this UBO needs to be
144 * uploaded conventionally */
145 if (ins->src[1].type == BI_INDEX_CONSTANT)
146 ctx->ubo_mask |= BITSET_BIT(ubo);
147 else
148 ctx->ubo_mask = ~0;
149
150 continue;
151 }
152
153 /* Check if we decided to push this */
154 assert(ubo < analysis.nr_blocks);
155 if (!BITSET_TEST(analysis.blocks[ubo].pushed, offset / 4)) {
156 ctx->ubo_mask |= BITSET_BIT(ubo);
157 continue;
158 }
159
160 /* Replace the UBO load with moves from FAU */
161 bi_builder b = bi_init_builder(ctx, bi_after_instr(ins));
162
163 bi_instr *vec = bi_collect_i32_to(&b, ins->dest[0]);
164 vec->nr_srcs = bi_opcode_props[ins->op].sr_count;
165
166 for (unsigned w = 0; w < vec->nr_srcs; ++w) {
167 /* FAU is grouped in pairs (2 x 4-byte) */
168 unsigned base =
169 pan_lookup_pushed_ubo(ctx->info.push, ubo,
170 (offset + 4 * w));
171
172 unsigned fau_idx = (base >> 1);
173 unsigned fau_hi = (base & 1);
174
175 vec->src[w] = bi_fau(BIR_FAU_UNIFORM | fau_idx, fau_hi);
176 }
177
178 bi_remove_instruction(ins);
179 }
180
181 free(analysis.blocks);
182 }
183
184 typedef struct {
185 BITSET_DECLARE(row, PAN_MAX_PUSH);
186 } adjacency_row;
187
188 /* Find the connected component containing `node` with depth-first search */
189 static void
bi_find_component(adjacency_row * adjacency,BITSET_WORD * visited,unsigned * component,unsigned * size,unsigned node)190 bi_find_component(adjacency_row *adjacency, BITSET_WORD *visited,
191 unsigned *component, unsigned *size, unsigned node)
192 {
193 unsigned neighbour;
194
195 BITSET_SET(visited, node);
196 component[(*size)++] = node;
197
198 BITSET_FOREACH_SET(neighbour, adjacency[node].row, PAN_MAX_PUSH) {
199 if (!BITSET_TEST(visited, neighbour)) {
200 bi_find_component(adjacency, visited, component, size,
201 neighbour);
202 }
203 }
204 }
205
206 static bool
bi_is_uniform(bi_index idx)207 bi_is_uniform(bi_index idx)
208 {
209 return (idx.type == BI_INDEX_FAU) && (idx.value & BIR_FAU_UNIFORM);
210 }
211
212 /* Get the index of a uniform in 32-bit words from the start of FAU-RAM */
213 static unsigned
bi_uniform_word(bi_index idx)214 bi_uniform_word(bi_index idx)
215 {
216 assert(bi_is_uniform(idx));
217 assert(idx.offset <= 1);
218
219 return ((idx.value & ~BIR_FAU_UNIFORM) << 1) | idx.offset;
220 }
221
222 /*
223 * Create an undirected graph where nodes are 32-bit uniform indices and edges
224 * represent that two nodes are used in the same instruction.
225 *
226 * The graph is constructed as an adjacency matrix stored in adjacency.
227 */
228 static void
bi_create_fau_interference_graph(bi_context * ctx,adjacency_row * adjacency)229 bi_create_fau_interference_graph(bi_context *ctx, adjacency_row *adjacency)
230 {
231 bi_foreach_instr_global(ctx, I) {
232 unsigned nodes[BI_MAX_SRCS] = {};
233 unsigned node_count = 0;
234
235 /* Set nodes[] to 32-bit uniforms accessed */
236 bi_foreach_src(I, s) {
237 if (bi_is_uniform(I->src[s])) {
238 unsigned word = bi_uniform_word(I->src[s]);
239
240 if (word >= ctx->info.push_offset)
241 nodes[node_count++] = word;
242 }
243 }
244
245 /* Create clique connecting nodes[] */
246 for (unsigned i = 0; i < node_count; ++i) {
247 for (unsigned j = 0; j < node_count; ++j) {
248 if (i == j)
249 continue;
250
251 unsigned x = nodes[i], y = nodes[j];
252 assert(MAX2(x, y) < ctx->info.push->count);
253
254 /* Add undirected edge between the nodes */
255 BITSET_SET(adjacency[x].row, y);
256 BITSET_SET(adjacency[y].row, x);
257 }
258 }
259 }
260 }
261
262 /*
263 * Optimization pass to reorder uniforms. The goal is to reduce the number of
264 * moves we emit when lowering FAU. The pass groups uniforms used by the same
265 * instruction.
266 *
267 * The pass works by creating a graph of pushed uniforms, where edges denote the
268 * "both 32-bit uniforms required by the same instruction" relationship. We
269 * perform depth-first search on this graph to find the connected components,
270 * where each connected component is a cluster of uniforms that are used
271 * together. We then select pairs of uniforms from each connected component.
272 * The remaining unpaired uniforms (from components of odd sizes) are paired
273 * together arbitrarily.
274 *
275 * After a new ordering is selected, pushed uniforms in the program and the
276 * panfrost_ubo_push data structure must be remapped to use the new ordering.
277 */
278 void
bi_opt_reorder_push(bi_context * ctx)279 bi_opt_reorder_push(bi_context *ctx)
280 {
281 adjacency_row adjacency[PAN_MAX_PUSH] = { 0 };
282 BITSET_DECLARE(visited, PAN_MAX_PUSH) = { 0 };
283
284 unsigned ordering[PAN_MAX_PUSH] = { 0 };
285 unsigned unpaired[PAN_MAX_PUSH] = { 0 };
286 unsigned pushed = 0, unpaired_count = 0;
287
288 struct panfrost_ubo_push *push = ctx->info.push;
289 unsigned push_offset = ctx->info.push_offset;
290
291 bi_create_fau_interference_graph(ctx, adjacency);
292
293 for (unsigned i = push_offset; i < push->count; ++i) {
294 if (BITSET_TEST(visited, i)) continue;
295
296 unsigned component[PAN_MAX_PUSH] = { 0 };
297 unsigned size = 0;
298 bi_find_component(adjacency, visited, component, &size, i);
299
300 /* If there is an odd number of uses, at least one use must be
301 * unpaired. Arbitrarily take the last one.
302 */
303 if (size % 2)
304 unpaired[unpaired_count++] = component[--size];
305
306 /* The rest of uses are paired */
307 assert((size % 2) == 0);
308
309 /* Push the paired uses */
310 memcpy(ordering + pushed, component, sizeof(unsigned) * size);
311 pushed += size;
312 }
313
314 /* Push unpaired nodes at the end */
315 memcpy(ordering + pushed, unpaired, sizeof(unsigned) * unpaired_count);
316 pushed += unpaired_count;
317
318 /* Ordering is a permutation. Invert it for O(1) lookup. */
319 unsigned old_to_new[PAN_MAX_PUSH] = { 0 };
320
321 for (unsigned i = 0; i < push_offset; ++i) {
322 old_to_new[i] = i;
323 }
324
325 for (unsigned i = 0; i < pushed; ++i) {
326 assert(ordering[i] >= push_offset);
327 old_to_new[ordering[i]] = push_offset + i;
328 }
329
330 /* Use new ordering throughout the program */
331 bi_foreach_instr_global(ctx, I) {
332 bi_foreach_src(I, s) {
333 if (bi_is_uniform(I->src[s])) {
334 unsigned node = bi_uniform_word(I->src[s]);
335 unsigned new_node = old_to_new[node];
336 I->src[s].value = BIR_FAU_UNIFORM | (new_node >> 1);
337 I->src[s].offset = new_node & 1;
338 }
339 }
340 }
341
342 /* Use new ordering for push */
343 struct panfrost_ubo_push old = *push;
344 for (unsigned i = 0; i < pushed; ++i)
345 push->words[push_offset + i] = old.words[ordering[i]];
346
347 push->count = push_offset + pushed;
348 }
349