• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2021 Collabora, Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #include "compiler.h"
25 #include "bi_builder.h"
26 
27 /* This optimization pass, intended to run once after code emission but before
28  * copy propagation, analyzes direct word-aligned UBO reads and promotes a
29  * subset to moves from FAU. It is the sole populator of the UBO push data
30  * structure returned back to the command stream. */
31 
32 static bool
bi_is_ubo(bi_instr * ins)33 bi_is_ubo(bi_instr *ins)
34 {
35         return (bi_opcode_props[ins->op].message == BIFROST_MESSAGE_LOAD) &&
36                 (ins->seg == BI_SEG_UBO);
37 }
38 
39 static bool
bi_is_direct_aligned_ubo(bi_instr * ins)40 bi_is_direct_aligned_ubo(bi_instr *ins)
41 {
42         return bi_is_ubo(ins) &&
43                 (ins->src[0].type == BI_INDEX_CONSTANT) &&
44                 (ins->src[1].type == BI_INDEX_CONSTANT) &&
45                 ((ins->src[0].value & 0x3) == 0);
46 }
47 
48 /* Represents use data for a single UBO */
49 
50 #define MAX_UBO_WORDS (65536 / 16)
51 
52 struct bi_ubo_block {
53         BITSET_DECLARE(pushed, MAX_UBO_WORDS);
54         uint8_t range[MAX_UBO_WORDS];
55 };
56 
57 struct bi_ubo_analysis {
58         /* Per block analysis */
59         unsigned nr_blocks;
60         struct bi_ubo_block *blocks;
61 };
62 
63 static struct bi_ubo_analysis
bi_analyze_ranges(bi_context * ctx)64 bi_analyze_ranges(bi_context *ctx)
65 {
66         struct bi_ubo_analysis res = {
67                 .nr_blocks = ctx->nir->info.num_ubos + 1,
68         };
69 
70         res.blocks = calloc(res.nr_blocks, sizeof(struct bi_ubo_block));
71 
72         bi_foreach_instr_global(ctx, ins) {
73                 if (!bi_is_direct_aligned_ubo(ins)) continue;
74 
75                 unsigned ubo = ins->src[1].value;
76                 unsigned word = ins->src[0].value / 4;
77                 unsigned channels = bi_opcode_props[ins->op].sr_count;
78 
79                 assert(ubo < res.nr_blocks);
80                 assert(channels > 0 && channels <= 4);
81 
82                 if (word >= MAX_UBO_WORDS) continue;
83 
84                 /* Must use max if the same base is read with different channel
85                  * counts, which is possible with nir_opt_shrink_vectors */
86                 uint8_t *range = res.blocks[ubo].range;
87                 range[word] = MAX2(range[word], channels);
88         }
89 
90         return res;
91 }
92 
93 /* Select UBO words to push. A sophisticated implementation would consider the
94  * number of uses and perhaps the control flow to estimate benefit. This is not
95  * sophisticated. Select from the last UBO first to prioritize sysvals. */
96 
97 static void
bi_pick_ubo(struct panfrost_ubo_push * push,struct bi_ubo_analysis * analysis)98 bi_pick_ubo(struct panfrost_ubo_push *push, struct bi_ubo_analysis *analysis)
99 {
100         for (signed ubo = analysis->nr_blocks - 1; ubo >= 0; --ubo) {
101                 struct bi_ubo_block *block = &analysis->blocks[ubo];
102 
103                 for (unsigned r = 0; r < MAX_UBO_WORDS; ++r) {
104                         unsigned range = block->range[r];
105 
106                         /* Don't push something we don't access */
107                         if (range == 0) continue;
108 
109                         /* Don't push more than possible */
110                         if (push->count > PAN_MAX_PUSH - range)
111                                 return;
112 
113                         for (unsigned offs = 0; offs < range; ++offs) {
114                                 struct panfrost_ubo_word word = {
115                                         .ubo = ubo,
116                                         .offset = (r + offs) * 4
117                                 };
118 
119                                 push->words[push->count++] = word;
120                         }
121 
122                         /* Mark it as pushed so we can rewrite */
123                         BITSET_SET(block->pushed, r);
124                 }
125         }
126 }
127 
128 void
bi_opt_push_ubo(bi_context * ctx)129 bi_opt_push_ubo(bi_context *ctx)
130 {
131         struct bi_ubo_analysis analysis = bi_analyze_ranges(ctx);
132         bi_pick_ubo(ctx->info.push, &analysis);
133 
134         ctx->ubo_mask = 0;
135 
136         bi_foreach_instr_global_safe(ctx, ins) {
137                 if (!bi_is_ubo(ins)) continue;
138 
139                 unsigned ubo = ins->src[1].value;
140                 unsigned offset = ins->src[0].value;
141 
142                 if (!bi_is_direct_aligned_ubo(ins)) {
143                         /* The load can't be pushed, so this UBO needs to be
144                          * uploaded conventionally */
145                         if (ins->src[1].type == BI_INDEX_CONSTANT)
146                                 ctx->ubo_mask |= BITSET_BIT(ubo);
147                         else
148                                 ctx->ubo_mask = ~0;
149 
150                         continue;
151                 }
152 
153                 /* Check if we decided to push this */
154                 assert(ubo < analysis.nr_blocks);
155                 if (!BITSET_TEST(analysis.blocks[ubo].pushed, offset / 4)) {
156                         ctx->ubo_mask |= BITSET_BIT(ubo);
157                         continue;
158                 }
159 
160                 /* Replace the UBO load with moves from FAU */
161                 bi_builder b = bi_init_builder(ctx, bi_after_instr(ins));
162 
163                 bi_instr *vec = bi_collect_i32_to(&b, ins->dest[0]);
164                 vec->nr_srcs = bi_opcode_props[ins->op].sr_count;
165 
166                 for (unsigned w = 0; w < vec->nr_srcs; ++w) {
167                         /* FAU is grouped in pairs (2 x 4-byte) */
168                         unsigned base =
169                                 pan_lookup_pushed_ubo(ctx->info.push, ubo,
170                                                       (offset + 4 * w));
171 
172                         unsigned fau_idx = (base >> 1);
173                         unsigned fau_hi = (base & 1);
174 
175                         vec->src[w] = bi_fau(BIR_FAU_UNIFORM | fau_idx, fau_hi);
176                 }
177 
178                 bi_remove_instruction(ins);
179         }
180 
181         free(analysis.blocks);
182 }
183 
184 typedef struct {
185         BITSET_DECLARE(row, PAN_MAX_PUSH);
186 } adjacency_row;
187 
188 /* Find the connected component containing `node` with depth-first search */
189 static void
bi_find_component(adjacency_row * adjacency,BITSET_WORD * visited,unsigned * component,unsigned * size,unsigned node)190 bi_find_component(adjacency_row *adjacency, BITSET_WORD *visited,
191                   unsigned *component, unsigned *size, unsigned node)
192 {
193         unsigned neighbour;
194 
195         BITSET_SET(visited, node);
196         component[(*size)++] = node;
197 
198         BITSET_FOREACH_SET(neighbour, adjacency[node].row, PAN_MAX_PUSH) {
199                 if (!BITSET_TEST(visited, neighbour)) {
200                         bi_find_component(adjacency, visited, component, size,
201                                           neighbour);
202                 }
203         }
204 }
205 
206 static bool
bi_is_uniform(bi_index idx)207 bi_is_uniform(bi_index idx)
208 {
209         return (idx.type == BI_INDEX_FAU) && (idx.value & BIR_FAU_UNIFORM);
210 }
211 
212 /* Get the index of a uniform in 32-bit words from the start of FAU-RAM */
213 static unsigned
bi_uniform_word(bi_index idx)214 bi_uniform_word(bi_index idx)
215 {
216         assert(bi_is_uniform(idx));
217         assert(idx.offset <= 1);
218 
219         return ((idx.value & ~BIR_FAU_UNIFORM) << 1) | idx.offset;
220 }
221 
222 /*
223  * Create an undirected graph where nodes are 32-bit uniform indices and edges
224  * represent that two nodes are used in the same instruction.
225  *
226  * The graph is constructed as an adjacency matrix stored in adjacency.
227  */
228 static void
bi_create_fau_interference_graph(bi_context * ctx,adjacency_row * adjacency)229 bi_create_fau_interference_graph(bi_context *ctx, adjacency_row *adjacency)
230 {
231         bi_foreach_instr_global(ctx, I) {
232                 unsigned nodes[BI_MAX_SRCS] = {};
233                 unsigned node_count = 0;
234 
235                 /* Set nodes[] to 32-bit uniforms accessed */
236                 bi_foreach_src(I, s) {
237                         if (bi_is_uniform(I->src[s])) {
238                                 unsigned word = bi_uniform_word(I->src[s]);
239 
240                                 if (word >= ctx->info.push_offset)
241                                         nodes[node_count++] = word;
242                         }
243                 }
244 
245                 /* Create clique connecting nodes[] */
246                 for (unsigned i = 0; i < node_count; ++i) {
247                         for (unsigned j = 0; j < node_count; ++j) {
248                                 if (i == j)
249                                         continue;
250 
251                                 unsigned x = nodes[i], y = nodes[j];
252                                 assert(MAX2(x, y) < ctx->info.push->count);
253 
254                                 /* Add undirected edge between the nodes */
255                                 BITSET_SET(adjacency[x].row, y);
256                                 BITSET_SET(adjacency[y].row, x);
257                         }
258                 }
259         }
260 }
261 
262 /*
263  * Optimization pass to reorder uniforms. The goal is to reduce the number of
264  * moves we emit when lowering FAU. The pass groups uniforms used by the same
265  * instruction.
266  *
267  * The pass works by creating a graph of pushed uniforms, where edges denote the
268  * "both 32-bit uniforms required by the same instruction" relationship. We
269  * perform depth-first search on this graph to find the connected components,
270  * where each connected component is a cluster of uniforms that are used
271  * together. We then select pairs of uniforms from each connected component.
272  * The remaining unpaired uniforms (from components of odd sizes) are paired
273  * together arbitrarily.
274  *
275  * After a new ordering is selected, pushed uniforms in the program and the
276  * panfrost_ubo_push data structure must be remapped to use the new ordering.
277  */
278 void
bi_opt_reorder_push(bi_context * ctx)279 bi_opt_reorder_push(bi_context *ctx)
280 {
281         adjacency_row adjacency[PAN_MAX_PUSH] = { 0 };
282         BITSET_DECLARE(visited, PAN_MAX_PUSH) = { 0 };
283 
284         unsigned ordering[PAN_MAX_PUSH] = { 0 };
285         unsigned unpaired[PAN_MAX_PUSH] = { 0 };
286         unsigned pushed = 0, unpaired_count = 0;
287 
288         struct panfrost_ubo_push *push = ctx->info.push;
289         unsigned push_offset = ctx->info.push_offset;
290 
291         bi_create_fau_interference_graph(ctx, adjacency);
292 
293         for (unsigned i = push_offset; i < push->count; ++i) {
294                 if (BITSET_TEST(visited, i)) continue;
295 
296                 unsigned component[PAN_MAX_PUSH] = { 0 };
297                 unsigned size = 0;
298                 bi_find_component(adjacency, visited, component, &size, i);
299 
300                 /* If there is an odd number of uses, at least one use must be
301                  * unpaired. Arbitrarily take the last one.
302                  */
303                 if (size % 2)
304                         unpaired[unpaired_count++] = component[--size];
305 
306                 /* The rest of uses are paired */
307                 assert((size % 2) == 0);
308 
309                 /* Push the paired uses */
310                 memcpy(ordering + pushed, component, sizeof(unsigned) * size);
311                 pushed += size;
312         }
313 
314         /* Push unpaired nodes at the end */
315         memcpy(ordering + pushed, unpaired, sizeof(unsigned) * unpaired_count);
316         pushed += unpaired_count;
317 
318         /* Ordering is a permutation. Invert it for O(1) lookup. */
319         unsigned old_to_new[PAN_MAX_PUSH] = { 0 };
320 
321         for (unsigned i = 0; i < push_offset; ++i) {
322                 old_to_new[i] = i;
323         }
324 
325         for (unsigned i = 0; i < pushed; ++i) {
326                 assert(ordering[i] >= push_offset);
327                 old_to_new[ordering[i]] = push_offset + i;
328         }
329 
330         /* Use new ordering throughout the program */
331         bi_foreach_instr_global(ctx, I) {
332                 bi_foreach_src(I, s) {
333                         if (bi_is_uniform(I->src[s])) {
334                                 unsigned node = bi_uniform_word(I->src[s]);
335                                 unsigned new_node = old_to_new[node];
336                                 I->src[s].value = BIR_FAU_UNIFORM | (new_node >> 1);
337                                 I->src[s].offset = new_node & 1;
338                         }
339                 }
340         }
341 
342         /* Use new ordering for push */
343         struct panfrost_ubo_push old = *push;
344         for (unsigned i = 0; i < pushed; ++i)
345                 push->words[push_offset + i] = old.words[ordering[i]];
346 
347         push->count = push_offset + pushed;
348 }
349