1 /*
2 * Copyright 2023 Asahi Lina
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "agx_scratch.h"
7 #include "asahi/compiler/agx_compile.h"
8 #include "shaders/helper.h"
9 #include "util/u_hexdump.h"
10 #include "agx_bo.h"
11 #include "libagx_shaders.h"
12 #include "nir.h"
13 #include "nir_builder_opcodes.h"
14
15 #define AGX_ADDR_SHIFT 8
16 #define AGX_THREADS_PER_GROUP 32
17 #define AGX_SPILL_UNIT_DWORDS 8
18
19 // FIXME: What is the actual value here? Seems to be 96 + 8 or so?
20 #define AGX_MAX_SUBGROUPS_PER_CORE 128
21
22 // Unknown if this goes higher.
23 #define AGX_MAX_SCRATCH_BLOCK_LOG4 6
24 #define AGX_MAX_SCRATCH_DWORDS \
25 ((AGX_SPILL_UNIT_DWORDS << (2 * AGX_MAX_SCRATCH_BLOCK_LOG4)) * 4)
26
27 struct spill_size {
28 uint32_t log4_bsize;
29 uint32_t count;
30 };
31
32 struct agx_bo *
agx_build_helper(struct agx_device * dev)33 agx_build_helper(struct agx_device *dev)
34 {
35 struct agx_bo *bo = agx_bo_create(
36 dev, sizeof(libagx_g13_helper),
37 AGX_BO_READONLY | AGX_BO_EXEC | AGX_BO_LOW_VA, "Helper shader");
38 assert(bo);
39 memcpy(bo->ptr.cpu, libagx_g13_helper, sizeof(libagx_g13_helper));
40
41 if (dev->debug & AGX_DBG_SCRATCH)
42 fprintf(stderr, "Helper: 0x%" PRIx64 "\n", bo->ptr.gpu);
43
44 return bo;
45 }
46
47 static struct spill_size
agx_scratch_get_spill_size(unsigned dwords)48 agx_scratch_get_spill_size(unsigned dwords)
49 {
50 if (!dwords) {
51 return (struct spill_size){0, 0};
52 }
53 assert(dwords <= AGX_MAX_SCRATCH_DWORDS && "Scratch size too large");
54
55 unsigned log4 =
56 util_logbase2(DIV_ROUND_UP(dwords, AGX_SPILL_UNIT_DWORDS)) / 2;
57 unsigned blocks = DIV_ROUND_UP(dwords, AGX_SPILL_UNIT_DWORDS << (2 * log4));
58 if (log4 > AGX_MAX_SCRATCH_BLOCK_LOG4) {
59 // Max size case (4 blocks)
60 assert(log4 == (AGX_MAX_SCRATCH_BLOCK_LOG4 + 1));
61 log4--;
62 blocks = 4;
63 } else if (blocks == 4) {
64 // Non max size 4 block case, shift to next log4 unit for consistency.
65 log4++;
66 blocks = 1;
67 }
68
69 return (struct spill_size){log4, blocks};
70 }
71
72 unsigned
agx_scratch_get_bucket(uint32_t dwords)73 agx_scratch_get_bucket(uint32_t dwords)
74 {
75 /* For debugging/analysis purposes, scratch allocation sizes are
76 * divided into buckets. Since we only allocate a single global
77 * worst-case scratch buffer, these buckets do not have any meaning
78 * for the actual allocation mechanism. They are only used to log
79 * allocation sizes. We just use a simple log2 of the size here.
80 */
81
82 if (!dwords)
83 return 0;
84 assert(dwords <= AGX_MAX_SCRATCH_DWORDS && "Scratch size too large");
85
86 return MIN2(
87 AGX_SPILL_SIZE_BUCKETS - 1,
88 1 + util_logbase2_ceil(DIV_ROUND_UP(dwords, AGX_SPILL_UNIT_DWORDS)));
89 }
90
91 static void
agx_scratch_realloc(struct agx_scratch * scratch)92 agx_scratch_realloc(struct agx_scratch *scratch)
93 {
94 if (scratch->buf)
95 agx_bo_unreference(scratch->buf);
96
97 struct spill_size size = agx_scratch_get_spill_size(scratch->size_dwords);
98
99 if (scratch->dev->debug & AGX_DBG_SCRATCH)
100 fprintf(stderr, "Scratch realloc: %d (%d:%d) x %d\n",
101 scratch->size_dwords, size.log4_bsize, size.count,
102 scratch->subgroups);
103
104 unsigned block_dwords = AGX_SPILL_UNIT_DWORDS << (2 * size.log4_bsize);
105 size_t block_size_bytes = (AGX_THREADS_PER_GROUP * 4) * block_dwords;
106 scratch->size_dwords = block_dwords * size.count;
107
108 if (scratch->dev->debug & AGX_DBG_SCRATCH)
109 fprintf(stderr, "Block size: 0x%zx bytes (%d)\n", block_size_bytes,
110 size.log4_bsize);
111
112 unsigned block_count = size.count;
113
114 if (scratch->dev->debug & AGX_DBG_SCRATCH)
115 fprintf(stderr, "Block count: %d\n", block_count);
116
117 size_t core_alloc = block_size_bytes * block_count * scratch->subgroups;
118
119 size_t header_size = sizeof(struct agx_helper_header);
120
121 size_t blocklist_off = header_size;
122 size_t blocklist_core_size =
123 scratch->subgroups * sizeof(struct agx_helper_block);
124 size_t blocklist_size = blocklist_core_size * scratch->num_cores;
125
126 size_t blocks_off = align(header_size + blocklist_size, block_size_bytes);
127 size_t total_alloc = blocks_off + core_alloc * scratch->num_cores;
128
129 unsigned flags = 0;
130 #ifdef SCRATCH_DEBUG
131 flags = AGX_BO_WRITEBACK;
132 #endif
133 scratch->buf = agx_bo_create_aligned(scratch->dev, total_alloc,
134 block_size_bytes, flags, "Scratch");
135 memset(scratch->buf->ptr.cpu, 0, blocks_off);
136
137 struct agx_helper_header *hdr = scratch->buf->ptr.cpu;
138 scratch->header = hdr;
139
140 uint64_t blocklist_gpu = scratch->buf->ptr.gpu + blocklist_off;
141 struct agx_helper_block *blocklist_cpu =
142 scratch->buf->ptr.cpu + blocklist_off;
143
144 #ifdef SCRATCH_DEBUG
145 scratch->blocklist = blocklist_cpu;
146 scratch->data = scratch->buf->ptr.cpu + blocks_off;
147 scratch->core_size = block_size_bytes * block_count * scratch->subgroups;
148 #endif
149
150 uint64_t blocks_gpu = scratch->buf->ptr.gpu + blocks_off;
151
152 hdr->subgroups = scratch->subgroups;
153
154 unsigned num_cores = 0;
155 unsigned core_id;
156 for (core_id = 0; core_id < AGX_MAX_CORE_ID; core_id++) {
157 #ifndef SCRATCH_DEBUG_CORES
158 unsigned cores_per_cluster =
159 util_next_power_of_two(scratch->dev->params.num_cores_per_cluster);
160 unsigned cluster = core_id / cores_per_cluster;
161 unsigned core = core_id % cores_per_cluster;
162 if (cluster >= scratch->dev->params.num_clusters_total)
163 break;
164 if (core >= scratch->dev->params.num_cores_per_cluster ||
165 !(scratch->dev->params.core_masks[cluster] & BITFIELD_BIT(core)))
166 continue;
167 #endif
168 num_cores++;
169 #ifdef SCRATCH_DEBUG
170 scratch->core_present[core_id] = true;
171 #endif
172
173 hdr->cores[core_id].blocklist = blocklist_gpu;
174
175 for (unsigned sg = 0; sg < scratch->subgroups; sg++) {
176 uint32_t mask = BITFIELD_MASK(size.log4_bsize + 1);
177 assert(!(blocks_gpu & (block_size_bytes - 1)));
178
179 uint32_t base = blocks_gpu >> AGX_ADDR_SHIFT;
180 uint32_t stride = block_size_bytes >> AGX_ADDR_SHIFT;
181 blocklist_cpu[sg].blocks[0] = mask | base;
182 for (int block = 1; block <= 3; block++) {
183 if (block_count >= (block + 1))
184 blocklist_cpu[sg].blocks[block] = 1 | (base + block * stride);
185 else
186 blocklist_cpu[sg].blocks[block] = 0;
187 }
188
189 blocks_gpu += block_size_bytes * block_count;
190 }
191
192 blocklist_gpu += sizeof(struct agx_helper_block) * scratch->subgroups;
193 blocklist_cpu += scratch->subgroups;
194 }
195 scratch->max_core_id = core_id;
196 assert(num_cores == scratch->num_cores);
197
198 if (scratch->dev->debug & AGX_DBG_SCRATCH)
199 fprintf(stderr, "New Scratch @ 0x%" PRIx64 " (size: 0x%zx)\n",
200 scratch->buf->ptr.gpu, scratch->buf->size);
201 }
202
203 void
agx_scratch_alloc(struct agx_scratch * scratch,unsigned dwords,size_t subgroups)204 agx_scratch_alloc(struct agx_scratch *scratch, unsigned dwords,
205 size_t subgroups)
206 {
207 bool realloc = false;
208
209 if (!dwords)
210 return;
211
212 assert(dwords <= AGX_MAX_SCRATCH_DWORDS && "Scratch size too large");
213
214 if (!subgroups)
215 subgroups = AGX_MAX_SUBGROUPS_PER_CORE;
216
217 subgroups = MIN2(AGX_MAX_SUBGROUPS_PER_CORE, subgroups);
218
219 if (dwords > scratch->size_dwords) {
220 scratch->size_dwords = dwords;
221 realloc = true;
222 }
223
224 if (subgroups > scratch->subgroups) {
225 scratch->subgroups = subgroups;
226 realloc = true;
227 }
228
229 if (realloc) {
230 agx_scratch_realloc(scratch);
231 }
232 }
233
234 void
agx_scratch_debug_pre(struct agx_scratch * scratch)235 agx_scratch_debug_pre(struct agx_scratch *scratch)
236 {
237 if (!scratch->buf)
238 return;
239
240 for (int core = 0; core < scratch->max_core_id; core++) {
241 assert(!scratch->header->cores[core].alloc_cur);
242 scratch->header->cores[core].alloc_max = 0;
243 scratch->header->cores[core].alloc_failed = 0;
244 memset(scratch->header->cores[core].alloc_count, 0,
245 sizeof(scratch->header->cores[core].alloc_count));
246 }
247 }
248
249 void
agx_scratch_debug_post(struct agx_scratch * scratch)250 agx_scratch_debug_post(struct agx_scratch *scratch)
251 {
252 if (!scratch->buf)
253 return;
254
255 fprintf(stderr, "Scratch @ 0x%" PRIx64 "\n", scratch->buf->ptr.gpu);
256
257 for (int core = 0; core < scratch->max_core_id; core++) {
258 fprintf(stderr, "Core %3d: max %d, failed %d, counts:", core,
259 scratch->header->cores[core].alloc_max,
260 scratch->header->cores[core].alloc_failed);
261
262 for (unsigned bucket = 0; bucket < AGX_SPILL_SIZE_BUCKETS; bucket++) {
263 fprintf(stderr, " %d:%-3d",
264 bucket ? (AGX_SPILL_UNIT_DWORDS << (bucket - 1)) : 0,
265 scratch->header->cores[core].alloc_count[bucket]);
266 }
267 fprintf(stderr, "\n");
268 assert(!scratch->header->cores[core].alloc_cur);
269 assert(!scratch->header->cores[core].alloc_failed);
270 }
271
272 #ifdef SCRATCH_DEBUG
273 unsigned core_index = 0;
274 for (int core = 0; core < scratch->max_core_id; core++) {
275 if (!scratch->core_present[core])
276 continue;
277 void *p = scratch->data + scratch->core_size * core_index++;
278 fprintf(stderr, "\nCORE %d (0x%lx)\n", core, scratch->core_size);
279 u_hexdump(stderr, p, scratch->core_size, true);
280 }
281 #endif
282 }
283
284 void
agx_scratch_init(struct agx_device * dev,struct agx_scratch * scratch)285 agx_scratch_init(struct agx_device *dev, struct agx_scratch *scratch)
286 {
287 memset(scratch, 0, sizeof(*scratch));
288
289 scratch->dev = dev;
290 #ifdef SCRATCH_DEBUG_CORES
291 scratch->num_cores = SCRATCH_DEBUG_CORES;
292 #else
293 scratch->num_cores = 0;
294 for (unsigned cl = 0; cl < dev->params.num_clusters_total; cl++) {
295 scratch->num_cores += util_bitcount(dev->params.core_masks[cl]);
296 }
297 #endif
298 }
299
300 void
agx_scratch_fini(struct agx_scratch * scratch)301 agx_scratch_fini(struct agx_scratch *scratch)
302 {
303 if (scratch->buf)
304 agx_bo_unreference(scratch->buf);
305 scratch->buf = NULL;
306 }
307