1 /*
2 * Copyright 2023 Asahi Lina
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "agx_scratch.h"
7 #include "libagx/helper.h"
8 #include "agx_bo.h"
9 #include "libagx_shaders.h"
10
11 #define AGX_ADDR_SHIFT 8
12 #define AGX_THREADS_PER_GROUP 32
13 #define AGX_SPILL_UNIT_DWORDS 8
14
15 // FIXME: What is the actual value here? Seems to be 96 + 8 or so?
16 #define AGX_MAX_SUBGROUPS_PER_CORE 128
17
18 // Unknown if this goes higher.
19 #define AGX_MAX_SCRATCH_BLOCK_LOG4 6
20 #define AGX_MAX_SCRATCH_DWORDS \
21 ((AGX_SPILL_UNIT_DWORDS << (2 * AGX_MAX_SCRATCH_BLOCK_LOG4)) * 4)
22
23 struct spill_size {
24 uint32_t log4_bsize;
25 uint32_t count;
26 };
27
28 static struct spill_size
agx_scratch_get_spill_size(unsigned dwords)29 agx_scratch_get_spill_size(unsigned dwords)
30 {
31 if (!dwords) {
32 return (struct spill_size){0, 0};
33 }
34 assert(dwords <= AGX_MAX_SCRATCH_DWORDS && "Scratch size too large");
35
36 unsigned log4 =
37 util_logbase2(DIV_ROUND_UP(dwords, AGX_SPILL_UNIT_DWORDS)) / 2;
38 unsigned blocks = DIV_ROUND_UP(dwords, AGX_SPILL_UNIT_DWORDS << (2 * log4));
39 if (log4 > AGX_MAX_SCRATCH_BLOCK_LOG4) {
40 // Max size case (4 blocks)
41 assert(log4 == (AGX_MAX_SCRATCH_BLOCK_LOG4 + 1));
42 log4--;
43 blocks = 4;
44 } else if (blocks == 4) {
45 // Non max size 4 block case, shift to next log4 unit for consistency.
46 log4++;
47 blocks = 1;
48 }
49
50 return (struct spill_size){log4, blocks};
51 }
52
53 unsigned
agx_scratch_get_bucket(uint32_t dwords)54 agx_scratch_get_bucket(uint32_t dwords)
55 {
56 /* For debugging/analysis purposes, scratch allocation sizes are
57 * divided into buckets. Since we only allocate a single global
58 * worst-case scratch buffer, these buckets do not have any meaning
59 * for the actual allocation mechanism. They are only used to log
60 * allocation sizes. We just use a simple log2 of the size here.
61 */
62
63 if (!dwords)
64 return 0;
65 assert(dwords <= AGX_MAX_SCRATCH_DWORDS && "Scratch size too large");
66
67 return MIN2(
68 AGX_SPILL_SIZE_BUCKETS - 1,
69 1 + util_logbase2_ceil(DIV_ROUND_UP(dwords, AGX_SPILL_UNIT_DWORDS)));
70 }
71
72 static void
agx_scratch_realloc(struct agx_scratch * scratch)73 agx_scratch_realloc(struct agx_scratch *scratch)
74 {
75 if (scratch->buf)
76 agx_bo_unreference(scratch->dev, scratch->buf);
77
78 struct spill_size size = agx_scratch_get_spill_size(scratch->size_dwords);
79
80 if (scratch->dev->debug & AGX_DBG_SCRATCH)
81 fprintf(stderr, "Scratch realloc: %d (%d:%d) x %d\n",
82 scratch->size_dwords, size.log4_bsize, size.count,
83 scratch->subgroups);
84
85 unsigned block_dwords = AGX_SPILL_UNIT_DWORDS << (2 * size.log4_bsize);
86 size_t block_size_bytes = (AGX_THREADS_PER_GROUP * 4) * block_dwords;
87 scratch->size_dwords = block_dwords * size.count;
88
89 if (scratch->dev->debug & AGX_DBG_SCRATCH)
90 fprintf(stderr, "Block size: 0x%zx bytes (%d)\n", block_size_bytes,
91 size.log4_bsize);
92
93 unsigned block_count = size.count;
94
95 if (scratch->dev->debug & AGX_DBG_SCRATCH)
96 fprintf(stderr, "Block count: %d\n", block_count);
97
98 size_t core_alloc = block_size_bytes * block_count * scratch->subgroups;
99
100 size_t header_size = sizeof(struct agx_helper_header);
101
102 size_t blocklist_off = header_size;
103 size_t blocklist_core_size =
104 scratch->subgroups * sizeof(struct agx_helper_block);
105 size_t blocklist_size = blocklist_core_size * scratch->num_cores;
106
107 size_t blocks_off = align(header_size + blocklist_size, block_size_bytes);
108 size_t total_alloc = blocks_off + core_alloc * scratch->num_cores;
109
110 unsigned flags = 0;
111 #ifdef SCRATCH_DEBUG
112 flags = AGX_BO_WRITEBACK;
113 #endif
114 scratch->buf = agx_bo_create(scratch->dev, total_alloc, block_size_bytes,
115 flags, "Scratch");
116 void *map = agx_bo_map(scratch->buf);
117 memset(map, 0, blocks_off);
118
119 struct agx_helper_header *hdr = map;
120 scratch->header = hdr;
121
122 uint64_t blocklist_gpu = scratch->buf->va->addr + blocklist_off;
123 struct agx_helper_block *blocklist_cpu = map + blocklist_off;
124
125 #ifdef SCRATCH_DEBUG
126 scratch->blocklist = blocklist_cpu;
127 scratch->data = scratch->buf->map + blocks_off;
128 scratch->core_size = block_size_bytes * block_count * scratch->subgroups;
129 #endif
130
131 uint64_t blocks_gpu = scratch->buf->va->addr + blocks_off;
132
133 hdr->subgroups = scratch->subgroups;
134
135 unsigned num_cores = 0;
136 unsigned core_id;
137 for (core_id = 0; core_id < AGX_MAX_CORE_ID; core_id++) {
138 #ifndef SCRATCH_DEBUG_CORES
139 unsigned cores_per_cluster =
140 util_next_power_of_two(scratch->dev->params.num_cores_per_cluster);
141 unsigned cluster = core_id / cores_per_cluster;
142 unsigned core = core_id % cores_per_cluster;
143 if (cluster >= scratch->dev->params.num_clusters_total)
144 break;
145 if (core >= scratch->dev->params.num_cores_per_cluster ||
146 !(scratch->dev->params.core_masks[cluster] & BITFIELD_BIT(core)))
147 continue;
148 #endif
149 num_cores++;
150 #ifdef SCRATCH_DEBUG
151 scratch->core_present[core_id] = true;
152 #endif
153
154 hdr->cores[core_id].blocklist = blocklist_gpu;
155
156 for (unsigned sg = 0; sg < scratch->subgroups; sg++) {
157 uint32_t mask = BITFIELD_MASK(size.log4_bsize + 1);
158 assert(!(blocks_gpu & (block_size_bytes - 1)));
159
160 uint32_t base = blocks_gpu >> AGX_ADDR_SHIFT;
161 uint32_t stride = block_size_bytes >> AGX_ADDR_SHIFT;
162 blocklist_cpu[sg].blocks[0] = mask | base;
163 for (int block = 1; block <= 3; block++) {
164 if (block_count >= (block + 1))
165 blocklist_cpu[sg].blocks[block] = 1 | (base + block * stride);
166 else
167 blocklist_cpu[sg].blocks[block] = 0;
168 }
169
170 blocks_gpu += block_size_bytes * block_count;
171 }
172
173 blocklist_gpu += sizeof(struct agx_helper_block) * scratch->subgroups;
174 blocklist_cpu += scratch->subgroups;
175 }
176 scratch->max_core_id = core_id;
177 assert(num_cores == scratch->num_cores);
178
179 if (scratch->dev->debug & AGX_DBG_SCRATCH)
180 fprintf(stderr, "New Scratch @ 0x%" PRIx64 " (size: 0x%zx)\n",
181 scratch->buf->va->addr, scratch->buf->size);
182 }
183
184 void
agx_scratch_alloc(struct agx_scratch * scratch,unsigned dwords,size_t subgroups)185 agx_scratch_alloc(struct agx_scratch *scratch, unsigned dwords,
186 size_t subgroups)
187 {
188 bool realloc = false;
189
190 if (!dwords)
191 return;
192
193 assert(dwords <= AGX_MAX_SCRATCH_DWORDS && "Scratch size too large");
194
195 if (!subgroups)
196 subgroups = AGX_MAX_SUBGROUPS_PER_CORE;
197
198 subgroups = MIN2(AGX_MAX_SUBGROUPS_PER_CORE, subgroups);
199
200 if (dwords > scratch->size_dwords) {
201 scratch->size_dwords = dwords;
202 realloc = true;
203 }
204
205 if (subgroups > scratch->subgroups) {
206 scratch->subgroups = subgroups;
207 realloc = true;
208 }
209
210 if (realloc) {
211 agx_scratch_realloc(scratch);
212 }
213 }
214
215 void
agx_scratch_debug_pre(struct agx_scratch * scratch)216 agx_scratch_debug_pre(struct agx_scratch *scratch)
217 {
218 if (!scratch->buf)
219 return;
220
221 for (int core = 0; core < scratch->max_core_id; core++) {
222 assert(!scratch->header->cores[core].alloc_cur);
223 scratch->header->cores[core].alloc_max = 0;
224 scratch->header->cores[core].alloc_failed = 0;
225 memset(scratch->header->cores[core].alloc_count, 0,
226 sizeof(scratch->header->cores[core].alloc_count));
227 }
228 }
229
230 void
agx_scratch_debug_post(struct agx_scratch * scratch)231 agx_scratch_debug_post(struct agx_scratch *scratch)
232 {
233 if (!scratch->buf)
234 return;
235
236 fprintf(stderr, "Scratch @ 0x%" PRIx64 "\n", scratch->buf->va->addr);
237
238 for (int core = 0; core < scratch->max_core_id; core++) {
239 fprintf(stderr, "Core %3d: max %d, failed %d, counts:", core,
240 scratch->header->cores[core].alloc_max,
241 scratch->header->cores[core].alloc_failed);
242
243 for (unsigned bucket = 0; bucket < AGX_SPILL_SIZE_BUCKETS; bucket++) {
244 fprintf(stderr, " %d:%-3d",
245 bucket ? (AGX_SPILL_UNIT_DWORDS << (bucket - 1)) : 0,
246 scratch->header->cores[core].alloc_count[bucket]);
247 }
248 fprintf(stderr, "\n");
249 assert(!scratch->header->cores[core].alloc_cur);
250 assert(!scratch->header->cores[core].alloc_failed);
251 }
252
253 #ifdef SCRATCH_DEBUG
254 unsigned core_index = 0;
255 for (int core = 0; core < scratch->max_core_id; core++) {
256 if (!scratch->core_present[core])
257 continue;
258 void *p = scratch->data + scratch->core_size * core_index++;
259 fprintf(stderr, "\nCORE %d (0x%lx)\n", core, scratch->core_size);
260 u_hexdump(stderr, p, scratch->core_size, true);
261 }
262 #endif
263 }
264
265 void
agx_scratch_init(struct agx_device * dev,struct agx_scratch * scratch)266 agx_scratch_init(struct agx_device *dev, struct agx_scratch *scratch)
267 {
268 memset(scratch, 0, sizeof(*scratch));
269
270 scratch->dev = dev;
271 #ifdef SCRATCH_DEBUG_CORES
272 scratch->num_cores = SCRATCH_DEBUG_CORES;
273 #else
274 scratch->num_cores = agx_get_num_cores(dev);
275 #endif
276 }
277
278 void
agx_scratch_fini(struct agx_scratch * scratch)279 agx_scratch_fini(struct agx_scratch *scratch)
280 {
281 if (scratch->buf)
282 agx_bo_unreference(scratch->dev, scratch->buf);
283 scratch->buf = NULL;
284 }
285