• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2023 Asahi Lina
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "agx_scratch.h"
7 #include "libagx/helper.h"
8 #include "agx_bo.h"
9 #include "libagx_shaders.h"
10 
11 #define AGX_ADDR_SHIFT        8
12 #define AGX_THREADS_PER_GROUP 32
13 #define AGX_SPILL_UNIT_DWORDS 8
14 
15 // FIXME: What is the actual value here? Seems to be 96 + 8 or so?
16 #define AGX_MAX_SUBGROUPS_PER_CORE 128
17 
18 // Unknown if this goes higher.
19 #define AGX_MAX_SCRATCH_BLOCK_LOG4 6
20 #define AGX_MAX_SCRATCH_DWORDS                                                 \
21    ((AGX_SPILL_UNIT_DWORDS << (2 * AGX_MAX_SCRATCH_BLOCK_LOG4)) * 4)
22 
23 struct spill_size {
24    uint32_t log4_bsize;
25    uint32_t count;
26 };
27 
28 static struct spill_size
agx_scratch_get_spill_size(unsigned dwords)29 agx_scratch_get_spill_size(unsigned dwords)
30 {
31    if (!dwords) {
32       return (struct spill_size){0, 0};
33    }
34    assert(dwords <= AGX_MAX_SCRATCH_DWORDS && "Scratch size too large");
35 
36    unsigned log4 =
37       util_logbase2(DIV_ROUND_UP(dwords, AGX_SPILL_UNIT_DWORDS)) / 2;
38    unsigned blocks = DIV_ROUND_UP(dwords, AGX_SPILL_UNIT_DWORDS << (2 * log4));
39    if (log4 > AGX_MAX_SCRATCH_BLOCK_LOG4) {
40       // Max size case (4 blocks)
41       assert(log4 == (AGX_MAX_SCRATCH_BLOCK_LOG4 + 1));
42       log4--;
43       blocks = 4;
44    } else if (blocks == 4) {
45       // Non max size 4 block case, shift to next log4 unit for consistency.
46       log4++;
47       blocks = 1;
48    }
49 
50    return (struct spill_size){log4, blocks};
51 }
52 
53 unsigned
agx_scratch_get_bucket(uint32_t dwords)54 agx_scratch_get_bucket(uint32_t dwords)
55 {
56    /* For debugging/analysis purposes, scratch allocation sizes are
57     * divided into buckets. Since we only allocate a single global
58     * worst-case scratch buffer, these buckets do not have any meaning
59     * for the actual allocation mechanism. They are only used to log
60     * allocation sizes. We just use a simple log2 of the size here.
61     */
62 
63    if (!dwords)
64       return 0;
65    assert(dwords <= AGX_MAX_SCRATCH_DWORDS && "Scratch size too large");
66 
67    return MIN2(
68       AGX_SPILL_SIZE_BUCKETS - 1,
69       1 + util_logbase2_ceil(DIV_ROUND_UP(dwords, AGX_SPILL_UNIT_DWORDS)));
70 }
71 
72 static void
agx_scratch_realloc(struct agx_scratch * scratch)73 agx_scratch_realloc(struct agx_scratch *scratch)
74 {
75    if (scratch->buf)
76       agx_bo_unreference(scratch->dev, scratch->buf);
77 
78    struct spill_size size = agx_scratch_get_spill_size(scratch->size_dwords);
79 
80    if (scratch->dev->debug & AGX_DBG_SCRATCH)
81       fprintf(stderr, "Scratch realloc: %d (%d:%d) x %d\n",
82               scratch->size_dwords, size.log4_bsize, size.count,
83               scratch->subgroups);
84 
85    unsigned block_dwords = AGX_SPILL_UNIT_DWORDS << (2 * size.log4_bsize);
86    size_t block_size_bytes = (AGX_THREADS_PER_GROUP * 4) * block_dwords;
87    scratch->size_dwords = block_dwords * size.count;
88 
89    if (scratch->dev->debug & AGX_DBG_SCRATCH)
90       fprintf(stderr, "Block size: 0x%zx bytes (%d)\n", block_size_bytes,
91               size.log4_bsize);
92 
93    unsigned block_count = size.count;
94 
95    if (scratch->dev->debug & AGX_DBG_SCRATCH)
96       fprintf(stderr, "Block count: %d\n", block_count);
97 
98    size_t core_alloc = block_size_bytes * block_count * scratch->subgroups;
99 
100    size_t header_size = sizeof(struct agx_helper_header);
101 
102    size_t blocklist_off = header_size;
103    size_t blocklist_core_size =
104       scratch->subgroups * sizeof(struct agx_helper_block);
105    size_t blocklist_size = blocklist_core_size * scratch->num_cores;
106 
107    size_t blocks_off = align(header_size + blocklist_size, block_size_bytes);
108    size_t total_alloc = blocks_off + core_alloc * scratch->num_cores;
109 
110    unsigned flags = 0;
111 #ifdef SCRATCH_DEBUG
112    flags = AGX_BO_WRITEBACK;
113 #endif
114    scratch->buf = agx_bo_create(scratch->dev, total_alloc, block_size_bytes,
115                                 flags, "Scratch");
116    void *map = agx_bo_map(scratch->buf);
117    memset(map, 0, blocks_off);
118 
119    struct agx_helper_header *hdr = map;
120    scratch->header = hdr;
121 
122    uint64_t blocklist_gpu = scratch->buf->va->addr + blocklist_off;
123    struct agx_helper_block *blocklist_cpu = map + blocklist_off;
124 
125 #ifdef SCRATCH_DEBUG
126    scratch->blocklist = blocklist_cpu;
127    scratch->data = scratch->buf->map + blocks_off;
128    scratch->core_size = block_size_bytes * block_count * scratch->subgroups;
129 #endif
130 
131    uint64_t blocks_gpu = scratch->buf->va->addr + blocks_off;
132 
133    hdr->subgroups = scratch->subgroups;
134 
135    unsigned num_cores = 0;
136    unsigned core_id;
137    for (core_id = 0; core_id < AGX_MAX_CORE_ID; core_id++) {
138 #ifndef SCRATCH_DEBUG_CORES
139       unsigned cores_per_cluster =
140          util_next_power_of_two(scratch->dev->params.num_cores_per_cluster);
141       unsigned cluster = core_id / cores_per_cluster;
142       unsigned core = core_id % cores_per_cluster;
143       if (cluster >= scratch->dev->params.num_clusters_total)
144          break;
145       if (core >= scratch->dev->params.num_cores_per_cluster ||
146           !(scratch->dev->params.core_masks[cluster] & BITFIELD_BIT(core)))
147          continue;
148 #endif
149       num_cores++;
150 #ifdef SCRATCH_DEBUG
151       scratch->core_present[core_id] = true;
152 #endif
153 
154       hdr->cores[core_id].blocklist = blocklist_gpu;
155 
156       for (unsigned sg = 0; sg < scratch->subgroups; sg++) {
157          uint32_t mask = BITFIELD_MASK(size.log4_bsize + 1);
158          assert(!(blocks_gpu & (block_size_bytes - 1)));
159 
160          uint32_t base = blocks_gpu >> AGX_ADDR_SHIFT;
161          uint32_t stride = block_size_bytes >> AGX_ADDR_SHIFT;
162          blocklist_cpu[sg].blocks[0] = mask | base;
163          for (int block = 1; block <= 3; block++) {
164             if (block_count >= (block + 1))
165                blocklist_cpu[sg].blocks[block] = 1 | (base + block * stride);
166             else
167                blocklist_cpu[sg].blocks[block] = 0;
168          }
169 
170          blocks_gpu += block_size_bytes * block_count;
171       }
172 
173       blocklist_gpu += sizeof(struct agx_helper_block) * scratch->subgroups;
174       blocklist_cpu += scratch->subgroups;
175    }
176    scratch->max_core_id = core_id;
177    assert(num_cores == scratch->num_cores);
178 
179    if (scratch->dev->debug & AGX_DBG_SCRATCH)
180       fprintf(stderr, "New Scratch @ 0x%" PRIx64 " (size: 0x%zx)\n",
181               scratch->buf->va->addr, scratch->buf->size);
182 }
183 
184 void
agx_scratch_alloc(struct agx_scratch * scratch,unsigned dwords,size_t subgroups)185 agx_scratch_alloc(struct agx_scratch *scratch, unsigned dwords,
186                   size_t subgroups)
187 {
188    bool realloc = false;
189 
190    if (!dwords)
191       return;
192 
193    assert(dwords <= AGX_MAX_SCRATCH_DWORDS && "Scratch size too large");
194 
195    if (!subgroups)
196       subgroups = AGX_MAX_SUBGROUPS_PER_CORE;
197 
198    subgroups = MIN2(AGX_MAX_SUBGROUPS_PER_CORE, subgroups);
199 
200    if (dwords > scratch->size_dwords) {
201       scratch->size_dwords = dwords;
202       realloc = true;
203    }
204 
205    if (subgroups > scratch->subgroups) {
206       scratch->subgroups = subgroups;
207       realloc = true;
208    }
209 
210    if (realloc) {
211       agx_scratch_realloc(scratch);
212    }
213 }
214 
215 void
agx_scratch_debug_pre(struct agx_scratch * scratch)216 agx_scratch_debug_pre(struct agx_scratch *scratch)
217 {
218    if (!scratch->buf)
219       return;
220 
221    for (int core = 0; core < scratch->max_core_id; core++) {
222       assert(!scratch->header->cores[core].alloc_cur);
223       scratch->header->cores[core].alloc_max = 0;
224       scratch->header->cores[core].alloc_failed = 0;
225       memset(scratch->header->cores[core].alloc_count, 0,
226              sizeof(scratch->header->cores[core].alloc_count));
227    }
228 }
229 
230 void
agx_scratch_debug_post(struct agx_scratch * scratch)231 agx_scratch_debug_post(struct agx_scratch *scratch)
232 {
233    if (!scratch->buf)
234       return;
235 
236    fprintf(stderr, "Scratch @ 0x%" PRIx64 "\n", scratch->buf->va->addr);
237 
238    for (int core = 0; core < scratch->max_core_id; core++) {
239       fprintf(stderr, "Core %3d: max %d, failed %d, counts:", core,
240               scratch->header->cores[core].alloc_max,
241               scratch->header->cores[core].alloc_failed);
242 
243       for (unsigned bucket = 0; bucket < AGX_SPILL_SIZE_BUCKETS; bucket++) {
244          fprintf(stderr, " %d:%-3d",
245                  bucket ? (AGX_SPILL_UNIT_DWORDS << (bucket - 1)) : 0,
246                  scratch->header->cores[core].alloc_count[bucket]);
247       }
248       fprintf(stderr, "\n");
249       assert(!scratch->header->cores[core].alloc_cur);
250       assert(!scratch->header->cores[core].alloc_failed);
251    }
252 
253 #ifdef SCRATCH_DEBUG
254    unsigned core_index = 0;
255    for (int core = 0; core < scratch->max_core_id; core++) {
256       if (!scratch->core_present[core])
257          continue;
258       void *p = scratch->data + scratch->core_size * core_index++;
259       fprintf(stderr, "\nCORE %d (0x%lx)\n", core, scratch->core_size);
260       u_hexdump(stderr, p, scratch->core_size, true);
261    }
262 #endif
263 }
264 
265 void
agx_scratch_init(struct agx_device * dev,struct agx_scratch * scratch)266 agx_scratch_init(struct agx_device *dev, struct agx_scratch *scratch)
267 {
268    memset(scratch, 0, sizeof(*scratch));
269 
270    scratch->dev = dev;
271 #ifdef SCRATCH_DEBUG_CORES
272    scratch->num_cores = SCRATCH_DEBUG_CORES;
273 #else
274    scratch->num_cores = agx_get_num_cores(dev);
275 #endif
276 }
277 
278 void
agx_scratch_fini(struct agx_scratch * scratch)279 agx_scratch_fini(struct agx_scratch *scratch)
280 {
281    if (scratch->buf)
282       agx_bo_unreference(scratch->dev, scratch->buf);
283    scratch->buf = NULL;
284 }
285