1 /*
2 * Copyright © 2020 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24
25 #include "crashdec.h"
26
27
28 static void
dump_mem_pool_reg_write(unsigned reg,uint32_t data,unsigned context,bool pipe)29 dump_mem_pool_reg_write(unsigned reg, uint32_t data, unsigned context,
30 bool pipe)
31 {
32 if (pipe) {
33 struct rnndecaddrinfo *info = rnn_reginfo(rnn_pipe, reg);
34 printf("\t\twrite %s (%02x) pipe\n", info->name, reg);
35
36 if (!strcmp(info->typeinfo->name, "void")) {
37 /* registers that ignore their payload */
38 } else {
39 printf("\t\t\t");
40 dump_register(rnn_pipe, reg, data);
41 }
42 } else {
43 printf("\t\twrite %s (%05x) context %d\n", regname(reg, 1), reg, context);
44 dump_register_val(reg, data, 2);
45 }
46 }
47
48 static void
dump_mem_pool_chunk(const uint32_t * chunk)49 dump_mem_pool_chunk(const uint32_t *chunk)
50 {
51 struct __attribute__((packed)) {
52 bool reg0_enabled : 1;
53 bool reg1_enabled : 1;
54 uint32_t data0 : 32;
55 uint32_t data1 : 32;
56 uint32_t reg0 : 18;
57 uint32_t reg1 : 18;
58 bool reg0_pipe : 1;
59 bool reg1_pipe : 1;
60 uint32_t reg0_context : 1;
61 uint32_t reg1_context : 1;
62 uint32_t padding : 22;
63 } fields;
64
65 memcpy(&fields, chunk, 4 * sizeof(uint32_t));
66
67 if (fields.reg0_enabled) {
68 dump_mem_pool_reg_write(fields.reg0, fields.data0, fields.reg0_context,
69 fields.reg0_pipe);
70 }
71
72 if (fields.reg1_enabled) {
73 dump_mem_pool_reg_write(fields.reg1, fields.data1, fields.reg1_context,
74 fields.reg1_pipe);
75 }
76 }
77
78 void
dump_cp_mem_pool(uint32_t * mempool)79 dump_cp_mem_pool(uint32_t *mempool)
80 {
81 /* The mem pool is a shared pool of memory used for storing in-flight
82 * register writes. There are 6 different queues, one for each
83 * cluster. Writing to $data (or for some special registers, $addr)
84 * pushes data onto the appropriate queue, and each queue is pulled
85 * from by the appropriate cluster. The queues are thus written to
86 * in-order, but may be read out-of-order.
87 *
88 * The queues are conceptually divided into 128-bit "chunks", and the
89 * read and write pointers are in units of chunks. These chunks are
90 * organized internally into 8-chunk "blocks", and memory is allocated
91 * dynamically in terms of blocks. Each queue is represented as a
92 * singly-linked list of blocks, as well as 3-bit start/end chunk
93 * pointers that point within the first/last block. The next pointers
94 * are located in a separate array, rather than inline.
95 */
96
97 /* TODO: The firmware CP_MEM_POOL save/restore routines do something
98 * like:
99 *
100 * cread $02, [ $00 + 0 ]
101 * and $02, $02, 0x118
102 * ...
103 * brne $02, 0, #label
104 * mov $03, 0x2000
105 * mov $03, 0x1000
106 * label:
107 * ...
108 *
109 * I think that control register 0 is the GPU version, and some
110 * versions have a smaller mem pool. It seems some models have a mem
111 * pool that's half the size, and a bunch of offsets are shifted
112 * accordingly. Unfortunately the kernel driver's dumping code doesn't
113 * seem to take this into account, even the downstream android driver,
114 * and we don't know which versions 0x8, 0x10, or 0x100 correspond
115 * to. Or maybe we can use CP_DBG_MEM_POOL_SIZE to figure this out?
116 */
117 bool small_mem_pool = false;
118
119 /* The array of next pointers for each block. */
120 const uint32_t *next_pointers =
121 small_mem_pool ? &mempool[0x800] : &mempool[0x1000];
122
123 /* Maximum number of blocks in the pool, also the size of the pointers
124 * array.
125 */
126 const int num_blocks = small_mem_pool ? 0x30 : 0x80;
127
128 /* Number of queues */
129 const unsigned num_queues = 6;
130
131 /* Unfortunately the per-queue state is a little more complicated than
132 * a simple pair of begin/end pointers. Instead of a single beginning
133 * block, there are *two*, with the property that either the two are
134 * equal or the second is the "next" of the first. Similarly there are
135 * two end blocks. Thus the queue either looks like this:
136 *
137 * A -> B -> ... -> C -> D
138 *
139 * Or like this, or some combination:
140 *
141 * A/B -> ... -> C/D
142 *
143 * However, there's only one beginning/end chunk offset. Now the
144 * question is, which of A or B is the actual start? I.e. is the chunk
145 * offset an offset inside A or B? It depends. I'll show a typical read
146 * cycle, starting here (read pointer marked with a *) with a chunk
147 * offset of 0:
148 *
149 * A B
150 * _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
151 * |_|_|_|_|_|_|_|_| -> |*|_|_|_|_|_|_|_| -> |_|_|_|_|_|_|_|_|
152 *
153 * Once the pointer advances far enough, the hardware decides to free
154 * A, after which the read-side state looks like:
155 *
156 * (free) A/B
157 * _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
158 * |_|_|_|_|_|_|_|_| |_|_|_|*|_|_|_|_| -> |_|_|_|_|_|_|_|_|
159 *
160 * Then after advancing the pointer a bit more, the hardware fetches
161 * the "next" pointer for A and stores it in B:
162 *
163 * (free) A B
164 * _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
165 * |_|_|_|_|_|_|_|_| |_|_|_|_|_|_|_|*| -> |_|_|_|_|_|_|_|_|
166 *
167 * Then the read pointer advances into B, at which point we've come
168 * back to the first state having advanced a whole block:
169 *
170 * (free) A B
171 * _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
172 * |_|_|_|_|_|_|_|_| |_|_|_|_|_|_|_|_| -> |*|_|_|_|_|_|_|_|
173 *
174 *
175 * There is a similar cycle for the write pointer. Now, the question
176 * is, how do we know which state we're in? We need to know this to
177 * know whether the pointer (*) is in A or B if they're different. It
178 * seems like there should be some bit somewhere describing this, but
179 * after lots of experimentation I've come up empty-handed. For now we
180 * assume that if the pointer is in the first half, then we're in
181 * either the first or second state and use B, and otherwise we're in
182 * the second or third state and use A. So far I haven't seen anything
183 * that violates this assumption.
184 */
185
186 struct {
187 uint32_t unk0;
188 uint32_t padding0[7]; /* Mirrors of unk0 */
189
190 struct {
191 uint32_t chunk : 3;
192 uint32_t first_block : 32 - 3;
193 } writer[6];
194 uint32_t padding1[2]; /* Mirrors of writer[4], writer[5] */
195
196 uint32_t unk1;
197 uint32_t padding2[7]; /* Mirrors of unk1 */
198
199 uint32_t writer_second_block[6];
200 uint32_t padding3[2];
201
202 uint32_t unk2[6];
203 uint32_t padding4[2];
204
205 struct {
206 uint32_t chunk : 3;
207 uint32_t first_block : 32 - 3;
208 } reader[6];
209 uint32_t padding5[2]; /* Mirrors of reader[4], reader[5] */
210
211 uint32_t unk3;
212 uint32_t padding6[7]; /* Mirrors of unk3 */
213
214 uint32_t reader_second_block[6];
215 uint32_t padding7[2];
216
217 uint32_t block_count[6];
218 uint32_t padding[2];
219
220 uint32_t unk4;
221 uint32_t padding9[7]; /* Mirrors of unk4 */
222 } data1;
223
224 const uint32_t *data1_ptr =
225 small_mem_pool ? &mempool[0xc00] : &mempool[0x1800];
226 memcpy(&data1, data1_ptr, sizeof(data1));
227
228 /* Based on the kernel, the first dword is the mem pool size (in
229 * blocks?) and mirrors CP_MEM_POOL_DBG_SIZE.
230 */
231 const uint32_t *data2_ptr =
232 small_mem_pool ? &mempool[0x1000] : &mempool[0x2000];
233 const int data2_size = 0x60;
234
235 /* This seems to be the size of each queue in chunks. */
236 const uint32_t *queue_sizes = &data2_ptr[0x18];
237
238 printf("\tdata2:\n");
239 dump_hex_ascii(data2_ptr, 4 * data2_size, 1);
240
241 /* These seem to be some kind of counter of allocated/deallocated blocks */
242 if (verbose) {
243 printf("\tunk0: %x\n", data1.unk0);
244 printf("\tunk1: %x\n", data1.unk1);
245 printf("\tunk3: %x\n", data1.unk3);
246 printf("\tunk4: %x\n\n", data1.unk4);
247 }
248
249 for (int queue = 0; queue < num_queues; queue++) {
250 const char *cluster_names[6] = {"FE", "SP_VS", "PC_VS",
251 "GRAS", "SP_PS", "PS"};
252 printf("\tCLUSTER_%s:\n\n", cluster_names[queue]);
253
254 if (verbose) {
255 printf("\t\twriter_first_block: 0x%x\n",
256 data1.writer[queue].first_block);
257 printf("\t\twriter_second_block: 0x%x\n",
258 data1.writer_second_block[queue]);
259 printf("\t\twriter_chunk: %d\n", data1.writer[queue].chunk);
260 printf("\t\treader_first_block: 0x%x\n",
261 data1.reader[queue].first_block);
262 printf("\t\treader_second_block: 0x%x\n",
263 data1.reader_second_block[queue]);
264 printf("\t\treader_chunk: %d\n", data1.reader[queue].chunk);
265 printf("\t\tblock_count: %d\n", data1.block_count[queue]);
266 printf("\t\tunk2: 0x%x\n", data1.unk2[queue]);
267 printf("\t\tqueue_size: %d\n\n", queue_sizes[queue]);
268 }
269
270 uint32_t cur_chunk = data1.reader[queue].chunk;
271 uint32_t cur_block = cur_chunk > 3 ? data1.reader[queue].first_block
272 : data1.reader_second_block[queue];
273 uint32_t last_chunk = data1.writer[queue].chunk;
274 uint32_t last_block = last_chunk > 3 ? data1.writer[queue].first_block
275 : data1.writer_second_block[queue];
276
277 if (verbose)
278 printf("\tblock %x\n", cur_block);
279 if (cur_block >= num_blocks) {
280 fprintf(stderr, "block %x too large\n", cur_block);
281 exit(1);
282 }
283 unsigned calculated_queue_size = 0;
284 while (cur_block != last_block || cur_chunk != last_chunk) {
285 calculated_queue_size++;
286 uint32_t *chunk_ptr = &mempool[cur_block * 0x20 + cur_chunk * 4];
287
288 dump_mem_pool_chunk(chunk_ptr);
289
290 printf("\t%05x: %08x %08x %08x %08x\n",
291 4 * (cur_block * 0x20 + cur_chunk + 4), chunk_ptr[0],
292 chunk_ptr[1], chunk_ptr[2], chunk_ptr[3]);
293
294 cur_chunk++;
295 if (cur_chunk == 8) {
296 cur_block = next_pointers[cur_block];
297 if (verbose)
298 printf("\tblock %x\n", cur_block);
299 if (cur_block >= num_blocks) {
300 fprintf(stderr, "block %x too large\n", cur_block);
301 exit(1);
302 }
303 cur_chunk = 0;
304 }
305 }
306 if (calculated_queue_size != queue_sizes[queue]) {
307 printf("\t\tCALCULATED SIZE %d DOES NOT MATCH!\n",
308 calculated_queue_size);
309 }
310 printf("\n");
311 }
312 }
313
314