1 /*
2 * Copyright © 2020 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24
25 #include "crashdec.h"
26
27
28 static void
dump_mem_pool_reg_write(unsigned reg,uint32_t data,unsigned context,bool pipe)29 dump_mem_pool_reg_write(unsigned reg, uint32_t data, unsigned context,
30 bool pipe)
31 {
32 /* TODO deal better somehow w/ 64b regs: */
33 struct regacc r = {
34 .rnn = pipe ? rnn_pipe : NULL,
35 .regbase = reg,
36 .value = data,
37 };
38 if (pipe) {
39 struct rnndecaddrinfo *info = rnn_reginfo(rnn_pipe, reg);
40 printf("\t\twrite %s (%02x) pipe\n", info->name, reg);
41
42 if (!strcmp(info->typeinfo->name, "void")) {
43 /* registers that ignore their payload */
44 } else {
45 printf("\t\t\t");
46 dump_register(&r);
47 }
48 rnn_reginfo_free(info);
49 } else {
50 printf("\t\twrite %s (%05x) context %d\n", regname(reg, 1), reg, context);
51 dump_register_val(&r, 2);
52 }
53 }
54
55 static void
dump_mem_pool_chunk(const uint32_t * chunk)56 dump_mem_pool_chunk(const uint32_t *chunk)
57 {
58 struct __attribute__((packed)) {
59 bool reg0_enabled : 1;
60 bool reg1_enabled : 1;
61 uint32_t data0 : 32;
62 uint32_t data1 : 32;
63 uint32_t reg0 : 18;
64 uint32_t reg1 : 18;
65 bool reg0_pipe : 1;
66 bool reg1_pipe : 1;
67 uint32_t reg0_context : 1;
68 uint32_t reg1_context : 1;
69 uint32_t padding : 22;
70 } fields;
71
72 memcpy(&fields, chunk, 4 * sizeof(uint32_t));
73
74 if (fields.reg0_enabled) {
75 dump_mem_pool_reg_write(fields.reg0, fields.data0, fields.reg0_context,
76 fields.reg0_pipe);
77 }
78
79 if (fields.reg1_enabled) {
80 dump_mem_pool_reg_write(fields.reg1, fields.data1, fields.reg1_context,
81 fields.reg1_pipe);
82 }
83 }
84
85 void
dump_cp_mem_pool(uint32_t * mempool)86 dump_cp_mem_pool(uint32_t *mempool)
87 {
88 /* The mem pool is a shared pool of memory used for storing in-flight
89 * register writes. There are 6 different queues, one for each
90 * cluster. Writing to $data (or for some special registers, $addr)
91 * pushes data onto the appropriate queue, and each queue is pulled
92 * from by the appropriate cluster. The queues are thus written to
93 * in-order, but may be read out-of-order.
94 *
95 * The queues are conceptually divided into 128-bit "chunks", and the
96 * read and write pointers are in units of chunks. These chunks are
97 * organized internally into 8-chunk "blocks", and memory is allocated
98 * dynamically in terms of blocks. Each queue is represented as a
99 * singly-linked list of blocks, as well as 3-bit start/end chunk
100 * pointers that point within the first/last block. The next pointers
101 * are located in a separate array, rather than inline.
102 */
103
104 /* TODO: The firmware CP_MEM_POOL save/restore routines do something
105 * like:
106 *
107 * cread $02, [ $00 + 0 ]
108 * and $02, $02, 0x118
109 * ...
110 * brne $02, 0, #label
111 * mov $03, 0x2000
112 * mov $03, 0x1000
113 * label:
114 * ...
115 *
116 * I think that control register 0 is the GPU version, and some
117 * versions have a smaller mem pool. It seems some models have a mem
118 * pool that's half the size, and a bunch of offsets are shifted
119 * accordingly. Unfortunately the kernel driver's dumping code doesn't
120 * seem to take this into account, even the downstream android driver,
121 * and we don't know which versions 0x8, 0x10, or 0x100 correspond
122 * to. Or maybe we can use CP_DBG_MEM_POOL_SIZE to figure this out?
123 */
124 bool small_mem_pool = false;
125
126 /* The array of next pointers for each block. */
127 const uint32_t *next_pointers =
128 small_mem_pool ? &mempool[0x800] : &mempool[0x1000];
129
130 /* Maximum number of blocks in the pool, also the size of the pointers
131 * array.
132 */
133 const int num_blocks = small_mem_pool ? 0x30 : 0x80;
134
135 /* Number of queues */
136 const unsigned num_queues = 6;
137
138 /* Unfortunately the per-queue state is a little more complicated than
139 * a simple pair of begin/end pointers. Instead of a single beginning
140 * block, there are *two*, with the property that either the two are
141 * equal or the second is the "next" of the first. Similarly there are
142 * two end blocks. Thus the queue either looks like this:
143 *
144 * A -> B -> ... -> C -> D
145 *
146 * Or like this, or some combination:
147 *
148 * A/B -> ... -> C/D
149 *
150 * However, there's only one beginning/end chunk offset. Now the
151 * question is, which of A or B is the actual start? I.e. is the chunk
152 * offset an offset inside A or B? It depends. I'll show a typical read
153 * cycle, starting here (read pointer marked with a *) with a chunk
154 * offset of 0:
155 *
156 * A B
157 * _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
158 * |_|_|_|_|_|_|_|_| -> |*|_|_|_|_|_|_|_| -> |_|_|_|_|_|_|_|_|
159 *
160 * Once the pointer advances far enough, the hardware decides to free
161 * A, after which the read-side state looks like:
162 *
163 * (free) A/B
164 * _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
165 * |_|_|_|_|_|_|_|_| |_|_|_|*|_|_|_|_| -> |_|_|_|_|_|_|_|_|
166 *
167 * Then after advancing the pointer a bit more, the hardware fetches
168 * the "next" pointer for A and stores it in B:
169 *
170 * (free) A B
171 * _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
172 * |_|_|_|_|_|_|_|_| |_|_|_|_|_|_|_|*| -> |_|_|_|_|_|_|_|_|
173 *
174 * Then the read pointer advances into B, at which point we've come
175 * back to the first state having advanced a whole block:
176 *
177 * (free) A B
178 * _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
179 * |_|_|_|_|_|_|_|_| |_|_|_|_|_|_|_|_| -> |*|_|_|_|_|_|_|_|
180 *
181 *
182 * There is a similar cycle for the write pointer. Now, the question
183 * is, how do we know which state we're in? We need to know this to
184 * know whether the pointer (*) is in A or B if they're different. It
185 * seems like there should be some bit somewhere describing this, but
186 * after lots of experimentation I've come up empty-handed. For now we
187 * assume that if the pointer is in the first half, then we're in
188 * either the first or second state and use B, and otherwise we're in
189 * the second or third state and use A. So far I haven't seen anything
190 * that violates this assumption.
191 */
192
193 struct {
194 uint32_t unk0;
195 uint32_t padding0[7]; /* Mirrors of unk0 */
196
197 struct {
198 uint32_t chunk : 3;
199 uint32_t first_block : 32 - 3;
200 } writer[6];
201 uint32_t padding1[2]; /* Mirrors of writer[4], writer[5] */
202
203 uint32_t unk1;
204 uint32_t padding2[7]; /* Mirrors of unk1 */
205
206 uint32_t writer_second_block[6];
207 uint32_t padding3[2];
208
209 uint32_t unk2[6];
210 uint32_t padding4[2];
211
212 struct {
213 uint32_t chunk : 3;
214 uint32_t first_block : 32 - 3;
215 } reader[6];
216 uint32_t padding5[2]; /* Mirrors of reader[4], reader[5] */
217
218 uint32_t unk3;
219 uint32_t padding6[7]; /* Mirrors of unk3 */
220
221 uint32_t reader_second_block[6];
222 uint32_t padding7[2];
223
224 uint32_t block_count[6];
225 uint32_t padding[2];
226
227 uint32_t unk4;
228 uint32_t padding9[7]; /* Mirrors of unk4 */
229 } data1;
230
231 const uint32_t *data1_ptr =
232 small_mem_pool ? &mempool[0xc00] : &mempool[0x1800];
233 memcpy(&data1, data1_ptr, sizeof(data1));
234
235 /* Based on the kernel, the first dword is the mem pool size (in
236 * blocks?) and mirrors CP_MEM_POOL_DBG_SIZE.
237 */
238 const uint32_t *data2_ptr =
239 small_mem_pool ? &mempool[0x1000] : &mempool[0x2000];
240 const int data2_size = 0x60;
241
242 /* This seems to be the size of each queue in chunks. */
243 const uint32_t *queue_sizes = &data2_ptr[0x18];
244
245 printf("\tdata2:\n");
246 dump_hex_ascii(data2_ptr, 4 * data2_size, 1);
247
248 /* These seem to be some kind of counter of allocated/deallocated blocks */
249 if (verbose) {
250 printf("\tunk0: %x\n", data1.unk0);
251 printf("\tunk1: %x\n", data1.unk1);
252 printf("\tunk3: %x\n", data1.unk3);
253 printf("\tunk4: %x\n\n", data1.unk4);
254 }
255
256 for (int queue = 0; queue < num_queues; queue++) {
257 const char *cluster_names[6] = {"FE", "SP_VS", "PC_VS",
258 "GRAS", "SP_PS", "PS"};
259 printf("\tCLUSTER_%s:\n\n", cluster_names[queue]);
260
261 if (verbose) {
262 printf("\t\twriter_first_block: 0x%x\n",
263 data1.writer[queue].first_block);
264 printf("\t\twriter_second_block: 0x%x\n",
265 data1.writer_second_block[queue]);
266 printf("\t\twriter_chunk: %d\n", data1.writer[queue].chunk);
267 printf("\t\treader_first_block: 0x%x\n",
268 data1.reader[queue].first_block);
269 printf("\t\treader_second_block: 0x%x\n",
270 data1.reader_second_block[queue]);
271 printf("\t\treader_chunk: %d\n", data1.reader[queue].chunk);
272 printf("\t\tblock_count: %d\n", data1.block_count[queue]);
273 printf("\t\tunk2: 0x%x\n", data1.unk2[queue]);
274 printf("\t\tqueue_size: %d\n\n", queue_sizes[queue]);
275 }
276
277 uint32_t cur_chunk = data1.reader[queue].chunk;
278 uint32_t cur_block = cur_chunk > 3 ? data1.reader[queue].first_block
279 : data1.reader_second_block[queue];
280 uint32_t last_chunk = data1.writer[queue].chunk;
281 uint32_t last_block = last_chunk > 3 ? data1.writer[queue].first_block
282 : data1.writer_second_block[queue];
283
284 if (verbose)
285 printf("\tblock %x\n", cur_block);
286 if (cur_block >= num_blocks) {
287 fprintf(stderr, "block %x too large\n", cur_block);
288 exit(1);
289 }
290 unsigned calculated_queue_size = 0;
291 while (cur_block != last_block || cur_chunk != last_chunk) {
292 calculated_queue_size++;
293 uint32_t *chunk_ptr = &mempool[cur_block * 0x20 + cur_chunk * 4];
294
295 dump_mem_pool_chunk(chunk_ptr);
296
297 printf("\t%05x: %08x %08x %08x %08x\n",
298 4 * (cur_block * 0x20 + cur_chunk + 4), chunk_ptr[0],
299 chunk_ptr[1], chunk_ptr[2], chunk_ptr[3]);
300
301 cur_chunk++;
302 if (cur_chunk == 8) {
303 cur_block = next_pointers[cur_block];
304 if (verbose)
305 printf("\tblock %x\n", cur_block);
306 if (cur_block >= num_blocks) {
307 fprintf(stderr, "block %x too large\n", cur_block);
308 exit(1);
309 }
310 cur_chunk = 0;
311 }
312 }
313 if (calculated_queue_size != queue_sizes[queue]) {
314 printf("\t\tCALCULATED SIZE %d DOES NOT MATCH!\n",
315 calculated_queue_size);
316 }
317 printf("\n");
318 }
319 }
320
321