• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2020 Valve Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 
25 #include "crashdec.h"
26 
27 
28 static void
dump_mem_pool_reg_write(unsigned reg,uint32_t data,unsigned context,bool pipe)29 dump_mem_pool_reg_write(unsigned reg, uint32_t data, unsigned context,
30                         bool pipe)
31 {
32    if (pipe) {
33       struct rnndecaddrinfo *info = rnn_reginfo(rnn_pipe, reg);
34       printf("\t\twrite %s (%02x) pipe\n", info->name, reg);
35 
36       if (!strcmp(info->typeinfo->name, "void")) {
37          /* registers that ignore their payload */
38       } else {
39          printf("\t\t\t");
40          dump_register(rnn_pipe, reg, data);
41       }
42    } else {
43       printf("\t\twrite %s (%05x) context %d\n", regname(reg, 1), reg, context);
44       dump_register_val(reg, data, 2);
45    }
46 }
47 
48 static void
dump_mem_pool_chunk(const uint32_t * chunk)49 dump_mem_pool_chunk(const uint32_t *chunk)
50 {
51    struct __attribute__((packed)) {
52       bool reg0_enabled : 1;
53       bool reg1_enabled : 1;
54       uint32_t data0 : 32;
55       uint32_t data1 : 32;
56       uint32_t reg0 : 18;
57       uint32_t reg1 : 18;
58       bool reg0_pipe : 1;
59       bool reg1_pipe : 1;
60       uint32_t reg0_context : 1;
61       uint32_t reg1_context : 1;
62       uint32_t padding : 22;
63    } fields;
64 
65    memcpy(&fields, chunk, 4 * sizeof(uint32_t));
66 
67    if (fields.reg0_enabled) {
68       dump_mem_pool_reg_write(fields.reg0, fields.data0, fields.reg0_context,
69                               fields.reg0_pipe);
70    }
71 
72    if (fields.reg1_enabled) {
73       dump_mem_pool_reg_write(fields.reg1, fields.data1, fields.reg1_context,
74                               fields.reg1_pipe);
75    }
76 }
77 
78 void
dump_cp_mem_pool(uint32_t * mempool)79 dump_cp_mem_pool(uint32_t *mempool)
80 {
81    /* The mem pool is a shared pool of memory used for storing in-flight
82     * register writes. There are 6 different queues, one for each
83     * cluster. Writing to $data (or for some special registers, $addr)
84     * pushes data onto the appropriate queue, and each queue is pulled
85     * from by the appropriate cluster. The queues are thus written to
86     * in-order, but may be read out-of-order.
87     *
88     * The queues are conceptually divided into 128-bit "chunks", and the
89     * read and write pointers are in units of chunks.  These chunks are
90     * organized internally into 8-chunk "blocks", and memory is allocated
91     * dynamically in terms of blocks. Each queue is represented as a
92     * singly-linked list of blocks, as well as 3-bit start/end chunk
93     * pointers that point within the first/last block.  The next pointers
94     * are located in a separate array, rather than inline.
95     */
96 
97    /* TODO: The firmware CP_MEM_POOL save/restore routines do something
98     * like:
99     *
100     * cread $02, [ $00 + 0 ]
101     * and $02, $02, 0x118
102     * ...
103     * brne $02, 0, #label
104     * mov $03, 0x2000
105     * mov $03, 0x1000
106     * label:
107     * ...
108     *
109     * I think that control register 0 is the GPU version, and some
110     * versions have a smaller mem pool. It seems some models have a mem
111     * pool that's half the size, and a bunch of offsets are shifted
112     * accordingly. Unfortunately the kernel driver's dumping code doesn't
113     * seem to take this into account, even the downstream android driver,
114     * and we don't know which versions 0x8, 0x10, or 0x100 correspond
115     * to. Or maybe we can use CP_DBG_MEM_POOL_SIZE to figure this out?
116     */
117    bool small_mem_pool = false;
118 
119    /* The array of next pointers for each block. */
120    const uint32_t *next_pointers =
121       small_mem_pool ? &mempool[0x800] : &mempool[0x1000];
122 
123    /* Maximum number of blocks in the pool, also the size of the pointers
124     * array.
125     */
126    const int num_blocks = small_mem_pool ? 0x30 : 0x80;
127 
128    /* Number of queues */
129    const unsigned num_queues = 6;
130 
131    /* Unfortunately the per-queue state is a little more complicated than
132     * a simple pair of begin/end pointers. Instead of a single beginning
133     * block, there are *two*, with the property that either the two are
134     * equal or the second is the "next" of the first. Similarly there are
135     * two end blocks. Thus the queue either looks like this:
136     *
137     * A -> B -> ... -> C -> D
138     *
139     * Or like this, or some combination:
140     *
141     * A/B -> ... -> C/D
142     *
143     * However, there's only one beginning/end chunk offset. Now the
144     * question is, which of A or B is the actual start? I.e. is the chunk
145     * offset an offset inside A or B? It depends. I'll show a typical read
146     * cycle, starting here (read pointer marked with a *) with a chunk
147     * offset of 0:
148     *
149     *	  A                    B
150     *  _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _
151     * |_|_|_|_|_|_|_|_| -> |*|_|_|_|_|_|_|_| -> |_|_|_|_|_|_|_|_|
152     *
153     * Once the pointer advances far enough, the hardware decides to free
154     * A, after which the read-side state looks like:
155     *
156     *	(free)                A/B
157     *  _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _
158     * |_|_|_|_|_|_|_|_|    |_|_|_|*|_|_|_|_| -> |_|_|_|_|_|_|_|_|
159     *
160     * Then after advancing the pointer a bit more, the hardware fetches
161     * the "next" pointer for A and stores it in B:
162     *
163     *	(free)                 A                     B
164     *  _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _
165     * |_|_|_|_|_|_|_|_|    |_|_|_|_|_|_|_|*| -> |_|_|_|_|_|_|_|_|
166     *
167     * Then the read pointer advances into B, at which point we've come
168     * back to the first state having advanced a whole block:
169     *
170     *	(free)                 A                     B
171     *  _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _
172     * |_|_|_|_|_|_|_|_|    |_|_|_|_|_|_|_|_| -> |*|_|_|_|_|_|_|_|
173     *
174     *
175     * There is a similar cycle for the write pointer. Now, the question
176     * is, how do we know which state we're in? We need to know this to
177     * know whether the pointer (*) is in A or B if they're different. It
178     * seems like there should be some bit somewhere describing this, but
179     * after lots of experimentation I've come up empty-handed. For now we
180     * assume that if the pointer is in the first half, then we're in
181     * either the first or second state and use B, and otherwise we're in
182     * the second or third state and use A. So far I haven't seen anything
183     * that violates this assumption.
184     */
185 
186    struct {
187       uint32_t unk0;
188       uint32_t padding0[7]; /* Mirrors of unk0 */
189 
190       struct {
191          uint32_t chunk : 3;
192          uint32_t first_block : 32 - 3;
193       } writer[6];
194       uint32_t padding1[2]; /* Mirrors of writer[4], writer[5] */
195 
196       uint32_t unk1;
197       uint32_t padding2[7]; /* Mirrors of unk1 */
198 
199       uint32_t writer_second_block[6];
200       uint32_t padding3[2];
201 
202       uint32_t unk2[6];
203       uint32_t padding4[2];
204 
205       struct {
206          uint32_t chunk : 3;
207          uint32_t first_block : 32 - 3;
208       } reader[6];
209       uint32_t padding5[2]; /* Mirrors of reader[4], reader[5] */
210 
211       uint32_t unk3;
212       uint32_t padding6[7]; /* Mirrors of unk3 */
213 
214       uint32_t reader_second_block[6];
215       uint32_t padding7[2];
216 
217       uint32_t block_count[6];
218       uint32_t padding[2];
219 
220       uint32_t unk4;
221       uint32_t padding9[7]; /* Mirrors of unk4 */
222    } data1;
223 
224    const uint32_t *data1_ptr =
225       small_mem_pool ? &mempool[0xc00] : &mempool[0x1800];
226    memcpy(&data1, data1_ptr, sizeof(data1));
227 
228    /* Based on the kernel, the first dword is the mem pool size (in
229     * blocks?) and mirrors CP_MEM_POOL_DBG_SIZE.
230     */
231    const uint32_t *data2_ptr =
232       small_mem_pool ? &mempool[0x1000] : &mempool[0x2000];
233    const int data2_size = 0x60;
234 
235    /* This seems to be the size of each queue in chunks. */
236    const uint32_t *queue_sizes = &data2_ptr[0x18];
237 
238    printf("\tdata2:\n");
239    dump_hex_ascii(data2_ptr, 4 * data2_size, 1);
240 
241    /* These seem to be some kind of counter of allocated/deallocated blocks */
242    if (verbose) {
243       printf("\tunk0: %x\n", data1.unk0);
244       printf("\tunk1: %x\n", data1.unk1);
245       printf("\tunk3: %x\n", data1.unk3);
246       printf("\tunk4: %x\n\n", data1.unk4);
247    }
248 
249    for (int queue = 0; queue < num_queues; queue++) {
250       const char *cluster_names[6] = {"FE",   "SP_VS", "PC_VS",
251                                       "GRAS", "SP_PS", "PS"};
252       printf("\tCLUSTER_%s:\n\n", cluster_names[queue]);
253 
254       if (verbose) {
255          printf("\t\twriter_first_block: 0x%x\n",
256                 data1.writer[queue].first_block);
257          printf("\t\twriter_second_block: 0x%x\n",
258                 data1.writer_second_block[queue]);
259          printf("\t\twriter_chunk: %d\n", data1.writer[queue].chunk);
260          printf("\t\treader_first_block: 0x%x\n",
261                 data1.reader[queue].first_block);
262          printf("\t\treader_second_block: 0x%x\n",
263                 data1.reader_second_block[queue]);
264          printf("\t\treader_chunk: %d\n", data1.reader[queue].chunk);
265          printf("\t\tblock_count: %d\n", data1.block_count[queue]);
266          printf("\t\tunk2: 0x%x\n", data1.unk2[queue]);
267          printf("\t\tqueue_size: %d\n\n", queue_sizes[queue]);
268       }
269 
270       uint32_t cur_chunk = data1.reader[queue].chunk;
271       uint32_t cur_block = cur_chunk > 3 ? data1.reader[queue].first_block
272                                          : data1.reader_second_block[queue];
273       uint32_t last_chunk = data1.writer[queue].chunk;
274       uint32_t last_block = last_chunk > 3 ? data1.writer[queue].first_block
275                                            : data1.writer_second_block[queue];
276 
277       if (verbose)
278          printf("\tblock %x\n", cur_block);
279       if (cur_block >= num_blocks) {
280          fprintf(stderr, "block %x too large\n", cur_block);
281          exit(1);
282       }
283       unsigned calculated_queue_size = 0;
284       while (cur_block != last_block || cur_chunk != last_chunk) {
285          calculated_queue_size++;
286          uint32_t *chunk_ptr = &mempool[cur_block * 0x20 + cur_chunk * 4];
287 
288          dump_mem_pool_chunk(chunk_ptr);
289 
290          printf("\t%05x: %08x %08x %08x %08x\n",
291                 4 * (cur_block * 0x20 + cur_chunk + 4), chunk_ptr[0],
292                 chunk_ptr[1], chunk_ptr[2], chunk_ptr[3]);
293 
294          cur_chunk++;
295          if (cur_chunk == 8) {
296             cur_block = next_pointers[cur_block];
297             if (verbose)
298                printf("\tblock %x\n", cur_block);
299             if (cur_block >= num_blocks) {
300                fprintf(stderr, "block %x too large\n", cur_block);
301                exit(1);
302             }
303             cur_chunk = 0;
304          }
305       }
306       if (calculated_queue_size != queue_sizes[queue]) {
307          printf("\t\tCALCULATED SIZE %d DOES NOT MATCH!\n",
308                 calculated_queue_size);
309       }
310       printf("\n");
311    }
312 }
313 
314