1 /*
2 * Copyright © 2022 Igalia S.L.
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "tu_cs.h"
7
8 #include <arpa/inet.h>
9 #include <netinet/in.h>
10 #include <sys/socket.h>
11
12 #include "tu_device.h"
13
14 /* A simple implementations of breadcrumbs tracking of GPU progress
15 * intended to be a last resort when debugging unrecoverable hangs.
16 * For best results use Vulkan traces to have a predictable place of hang.
17 *
18 * For ordinary hangs as a more user-friendly solution use GFR
19 * "Graphics Flight Recorder".
20 *
21 * This implementation aims to handle cases where we cannot do anything
22 * after the hang, which is achieved by:
23 * - On GPU after each breadcrumb we wait until CPU acks it and sends udp
24 * packet to the remote host;
25 * - At specified breadcrumb require explicit user input to continue
26 * execution up to the next breadcrumb.
27 *
28 * In-driver breadcrumbs also allow more precise tracking since we could
29 * target a single GPU packet.
30 *
31 *
32 * Breadcrumbs settings:
33 *
34 * TU_BREADCRUMBS=$IP:$PORT,break=$BREAKPOINT:$BREAKPOINT_HITS
35 * Where:
36 * $BREAKPOINT - the breadcrumb from which we require explicit ack
37 * $BREAKPOINT_HITS - how many times breakpoint should be reached for
38 * break to occur. Necessary for a gmem mode and re-usable cmdbuffers
39 * in both of which the same cmdstream could be executed several times.
40 *
41 *
42 * A typical work flow would be:
43 * - Start listening for breadcrumbs on remote host:
44 * nc -lvup $PORT | stdbuf -o0 xxd -pc -c 4 | awk -Wposix '{printf("%u:%u\n", "0x" $0, a[$0]++)}'
45 *
46 * - Start capturing command stream:
47 * sudo cat /sys/kernel/debug/dri/0/rd > ~/cmdstream.rd
48 *
49 * - On device replay the hanging trace with:
50 * TU_BREADCRUMBS=$IP:$PORT,break=-1:0
51 * ! Try to reproduce the hang in a sysmem mode because it would
52 * require much less breadcrumb writes and syncs.
53 *
54 * - Increase hangcheck period:
55 * echo -n 60000 > /sys/kernel/debug/dri/0/hangcheck_period_ms
56 *
57 * - After GPU hang note the last breadcrumb and relaunch trace with:
58 * TU_BREADCRUMBS=$IP:$PORT,break=$LAST_BREADCRUMB:$HITS
59 *
60 * - After the breakpoint is reached each breadcrumb would require
61 * explicit ack from the user. This way it's possible to find
62 * the last packet which did't hang.
63 *
64 * - Find the packet in the decoded cmdstream.
65 */
66
67 struct breadcrumbs_context
68 {
69 char remote_host[64];
70 int remote_port;
71 uint32_t breadcrumb_breakpoint;
72 uint32_t breadcrumb_breakpoint_hits;
73
74 bool thread_stop;
75 pthread_t breadcrumbs_thread;
76
77 struct tu_device *device;
78
79 uint32_t breadcrumb_idx;
80 };
81
82 static void *
sync_gpu_with_cpu(void * _job)83 sync_gpu_with_cpu(void *_job)
84 {
85 struct breadcrumbs_context *ctx = (struct breadcrumbs_context *) _job;
86 struct tu6_global *global =
87 (struct tu6_global *) ctx->device->global_bo->map;
88 uint32_t last_breadcrumb = 0;
89 uint32_t breakpoint_hits = 0;
90
91 int s = socket(AF_INET, SOCK_DGRAM, 0);
92
93 if (s < 0) {
94 mesa_loge("TU_BREADCRUMBS: Error while creating socket");
95 return NULL;
96 }
97
98 struct sockaddr_in to_addr;
99 to_addr.sin_family = AF_INET;
100 to_addr.sin_port = htons(ctx->remote_port);
101 to_addr.sin_addr.s_addr = inet_addr(ctx->remote_host);
102
103 /* Run until we know that no more work would be submitted,
104 * because each breadcrumb requires an ack from cpu side and without
105 * the ack GPU would timeout.
106 */
107 while (!ctx->thread_stop) {
108 uint32_t current_breadcrumb = global->breadcrumb_gpu_sync_seqno;
109
110 if (current_breadcrumb != last_breadcrumb) {
111 last_breadcrumb = current_breadcrumb;
112
113 uint32_t data = htonl(last_breadcrumb);
114 if (sendto(s, &data, sizeof(data), 0, (struct sockaddr *) &to_addr,
115 sizeof(to_addr)) < 0) {
116 mesa_loge("TU_BREADCRUMBS: sendto failed");
117 goto fail;
118 }
119
120 if (last_breadcrumb >= ctx->breadcrumb_breakpoint &&
121 breakpoint_hits >= ctx->breadcrumb_breakpoint_hits) {
122 printf("GPU is on breadcrumb %d, continue?", last_breadcrumb);
123 while (getchar() != 'y')
124 ;
125 }
126
127 if (ctx->breadcrumb_breakpoint == last_breadcrumb)
128 breakpoint_hits++;
129
130 /* ack that we received the value */
131 global->breadcrumb_cpu_sync_seqno = last_breadcrumb;
132 }
133 }
134
135 fail:
136 close(s);
137
138 return NULL;
139 }
140
141 /* Same as tu_cs_emit_pkt7 but without instrumentation */
142 static inline void
emit_pkt7(struct tu_cs * cs,uint8_t opcode,uint16_t cnt)143 emit_pkt7(struct tu_cs *cs, uint8_t opcode, uint16_t cnt)
144 {
145 tu_cs_reserve(cs, cnt + 1);
146 tu_cs_emit(cs, pm4_pkt7_hdr(opcode, cnt));
147 }
148
149 void
tu_breadcrumbs_init(struct tu_device * device)150 tu_breadcrumbs_init(struct tu_device *device)
151 {
152 const char *breadcrumbs_opt = NULL;
153 #ifdef TU_BREADCRUMBS_ENABLED
154 breadcrumbs_opt = os_get_option("TU_BREADCRUMBS");
155 #endif
156
157 device->breadcrumbs_ctx = NULL;
158 if (!breadcrumbs_opt) {
159 return;
160 }
161
162 struct breadcrumbs_context *ctx =
163 malloc(sizeof(struct breadcrumbs_context));
164 ctx->device = device;
165 ctx->breadcrumb_idx = 0;
166 ctx->thread_stop = false;
167
168 if (sscanf(breadcrumbs_opt, "%[^:]:%d,break=%u:%u", ctx->remote_host,
169 &ctx->remote_port, &ctx->breadcrumb_breakpoint,
170 &ctx->breadcrumb_breakpoint_hits) != 4) {
171 free(ctx);
172 mesa_loge("Wrong TU_BREADCRUMBS value");
173 return;
174 }
175
176 device->breadcrumbs_ctx = ctx;
177
178 struct tu6_global *global = device->global_bo->map;
179 global->breadcrumb_cpu_sync_seqno = 0;
180 global->breadcrumb_gpu_sync_seqno = 0;
181
182 pthread_create(&ctx->breadcrumbs_thread, NULL, sync_gpu_with_cpu, ctx);
183 }
184
185 void
tu_breadcrumbs_finish(struct tu_device * device)186 tu_breadcrumbs_finish(struct tu_device *device)
187 {
188 struct breadcrumbs_context *ctx = device->breadcrumbs_ctx;
189 if (!ctx || ctx->thread_stop)
190 return;
191
192 ctx->thread_stop = true;
193 pthread_join(ctx->breadcrumbs_thread, NULL);
194
195 free(ctx);
196 }
197
198 void
tu_cs_emit_sync_breadcrumb(struct tu_cs * cs,uint8_t opcode,uint16_t cnt)199 tu_cs_emit_sync_breadcrumb(struct tu_cs *cs, uint8_t opcode, uint16_t cnt)
200 {
201 /* TODO: we may run out of space if we add breadcrumbs
202 * to non-growable CS.
203 */
204 if (cs->mode != TU_CS_MODE_GROW)
205 return;
206
207 struct tu_device *device = cs->device;
208 struct breadcrumbs_context *ctx = device->breadcrumbs_ctx;
209 if (!ctx || ctx->thread_stop)
210 return;
211
212 bool before_packet = (cnt != 0);
213
214 if (before_packet) {
215 switch (opcode) {
216 case CP_EXEC_CS_INDIRECT:
217 case CP_EXEC_CS:
218 case CP_DRAW_INDX:
219 case CP_DRAW_INDX_OFFSET:
220 case CP_DRAW_INDIRECT:
221 case CP_DRAW_INDX_INDIRECT:
222 case CP_DRAW_INDIRECT_MULTI:
223 case CP_DRAW_AUTO:
224 case CP_BLIT:
225 // case CP_SET_DRAW_STATE:
226 // case CP_LOAD_STATE6_FRAG:
227 // case CP_LOAD_STATE6_GEOM:
228 break;
229 default:
230 return;
231 };
232 } else {
233 assert(cs->breadcrumb_emit_after == 0);
234 }
235
236 uint32_t current_breadcrumb = p_atomic_inc_return(&ctx->breadcrumb_idx);
237
238 if (ctx->breadcrumb_breakpoint != -1 &&
239 current_breadcrumb < ctx->breadcrumb_breakpoint)
240 return;
241
242 emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
243 emit_pkt7(cs, CP_WAIT_FOR_IDLE, 0);
244 emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
245
246 emit_pkt7(cs, CP_MEM_WRITE, 3);
247 tu_cs_emit_qw(
248 cs, device->global_bo->iova + gb_offset(breadcrumb_gpu_sync_seqno));
249 tu_cs_emit(cs, current_breadcrumb);
250
251 /* Wait until CPU acknowledges the value written by GPU */
252 emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
253 tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |
254 CP_WAIT_REG_MEM_0_POLL_MEMORY);
255 tu_cs_emit_qw(
256 cs, device->global_bo->iova + gb_offset(breadcrumb_cpu_sync_seqno));
257 tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(current_breadcrumb));
258 tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));
259 tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
260
261 if (before_packet)
262 cs->breadcrumb_emit_after = cnt;
263 }
264