1 /*
2 * Copyright 2015 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "ac_debug.h"
8 #include "sid.h"
9 #include "sid_tables.h"
10
11 #include "util/u_string.h"
12
13 #include <inttypes.h>
14
ac_find_register(enum amd_gfx_level gfx_level,enum radeon_family family,unsigned offset)15 const struct si_reg *ac_find_register(enum amd_gfx_level gfx_level, enum radeon_family family,
16 unsigned offset)
17 {
18 const struct si_reg *table;
19 unsigned table_size;
20
21 switch (gfx_level) {
22 case GFX11_5:
23 table = gfx115_reg_table;
24 table_size = ARRAY_SIZE(gfx115_reg_table);
25 break;
26 case GFX11:
27 table = gfx11_reg_table;
28 table_size = ARRAY_SIZE(gfx11_reg_table);
29 break;
30 case GFX10_3:
31 table = gfx103_reg_table;
32 table_size = ARRAY_SIZE(gfx103_reg_table);
33 break;
34 case GFX10:
35 table = gfx10_reg_table;
36 table_size = ARRAY_SIZE(gfx10_reg_table);
37 break;
38 case GFX9:
39 if (family == CHIP_GFX940) {
40 table = gfx940_reg_table;
41 table_size = ARRAY_SIZE(gfx940_reg_table);
42 break;
43 }
44 table = gfx9_reg_table;
45 table_size = ARRAY_SIZE(gfx9_reg_table);
46 break;
47 case GFX8:
48 if (family == CHIP_STONEY) {
49 table = gfx81_reg_table;
50 table_size = ARRAY_SIZE(gfx81_reg_table);
51 break;
52 }
53 table = gfx8_reg_table;
54 table_size = ARRAY_SIZE(gfx8_reg_table);
55 break;
56 case GFX7:
57 table = gfx7_reg_table;
58 table_size = ARRAY_SIZE(gfx7_reg_table);
59 break;
60 case GFX6:
61 table = gfx6_reg_table;
62 table_size = ARRAY_SIZE(gfx6_reg_table);
63 break;
64 default:
65 return NULL;
66 }
67
68 for (unsigned i = 0; i < table_size; i++) {
69 const struct si_reg *reg = &table[i];
70
71 if (reg->offset == offset)
72 return reg;
73 }
74
75 return NULL;
76 }
77
ac_get_register_name(enum amd_gfx_level gfx_level,enum radeon_family family,unsigned offset)78 const char *ac_get_register_name(enum amd_gfx_level gfx_level, enum radeon_family family,
79 unsigned offset)
80 {
81 const struct si_reg *reg = ac_find_register(gfx_level, family, offset);
82
83 return reg ? sid_strings + reg->name_offset : "(no name)";
84 }
85
ac_register_exists(enum amd_gfx_level gfx_level,enum radeon_family family,unsigned offset)86 bool ac_register_exists(enum amd_gfx_level gfx_level, enum radeon_family family,
87 unsigned offset)
88 {
89 return ac_find_register(gfx_level, family, offset) != NULL;
90 }
91
92 /**
93 * Parse dmesg and return TRUE if a VM fault has been detected.
94 *
95 * \param gfx_level gfx level
96 * \param old_dmesg_timestamp previous dmesg timestamp parsed at init time
97 * \param out_addr detected VM fault addr
98 */
ac_vm_fault_occurred(enum amd_gfx_level gfx_level,uint64_t * old_dmesg_timestamp,uint64_t * out_addr)99 bool ac_vm_fault_occurred(enum amd_gfx_level gfx_level, uint64_t *old_dmesg_timestamp,
100 uint64_t *out_addr)
101 {
102 #ifdef _WIN32
103 return false;
104 #else
105 char line[2000];
106 unsigned sec, usec;
107 int progress = 0;
108 uint64_t dmesg_timestamp = 0;
109 bool fault = false;
110
111 FILE *p = popen("dmesg", "r");
112 if (!p)
113 return false;
114
115 while (fgets(line, sizeof(line), p)) {
116 char *msg, len;
117
118 if (!line[0] || line[0] == '\n')
119 continue;
120
121 /* Get the timestamp. */
122 if (sscanf(line, "[%u.%u]", &sec, &usec) != 2) {
123 static bool hit = false;
124 if (!hit) {
125 fprintf(stderr, "%s: failed to parse line '%s'\n", __func__, line);
126 hit = true;
127 }
128 continue;
129 }
130 dmesg_timestamp = sec * 1000000ull + usec;
131
132 /* If just updating the timestamp. */
133 if (!out_addr)
134 continue;
135
136 /* Process messages only if the timestamp is newer. */
137 if (dmesg_timestamp <= *old_dmesg_timestamp)
138 continue;
139
140 /* Only process the first VM fault. */
141 if (fault)
142 continue;
143
144 /* Remove trailing \n */
145 len = strlen(line);
146 if (len && line[len - 1] == '\n')
147 line[len - 1] = 0;
148
149 /* Get the message part. */
150 msg = strchr(line, ']');
151 if (!msg)
152 continue;
153 msg++;
154
155 const char *header_line, *addr_line_prefix, *addr_line_format;
156
157 if (gfx_level >= GFX9) {
158 /* Match this:
159 * ..: [gfxhub] VMC page fault (src_id:0 ring:158 vm_id:2 pas_id:0)
160 * ..: at page 0x0000000219f8f000 from 27
161 * ..: VM_L2_PROTECTION_FAULT_STATUS:0x0020113C
162 */
163 header_line = "VMC page fault";
164 addr_line_prefix = " at page";
165 addr_line_format = "%" PRIx64;
166 } else {
167 header_line = "GPU fault detected:";
168 addr_line_prefix = "VM_CONTEXT1_PROTECTION_FAULT_ADDR";
169 addr_line_format = "%" PRIX64;
170 }
171
172 switch (progress) {
173 case 0:
174 if (strstr(msg, header_line))
175 progress = 1;
176 break;
177 case 1:
178 msg = strstr(msg, addr_line_prefix);
179 if (msg) {
180 msg = strstr(msg, "0x");
181 if (msg) {
182 msg += 2;
183 if (sscanf(msg, addr_line_format, out_addr) == 1)
184 fault = true;
185 }
186 }
187 progress = 0;
188 break;
189 default:
190 progress = 0;
191 }
192 }
193 pclose(p);
194
195 if (dmesg_timestamp > *old_dmesg_timestamp)
196 *old_dmesg_timestamp = dmesg_timestamp;
197
198 return fault;
199 #endif
200 }
201
compare_wave(const void * p1,const void * p2)202 static int compare_wave(const void *p1, const void *p2)
203 {
204 struct ac_wave_info *w1 = (struct ac_wave_info *)p1;
205 struct ac_wave_info *w2 = (struct ac_wave_info *)p2;
206
207 /* Sort waves according to PC and then SE, SH, CU, etc. */
208 if (w1->pc < w2->pc)
209 return -1;
210 if (w1->pc > w2->pc)
211 return 1;
212 if (w1->se < w2->se)
213 return -1;
214 if (w1->se > w2->se)
215 return 1;
216 if (w1->sh < w2->sh)
217 return -1;
218 if (w1->sh > w2->sh)
219 return 1;
220 if (w1->cu < w2->cu)
221 return -1;
222 if (w1->cu > w2->cu)
223 return 1;
224 if (w1->simd < w2->simd)
225 return -1;
226 if (w1->simd > w2->simd)
227 return 1;
228 if (w1->wave < w2->wave)
229 return -1;
230 if (w1->wave > w2->wave)
231 return 1;
232
233 return 0;
234 }
235
236 /* Return wave information. "waves" should be a large enough array. */
ac_get_wave_info(enum amd_gfx_level gfx_level,const struct radeon_info * info,struct ac_wave_info waves[AC_MAX_WAVES_PER_CHIP])237 unsigned ac_get_wave_info(enum amd_gfx_level gfx_level, const struct radeon_info *info,
238 struct ac_wave_info waves[AC_MAX_WAVES_PER_CHIP])
239 {
240 #ifdef _WIN32
241 return 0;
242 #else
243 char line[2000], cmd[256];
244 unsigned num_waves = 0;
245
246 sprintf(cmd, "umr --by-pci %04x:%02x:%02x.%01x -O halt_waves -wa %s",
247 info->pci.domain, info->pci.bus, info->pci.dev, info->pci.func,
248 gfx_level >= GFX10 ? "gfx_0.0.0" : "gfx");
249
250 FILE *p = popen(cmd, "r");
251 if (!p)
252 return 0;
253
254 if (!fgets(line, sizeof(line), p) || strncmp(line, "SE", 2) != 0) {
255 pclose(p);
256 return 0;
257 }
258
259 while (fgets(line, sizeof(line), p)) {
260 struct ac_wave_info *w;
261 uint32_t pc_hi, pc_lo, exec_hi, exec_lo;
262
263 assert(num_waves < AC_MAX_WAVES_PER_CHIP);
264 w = &waves[num_waves];
265
266 if (sscanf(line, "%u %u %u %u %u %x %x %x %x %x %x %x", &w->se, &w->sh, &w->cu, &w->simd,
267 &w->wave, &w->status, &pc_hi, &pc_lo, &w->inst_dw0, &w->inst_dw1, &exec_hi,
268 &exec_lo) == 12) {
269 w->pc = ((uint64_t)pc_hi << 32) | pc_lo;
270 w->exec = ((uint64_t)exec_hi << 32) | exec_lo;
271 w->matched = false;
272 num_waves++;
273 }
274 }
275
276 qsort(waves, num_waves, sizeof(struct ac_wave_info), compare_wave);
277
278 pclose(p);
279 return num_waves;
280 #endif
281 }
282
283 /* List of GFXHUB clients from AMDGPU source code. */
284 static const char *const gfx10_gfxhub_client_ids[] = {
285 "CB/DB",
286 "Reserved",
287 "GE1",
288 "GE2",
289 "CPF",
290 "CPC",
291 "CPG",
292 "RLC",
293 "TCP",
294 "SQC (inst)",
295 "SQC (data)",
296 "SQG",
297 "Reserved",
298 "SDMA0",
299 "SDMA1",
300 "GCR",
301 "SDMA2",
302 "SDMA3",
303 };
304
305 static const char *
ac_get_gfx10_gfxhub_client(unsigned cid)306 ac_get_gfx10_gfxhub_client(unsigned cid)
307 {
308 if (cid >= ARRAY_SIZE(gfx10_gfxhub_client_ids))
309 return "UNKNOWN";
310 return gfx10_gfxhub_client_ids[cid];
311 }
312
ac_print_gpuvm_fault_status(FILE * output,enum amd_gfx_level gfx_level,uint32_t status)313 void ac_print_gpuvm_fault_status(FILE *output, enum amd_gfx_level gfx_level,
314 uint32_t status)
315 {
316 if (gfx_level >= GFX10) {
317 const uint8_t cid = G_00A130_CID(status);
318
319 fprintf(output, "GCVM_L2_PROTECTION_FAULT_STATUS: 0x%x\n", status);
320 fprintf(output, "\t CLIENT_ID: (%s) 0x%x\n", ac_get_gfx10_gfxhub_client(cid), cid);
321 fprintf(output, "\t MORE_FAULTS: %d\n", G_00A130_MORE_FAULTS(status));
322 fprintf(output, "\t WALKER_ERROR: %d\n", G_00A130_WALKER_ERROR(status));
323 fprintf(output, "\t PERMISSION_FAULTS: %d\n", G_00A130_PERMISSION_FAULTS(status));
324 fprintf(output, "\t MAPPING_ERROR: %d\n", G_00A130_MAPPING_ERROR(status));
325 fprintf(output, "\t RW: %d\n", G_00A130_RW(status));
326 } else {
327 fprintf(output, "VM_CONTEXT1_PROTECTION_FAULT_STATUS: 0x%x\n", status);
328 }
329 }
330