• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2015 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "ac_debug.h"
8 #include "sid.h"
9 #include "sid_tables.h"
10 
11 #include "util/u_string.h"
12 
13 #include <inttypes.h>
14 
ac_find_register(enum amd_gfx_level gfx_level,enum radeon_family family,unsigned offset)15 const struct si_reg *ac_find_register(enum amd_gfx_level gfx_level, enum radeon_family family,
16                                       unsigned offset)
17 {
18    const struct si_reg *table;
19    unsigned table_size;
20 
21    switch (gfx_level) {
22    case GFX11_5:
23       table = gfx115_reg_table;
24       table_size = ARRAY_SIZE(gfx115_reg_table);
25       break;
26    case GFX11:
27       table = gfx11_reg_table;
28       table_size = ARRAY_SIZE(gfx11_reg_table);
29       break;
30    case GFX10_3:
31       table = gfx103_reg_table;
32       table_size = ARRAY_SIZE(gfx103_reg_table);
33       break;
34    case GFX10:
35       table = gfx10_reg_table;
36       table_size = ARRAY_SIZE(gfx10_reg_table);
37       break;
38    case GFX9:
39       if (family == CHIP_GFX940) {
40          table = gfx940_reg_table;
41          table_size = ARRAY_SIZE(gfx940_reg_table);
42          break;
43       }
44       table = gfx9_reg_table;
45       table_size = ARRAY_SIZE(gfx9_reg_table);
46       break;
47    case GFX8:
48       if (family == CHIP_STONEY) {
49          table = gfx81_reg_table;
50          table_size = ARRAY_SIZE(gfx81_reg_table);
51          break;
52       }
53       table = gfx8_reg_table;
54       table_size = ARRAY_SIZE(gfx8_reg_table);
55       break;
56    case GFX7:
57       table = gfx7_reg_table;
58       table_size = ARRAY_SIZE(gfx7_reg_table);
59       break;
60    case GFX6:
61       table = gfx6_reg_table;
62       table_size = ARRAY_SIZE(gfx6_reg_table);
63       break;
64    default:
65       return NULL;
66    }
67 
68    for (unsigned i = 0; i < table_size; i++) {
69       const struct si_reg *reg = &table[i];
70 
71       if (reg->offset == offset)
72          return reg;
73    }
74 
75    return NULL;
76 }
77 
ac_get_register_name(enum amd_gfx_level gfx_level,enum radeon_family family,unsigned offset)78 const char *ac_get_register_name(enum amd_gfx_level gfx_level, enum radeon_family family,
79                                  unsigned offset)
80 {
81    const struct si_reg *reg = ac_find_register(gfx_level, family, offset);
82 
83    return reg ? sid_strings + reg->name_offset : "(no name)";
84 }
85 
ac_register_exists(enum amd_gfx_level gfx_level,enum radeon_family family,unsigned offset)86 bool ac_register_exists(enum amd_gfx_level gfx_level, enum radeon_family family,
87                         unsigned offset)
88 {
89    return ac_find_register(gfx_level, family, offset) != NULL;
90 }
91 
92 /**
93  * Parse dmesg and return TRUE if a VM fault has been detected.
94  *
95  * \param gfx_level		gfx level
96  * \param old_dmesg_timestamp	previous dmesg timestamp parsed at init time
97  * \param out_addr		detected VM fault addr
98  */
ac_vm_fault_occurred(enum amd_gfx_level gfx_level,uint64_t * old_dmesg_timestamp,uint64_t * out_addr)99 bool ac_vm_fault_occurred(enum amd_gfx_level gfx_level, uint64_t *old_dmesg_timestamp,
100                          uint64_t *out_addr)
101 {
102 #ifdef _WIN32
103    return false;
104 #else
105    char line[2000];
106    unsigned sec, usec;
107    int progress = 0;
108    uint64_t dmesg_timestamp = 0;
109    bool fault = false;
110 
111    FILE *p = popen("dmesg", "r");
112    if (!p)
113       return false;
114 
115    while (fgets(line, sizeof(line), p)) {
116       char *msg, len;
117 
118       if (!line[0] || line[0] == '\n')
119          continue;
120 
121       /* Get the timestamp. */
122       if (sscanf(line, "[%u.%u]", &sec, &usec) != 2) {
123          static bool hit = false;
124          if (!hit) {
125             fprintf(stderr, "%s: failed to parse line '%s'\n", __func__, line);
126             hit = true;
127          }
128          continue;
129       }
130       dmesg_timestamp = sec * 1000000ull + usec;
131 
132       /* If just updating the timestamp. */
133       if (!out_addr)
134          continue;
135 
136       /* Process messages only if the timestamp is newer. */
137       if (dmesg_timestamp <= *old_dmesg_timestamp)
138          continue;
139 
140       /* Only process the first VM fault. */
141       if (fault)
142          continue;
143 
144       /* Remove trailing \n */
145       len = strlen(line);
146       if (len && line[len - 1] == '\n')
147          line[len - 1] = 0;
148 
149       /* Get the message part. */
150       msg = strchr(line, ']');
151       if (!msg)
152          continue;
153       msg++;
154 
155       const char *header_line, *addr_line_prefix, *addr_line_format;
156 
157       if (gfx_level >= GFX9) {
158          /* Match this:
159           * ..: [gfxhub] VMC page fault (src_id:0 ring:158 vm_id:2 pas_id:0)
160           * ..:   at page 0x0000000219f8f000 from 27
161           * ..: VM_L2_PROTECTION_FAULT_STATUS:0x0020113C
162           */
163          header_line = "VMC page fault";
164          addr_line_prefix = "   at page";
165          addr_line_format = "%" PRIx64;
166       } else {
167          header_line = "GPU fault detected:";
168          addr_line_prefix = "VM_CONTEXT1_PROTECTION_FAULT_ADDR";
169          addr_line_format = "%" PRIX64;
170       }
171 
172       switch (progress) {
173       case 0:
174          if (strstr(msg, header_line))
175             progress = 1;
176          break;
177       case 1:
178          msg = strstr(msg, addr_line_prefix);
179          if (msg) {
180             msg = strstr(msg, "0x");
181             if (msg) {
182                msg += 2;
183                if (sscanf(msg, addr_line_format, out_addr) == 1)
184                   fault = true;
185             }
186          }
187          progress = 0;
188          break;
189       default:
190          progress = 0;
191       }
192    }
193    pclose(p);
194 
195    if (dmesg_timestamp > *old_dmesg_timestamp)
196       *old_dmesg_timestamp = dmesg_timestamp;
197 
198    return fault;
199 #endif
200 }
201 
compare_wave(const void * p1,const void * p2)202 static int compare_wave(const void *p1, const void *p2)
203 {
204    struct ac_wave_info *w1 = (struct ac_wave_info *)p1;
205    struct ac_wave_info *w2 = (struct ac_wave_info *)p2;
206 
207    /* Sort waves according to PC and then SE, SH, CU, etc. */
208    if (w1->pc < w2->pc)
209       return -1;
210    if (w1->pc > w2->pc)
211       return 1;
212    if (w1->se < w2->se)
213       return -1;
214    if (w1->se > w2->se)
215       return 1;
216    if (w1->sh < w2->sh)
217       return -1;
218    if (w1->sh > w2->sh)
219       return 1;
220    if (w1->cu < w2->cu)
221       return -1;
222    if (w1->cu > w2->cu)
223       return 1;
224    if (w1->simd < w2->simd)
225       return -1;
226    if (w1->simd > w2->simd)
227       return 1;
228    if (w1->wave < w2->wave)
229       return -1;
230    if (w1->wave > w2->wave)
231       return 1;
232 
233    return 0;
234 }
235 
236 /* Return wave information. "waves" should be a large enough array. */
ac_get_wave_info(enum amd_gfx_level gfx_level,const struct radeon_info * info,struct ac_wave_info waves[AC_MAX_WAVES_PER_CHIP])237 unsigned ac_get_wave_info(enum amd_gfx_level gfx_level, const struct radeon_info *info,
238                           struct ac_wave_info waves[AC_MAX_WAVES_PER_CHIP])
239 {
240 #ifdef _WIN32
241    return 0;
242 #else
243    char line[2000], cmd[256];
244    unsigned num_waves = 0;
245 
246    sprintf(cmd, "umr --by-pci %04x:%02x:%02x.%01x -O halt_waves -wa %s",
247            info->pci.domain, info->pci.bus, info->pci.dev, info->pci.func,
248            gfx_level >= GFX10 ? "gfx_0.0.0" : "gfx");
249 
250    FILE *p = popen(cmd, "r");
251    if (!p)
252       return 0;
253 
254    if (!fgets(line, sizeof(line), p) || strncmp(line, "SE", 2) != 0) {
255       pclose(p);
256       return 0;
257    }
258 
259    while (fgets(line, sizeof(line), p)) {
260       struct ac_wave_info *w;
261       uint32_t pc_hi, pc_lo, exec_hi, exec_lo;
262 
263       assert(num_waves < AC_MAX_WAVES_PER_CHIP);
264       w = &waves[num_waves];
265 
266       if (sscanf(line, "%u %u %u %u %u %x %x %x %x %x %x %x", &w->se, &w->sh, &w->cu, &w->simd,
267                  &w->wave, &w->status, &pc_hi, &pc_lo, &w->inst_dw0, &w->inst_dw1, &exec_hi,
268                  &exec_lo) == 12) {
269          w->pc = ((uint64_t)pc_hi << 32) | pc_lo;
270          w->exec = ((uint64_t)exec_hi << 32) | exec_lo;
271          w->matched = false;
272          num_waves++;
273       }
274    }
275 
276    qsort(waves, num_waves, sizeof(struct ac_wave_info), compare_wave);
277 
278    pclose(p);
279    return num_waves;
280 #endif
281 }
282 
283 /* List of GFXHUB clients from AMDGPU source code. */
284 static const char *const gfx10_gfxhub_client_ids[] = {
285    "CB/DB",
286    "Reserved",
287    "GE1",
288    "GE2",
289    "CPF",
290    "CPC",
291    "CPG",
292    "RLC",
293    "TCP",
294    "SQC (inst)",
295    "SQC (data)",
296    "SQG",
297    "Reserved",
298    "SDMA0",
299    "SDMA1",
300    "GCR",
301    "SDMA2",
302    "SDMA3",
303 };
304 
305 static const char *
ac_get_gfx10_gfxhub_client(unsigned cid)306 ac_get_gfx10_gfxhub_client(unsigned cid)
307 {
308    if (cid >= ARRAY_SIZE(gfx10_gfxhub_client_ids))
309       return "UNKNOWN";
310    return gfx10_gfxhub_client_ids[cid];
311 }
312 
ac_print_gpuvm_fault_status(FILE * output,enum amd_gfx_level gfx_level,uint32_t status)313 void ac_print_gpuvm_fault_status(FILE *output, enum amd_gfx_level gfx_level,
314                                  uint32_t status)
315 {
316    if (gfx_level >= GFX10) {
317       const uint8_t cid = G_00A130_CID(status);
318 
319       fprintf(output, "GCVM_L2_PROTECTION_FAULT_STATUS: 0x%x\n", status);
320       fprintf(output, "\t CLIENT_ID: (%s) 0x%x\n", ac_get_gfx10_gfxhub_client(cid), cid);
321       fprintf(output, "\t MORE_FAULTS: %d\n", G_00A130_MORE_FAULTS(status));
322       fprintf(output, "\t WALKER_ERROR: %d\n", G_00A130_WALKER_ERROR(status));
323       fprintf(output, "\t PERMISSION_FAULTS: %d\n", G_00A130_PERMISSION_FAULTS(status));
324       fprintf(output, "\t MAPPING_ERROR: %d\n", G_00A130_MAPPING_ERROR(status));
325       fprintf(output, "\t RW: %d\n", G_00A130_RW(status));
326    } else {
327       fprintf(output, "VM_CONTEXT1_PROTECTION_FAULT_STATUS: 0x%x\n", status);
328    }
329 }
330