1 /*
2 * Copyright 2018 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 *
22 *
23 */
24 #include <linux/debugfs.h>
25 #include <linux/list.h>
26 #include <linux/module.h>
27 #include <linux/uaccess.h>
28
29 #include "amdgpu.h"
30 #include "amdgpu_ras.h"
31 #include "amdgpu_atomfirmware.h"
32
33 const char *ras_error_string[] = {
34 "none",
35 "parity",
36 "single_correctable",
37 "multi_uncorrectable",
38 "poison",
39 };
40
41 const char *ras_block_string[] = {
42 "umc",
43 "sdma",
44 "gfx",
45 "mmhub",
46 "athub",
47 "pcie_bif",
48 "hdp",
49 "xgmi_wafl",
50 "df",
51 "smn",
52 "sem",
53 "mp0",
54 "mp1",
55 "fuse",
56 };
57
58 #define ras_err_str(i) (ras_error_string[ffs(i)])
59 #define ras_block_str(i) (ras_block_string[i])
60
61 #define AMDGPU_RAS_FLAG_INIT_BY_VBIOS 1
62 #define AMDGPU_RAS_FLAG_INIT_NEED_RESET 2
63 #define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS)
64
65 /* inject address is 52 bits */
66 #define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52)
67
amdgpu_ras_debugfs_read(struct file * f,char __user * buf,size_t size,loff_t * pos)68 static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
69 size_t size, loff_t *pos)
70 {
71 struct ras_manager *obj = (struct ras_manager *)file_inode(f)->i_private;
72 struct ras_query_if info = {
73 .head = obj->head,
74 };
75 ssize_t s;
76 char val[128];
77
78 if (amdgpu_ras_error_query(obj->adev, &info))
79 return -EINVAL;
80
81 s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n",
82 "ue", info.ue_count,
83 "ce", info.ce_count);
84 if (*pos >= s)
85 return 0;
86
87 s -= *pos;
88 s = min_t(u64, s, size);
89
90
91 if (copy_to_user(buf, &val[*pos], s))
92 return -EINVAL;
93
94 *pos += s;
95
96 return s;
97 }
98
99 static const struct file_operations amdgpu_ras_debugfs_ops = {
100 .owner = THIS_MODULE,
101 .read = amdgpu_ras_debugfs_read,
102 .write = NULL,
103 .llseek = default_llseek
104 };
105
amdgpu_ras_find_block_id_by_name(const char * name,int * block_id)106 static int amdgpu_ras_find_block_id_by_name(const char *name, int *block_id)
107 {
108 int i;
109
110 for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) {
111 *block_id = i;
112 if (strcmp(name, ras_block_str(i)) == 0)
113 return 0;
114 }
115 return -EINVAL;
116 }
117
amdgpu_ras_debugfs_ctrl_parse_data(struct file * f,const char __user * buf,size_t size,loff_t * pos,struct ras_debug_if * data)118 static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
119 const char __user *buf, size_t size,
120 loff_t *pos, struct ras_debug_if *data)
121 {
122 ssize_t s = min_t(u64, 64, size);
123 char str[65];
124 char block_name[33];
125 char err[9] = "ue";
126 int op = -1;
127 int block_id;
128 uint32_t sub_block;
129 u64 address, value;
130
131 if (*pos)
132 return -EINVAL;
133 *pos = size;
134
135 memset(str, 0, sizeof(str));
136 memset(data, 0, sizeof(*data));
137
138 if (copy_from_user(str, buf, s))
139 return -EINVAL;
140
141 if (sscanf(str, "disable %32s", block_name) == 1)
142 op = 0;
143 else if (sscanf(str, "enable %32s %8s", block_name, err) == 2)
144 op = 1;
145 else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
146 op = 2;
147 else if (str[0] && str[1] && str[2] && str[3])
148 /* ascii string, but commands are not matched. */
149 return -EINVAL;
150
151 if (op != -1) {
152 if (amdgpu_ras_find_block_id_by_name(block_name, &block_id))
153 return -EINVAL;
154
155 data->head.block = block_id;
156 /* only ue and ce errors are supported */
157 if (!memcmp("ue", err, 2))
158 data->head.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
159 else if (!memcmp("ce", err, 2))
160 data->head.type = AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE;
161 else
162 return -EINVAL;
163
164 data->op = op;
165
166 if (op == 2) {
167 if (sscanf(str, "%*s %*s %*s %u %llu %llu",
168 &sub_block, &address, &value) != 3)
169 if (sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx",
170 &sub_block, &address, &value) != 3)
171 return -EINVAL;
172 data->head.sub_block_index = sub_block;
173 data->inject.address = address;
174 data->inject.value = value;
175 }
176 } else {
177 if (size < sizeof(*data))
178 return -EINVAL;
179
180 if (copy_from_user(data, buf, sizeof(*data)))
181 return -EINVAL;
182 }
183
184 return 0;
185 }
186 /**
187 * DOC: AMDGPU RAS debugfs control interface
188 *
189 * It accepts struct ras_debug_if who has two members.
190 *
191 * First member: ras_debug_if::head or ras_debug_if::inject.
192 *
193 * head is used to indicate which IP block will be under control.
194 *
195 * head has four members, they are block, type, sub_block_index, name.
196 * block: which IP will be under control.
197 * type: what kind of error will be enabled/disabled/injected.
198 * sub_block_index: some IPs have subcomponets. say, GFX, sDMA.
199 * name: the name of IP.
200 *
201 * inject has two more members than head, they are address, value.
202 * As their names indicate, inject operation will write the
203 * value to the address.
204 *
205 * Second member: struct ras_debug_if::op.
206 * It has three kinds of operations.
207 * 0: disable RAS on the block. Take ::head as its data.
208 * 1: enable RAS on the block. Take ::head as its data.
209 * 2: inject errors on the block. Take ::inject as its data.
210 *
211 * How to use the interface?
212 * programs:
213 * copy the struct ras_debug_if in your codes and initialize it.
214 * write the struct to the control node.
215 *
216 * bash:
217 * echo op block [error [sub_blcok address value]] > .../ras/ras_ctrl
218 * op: disable, enable, inject
219 * disable: only block is needed
220 * enable: block and error are needed
221 * inject: error, address, value are needed
222 * block: umc, smda, gfx, .........
223 * see ras_block_string[] for details
224 * error: ue, ce
225 * ue: multi_uncorrectable
226 * ce: single_correctable
227 * sub_block: sub block index, pass 0 if there is no sub block
228 *
229 * here are some examples for bash commands,
230 * echo inject umc ue 0x0 0x0 0x0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
231 * echo inject umc ce 0 0 0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
232 * echo disable umc > /sys/kernel/debug/dri/0/ras/ras_ctrl
233 *
234 * How to check the result?
235 *
236 * For disable/enable, please check ras features at
237 * /sys/class/drm/card[0/1/2...]/device/ras/features
238 *
239 * For inject, please check corresponding err count at
240 * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count
241 *
242 * NOTE: operation is only allowed on blocks which are supported.
243 * Please check ras mask at /sys/module/amdgpu/parameters/ras_mask
244 */
amdgpu_ras_debugfs_ctrl_write(struct file * f,const char __user * buf,size_t size,loff_t * pos)245 static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *buf,
246 size_t size, loff_t *pos)
247 {
248 struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
249 struct ras_debug_if data;
250 int ret = 0;
251
252 ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data);
253 if (ret)
254 return -EINVAL;
255
256 if (!amdgpu_ras_is_supported(adev, data.head.block))
257 return -EINVAL;
258
259 switch (data.op) {
260 case 0:
261 ret = amdgpu_ras_feature_enable(adev, &data.head, 0);
262 break;
263 case 1:
264 ret = amdgpu_ras_feature_enable(adev, &data.head, 1);
265 break;
266 case 2:
267 if ((data.inject.address >= adev->gmc.mc_vram_size) ||
268 (data.inject.address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
269 ret = -EINVAL;
270 break;
271 }
272
273 /* data.inject.address is offset instead of absolute gpu address */
274 ret = amdgpu_ras_error_inject(adev, &data.inject);
275 break;
276 default:
277 ret = -EINVAL;
278 break;
279 };
280
281 if (ret)
282 return -EINVAL;
283
284 return size;
285 }
286
287 static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
288 .owner = THIS_MODULE,
289 .read = NULL,
290 .write = amdgpu_ras_debugfs_ctrl_write,
291 .llseek = default_llseek
292 };
293
amdgpu_ras_sysfs_read(struct device * dev,struct device_attribute * attr,char * buf)294 static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
295 struct device_attribute *attr, char *buf)
296 {
297 struct ras_manager *obj = container_of(attr, struct ras_manager, sysfs_attr);
298 struct ras_query_if info = {
299 .head = obj->head,
300 };
301
302 if (amdgpu_ras_error_query(obj->adev, &info))
303 return -EINVAL;
304
305 return snprintf(buf, PAGE_SIZE, "%s: %lu\n%s: %lu\n",
306 "ue", info.ue_count,
307 "ce", info.ce_count);
308 }
309
310 /* obj begin */
311
312 #define get_obj(obj) do { (obj)->use++; } while (0)
313 #define alive_obj(obj) ((obj)->use)
314
put_obj(struct ras_manager * obj)315 static inline void put_obj(struct ras_manager *obj)
316 {
317 if (obj && --obj->use == 0)
318 list_del(&obj->node);
319 if (obj && obj->use < 0) {
320 DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", obj->head.name);
321 }
322 }
323
324 /* make one obj and return it. */
amdgpu_ras_create_obj(struct amdgpu_device * adev,struct ras_common_if * head)325 static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev,
326 struct ras_common_if *head)
327 {
328 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
329 struct ras_manager *obj;
330
331 if (!con)
332 return NULL;
333
334 if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
335 return NULL;
336
337 obj = &con->objs[head->block];
338 /* already exist. return obj? */
339 if (alive_obj(obj))
340 return NULL;
341
342 obj->head = *head;
343 obj->adev = adev;
344 list_add(&obj->node, &con->head);
345 get_obj(obj);
346
347 return obj;
348 }
349
350 /* return an obj equal to head, or the first when head is NULL */
amdgpu_ras_find_obj(struct amdgpu_device * adev,struct ras_common_if * head)351 static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
352 struct ras_common_if *head)
353 {
354 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
355 struct ras_manager *obj;
356 int i;
357
358 if (!con)
359 return NULL;
360
361 if (head) {
362 if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
363 return NULL;
364
365 obj = &con->objs[head->block];
366
367 if (alive_obj(obj)) {
368 WARN_ON(head->block != obj->head.block);
369 return obj;
370 }
371 } else {
372 for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) {
373 obj = &con->objs[i];
374 if (alive_obj(obj)) {
375 WARN_ON(i != obj->head.block);
376 return obj;
377 }
378 }
379 }
380
381 return NULL;
382 }
383 /* obj end */
384
385 /* feature ctl begin */
amdgpu_ras_is_feature_allowed(struct amdgpu_device * adev,struct ras_common_if * head)386 static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev,
387 struct ras_common_if *head)
388 {
389 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
390
391 return con->hw_supported & BIT(head->block);
392 }
393
amdgpu_ras_is_feature_enabled(struct amdgpu_device * adev,struct ras_common_if * head)394 static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev,
395 struct ras_common_if *head)
396 {
397 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
398
399 return con->features & BIT(head->block);
400 }
401
402 /*
403 * if obj is not created, then create one.
404 * set feature enable flag.
405 */
__amdgpu_ras_feature_enable(struct amdgpu_device * adev,struct ras_common_if * head,int enable)406 static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev,
407 struct ras_common_if *head, int enable)
408 {
409 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
410 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
411
412 /* If hardware does not support ras, then do not create obj.
413 * But if hardware support ras, we can create the obj.
414 * Ras framework checks con->hw_supported to see if it need do
415 * corresponding initialization.
416 * IP checks con->support to see if it need disable ras.
417 */
418 if (!amdgpu_ras_is_feature_allowed(adev, head))
419 return 0;
420 if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head)))
421 return 0;
422
423 if (enable) {
424 if (!obj) {
425 obj = amdgpu_ras_create_obj(adev, head);
426 if (!obj)
427 return -EINVAL;
428 } else {
429 /* In case we create obj somewhere else */
430 get_obj(obj);
431 }
432 con->features |= BIT(head->block);
433 } else {
434 if (obj && amdgpu_ras_is_feature_enabled(adev, head)) {
435 con->features &= ~BIT(head->block);
436 put_obj(obj);
437 }
438 }
439
440 return 0;
441 }
442
443 /* wrapper of psp_ras_enable_features */
amdgpu_ras_feature_enable(struct amdgpu_device * adev,struct ras_common_if * head,bool enable)444 int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
445 struct ras_common_if *head, bool enable)
446 {
447 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
448 union ta_ras_cmd_input info;
449 int ret;
450
451 if (!con)
452 return -EINVAL;
453
454 if (!enable) {
455 info.disable_features = (struct ta_ras_disable_features_input) {
456 .block_id = amdgpu_ras_block_to_ta(head->block),
457 .error_type = amdgpu_ras_error_to_ta(head->type),
458 };
459 } else {
460 info.enable_features = (struct ta_ras_enable_features_input) {
461 .block_id = amdgpu_ras_block_to_ta(head->block),
462 .error_type = amdgpu_ras_error_to_ta(head->type),
463 };
464 }
465
466 /* Do not enable if it is not allowed. */
467 WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head));
468 /* Are we alerady in that state we are going to set? */
469 if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head)))
470 return 0;
471
472 ret = psp_ras_enable_features(&adev->psp, &info, enable);
473 if (ret) {
474 DRM_ERROR("RAS ERROR: %s %s feature failed ret %d\n",
475 enable ? "enable":"disable",
476 ras_block_str(head->block),
477 ret);
478 if (ret == TA_RAS_STATUS__RESET_NEEDED)
479 return -EAGAIN;
480 return -EINVAL;
481 }
482
483 /* setup the obj */
484 __amdgpu_ras_feature_enable(adev, head, enable);
485
486 return 0;
487 }
488
489 /* Only used in device probe stage and called only once. */
amdgpu_ras_feature_enable_on_boot(struct amdgpu_device * adev,struct ras_common_if * head,bool enable)490 int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev,
491 struct ras_common_if *head, bool enable)
492 {
493 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
494 int ret;
495
496 if (!con)
497 return -EINVAL;
498
499 if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
500 if (enable) {
501 /* There is no harm to issue a ras TA cmd regardless of
502 * the currecnt ras state.
503 * If current state == target state, it will do nothing
504 * But sometimes it requests driver to reset and repost
505 * with error code -EAGAIN.
506 */
507 ret = amdgpu_ras_feature_enable(adev, head, 1);
508 /* With old ras TA, we might fail to enable ras.
509 * Log it and just setup the object.
510 * TODO need remove this WA in the future.
511 */
512 if (ret == -EINVAL) {
513 ret = __amdgpu_ras_feature_enable(adev, head, 1);
514 if (!ret)
515 DRM_INFO("RAS INFO: %s setup object\n",
516 ras_block_str(head->block));
517 }
518 } else {
519 /* setup the object then issue a ras TA disable cmd.*/
520 ret = __amdgpu_ras_feature_enable(adev, head, 1);
521 if (ret)
522 return ret;
523
524 ret = amdgpu_ras_feature_enable(adev, head, 0);
525 }
526 } else
527 ret = amdgpu_ras_feature_enable(adev, head, enable);
528
529 return ret;
530 }
531
amdgpu_ras_disable_all_features(struct amdgpu_device * adev,bool bypass)532 static int amdgpu_ras_disable_all_features(struct amdgpu_device *adev,
533 bool bypass)
534 {
535 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
536 struct ras_manager *obj, *tmp;
537
538 list_for_each_entry_safe(obj, tmp, &con->head, node) {
539 /* bypass psp.
540 * aka just release the obj and corresponding flags
541 */
542 if (bypass) {
543 if (__amdgpu_ras_feature_enable(adev, &obj->head, 0))
544 break;
545 } else {
546 if (amdgpu_ras_feature_enable(adev, &obj->head, 0))
547 break;
548 }
549 }
550
551 return con->features;
552 }
553
amdgpu_ras_enable_all_features(struct amdgpu_device * adev,bool bypass)554 static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,
555 bool bypass)
556 {
557 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
558 int ras_block_count = AMDGPU_RAS_BLOCK_COUNT;
559 int i;
560 const enum amdgpu_ras_error_type default_ras_type =
561 AMDGPU_RAS_ERROR__NONE;
562
563 for (i = 0; i < ras_block_count; i++) {
564 struct ras_common_if head = {
565 .block = i,
566 .type = default_ras_type,
567 .sub_block_index = 0,
568 };
569 strcpy(head.name, ras_block_str(i));
570 if (bypass) {
571 /*
572 * bypass psp. vbios enable ras for us.
573 * so just create the obj
574 */
575 if (__amdgpu_ras_feature_enable(adev, &head, 1))
576 break;
577 } else {
578 if (amdgpu_ras_feature_enable(adev, &head, 1))
579 break;
580 }
581 }
582
583 return con->features;
584 }
585 /* feature ctl end */
586
587 /* query/inject/cure begin */
amdgpu_ras_error_query(struct amdgpu_device * adev,struct ras_query_if * info)588 int amdgpu_ras_error_query(struct amdgpu_device *adev,
589 struct ras_query_if *info)
590 {
591 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
592 struct ras_err_data err_data = {0, 0, 0, NULL};
593
594 if (!obj)
595 return -EINVAL;
596
597 switch (info->head.block) {
598 case AMDGPU_RAS_BLOCK__UMC:
599 if (adev->umc.funcs->query_ras_error_count)
600 adev->umc.funcs->query_ras_error_count(adev, &err_data);
601 /* umc query_ras_error_address is also responsible for clearing
602 * error status
603 */
604 if (adev->umc.funcs->query_ras_error_address)
605 adev->umc.funcs->query_ras_error_address(adev, &err_data);
606 break;
607 case AMDGPU_RAS_BLOCK__GFX:
608 if (adev->gfx.funcs->query_ras_error_count)
609 adev->gfx.funcs->query_ras_error_count(adev, &err_data);
610 break;
611 case AMDGPU_RAS_BLOCK__MMHUB:
612 if (adev->mmhub_funcs->query_ras_error_count)
613 adev->mmhub_funcs->query_ras_error_count(adev, &err_data);
614 break;
615 default:
616 break;
617 }
618
619 obj->err_data.ue_count += err_data.ue_count;
620 obj->err_data.ce_count += err_data.ce_count;
621
622 info->ue_count = obj->err_data.ue_count;
623 info->ce_count = obj->err_data.ce_count;
624
625 if (err_data.ce_count)
626 dev_info(adev->dev, "%ld correctable errors detected in %s block\n",
627 obj->err_data.ce_count, ras_block_str(info->head.block));
628 if (err_data.ue_count)
629 dev_info(adev->dev, "%ld uncorrectable errors detected in %s block\n",
630 obj->err_data.ue_count, ras_block_str(info->head.block));
631
632 return 0;
633 }
634
635 /* wrapper of psp_ras_trigger_error */
amdgpu_ras_error_inject(struct amdgpu_device * adev,struct ras_inject_if * info)636 int amdgpu_ras_error_inject(struct amdgpu_device *adev,
637 struct ras_inject_if *info)
638 {
639 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
640 struct ta_ras_trigger_error_input block_info = {
641 .block_id = amdgpu_ras_block_to_ta(info->head.block),
642 .inject_error_type = amdgpu_ras_error_to_ta(info->head.type),
643 .sub_block_index = info->head.sub_block_index,
644 .address = info->address,
645 .value = info->value,
646 };
647 int ret = 0;
648
649 if (!obj)
650 return -EINVAL;
651
652 switch (info->head.block) {
653 case AMDGPU_RAS_BLOCK__GFX:
654 if (adev->gfx.funcs->ras_error_inject)
655 ret = adev->gfx.funcs->ras_error_inject(adev, info);
656 else
657 ret = -EINVAL;
658 break;
659 case AMDGPU_RAS_BLOCK__UMC:
660 case AMDGPU_RAS_BLOCK__MMHUB:
661 ret = psp_ras_trigger_error(&adev->psp, &block_info);
662 break;
663 default:
664 DRM_INFO("%s error injection is not supported yet\n",
665 ras_block_str(info->head.block));
666 ret = -EINVAL;
667 }
668
669 if (ret)
670 DRM_ERROR("RAS ERROR: inject %s error failed ret %d\n",
671 ras_block_str(info->head.block),
672 ret);
673
674 return ret;
675 }
676
amdgpu_ras_error_cure(struct amdgpu_device * adev,struct ras_cure_if * info)677 int amdgpu_ras_error_cure(struct amdgpu_device *adev,
678 struct ras_cure_if *info)
679 {
680 /* psp fw has no cure interface for now. */
681 return 0;
682 }
683
684 /* get the total error counts on all IPs */
amdgpu_ras_query_error_count(struct amdgpu_device * adev,bool is_ce)685 unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
686 bool is_ce)
687 {
688 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
689 struct ras_manager *obj;
690 struct ras_err_data data = {0, 0};
691
692 if (!con)
693 return 0;
694
695 list_for_each_entry(obj, &con->head, node) {
696 struct ras_query_if info = {
697 .head = obj->head,
698 };
699
700 if (amdgpu_ras_error_query(adev, &info))
701 return 0;
702
703 data.ce_count += info.ce_count;
704 data.ue_count += info.ue_count;
705 }
706
707 return is_ce ? data.ce_count : data.ue_count;
708 }
709 /* query/inject/cure end */
710
711
712 /* sysfs begin */
713
714 static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
715 struct ras_badpage **bps, unsigned int *count);
716
amdgpu_ras_badpage_flags_str(unsigned int flags)717 static char *amdgpu_ras_badpage_flags_str(unsigned int flags)
718 {
719 switch (flags) {
720 case 0:
721 return "R";
722 case 1:
723 return "P";
724 case 2:
725 default:
726 return "F";
727 };
728 }
729
730 /*
731 * DOC: ras sysfs gpu_vram_bad_pages interface
732 *
733 * It allows user to read the bad pages of vram on the gpu through
734 * /sys/class/drm/card[0/1/2...]/device/ras/gpu_vram_bad_pages
735 *
736 * It outputs multiple lines, and each line stands for one gpu page.
737 *
738 * The format of one line is below,
739 * gpu pfn : gpu page size : flags
740 *
741 * gpu pfn and gpu page size are printed in hex format.
742 * flags can be one of below character,
743 * R: reserved, this gpu page is reserved and not able to use.
744 * P: pending for reserve, this gpu page is marked as bad, will be reserved
745 * in next window of page_reserve.
746 * F: unable to reserve. this gpu page can't be reserved due to some reasons.
747 *
748 * examples:
749 * 0x00000001 : 0x00001000 : R
750 * 0x00000002 : 0x00001000 : P
751 */
752
amdgpu_ras_sysfs_badpages_read(struct file * f,struct kobject * kobj,struct bin_attribute * attr,char * buf,loff_t ppos,size_t count)753 static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f,
754 struct kobject *kobj, struct bin_attribute *attr,
755 char *buf, loff_t ppos, size_t count)
756 {
757 struct amdgpu_ras *con =
758 container_of(attr, struct amdgpu_ras, badpages_attr);
759 struct amdgpu_device *adev = con->adev;
760 const unsigned int element_size =
761 sizeof("0xabcdabcd : 0x12345678 : R\n") - 1;
762 unsigned int start = div64_ul(ppos + element_size - 1, element_size);
763 unsigned int end = div64_ul(ppos + count - 1, element_size);
764 ssize_t s = 0;
765 struct ras_badpage *bps = NULL;
766 unsigned int bps_count = 0;
767
768 memset(buf, 0, count);
769
770 if (amdgpu_ras_badpages_read(adev, &bps, &bps_count))
771 return 0;
772
773 for (; start < end && start < bps_count; start++)
774 s += scnprintf(&buf[s], element_size + 1,
775 "0x%08x : 0x%08x : %1s\n",
776 bps[start].bp,
777 bps[start].size,
778 amdgpu_ras_badpage_flags_str(bps[start].flags));
779
780 kfree(bps);
781
782 return s;
783 }
784
amdgpu_ras_sysfs_features_read(struct device * dev,struct device_attribute * attr,char * buf)785 static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev,
786 struct device_attribute *attr, char *buf)
787 {
788 struct amdgpu_ras *con =
789 container_of(attr, struct amdgpu_ras, features_attr);
790
791 return scnprintf(buf, PAGE_SIZE, "feature mask: 0x%x\n", con->features);
792 }
793
amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device * adev)794 static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev)
795 {
796 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
797 struct attribute *attrs[] = {
798 &con->features_attr.attr,
799 NULL
800 };
801 struct bin_attribute *bin_attrs[] = {
802 &con->badpages_attr,
803 NULL
804 };
805 struct attribute_group group = {
806 .name = "ras",
807 .attrs = attrs,
808 .bin_attrs = bin_attrs,
809 };
810
811 con->features_attr = (struct device_attribute) {
812 .attr = {
813 .name = "features",
814 .mode = S_IRUGO,
815 },
816 .show = amdgpu_ras_sysfs_features_read,
817 };
818
819 con->badpages_attr = (struct bin_attribute) {
820 .attr = {
821 .name = "gpu_vram_bad_pages",
822 .mode = S_IRUGO,
823 },
824 .size = 0,
825 .private = NULL,
826 .read = amdgpu_ras_sysfs_badpages_read,
827 };
828
829 sysfs_attr_init(attrs[0]);
830 sysfs_bin_attr_init(bin_attrs[0]);
831
832 return sysfs_create_group(&adev->dev->kobj, &group);
833 }
834
amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device * adev)835 static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev)
836 {
837 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
838 struct attribute *attrs[] = {
839 &con->features_attr.attr,
840 NULL
841 };
842 struct bin_attribute *bin_attrs[] = {
843 &con->badpages_attr,
844 NULL
845 };
846 struct attribute_group group = {
847 .name = "ras",
848 .attrs = attrs,
849 .bin_attrs = bin_attrs,
850 };
851
852 sysfs_remove_group(&adev->dev->kobj, &group);
853
854 return 0;
855 }
856
amdgpu_ras_sysfs_create(struct amdgpu_device * adev,struct ras_fs_if * head)857 int amdgpu_ras_sysfs_create(struct amdgpu_device *adev,
858 struct ras_fs_if *head)
859 {
860 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
861
862 if (!obj || obj->attr_inuse)
863 return -EINVAL;
864
865 get_obj(obj);
866
867 memcpy(obj->fs_data.sysfs_name,
868 head->sysfs_name,
869 sizeof(obj->fs_data.sysfs_name));
870
871 obj->sysfs_attr = (struct device_attribute){
872 .attr = {
873 .name = obj->fs_data.sysfs_name,
874 .mode = S_IRUGO,
875 },
876 .show = amdgpu_ras_sysfs_read,
877 };
878 sysfs_attr_init(&obj->sysfs_attr.attr);
879
880 if (sysfs_add_file_to_group(&adev->dev->kobj,
881 &obj->sysfs_attr.attr,
882 "ras")) {
883 put_obj(obj);
884 return -EINVAL;
885 }
886
887 obj->attr_inuse = 1;
888
889 return 0;
890 }
891
amdgpu_ras_sysfs_remove(struct amdgpu_device * adev,struct ras_common_if * head)892 int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev,
893 struct ras_common_if *head)
894 {
895 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
896
897 if (!obj || !obj->attr_inuse)
898 return -EINVAL;
899
900 sysfs_remove_file_from_group(&adev->dev->kobj,
901 &obj->sysfs_attr.attr,
902 "ras");
903 obj->attr_inuse = 0;
904 put_obj(obj);
905
906 return 0;
907 }
908
amdgpu_ras_sysfs_remove_all(struct amdgpu_device * adev)909 static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev)
910 {
911 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
912 struct ras_manager *obj, *tmp;
913
914 list_for_each_entry_safe(obj, tmp, &con->head, node) {
915 amdgpu_ras_sysfs_remove(adev, &obj->head);
916 }
917
918 amdgpu_ras_sysfs_remove_feature_node(adev);
919
920 return 0;
921 }
922 /* sysfs end */
923
924 /* debugfs begin */
amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device * adev)925 static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)
926 {
927 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
928 struct drm_minor *minor = adev->ddev->primary;
929
930 con->dir = debugfs_create_dir("ras", minor->debugfs_root);
931 con->ent = debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, con->dir,
932 adev, &amdgpu_ras_debugfs_ctrl_ops);
933 }
934
amdgpu_ras_debugfs_create(struct amdgpu_device * adev,struct ras_fs_if * head)935 void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
936 struct ras_fs_if *head)
937 {
938 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
939 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
940
941 if (!obj || obj->ent)
942 return;
943
944 get_obj(obj);
945
946 memcpy(obj->fs_data.debugfs_name,
947 head->debugfs_name,
948 sizeof(obj->fs_data.debugfs_name));
949
950 obj->ent = debugfs_create_file(obj->fs_data.debugfs_name,
951 S_IWUGO | S_IRUGO, con->dir, obj,
952 &amdgpu_ras_debugfs_ops);
953 }
954
amdgpu_ras_debugfs_remove(struct amdgpu_device * adev,struct ras_common_if * head)955 void amdgpu_ras_debugfs_remove(struct amdgpu_device *adev,
956 struct ras_common_if *head)
957 {
958 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
959
960 if (!obj || !obj->ent)
961 return;
962
963 debugfs_remove(obj->ent);
964 obj->ent = NULL;
965 put_obj(obj);
966 }
967
amdgpu_ras_debugfs_remove_all(struct amdgpu_device * adev)968 static void amdgpu_ras_debugfs_remove_all(struct amdgpu_device *adev)
969 {
970 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
971 struct ras_manager *obj, *tmp;
972
973 list_for_each_entry_safe(obj, tmp, &con->head, node) {
974 amdgpu_ras_debugfs_remove(adev, &obj->head);
975 }
976
977 debugfs_remove(con->ent);
978 debugfs_remove(con->dir);
979 con->dir = NULL;
980 con->ent = NULL;
981 }
982 /* debugfs end */
983
984 /* ras fs */
985
amdgpu_ras_fs_init(struct amdgpu_device * adev)986 static int amdgpu_ras_fs_init(struct amdgpu_device *adev)
987 {
988 amdgpu_ras_sysfs_create_feature_node(adev);
989 amdgpu_ras_debugfs_create_ctrl_node(adev);
990
991 return 0;
992 }
993
amdgpu_ras_fs_fini(struct amdgpu_device * adev)994 static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)
995 {
996 amdgpu_ras_debugfs_remove_all(adev);
997 amdgpu_ras_sysfs_remove_all(adev);
998 return 0;
999 }
1000 /* ras fs end */
1001
1002 /* ih begin */
amdgpu_ras_interrupt_handler(struct ras_manager * obj)1003 static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
1004 {
1005 struct ras_ih_data *data = &obj->ih_data;
1006 struct amdgpu_iv_entry entry;
1007 int ret;
1008 struct ras_err_data err_data = {0, 0, 0, NULL};
1009
1010 while (data->rptr != data->wptr) {
1011 rmb();
1012 memcpy(&entry, &data->ring[data->rptr],
1013 data->element_size);
1014
1015 wmb();
1016 data->rptr = (data->aligned_element_size +
1017 data->rptr) % data->ring_size;
1018
1019 /* Let IP handle its data, maybe we need get the output
1020 * from the callback to udpate the error type/count, etc
1021 */
1022 if (data->cb) {
1023 ret = data->cb(obj->adev, &err_data, &entry);
1024 /* ue will trigger an interrupt, and in that case
1025 * we need do a reset to recovery the whole system.
1026 * But leave IP do that recovery, here we just dispatch
1027 * the error.
1028 */
1029 if (ret == AMDGPU_RAS_SUCCESS) {
1030 /* these counts could be left as 0 if
1031 * some blocks do not count error number
1032 */
1033 obj->err_data.ue_count += err_data.ue_count;
1034 obj->err_data.ce_count += err_data.ce_count;
1035 }
1036 }
1037 }
1038 }
1039
amdgpu_ras_interrupt_process_handler(struct work_struct * work)1040 static void amdgpu_ras_interrupt_process_handler(struct work_struct *work)
1041 {
1042 struct ras_ih_data *data =
1043 container_of(work, struct ras_ih_data, ih_work);
1044 struct ras_manager *obj =
1045 container_of(data, struct ras_manager, ih_data);
1046
1047 amdgpu_ras_interrupt_handler(obj);
1048 }
1049
amdgpu_ras_interrupt_dispatch(struct amdgpu_device * adev,struct ras_dispatch_if * info)1050 int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,
1051 struct ras_dispatch_if *info)
1052 {
1053 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1054 struct ras_ih_data *data = &obj->ih_data;
1055
1056 if (!obj)
1057 return -EINVAL;
1058
1059 if (data->inuse == 0)
1060 return 0;
1061
1062 /* Might be overflow... */
1063 memcpy(&data->ring[data->wptr], info->entry,
1064 data->element_size);
1065
1066 wmb();
1067 data->wptr = (data->aligned_element_size +
1068 data->wptr) % data->ring_size;
1069
1070 schedule_work(&data->ih_work);
1071
1072 return 0;
1073 }
1074
amdgpu_ras_interrupt_remove_handler(struct amdgpu_device * adev,struct ras_ih_if * info)1075 int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev,
1076 struct ras_ih_if *info)
1077 {
1078 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1079 struct ras_ih_data *data;
1080
1081 if (!obj)
1082 return -EINVAL;
1083
1084 data = &obj->ih_data;
1085 if (data->inuse == 0)
1086 return 0;
1087
1088 cancel_work_sync(&data->ih_work);
1089
1090 kfree(data->ring);
1091 memset(data, 0, sizeof(*data));
1092 put_obj(obj);
1093
1094 return 0;
1095 }
1096
amdgpu_ras_interrupt_add_handler(struct amdgpu_device * adev,struct ras_ih_if * info)1097 int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev,
1098 struct ras_ih_if *info)
1099 {
1100 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1101 struct ras_ih_data *data;
1102
1103 if (!obj) {
1104 /* in case we registe the IH before enable ras feature */
1105 obj = amdgpu_ras_create_obj(adev, &info->head);
1106 if (!obj)
1107 return -EINVAL;
1108 } else
1109 get_obj(obj);
1110
1111 data = &obj->ih_data;
1112 /* add the callback.etc */
1113 *data = (struct ras_ih_data) {
1114 .inuse = 0,
1115 .cb = info->cb,
1116 .element_size = sizeof(struct amdgpu_iv_entry),
1117 .rptr = 0,
1118 .wptr = 0,
1119 };
1120
1121 INIT_WORK(&data->ih_work, amdgpu_ras_interrupt_process_handler);
1122
1123 data->aligned_element_size = ALIGN(data->element_size, 8);
1124 /* the ring can store 64 iv entries. */
1125 data->ring_size = 64 * data->aligned_element_size;
1126 data->ring = kmalloc(data->ring_size, GFP_KERNEL);
1127 if (!data->ring) {
1128 put_obj(obj);
1129 return -ENOMEM;
1130 }
1131
1132 /* IH is ready */
1133 data->inuse = 1;
1134
1135 return 0;
1136 }
1137
amdgpu_ras_interrupt_remove_all(struct amdgpu_device * adev)1138 static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev)
1139 {
1140 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1141 struct ras_manager *obj, *tmp;
1142
1143 list_for_each_entry_safe(obj, tmp, &con->head, node) {
1144 struct ras_ih_if info = {
1145 .head = obj->head,
1146 };
1147 amdgpu_ras_interrupt_remove_handler(adev, &info);
1148 }
1149
1150 return 0;
1151 }
1152 /* ih end */
1153
1154 /* recovery begin */
1155
1156 /* return 0 on success.
1157 * caller need free bps.
1158 */
amdgpu_ras_badpages_read(struct amdgpu_device * adev,struct ras_badpage ** bps,unsigned int * count)1159 static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
1160 struct ras_badpage **bps, unsigned int *count)
1161 {
1162 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1163 struct ras_err_handler_data *data;
1164 int i = 0;
1165 int ret = 0;
1166
1167 if (!con || !con->eh_data || !bps || !count)
1168 return -EINVAL;
1169
1170 mutex_lock(&con->recovery_lock);
1171 data = con->eh_data;
1172 if (!data || data->count == 0) {
1173 *bps = NULL;
1174 goto out;
1175 }
1176
1177 *bps = kmalloc(sizeof(struct ras_badpage) * data->count, GFP_KERNEL);
1178 if (!*bps) {
1179 ret = -ENOMEM;
1180 goto out;
1181 }
1182
1183 for (; i < data->count; i++) {
1184 (*bps)[i] = (struct ras_badpage){
1185 .bp = data->bps[i].bp,
1186 .size = AMDGPU_GPU_PAGE_SIZE,
1187 .flags = 0,
1188 };
1189
1190 if (data->last_reserved <= i)
1191 (*bps)[i].flags = 1;
1192 else if (data->bps[i].bo == NULL)
1193 (*bps)[i].flags = 2;
1194 }
1195
1196 *count = data->count;
1197 out:
1198 mutex_unlock(&con->recovery_lock);
1199 return ret;
1200 }
1201
amdgpu_ras_do_recovery(struct work_struct * work)1202 static void amdgpu_ras_do_recovery(struct work_struct *work)
1203 {
1204 struct amdgpu_ras *ras =
1205 container_of(work, struct amdgpu_ras, recovery_work);
1206
1207 amdgpu_device_gpu_recover(ras->adev, 0);
1208 atomic_set(&ras->in_recovery, 0);
1209 }
1210
1211 /* alloc/realloc bps array */
amdgpu_ras_realloc_eh_data_space(struct amdgpu_device * adev,struct ras_err_handler_data * data,int pages)1212 static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
1213 struct ras_err_handler_data *data, int pages)
1214 {
1215 unsigned int old_space = data->count + data->space_left;
1216 unsigned int new_space = old_space + pages;
1217 unsigned int align_space = ALIGN(new_space, 1024);
1218 void *tmp = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
1219
1220 if (!tmp)
1221 return -ENOMEM;
1222
1223 if (data->bps) {
1224 memcpy(tmp, data->bps,
1225 data->count * sizeof(*data->bps));
1226 kfree(data->bps);
1227 }
1228
1229 data->bps = tmp;
1230 data->space_left += align_space - old_space;
1231 return 0;
1232 }
1233
1234 /* it deal with vram only. */
amdgpu_ras_add_bad_pages(struct amdgpu_device * adev,unsigned long * bps,int pages)1235 int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
1236 unsigned long *bps, int pages)
1237 {
1238 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1239 struct ras_err_handler_data *data;
1240 int i = pages;
1241 int ret = 0;
1242
1243 if (!con || !con->eh_data || !bps || pages <= 0)
1244 return 0;
1245
1246 mutex_lock(&con->recovery_lock);
1247 data = con->eh_data;
1248 if (!data)
1249 goto out;
1250
1251 if (data->space_left <= pages)
1252 if (amdgpu_ras_realloc_eh_data_space(adev, data, pages)) {
1253 ret = -ENOMEM;
1254 goto out;
1255 }
1256
1257 while (i--)
1258 data->bps[data->count++].bp = bps[i];
1259
1260 data->space_left -= pages;
1261 out:
1262 mutex_unlock(&con->recovery_lock);
1263
1264 return ret;
1265 }
1266
1267 /* called in gpu recovery/init */
amdgpu_ras_reserve_bad_pages(struct amdgpu_device * adev)1268 int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
1269 {
1270 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1271 struct ras_err_handler_data *data;
1272 uint64_t bp;
1273 struct amdgpu_bo *bo = NULL;
1274 int i;
1275
1276 if (!con || !con->eh_data)
1277 return 0;
1278
1279 mutex_lock(&con->recovery_lock);
1280 data = con->eh_data;
1281 if (!data)
1282 goto out;
1283 /* reserve vram at driver post stage. */
1284 for (i = data->last_reserved; i < data->count; i++) {
1285 bp = data->bps[i].bp;
1286
1287 if (amdgpu_bo_create_kernel_at(adev, bp << PAGE_SHIFT, PAGE_SIZE,
1288 AMDGPU_GEM_DOMAIN_VRAM,
1289 &bo, NULL))
1290 DRM_ERROR("RAS ERROR: reserve vram %llx fail\n", bp);
1291
1292 data->bps[i].bo = bo;
1293 data->last_reserved = i + 1;
1294 bo = NULL;
1295 }
1296 out:
1297 mutex_unlock(&con->recovery_lock);
1298 return 0;
1299 }
1300
1301 /* called when driver unload */
amdgpu_ras_release_bad_pages(struct amdgpu_device * adev)1302 static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev)
1303 {
1304 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1305 struct ras_err_handler_data *data;
1306 struct amdgpu_bo *bo;
1307 int i;
1308
1309 if (!con || !con->eh_data)
1310 return 0;
1311
1312 mutex_lock(&con->recovery_lock);
1313 data = con->eh_data;
1314 if (!data)
1315 goto out;
1316
1317 for (i = data->last_reserved - 1; i >= 0; i--) {
1318 bo = data->bps[i].bo;
1319
1320 amdgpu_bo_free_kernel(&bo, NULL, NULL);
1321
1322 data->bps[i].bo = bo;
1323 data->last_reserved = i;
1324 }
1325 out:
1326 mutex_unlock(&con->recovery_lock);
1327 return 0;
1328 }
1329
amdgpu_ras_save_bad_pages(struct amdgpu_device * adev)1330 static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
1331 {
1332 /* TODO
1333 * write the array to eeprom when SMU disabled.
1334 */
1335 return 0;
1336 }
1337
amdgpu_ras_load_bad_pages(struct amdgpu_device * adev)1338 static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
1339 {
1340 /* TODO
1341 * read the array to eeprom when SMU disabled.
1342 */
1343 return 0;
1344 }
1345
amdgpu_ras_recovery_init(struct amdgpu_device * adev)1346 static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
1347 {
1348 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1349 struct ras_err_handler_data **data = &con->eh_data;
1350
1351 *data = kmalloc(sizeof(**data),
1352 GFP_KERNEL|__GFP_ZERO);
1353 if (!*data)
1354 return -ENOMEM;
1355
1356 mutex_init(&con->recovery_lock);
1357 INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
1358 atomic_set(&con->in_recovery, 0);
1359 con->adev = adev;
1360
1361 amdgpu_ras_load_bad_pages(adev);
1362 amdgpu_ras_reserve_bad_pages(adev);
1363
1364 return 0;
1365 }
1366
amdgpu_ras_recovery_fini(struct amdgpu_device * adev)1367 static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
1368 {
1369 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1370 struct ras_err_handler_data *data = con->eh_data;
1371
1372 cancel_work_sync(&con->recovery_work);
1373 amdgpu_ras_save_bad_pages(adev);
1374 amdgpu_ras_release_bad_pages(adev);
1375
1376 mutex_lock(&con->recovery_lock);
1377 con->eh_data = NULL;
1378 kfree(data->bps);
1379 kfree(data);
1380 mutex_unlock(&con->recovery_lock);
1381
1382 return 0;
1383 }
1384 /* recovery end */
1385
1386 /* return 0 if ras will reset gpu and repost.*/
amdgpu_ras_request_reset_on_boot(struct amdgpu_device * adev,unsigned int block)1387 int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
1388 unsigned int block)
1389 {
1390 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
1391
1392 if (!ras)
1393 return -EINVAL;
1394
1395 ras->flags |= AMDGPU_RAS_FLAG_INIT_NEED_RESET;
1396 return 0;
1397 }
1398
1399 /*
1400 * check hardware's ras ability which will be saved in hw_supported.
1401 * if hardware does not support ras, we can skip some ras initializtion and
1402 * forbid some ras operations from IP.
1403 * if software itself, say boot parameter, limit the ras ability. We still
1404 * need allow IP do some limited operations, like disable. In such case,
1405 * we have to initialize ras as normal. but need check if operation is
1406 * allowed or not in each function.
1407 */
amdgpu_ras_check_supported(struct amdgpu_device * adev,uint32_t * hw_supported,uint32_t * supported)1408 static void amdgpu_ras_check_supported(struct amdgpu_device *adev,
1409 uint32_t *hw_supported, uint32_t *supported)
1410 {
1411 *hw_supported = 0;
1412 *supported = 0;
1413
1414 if (amdgpu_sriov_vf(adev) ||
1415 adev->asic_type != CHIP_VEGA20)
1416 return;
1417
1418 if (adev->is_atom_fw &&
1419 (amdgpu_atomfirmware_mem_ecc_supported(adev) ||
1420 amdgpu_atomfirmware_sram_ecc_supported(adev)))
1421 *hw_supported = AMDGPU_RAS_BLOCK_MASK;
1422
1423 *supported = amdgpu_ras_enable == 0 ?
1424 0 : *hw_supported & amdgpu_ras_mask;
1425 }
1426
amdgpu_ras_init(struct amdgpu_device * adev)1427 int amdgpu_ras_init(struct amdgpu_device *adev)
1428 {
1429 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1430
1431 if (con)
1432 return 0;
1433
1434 con = kmalloc(sizeof(struct amdgpu_ras) +
1435 sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT,
1436 GFP_KERNEL|__GFP_ZERO);
1437 if (!con)
1438 return -ENOMEM;
1439
1440 con->objs = (struct ras_manager *)(con + 1);
1441
1442 amdgpu_ras_set_context(adev, con);
1443
1444 amdgpu_ras_check_supported(adev, &con->hw_supported,
1445 &con->supported);
1446 if (!con->hw_supported) {
1447 amdgpu_ras_set_context(adev, NULL);
1448 kfree(con);
1449 return 0;
1450 }
1451
1452 con->features = 0;
1453 INIT_LIST_HEAD(&con->head);
1454 /* Might need get this flag from vbios. */
1455 con->flags = RAS_DEFAULT_FLAGS;
1456
1457 if (amdgpu_ras_recovery_init(adev))
1458 goto recovery_out;
1459
1460 amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK;
1461
1462 if (amdgpu_ras_fs_init(adev))
1463 goto fs_out;
1464
1465 /* ras init for each ras block */
1466 if (adev->umc.funcs->ras_init)
1467 adev->umc.funcs->ras_init(adev);
1468
1469 DRM_INFO("RAS INFO: ras initialized successfully, "
1470 "hardware ability[%x] ras_mask[%x]\n",
1471 con->hw_supported, con->supported);
1472 return 0;
1473 fs_out:
1474 amdgpu_ras_recovery_fini(adev);
1475 recovery_out:
1476 amdgpu_ras_set_context(adev, NULL);
1477 kfree(con);
1478
1479 return -EINVAL;
1480 }
1481
1482 /* do some init work after IP late init as dependence.
1483 * and it runs in resume/gpu reset/booting up cases.
1484 */
amdgpu_ras_resume(struct amdgpu_device * adev)1485 void amdgpu_ras_resume(struct amdgpu_device *adev)
1486 {
1487 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1488 struct ras_manager *obj, *tmp;
1489
1490 if (!con)
1491 return;
1492
1493 if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
1494 /* Set up all other IPs which are not implemented. There is a
1495 * tricky thing that IP's actual ras error type should be
1496 * MULTI_UNCORRECTABLE, but as driver does not handle it, so
1497 * ERROR_NONE make sense anyway.
1498 */
1499 amdgpu_ras_enable_all_features(adev, 1);
1500
1501 /* We enable ras on all hw_supported block, but as boot
1502 * parameter might disable some of them and one or more IP has
1503 * not implemented yet. So we disable them on behalf.
1504 */
1505 list_for_each_entry_safe(obj, tmp, &con->head, node) {
1506 if (!amdgpu_ras_is_supported(adev, obj->head.block)) {
1507 amdgpu_ras_feature_enable(adev, &obj->head, 0);
1508 /* there should be no any reference. */
1509 WARN_ON(alive_obj(obj));
1510 }
1511 }
1512 }
1513
1514 if (con->flags & AMDGPU_RAS_FLAG_INIT_NEED_RESET) {
1515 con->flags &= ~AMDGPU_RAS_FLAG_INIT_NEED_RESET;
1516 /* setup ras obj state as disabled.
1517 * for init_by_vbios case.
1518 * if we want to enable ras, just enable it in a normal way.
1519 * If we want do disable it, need setup ras obj as enabled,
1520 * then issue another TA disable cmd.
1521 * See feature_enable_on_boot
1522 */
1523 amdgpu_ras_disable_all_features(adev, 1);
1524 amdgpu_ras_reset_gpu(adev, 0);
1525 }
1526 }
1527
amdgpu_ras_suspend(struct amdgpu_device * adev)1528 void amdgpu_ras_suspend(struct amdgpu_device *adev)
1529 {
1530 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1531
1532 if (!con)
1533 return;
1534
1535 amdgpu_ras_disable_all_features(adev, 0);
1536 /* Make sure all ras objects are disabled. */
1537 if (con->features)
1538 amdgpu_ras_disable_all_features(adev, 1);
1539 }
1540
1541 /* do some fini work before IP fini as dependence */
amdgpu_ras_pre_fini(struct amdgpu_device * adev)1542 int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
1543 {
1544 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1545
1546 if (!con)
1547 return 0;
1548
1549 /* Need disable ras on all IPs here before ip [hw/sw]fini */
1550 amdgpu_ras_disable_all_features(adev, 0);
1551 amdgpu_ras_recovery_fini(adev);
1552 return 0;
1553 }
1554
amdgpu_ras_fini(struct amdgpu_device * adev)1555 int amdgpu_ras_fini(struct amdgpu_device *adev)
1556 {
1557 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1558
1559 if (!con)
1560 return 0;
1561
1562 amdgpu_ras_fs_fini(adev);
1563 amdgpu_ras_interrupt_remove_all(adev);
1564
1565 WARN(con->features, "Feature mask is not cleared");
1566
1567 if (con->features)
1568 amdgpu_ras_disable_all_features(adev, 1);
1569
1570 amdgpu_ras_set_context(adev, NULL);
1571 kfree(con);
1572
1573 return 0;
1574 }
1575