• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2018 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  *
22  *
23  */
24 #include <linux/debugfs.h>
25 #include <linux/list.h>
26 #include <linux/module.h>
27 #include <linux/uaccess.h>
28 
29 #include "amdgpu.h"
30 #include "amdgpu_ras.h"
31 #include "amdgpu_atomfirmware.h"
32 
33 const char *ras_error_string[] = {
34 	"none",
35 	"parity",
36 	"single_correctable",
37 	"multi_uncorrectable",
38 	"poison",
39 };
40 
41 const char *ras_block_string[] = {
42 	"umc",
43 	"sdma",
44 	"gfx",
45 	"mmhub",
46 	"athub",
47 	"pcie_bif",
48 	"hdp",
49 	"xgmi_wafl",
50 	"df",
51 	"smn",
52 	"sem",
53 	"mp0",
54 	"mp1",
55 	"fuse",
56 };
57 
58 #define ras_err_str(i) (ras_error_string[ffs(i)])
59 #define ras_block_str(i) (ras_block_string[i])
60 
61 #define AMDGPU_RAS_FLAG_INIT_BY_VBIOS		1
62 #define AMDGPU_RAS_FLAG_INIT_NEED_RESET		2
63 #define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS)
64 
65 /* inject address is 52 bits */
66 #define	RAS_UMC_INJECT_ADDR_LIMIT	(0x1ULL << 52)
67 
amdgpu_ras_debugfs_read(struct file * f,char __user * buf,size_t size,loff_t * pos)68 static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
69 					size_t size, loff_t *pos)
70 {
71 	struct ras_manager *obj = (struct ras_manager *)file_inode(f)->i_private;
72 	struct ras_query_if info = {
73 		.head = obj->head,
74 	};
75 	ssize_t s;
76 	char val[128];
77 
78 	if (amdgpu_ras_error_query(obj->adev, &info))
79 		return -EINVAL;
80 
81 	s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n",
82 			"ue", info.ue_count,
83 			"ce", info.ce_count);
84 	if (*pos >= s)
85 		return 0;
86 
87 	s -= *pos;
88 	s = min_t(u64, s, size);
89 
90 
91 	if (copy_to_user(buf, &val[*pos], s))
92 		return -EINVAL;
93 
94 	*pos += s;
95 
96 	return s;
97 }
98 
99 static const struct file_operations amdgpu_ras_debugfs_ops = {
100 	.owner = THIS_MODULE,
101 	.read = amdgpu_ras_debugfs_read,
102 	.write = NULL,
103 	.llseek = default_llseek
104 };
105 
amdgpu_ras_find_block_id_by_name(const char * name,int * block_id)106 static int amdgpu_ras_find_block_id_by_name(const char *name, int *block_id)
107 {
108 	int i;
109 
110 	for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) {
111 		*block_id = i;
112 		if (strcmp(name, ras_block_str(i)) == 0)
113 			return 0;
114 	}
115 	return -EINVAL;
116 }
117 
amdgpu_ras_debugfs_ctrl_parse_data(struct file * f,const char __user * buf,size_t size,loff_t * pos,struct ras_debug_if * data)118 static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
119 		const char __user *buf, size_t size,
120 		loff_t *pos, struct ras_debug_if *data)
121 {
122 	ssize_t s = min_t(u64, 64, size);
123 	char str[65];
124 	char block_name[33];
125 	char err[9] = "ue";
126 	int op = -1;
127 	int block_id;
128 	uint32_t sub_block;
129 	u64 address, value;
130 
131 	if (*pos)
132 		return -EINVAL;
133 	*pos = size;
134 
135 	memset(str, 0, sizeof(str));
136 	memset(data, 0, sizeof(*data));
137 
138 	if (copy_from_user(str, buf, s))
139 		return -EINVAL;
140 
141 	if (sscanf(str, "disable %32s", block_name) == 1)
142 		op = 0;
143 	else if (sscanf(str, "enable %32s %8s", block_name, err) == 2)
144 		op = 1;
145 	else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
146 		op = 2;
147 	else if (str[0] && str[1] && str[2] && str[3])
148 		/* ascii string, but commands are not matched. */
149 		return -EINVAL;
150 
151 	if (op != -1) {
152 		if (amdgpu_ras_find_block_id_by_name(block_name, &block_id))
153 			return -EINVAL;
154 
155 		data->head.block = block_id;
156 		/* only ue and ce errors are supported */
157 		if (!memcmp("ue", err, 2))
158 			data->head.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
159 		else if (!memcmp("ce", err, 2))
160 			data->head.type = AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE;
161 		else
162 			return -EINVAL;
163 
164 		data->op = op;
165 
166 		if (op == 2) {
167 			if (sscanf(str, "%*s %*s %*s %u %llu %llu",
168 						&sub_block, &address, &value) != 3)
169 				if (sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx",
170 							&sub_block, &address, &value) != 3)
171 					return -EINVAL;
172 			data->head.sub_block_index = sub_block;
173 			data->inject.address = address;
174 			data->inject.value = value;
175 		}
176 	} else {
177 		if (size < sizeof(*data))
178 			return -EINVAL;
179 
180 		if (copy_from_user(data, buf, sizeof(*data)))
181 			return -EINVAL;
182 	}
183 
184 	return 0;
185 }
186 /**
187  * DOC: AMDGPU RAS debugfs control interface
188  *
189  * It accepts struct ras_debug_if who has two members.
190  *
191  * First member: ras_debug_if::head or ras_debug_if::inject.
192  *
193  * head is used to indicate which IP block will be under control.
194  *
195  * head has four members, they are block, type, sub_block_index, name.
196  * block: which IP will be under control.
197  * type: what kind of error will be enabled/disabled/injected.
198  * sub_block_index: some IPs have subcomponets. say, GFX, sDMA.
199  * name: the name of IP.
200  *
201  * inject has two more members than head, they are address, value.
202  * As their names indicate, inject operation will write the
203  * value to the address.
204  *
205  * Second member: struct ras_debug_if::op.
206  * It has three kinds of operations.
207  *  0: disable RAS on the block. Take ::head as its data.
208  *  1: enable RAS on the block. Take ::head as its data.
209  *  2: inject errors on the block. Take ::inject as its data.
210  *
211  * How to use the interface?
212  * programs:
213  * copy the struct ras_debug_if in your codes and initialize it.
214  * write the struct to the control node.
215  *
216  * bash:
217  * echo op block [error [sub_blcok address value]] > .../ras/ras_ctrl
218  *	op: disable, enable, inject
219  *		disable: only block is needed
220  *		enable: block and error are needed
221  *		inject: error, address, value are needed
222  *	block: umc, smda, gfx, .........
223  *		see ras_block_string[] for details
224  *	error: ue, ce
225  *		ue: multi_uncorrectable
226  *		ce: single_correctable
227  *	sub_block: sub block index, pass 0 if there is no sub block
228  *
229  * here are some examples for bash commands,
230  *	echo inject umc ue 0x0 0x0 0x0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
231  *	echo inject umc ce 0 0 0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
232  *	echo disable umc > /sys/kernel/debug/dri/0/ras/ras_ctrl
233  *
234  * How to check the result?
235  *
236  * For disable/enable, please check ras features at
237  * /sys/class/drm/card[0/1/2...]/device/ras/features
238  *
239  * For inject, please check corresponding err count at
240  * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count
241  *
242  * NOTE: operation is only allowed on blocks which are supported.
243  * Please check ras mask at /sys/module/amdgpu/parameters/ras_mask
244  */
amdgpu_ras_debugfs_ctrl_write(struct file * f,const char __user * buf,size_t size,loff_t * pos)245 static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *buf,
246 		size_t size, loff_t *pos)
247 {
248 	struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
249 	struct ras_debug_if data;
250 	int ret = 0;
251 
252 	ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data);
253 	if (ret)
254 		return -EINVAL;
255 
256 	if (!amdgpu_ras_is_supported(adev, data.head.block))
257 		return -EINVAL;
258 
259 	switch (data.op) {
260 	case 0:
261 		ret = amdgpu_ras_feature_enable(adev, &data.head, 0);
262 		break;
263 	case 1:
264 		ret = amdgpu_ras_feature_enable(adev, &data.head, 1);
265 		break;
266 	case 2:
267 		if ((data.inject.address >= adev->gmc.mc_vram_size) ||
268 		    (data.inject.address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
269 			ret = -EINVAL;
270 			break;
271 		}
272 
273 		/* data.inject.address is offset instead of absolute gpu address */
274 		ret = amdgpu_ras_error_inject(adev, &data.inject);
275 		break;
276 	default:
277 		ret = -EINVAL;
278 		break;
279 	};
280 
281 	if (ret)
282 		return -EINVAL;
283 
284 	return size;
285 }
286 
287 static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
288 	.owner = THIS_MODULE,
289 	.read = NULL,
290 	.write = amdgpu_ras_debugfs_ctrl_write,
291 	.llseek = default_llseek
292 };
293 
amdgpu_ras_sysfs_read(struct device * dev,struct device_attribute * attr,char * buf)294 static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
295 		struct device_attribute *attr, char *buf)
296 {
297 	struct ras_manager *obj = container_of(attr, struct ras_manager, sysfs_attr);
298 	struct ras_query_if info = {
299 		.head = obj->head,
300 	};
301 
302 	if (amdgpu_ras_error_query(obj->adev, &info))
303 		return -EINVAL;
304 
305 	return snprintf(buf, PAGE_SIZE, "%s: %lu\n%s: %lu\n",
306 			"ue", info.ue_count,
307 			"ce", info.ce_count);
308 }
309 
310 /* obj begin */
311 
312 #define get_obj(obj) do { (obj)->use++; } while (0)
313 #define alive_obj(obj) ((obj)->use)
314 
put_obj(struct ras_manager * obj)315 static inline void put_obj(struct ras_manager *obj)
316 {
317 	if (obj && --obj->use == 0)
318 		list_del(&obj->node);
319 	if (obj && obj->use < 0) {
320 		 DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", obj->head.name);
321 	}
322 }
323 
324 /* make one obj and return it. */
amdgpu_ras_create_obj(struct amdgpu_device * adev,struct ras_common_if * head)325 static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev,
326 		struct ras_common_if *head)
327 {
328 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
329 	struct ras_manager *obj;
330 
331 	if (!con)
332 		return NULL;
333 
334 	if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
335 		return NULL;
336 
337 	obj = &con->objs[head->block];
338 	/* already exist. return obj? */
339 	if (alive_obj(obj))
340 		return NULL;
341 
342 	obj->head = *head;
343 	obj->adev = adev;
344 	list_add(&obj->node, &con->head);
345 	get_obj(obj);
346 
347 	return obj;
348 }
349 
350 /* return an obj equal to head, or the first when head is NULL */
amdgpu_ras_find_obj(struct amdgpu_device * adev,struct ras_common_if * head)351 static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
352 		struct ras_common_if *head)
353 {
354 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
355 	struct ras_manager *obj;
356 	int i;
357 
358 	if (!con)
359 		return NULL;
360 
361 	if (head) {
362 		if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
363 			return NULL;
364 
365 		obj = &con->objs[head->block];
366 
367 		if (alive_obj(obj)) {
368 			WARN_ON(head->block != obj->head.block);
369 			return obj;
370 		}
371 	} else {
372 		for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) {
373 			obj = &con->objs[i];
374 			if (alive_obj(obj)) {
375 				WARN_ON(i != obj->head.block);
376 				return obj;
377 			}
378 		}
379 	}
380 
381 	return NULL;
382 }
383 /* obj end */
384 
385 /* feature ctl begin */
amdgpu_ras_is_feature_allowed(struct amdgpu_device * adev,struct ras_common_if * head)386 static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev,
387 		struct ras_common_if *head)
388 {
389 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
390 
391 	return con->hw_supported & BIT(head->block);
392 }
393 
amdgpu_ras_is_feature_enabled(struct amdgpu_device * adev,struct ras_common_if * head)394 static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev,
395 		struct ras_common_if *head)
396 {
397 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
398 
399 	return con->features & BIT(head->block);
400 }
401 
402 /*
403  * if obj is not created, then create one.
404  * set feature enable flag.
405  */
__amdgpu_ras_feature_enable(struct amdgpu_device * adev,struct ras_common_if * head,int enable)406 static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev,
407 		struct ras_common_if *head, int enable)
408 {
409 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
410 	struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
411 
412 	/* If hardware does not support ras, then do not create obj.
413 	 * But if hardware support ras, we can create the obj.
414 	 * Ras framework checks con->hw_supported to see if it need do
415 	 * corresponding initialization.
416 	 * IP checks con->support to see if it need disable ras.
417 	 */
418 	if (!amdgpu_ras_is_feature_allowed(adev, head))
419 		return 0;
420 	if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head)))
421 		return 0;
422 
423 	if (enable) {
424 		if (!obj) {
425 			obj = amdgpu_ras_create_obj(adev, head);
426 			if (!obj)
427 				return -EINVAL;
428 		} else {
429 			/* In case we create obj somewhere else */
430 			get_obj(obj);
431 		}
432 		con->features |= BIT(head->block);
433 	} else {
434 		if (obj && amdgpu_ras_is_feature_enabled(adev, head)) {
435 			con->features &= ~BIT(head->block);
436 			put_obj(obj);
437 		}
438 	}
439 
440 	return 0;
441 }
442 
443 /* wrapper of psp_ras_enable_features */
amdgpu_ras_feature_enable(struct amdgpu_device * adev,struct ras_common_if * head,bool enable)444 int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
445 		struct ras_common_if *head, bool enable)
446 {
447 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
448 	union ta_ras_cmd_input info;
449 	int ret;
450 
451 	if (!con)
452 		return -EINVAL;
453 
454 	if (!enable) {
455 		info.disable_features = (struct ta_ras_disable_features_input) {
456 			.block_id =  amdgpu_ras_block_to_ta(head->block),
457 			.error_type = amdgpu_ras_error_to_ta(head->type),
458 		};
459 	} else {
460 		info.enable_features = (struct ta_ras_enable_features_input) {
461 			.block_id =  amdgpu_ras_block_to_ta(head->block),
462 			.error_type = amdgpu_ras_error_to_ta(head->type),
463 		};
464 	}
465 
466 	/* Do not enable if it is not allowed. */
467 	WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head));
468 	/* Are we alerady in that state we are going to set? */
469 	if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head)))
470 		return 0;
471 
472 	ret = psp_ras_enable_features(&adev->psp, &info, enable);
473 	if (ret) {
474 		DRM_ERROR("RAS ERROR: %s %s feature failed ret %d\n",
475 				enable ? "enable":"disable",
476 				ras_block_str(head->block),
477 				ret);
478 		if (ret == TA_RAS_STATUS__RESET_NEEDED)
479 			return -EAGAIN;
480 		return -EINVAL;
481 	}
482 
483 	/* setup the obj */
484 	__amdgpu_ras_feature_enable(adev, head, enable);
485 
486 	return 0;
487 }
488 
489 /* Only used in device probe stage and called only once. */
amdgpu_ras_feature_enable_on_boot(struct amdgpu_device * adev,struct ras_common_if * head,bool enable)490 int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev,
491 		struct ras_common_if *head, bool enable)
492 {
493 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
494 	int ret;
495 
496 	if (!con)
497 		return -EINVAL;
498 
499 	if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
500 		if (enable) {
501 			/* There is no harm to issue a ras TA cmd regardless of
502 			 * the currecnt ras state.
503 			 * If current state == target state, it will do nothing
504 			 * But sometimes it requests driver to reset and repost
505 			 * with error code -EAGAIN.
506 			 */
507 			ret = amdgpu_ras_feature_enable(adev, head, 1);
508 			/* With old ras TA, we might fail to enable ras.
509 			 * Log it and just setup the object.
510 			 * TODO need remove this WA in the future.
511 			 */
512 			if (ret == -EINVAL) {
513 				ret = __amdgpu_ras_feature_enable(adev, head, 1);
514 				if (!ret)
515 					DRM_INFO("RAS INFO: %s setup object\n",
516 						ras_block_str(head->block));
517 			}
518 		} else {
519 			/* setup the object then issue a ras TA disable cmd.*/
520 			ret = __amdgpu_ras_feature_enable(adev, head, 1);
521 			if (ret)
522 				return ret;
523 
524 			ret = amdgpu_ras_feature_enable(adev, head, 0);
525 		}
526 	} else
527 		ret = amdgpu_ras_feature_enable(adev, head, enable);
528 
529 	return ret;
530 }
531 
amdgpu_ras_disable_all_features(struct amdgpu_device * adev,bool bypass)532 static int amdgpu_ras_disable_all_features(struct amdgpu_device *adev,
533 		bool bypass)
534 {
535 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
536 	struct ras_manager *obj, *tmp;
537 
538 	list_for_each_entry_safe(obj, tmp, &con->head, node) {
539 		/* bypass psp.
540 		 * aka just release the obj and corresponding flags
541 		 */
542 		if (bypass) {
543 			if (__amdgpu_ras_feature_enable(adev, &obj->head, 0))
544 				break;
545 		} else {
546 			if (amdgpu_ras_feature_enable(adev, &obj->head, 0))
547 				break;
548 		}
549 	}
550 
551 	return con->features;
552 }
553 
amdgpu_ras_enable_all_features(struct amdgpu_device * adev,bool bypass)554 static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,
555 		bool bypass)
556 {
557 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
558 	int ras_block_count = AMDGPU_RAS_BLOCK_COUNT;
559 	int i;
560 	const enum amdgpu_ras_error_type default_ras_type =
561 		AMDGPU_RAS_ERROR__NONE;
562 
563 	for (i = 0; i < ras_block_count; i++) {
564 		struct ras_common_if head = {
565 			.block = i,
566 			.type = default_ras_type,
567 			.sub_block_index = 0,
568 		};
569 		strcpy(head.name, ras_block_str(i));
570 		if (bypass) {
571 			/*
572 			 * bypass psp. vbios enable ras for us.
573 			 * so just create the obj
574 			 */
575 			if (__amdgpu_ras_feature_enable(adev, &head, 1))
576 				break;
577 		} else {
578 			if (amdgpu_ras_feature_enable(adev, &head, 1))
579 				break;
580 		}
581 	}
582 
583 	return con->features;
584 }
585 /* feature ctl end */
586 
587 /* query/inject/cure begin */
amdgpu_ras_error_query(struct amdgpu_device * adev,struct ras_query_if * info)588 int amdgpu_ras_error_query(struct amdgpu_device *adev,
589 		struct ras_query_if *info)
590 {
591 	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
592 	struct ras_err_data err_data = {0, 0, 0, NULL};
593 
594 	if (!obj)
595 		return -EINVAL;
596 
597 	switch (info->head.block) {
598 	case AMDGPU_RAS_BLOCK__UMC:
599 		if (adev->umc.funcs->query_ras_error_count)
600 			adev->umc.funcs->query_ras_error_count(adev, &err_data);
601 		/* umc query_ras_error_address is also responsible for clearing
602 		 * error status
603 		 */
604 		if (adev->umc.funcs->query_ras_error_address)
605 			adev->umc.funcs->query_ras_error_address(adev, &err_data);
606 		break;
607 	case AMDGPU_RAS_BLOCK__GFX:
608 		if (adev->gfx.funcs->query_ras_error_count)
609 			adev->gfx.funcs->query_ras_error_count(adev, &err_data);
610 		break;
611 	case AMDGPU_RAS_BLOCK__MMHUB:
612 		if (adev->mmhub_funcs->query_ras_error_count)
613 			adev->mmhub_funcs->query_ras_error_count(adev, &err_data);
614 		break;
615 	default:
616 		break;
617 	}
618 
619 	obj->err_data.ue_count += err_data.ue_count;
620 	obj->err_data.ce_count += err_data.ce_count;
621 
622 	info->ue_count = obj->err_data.ue_count;
623 	info->ce_count = obj->err_data.ce_count;
624 
625 	if (err_data.ce_count)
626 		dev_info(adev->dev, "%ld correctable errors detected in %s block\n",
627 			 obj->err_data.ce_count, ras_block_str(info->head.block));
628 	if (err_data.ue_count)
629 		dev_info(adev->dev, "%ld uncorrectable errors detected in %s block\n",
630 			 obj->err_data.ue_count, ras_block_str(info->head.block));
631 
632 	return 0;
633 }
634 
635 /* wrapper of psp_ras_trigger_error */
amdgpu_ras_error_inject(struct amdgpu_device * adev,struct ras_inject_if * info)636 int amdgpu_ras_error_inject(struct amdgpu_device *adev,
637 		struct ras_inject_if *info)
638 {
639 	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
640 	struct ta_ras_trigger_error_input block_info = {
641 		.block_id =  amdgpu_ras_block_to_ta(info->head.block),
642 		.inject_error_type = amdgpu_ras_error_to_ta(info->head.type),
643 		.sub_block_index = info->head.sub_block_index,
644 		.address = info->address,
645 		.value = info->value,
646 	};
647 	int ret = 0;
648 
649 	if (!obj)
650 		return -EINVAL;
651 
652 	switch (info->head.block) {
653 	case AMDGPU_RAS_BLOCK__GFX:
654 		if (adev->gfx.funcs->ras_error_inject)
655 			ret = adev->gfx.funcs->ras_error_inject(adev, info);
656 		else
657 			ret = -EINVAL;
658 		break;
659 	case AMDGPU_RAS_BLOCK__UMC:
660 	case AMDGPU_RAS_BLOCK__MMHUB:
661 		ret = psp_ras_trigger_error(&adev->psp, &block_info);
662 		break;
663 	default:
664 		DRM_INFO("%s error injection is not supported yet\n",
665 			 ras_block_str(info->head.block));
666 		ret = -EINVAL;
667 	}
668 
669 	if (ret)
670 		DRM_ERROR("RAS ERROR: inject %s error failed ret %d\n",
671 				ras_block_str(info->head.block),
672 				ret);
673 
674 	return ret;
675 }
676 
amdgpu_ras_error_cure(struct amdgpu_device * adev,struct ras_cure_if * info)677 int amdgpu_ras_error_cure(struct amdgpu_device *adev,
678 		struct ras_cure_if *info)
679 {
680 	/* psp fw has no cure interface for now. */
681 	return 0;
682 }
683 
684 /* get the total error counts on all IPs */
amdgpu_ras_query_error_count(struct amdgpu_device * adev,bool is_ce)685 unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
686 		bool is_ce)
687 {
688 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
689 	struct ras_manager *obj;
690 	struct ras_err_data data = {0, 0};
691 
692 	if (!con)
693 		return 0;
694 
695 	list_for_each_entry(obj, &con->head, node) {
696 		struct ras_query_if info = {
697 			.head = obj->head,
698 		};
699 
700 		if (amdgpu_ras_error_query(adev, &info))
701 			return 0;
702 
703 		data.ce_count += info.ce_count;
704 		data.ue_count += info.ue_count;
705 	}
706 
707 	return is_ce ? data.ce_count : data.ue_count;
708 }
709 /* query/inject/cure end */
710 
711 
712 /* sysfs begin */
713 
714 static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
715 		struct ras_badpage **bps, unsigned int *count);
716 
amdgpu_ras_badpage_flags_str(unsigned int flags)717 static char *amdgpu_ras_badpage_flags_str(unsigned int flags)
718 {
719 	switch (flags) {
720 	case 0:
721 		return "R";
722 	case 1:
723 		return "P";
724 	case 2:
725 	default:
726 		return "F";
727 	};
728 }
729 
730 /*
731  * DOC: ras sysfs gpu_vram_bad_pages interface
732  *
733  * It allows user to read the bad pages of vram on the gpu through
734  * /sys/class/drm/card[0/1/2...]/device/ras/gpu_vram_bad_pages
735  *
736  * It outputs multiple lines, and each line stands for one gpu page.
737  *
738  * The format of one line is below,
739  * gpu pfn : gpu page size : flags
740  *
741  * gpu pfn and gpu page size are printed in hex format.
742  * flags can be one of below character,
743  * R: reserved, this gpu page is reserved and not able to use.
744  * P: pending for reserve, this gpu page is marked as bad, will be reserved
745  *    in next window of page_reserve.
746  * F: unable to reserve. this gpu page can't be reserved due to some reasons.
747  *
748  * examples:
749  * 0x00000001 : 0x00001000 : R
750  * 0x00000002 : 0x00001000 : P
751  */
752 
amdgpu_ras_sysfs_badpages_read(struct file * f,struct kobject * kobj,struct bin_attribute * attr,char * buf,loff_t ppos,size_t count)753 static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f,
754 		struct kobject *kobj, struct bin_attribute *attr,
755 		char *buf, loff_t ppos, size_t count)
756 {
757 	struct amdgpu_ras *con =
758 		container_of(attr, struct amdgpu_ras, badpages_attr);
759 	struct amdgpu_device *adev = con->adev;
760 	const unsigned int element_size =
761 		sizeof("0xabcdabcd : 0x12345678 : R\n") - 1;
762 	unsigned int start = div64_ul(ppos + element_size - 1, element_size);
763 	unsigned int end = div64_ul(ppos + count - 1, element_size);
764 	ssize_t s = 0;
765 	struct ras_badpage *bps = NULL;
766 	unsigned int bps_count = 0;
767 
768 	memset(buf, 0, count);
769 
770 	if (amdgpu_ras_badpages_read(adev, &bps, &bps_count))
771 		return 0;
772 
773 	for (; start < end && start < bps_count; start++)
774 		s += scnprintf(&buf[s], element_size + 1,
775 				"0x%08x : 0x%08x : %1s\n",
776 				bps[start].bp,
777 				bps[start].size,
778 				amdgpu_ras_badpage_flags_str(bps[start].flags));
779 
780 	kfree(bps);
781 
782 	return s;
783 }
784 
amdgpu_ras_sysfs_features_read(struct device * dev,struct device_attribute * attr,char * buf)785 static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev,
786 		struct device_attribute *attr, char *buf)
787 {
788 	struct amdgpu_ras *con =
789 		container_of(attr, struct amdgpu_ras, features_attr);
790 
791 	return scnprintf(buf, PAGE_SIZE, "feature mask: 0x%x\n", con->features);
792 }
793 
amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device * adev)794 static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev)
795 {
796 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
797 	struct attribute *attrs[] = {
798 		&con->features_attr.attr,
799 		NULL
800 	};
801 	struct bin_attribute *bin_attrs[] = {
802 		&con->badpages_attr,
803 		NULL
804 	};
805 	struct attribute_group group = {
806 		.name = "ras",
807 		.attrs = attrs,
808 		.bin_attrs = bin_attrs,
809 	};
810 
811 	con->features_attr = (struct device_attribute) {
812 		.attr = {
813 			.name = "features",
814 			.mode = S_IRUGO,
815 		},
816 			.show = amdgpu_ras_sysfs_features_read,
817 	};
818 
819 	con->badpages_attr = (struct bin_attribute) {
820 		.attr = {
821 			.name = "gpu_vram_bad_pages",
822 			.mode = S_IRUGO,
823 		},
824 		.size = 0,
825 		.private = NULL,
826 		.read = amdgpu_ras_sysfs_badpages_read,
827 	};
828 
829 	sysfs_attr_init(attrs[0]);
830 	sysfs_bin_attr_init(bin_attrs[0]);
831 
832 	return sysfs_create_group(&adev->dev->kobj, &group);
833 }
834 
amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device * adev)835 static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev)
836 {
837 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
838 	struct attribute *attrs[] = {
839 		&con->features_attr.attr,
840 		NULL
841 	};
842 	struct bin_attribute *bin_attrs[] = {
843 		&con->badpages_attr,
844 		NULL
845 	};
846 	struct attribute_group group = {
847 		.name = "ras",
848 		.attrs = attrs,
849 		.bin_attrs = bin_attrs,
850 	};
851 
852 	sysfs_remove_group(&adev->dev->kobj, &group);
853 
854 	return 0;
855 }
856 
amdgpu_ras_sysfs_create(struct amdgpu_device * adev,struct ras_fs_if * head)857 int amdgpu_ras_sysfs_create(struct amdgpu_device *adev,
858 		struct ras_fs_if *head)
859 {
860 	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
861 
862 	if (!obj || obj->attr_inuse)
863 		return -EINVAL;
864 
865 	get_obj(obj);
866 
867 	memcpy(obj->fs_data.sysfs_name,
868 			head->sysfs_name,
869 			sizeof(obj->fs_data.sysfs_name));
870 
871 	obj->sysfs_attr = (struct device_attribute){
872 		.attr = {
873 			.name = obj->fs_data.sysfs_name,
874 			.mode = S_IRUGO,
875 		},
876 			.show = amdgpu_ras_sysfs_read,
877 	};
878 	sysfs_attr_init(&obj->sysfs_attr.attr);
879 
880 	if (sysfs_add_file_to_group(&adev->dev->kobj,
881 				&obj->sysfs_attr.attr,
882 				"ras")) {
883 		put_obj(obj);
884 		return -EINVAL;
885 	}
886 
887 	obj->attr_inuse = 1;
888 
889 	return 0;
890 }
891 
amdgpu_ras_sysfs_remove(struct amdgpu_device * adev,struct ras_common_if * head)892 int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev,
893 		struct ras_common_if *head)
894 {
895 	struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
896 
897 	if (!obj || !obj->attr_inuse)
898 		return -EINVAL;
899 
900 	sysfs_remove_file_from_group(&adev->dev->kobj,
901 				&obj->sysfs_attr.attr,
902 				"ras");
903 	obj->attr_inuse = 0;
904 	put_obj(obj);
905 
906 	return 0;
907 }
908 
amdgpu_ras_sysfs_remove_all(struct amdgpu_device * adev)909 static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev)
910 {
911 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
912 	struct ras_manager *obj, *tmp;
913 
914 	list_for_each_entry_safe(obj, tmp, &con->head, node) {
915 		amdgpu_ras_sysfs_remove(adev, &obj->head);
916 	}
917 
918 	amdgpu_ras_sysfs_remove_feature_node(adev);
919 
920 	return 0;
921 }
922 /* sysfs end */
923 
924 /* debugfs begin */
amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device * adev)925 static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)
926 {
927 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
928 	struct drm_minor *minor = adev->ddev->primary;
929 
930 	con->dir = debugfs_create_dir("ras", minor->debugfs_root);
931 	con->ent = debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, con->dir,
932 				       adev, &amdgpu_ras_debugfs_ctrl_ops);
933 }
934 
amdgpu_ras_debugfs_create(struct amdgpu_device * adev,struct ras_fs_if * head)935 void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
936 		struct ras_fs_if *head)
937 {
938 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
939 	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
940 
941 	if (!obj || obj->ent)
942 		return;
943 
944 	get_obj(obj);
945 
946 	memcpy(obj->fs_data.debugfs_name,
947 			head->debugfs_name,
948 			sizeof(obj->fs_data.debugfs_name));
949 
950 	obj->ent = debugfs_create_file(obj->fs_data.debugfs_name,
951 				       S_IWUGO | S_IRUGO, con->dir, obj,
952 				       &amdgpu_ras_debugfs_ops);
953 }
954 
amdgpu_ras_debugfs_remove(struct amdgpu_device * adev,struct ras_common_if * head)955 void amdgpu_ras_debugfs_remove(struct amdgpu_device *adev,
956 		struct ras_common_if *head)
957 {
958 	struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
959 
960 	if (!obj || !obj->ent)
961 		return;
962 
963 	debugfs_remove(obj->ent);
964 	obj->ent = NULL;
965 	put_obj(obj);
966 }
967 
amdgpu_ras_debugfs_remove_all(struct amdgpu_device * adev)968 static void amdgpu_ras_debugfs_remove_all(struct amdgpu_device *adev)
969 {
970 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
971 	struct ras_manager *obj, *tmp;
972 
973 	list_for_each_entry_safe(obj, tmp, &con->head, node) {
974 		amdgpu_ras_debugfs_remove(adev, &obj->head);
975 	}
976 
977 	debugfs_remove(con->ent);
978 	debugfs_remove(con->dir);
979 	con->dir = NULL;
980 	con->ent = NULL;
981 }
982 /* debugfs end */
983 
984 /* ras fs */
985 
amdgpu_ras_fs_init(struct amdgpu_device * adev)986 static int amdgpu_ras_fs_init(struct amdgpu_device *adev)
987 {
988 	amdgpu_ras_sysfs_create_feature_node(adev);
989 	amdgpu_ras_debugfs_create_ctrl_node(adev);
990 
991 	return 0;
992 }
993 
amdgpu_ras_fs_fini(struct amdgpu_device * adev)994 static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)
995 {
996 	amdgpu_ras_debugfs_remove_all(adev);
997 	amdgpu_ras_sysfs_remove_all(adev);
998 	return 0;
999 }
1000 /* ras fs end */
1001 
1002 /* ih begin */
amdgpu_ras_interrupt_handler(struct ras_manager * obj)1003 static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
1004 {
1005 	struct ras_ih_data *data = &obj->ih_data;
1006 	struct amdgpu_iv_entry entry;
1007 	int ret;
1008 	struct ras_err_data err_data = {0, 0, 0, NULL};
1009 
1010 	while (data->rptr != data->wptr) {
1011 		rmb();
1012 		memcpy(&entry, &data->ring[data->rptr],
1013 				data->element_size);
1014 
1015 		wmb();
1016 		data->rptr = (data->aligned_element_size +
1017 				data->rptr) % data->ring_size;
1018 
1019 		/* Let IP handle its data, maybe we need get the output
1020 		 * from the callback to udpate the error type/count, etc
1021 		 */
1022 		if (data->cb) {
1023 			ret = data->cb(obj->adev, &err_data, &entry);
1024 			/* ue will trigger an interrupt, and in that case
1025 			 * we need do a reset to recovery the whole system.
1026 			 * But leave IP do that recovery, here we just dispatch
1027 			 * the error.
1028 			 */
1029 			if (ret == AMDGPU_RAS_SUCCESS) {
1030 				/* these counts could be left as 0 if
1031 				 * some blocks do not count error number
1032 				 */
1033 				obj->err_data.ue_count += err_data.ue_count;
1034 				obj->err_data.ce_count += err_data.ce_count;
1035 			}
1036 		}
1037 	}
1038 }
1039 
amdgpu_ras_interrupt_process_handler(struct work_struct * work)1040 static void amdgpu_ras_interrupt_process_handler(struct work_struct *work)
1041 {
1042 	struct ras_ih_data *data =
1043 		container_of(work, struct ras_ih_data, ih_work);
1044 	struct ras_manager *obj =
1045 		container_of(data, struct ras_manager, ih_data);
1046 
1047 	amdgpu_ras_interrupt_handler(obj);
1048 }
1049 
amdgpu_ras_interrupt_dispatch(struct amdgpu_device * adev,struct ras_dispatch_if * info)1050 int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,
1051 		struct ras_dispatch_if *info)
1052 {
1053 	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1054 	struct ras_ih_data *data = &obj->ih_data;
1055 
1056 	if (!obj)
1057 		return -EINVAL;
1058 
1059 	if (data->inuse == 0)
1060 		return 0;
1061 
1062 	/* Might be overflow... */
1063 	memcpy(&data->ring[data->wptr], info->entry,
1064 			data->element_size);
1065 
1066 	wmb();
1067 	data->wptr = (data->aligned_element_size +
1068 			data->wptr) % data->ring_size;
1069 
1070 	schedule_work(&data->ih_work);
1071 
1072 	return 0;
1073 }
1074 
amdgpu_ras_interrupt_remove_handler(struct amdgpu_device * adev,struct ras_ih_if * info)1075 int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev,
1076 		struct ras_ih_if *info)
1077 {
1078 	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1079 	struct ras_ih_data *data;
1080 
1081 	if (!obj)
1082 		return -EINVAL;
1083 
1084 	data = &obj->ih_data;
1085 	if (data->inuse == 0)
1086 		return 0;
1087 
1088 	cancel_work_sync(&data->ih_work);
1089 
1090 	kfree(data->ring);
1091 	memset(data, 0, sizeof(*data));
1092 	put_obj(obj);
1093 
1094 	return 0;
1095 }
1096 
amdgpu_ras_interrupt_add_handler(struct amdgpu_device * adev,struct ras_ih_if * info)1097 int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev,
1098 		struct ras_ih_if *info)
1099 {
1100 	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1101 	struct ras_ih_data *data;
1102 
1103 	if (!obj) {
1104 		/* in case we registe the IH before enable ras feature */
1105 		obj = amdgpu_ras_create_obj(adev, &info->head);
1106 		if (!obj)
1107 			return -EINVAL;
1108 	} else
1109 		get_obj(obj);
1110 
1111 	data = &obj->ih_data;
1112 	/* add the callback.etc */
1113 	*data = (struct ras_ih_data) {
1114 		.inuse = 0,
1115 		.cb = info->cb,
1116 		.element_size = sizeof(struct amdgpu_iv_entry),
1117 		.rptr = 0,
1118 		.wptr = 0,
1119 	};
1120 
1121 	INIT_WORK(&data->ih_work, amdgpu_ras_interrupt_process_handler);
1122 
1123 	data->aligned_element_size = ALIGN(data->element_size, 8);
1124 	/* the ring can store 64 iv entries. */
1125 	data->ring_size = 64 * data->aligned_element_size;
1126 	data->ring = kmalloc(data->ring_size, GFP_KERNEL);
1127 	if (!data->ring) {
1128 		put_obj(obj);
1129 		return -ENOMEM;
1130 	}
1131 
1132 	/* IH is ready */
1133 	data->inuse = 1;
1134 
1135 	return 0;
1136 }
1137 
amdgpu_ras_interrupt_remove_all(struct amdgpu_device * adev)1138 static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev)
1139 {
1140 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1141 	struct ras_manager *obj, *tmp;
1142 
1143 	list_for_each_entry_safe(obj, tmp, &con->head, node) {
1144 		struct ras_ih_if info = {
1145 			.head = obj->head,
1146 		};
1147 		amdgpu_ras_interrupt_remove_handler(adev, &info);
1148 	}
1149 
1150 	return 0;
1151 }
1152 /* ih end */
1153 
1154 /* recovery begin */
1155 
1156 /* return 0 on success.
1157  * caller need free bps.
1158  */
amdgpu_ras_badpages_read(struct amdgpu_device * adev,struct ras_badpage ** bps,unsigned int * count)1159 static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
1160 		struct ras_badpage **bps, unsigned int *count)
1161 {
1162 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1163 	struct ras_err_handler_data *data;
1164 	int i = 0;
1165 	int ret = 0;
1166 
1167 	if (!con || !con->eh_data || !bps || !count)
1168 		return -EINVAL;
1169 
1170 	mutex_lock(&con->recovery_lock);
1171 	data = con->eh_data;
1172 	if (!data || data->count == 0) {
1173 		*bps = NULL;
1174 		goto out;
1175 	}
1176 
1177 	*bps = kmalloc(sizeof(struct ras_badpage) * data->count, GFP_KERNEL);
1178 	if (!*bps) {
1179 		ret = -ENOMEM;
1180 		goto out;
1181 	}
1182 
1183 	for (; i < data->count; i++) {
1184 		(*bps)[i] = (struct ras_badpage){
1185 			.bp = data->bps[i].bp,
1186 			.size = AMDGPU_GPU_PAGE_SIZE,
1187 			.flags = 0,
1188 		};
1189 
1190 		if (data->last_reserved <= i)
1191 			(*bps)[i].flags = 1;
1192 		else if (data->bps[i].bo == NULL)
1193 			(*bps)[i].flags = 2;
1194 	}
1195 
1196 	*count = data->count;
1197 out:
1198 	mutex_unlock(&con->recovery_lock);
1199 	return ret;
1200 }
1201 
amdgpu_ras_do_recovery(struct work_struct * work)1202 static void amdgpu_ras_do_recovery(struct work_struct *work)
1203 {
1204 	struct amdgpu_ras *ras =
1205 		container_of(work, struct amdgpu_ras, recovery_work);
1206 
1207 	amdgpu_device_gpu_recover(ras->adev, 0);
1208 	atomic_set(&ras->in_recovery, 0);
1209 }
1210 
1211 /* alloc/realloc bps array */
amdgpu_ras_realloc_eh_data_space(struct amdgpu_device * adev,struct ras_err_handler_data * data,int pages)1212 static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
1213 		struct ras_err_handler_data *data, int pages)
1214 {
1215 	unsigned int old_space = data->count + data->space_left;
1216 	unsigned int new_space = old_space + pages;
1217 	unsigned int align_space = ALIGN(new_space, 1024);
1218 	void *tmp = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
1219 
1220 	if (!tmp)
1221 		return -ENOMEM;
1222 
1223 	if (data->bps) {
1224 		memcpy(tmp, data->bps,
1225 				data->count * sizeof(*data->bps));
1226 		kfree(data->bps);
1227 	}
1228 
1229 	data->bps = tmp;
1230 	data->space_left += align_space - old_space;
1231 	return 0;
1232 }
1233 
1234 /* it deal with vram only. */
amdgpu_ras_add_bad_pages(struct amdgpu_device * adev,unsigned long * bps,int pages)1235 int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
1236 		unsigned long *bps, int pages)
1237 {
1238 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1239 	struct ras_err_handler_data *data;
1240 	int i = pages;
1241 	int ret = 0;
1242 
1243 	if (!con || !con->eh_data || !bps || pages <= 0)
1244 		return 0;
1245 
1246 	mutex_lock(&con->recovery_lock);
1247 	data = con->eh_data;
1248 	if (!data)
1249 		goto out;
1250 
1251 	if (data->space_left <= pages)
1252 		if (amdgpu_ras_realloc_eh_data_space(adev, data, pages)) {
1253 			ret = -ENOMEM;
1254 			goto out;
1255 		}
1256 
1257 	while (i--)
1258 		data->bps[data->count++].bp = bps[i];
1259 
1260 	data->space_left -= pages;
1261 out:
1262 	mutex_unlock(&con->recovery_lock);
1263 
1264 	return ret;
1265 }
1266 
1267 /* called in gpu recovery/init */
amdgpu_ras_reserve_bad_pages(struct amdgpu_device * adev)1268 int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
1269 {
1270 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1271 	struct ras_err_handler_data *data;
1272 	uint64_t bp;
1273 	struct amdgpu_bo *bo = NULL;
1274 	int i;
1275 
1276 	if (!con || !con->eh_data)
1277 		return 0;
1278 
1279 	mutex_lock(&con->recovery_lock);
1280 	data = con->eh_data;
1281 	if (!data)
1282 		goto out;
1283 	/* reserve vram at driver post stage. */
1284 	for (i = data->last_reserved; i < data->count; i++) {
1285 		bp = data->bps[i].bp;
1286 
1287 		if (amdgpu_bo_create_kernel_at(adev, bp << PAGE_SHIFT, PAGE_SIZE,
1288 					       AMDGPU_GEM_DOMAIN_VRAM,
1289 					       &bo, NULL))
1290 			DRM_ERROR("RAS ERROR: reserve vram %llx fail\n", bp);
1291 
1292 		data->bps[i].bo = bo;
1293 		data->last_reserved = i + 1;
1294 		bo = NULL;
1295 	}
1296 out:
1297 	mutex_unlock(&con->recovery_lock);
1298 	return 0;
1299 }
1300 
1301 /* called when driver unload */
amdgpu_ras_release_bad_pages(struct amdgpu_device * adev)1302 static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev)
1303 {
1304 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1305 	struct ras_err_handler_data *data;
1306 	struct amdgpu_bo *bo;
1307 	int i;
1308 
1309 	if (!con || !con->eh_data)
1310 		return 0;
1311 
1312 	mutex_lock(&con->recovery_lock);
1313 	data = con->eh_data;
1314 	if (!data)
1315 		goto out;
1316 
1317 	for (i = data->last_reserved - 1; i >= 0; i--) {
1318 		bo = data->bps[i].bo;
1319 
1320 		amdgpu_bo_free_kernel(&bo, NULL, NULL);
1321 
1322 		data->bps[i].bo = bo;
1323 		data->last_reserved = i;
1324 	}
1325 out:
1326 	mutex_unlock(&con->recovery_lock);
1327 	return 0;
1328 }
1329 
amdgpu_ras_save_bad_pages(struct amdgpu_device * adev)1330 static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
1331 {
1332 	/* TODO
1333 	 * write the array to eeprom when SMU disabled.
1334 	 */
1335 	return 0;
1336 }
1337 
amdgpu_ras_load_bad_pages(struct amdgpu_device * adev)1338 static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
1339 {
1340 	/* TODO
1341 	 * read the array to eeprom when SMU disabled.
1342 	 */
1343 	return 0;
1344 }
1345 
amdgpu_ras_recovery_init(struct amdgpu_device * adev)1346 static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
1347 {
1348 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1349 	struct ras_err_handler_data **data = &con->eh_data;
1350 
1351 	*data = kmalloc(sizeof(**data),
1352 			GFP_KERNEL|__GFP_ZERO);
1353 	if (!*data)
1354 		return -ENOMEM;
1355 
1356 	mutex_init(&con->recovery_lock);
1357 	INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
1358 	atomic_set(&con->in_recovery, 0);
1359 	con->adev = adev;
1360 
1361 	amdgpu_ras_load_bad_pages(adev);
1362 	amdgpu_ras_reserve_bad_pages(adev);
1363 
1364 	return 0;
1365 }
1366 
amdgpu_ras_recovery_fini(struct amdgpu_device * adev)1367 static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
1368 {
1369 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1370 	struct ras_err_handler_data *data = con->eh_data;
1371 
1372 	cancel_work_sync(&con->recovery_work);
1373 	amdgpu_ras_save_bad_pages(adev);
1374 	amdgpu_ras_release_bad_pages(adev);
1375 
1376 	mutex_lock(&con->recovery_lock);
1377 	con->eh_data = NULL;
1378 	kfree(data->bps);
1379 	kfree(data);
1380 	mutex_unlock(&con->recovery_lock);
1381 
1382 	return 0;
1383 }
1384 /* recovery end */
1385 
1386 /* return 0 if ras will reset gpu and repost.*/
amdgpu_ras_request_reset_on_boot(struct amdgpu_device * adev,unsigned int block)1387 int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
1388 		unsigned int block)
1389 {
1390 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
1391 
1392 	if (!ras)
1393 		return -EINVAL;
1394 
1395 	ras->flags |= AMDGPU_RAS_FLAG_INIT_NEED_RESET;
1396 	return 0;
1397 }
1398 
1399 /*
1400  * check hardware's ras ability which will be saved in hw_supported.
1401  * if hardware does not support ras, we can skip some ras initializtion and
1402  * forbid some ras operations from IP.
1403  * if software itself, say boot parameter, limit the ras ability. We still
1404  * need allow IP do some limited operations, like disable. In such case,
1405  * we have to initialize ras as normal. but need check if operation is
1406  * allowed or not in each function.
1407  */
amdgpu_ras_check_supported(struct amdgpu_device * adev,uint32_t * hw_supported,uint32_t * supported)1408 static void amdgpu_ras_check_supported(struct amdgpu_device *adev,
1409 		uint32_t *hw_supported, uint32_t *supported)
1410 {
1411 	*hw_supported = 0;
1412 	*supported = 0;
1413 
1414 	if (amdgpu_sriov_vf(adev) ||
1415 			adev->asic_type != CHIP_VEGA20)
1416 		return;
1417 
1418 	if (adev->is_atom_fw &&
1419 			(amdgpu_atomfirmware_mem_ecc_supported(adev) ||
1420 			 amdgpu_atomfirmware_sram_ecc_supported(adev)))
1421 		*hw_supported = AMDGPU_RAS_BLOCK_MASK;
1422 
1423 	*supported = amdgpu_ras_enable == 0 ?
1424 				0 : *hw_supported & amdgpu_ras_mask;
1425 }
1426 
amdgpu_ras_init(struct amdgpu_device * adev)1427 int amdgpu_ras_init(struct amdgpu_device *adev)
1428 {
1429 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1430 
1431 	if (con)
1432 		return 0;
1433 
1434 	con = kmalloc(sizeof(struct amdgpu_ras) +
1435 			sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT,
1436 			GFP_KERNEL|__GFP_ZERO);
1437 	if (!con)
1438 		return -ENOMEM;
1439 
1440 	con->objs = (struct ras_manager *)(con + 1);
1441 
1442 	amdgpu_ras_set_context(adev, con);
1443 
1444 	amdgpu_ras_check_supported(adev, &con->hw_supported,
1445 			&con->supported);
1446 	if (!con->hw_supported) {
1447 		amdgpu_ras_set_context(adev, NULL);
1448 		kfree(con);
1449 		return 0;
1450 	}
1451 
1452 	con->features = 0;
1453 	INIT_LIST_HEAD(&con->head);
1454 	/* Might need get this flag from vbios. */
1455 	con->flags = RAS_DEFAULT_FLAGS;
1456 
1457 	if (amdgpu_ras_recovery_init(adev))
1458 		goto recovery_out;
1459 
1460 	amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK;
1461 
1462 	if (amdgpu_ras_fs_init(adev))
1463 		goto fs_out;
1464 
1465 	/* ras init for each ras block */
1466 	if (adev->umc.funcs->ras_init)
1467 		adev->umc.funcs->ras_init(adev);
1468 
1469 	DRM_INFO("RAS INFO: ras initialized successfully, "
1470 			"hardware ability[%x] ras_mask[%x]\n",
1471 			con->hw_supported, con->supported);
1472 	return 0;
1473 fs_out:
1474 	amdgpu_ras_recovery_fini(adev);
1475 recovery_out:
1476 	amdgpu_ras_set_context(adev, NULL);
1477 	kfree(con);
1478 
1479 	return -EINVAL;
1480 }
1481 
1482 /* do some init work after IP late init as dependence.
1483  * and it runs in resume/gpu reset/booting up cases.
1484  */
amdgpu_ras_resume(struct amdgpu_device * adev)1485 void amdgpu_ras_resume(struct amdgpu_device *adev)
1486 {
1487 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1488 	struct ras_manager *obj, *tmp;
1489 
1490 	if (!con)
1491 		return;
1492 
1493 	if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
1494 		/* Set up all other IPs which are not implemented. There is a
1495 		 * tricky thing that IP's actual ras error type should be
1496 		 * MULTI_UNCORRECTABLE, but as driver does not handle it, so
1497 		 * ERROR_NONE make sense anyway.
1498 		 */
1499 		amdgpu_ras_enable_all_features(adev, 1);
1500 
1501 		/* We enable ras on all hw_supported block, but as boot
1502 		 * parameter might disable some of them and one or more IP has
1503 		 * not implemented yet. So we disable them on behalf.
1504 		 */
1505 		list_for_each_entry_safe(obj, tmp, &con->head, node) {
1506 			if (!amdgpu_ras_is_supported(adev, obj->head.block)) {
1507 				amdgpu_ras_feature_enable(adev, &obj->head, 0);
1508 				/* there should be no any reference. */
1509 				WARN_ON(alive_obj(obj));
1510 			}
1511 		}
1512 	}
1513 
1514 	if (con->flags & AMDGPU_RAS_FLAG_INIT_NEED_RESET) {
1515 		con->flags &= ~AMDGPU_RAS_FLAG_INIT_NEED_RESET;
1516 		/* setup ras obj state as disabled.
1517 		 * for init_by_vbios case.
1518 		 * if we want to enable ras, just enable it in a normal way.
1519 		 * If we want do disable it, need setup ras obj as enabled,
1520 		 * then issue another TA disable cmd.
1521 		 * See feature_enable_on_boot
1522 		 */
1523 		amdgpu_ras_disable_all_features(adev, 1);
1524 		amdgpu_ras_reset_gpu(adev, 0);
1525 	}
1526 }
1527 
amdgpu_ras_suspend(struct amdgpu_device * adev)1528 void amdgpu_ras_suspend(struct amdgpu_device *adev)
1529 {
1530 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1531 
1532 	if (!con)
1533 		return;
1534 
1535 	amdgpu_ras_disable_all_features(adev, 0);
1536 	/* Make sure all ras objects are disabled. */
1537 	if (con->features)
1538 		amdgpu_ras_disable_all_features(adev, 1);
1539 }
1540 
1541 /* do some fini work before IP fini as dependence */
amdgpu_ras_pre_fini(struct amdgpu_device * adev)1542 int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
1543 {
1544 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1545 
1546 	if (!con)
1547 		return 0;
1548 
1549 	/* Need disable ras on all IPs here before ip [hw/sw]fini */
1550 	amdgpu_ras_disable_all_features(adev, 0);
1551 	amdgpu_ras_recovery_fini(adev);
1552 	return 0;
1553 }
1554 
amdgpu_ras_fini(struct amdgpu_device * adev)1555 int amdgpu_ras_fini(struct amdgpu_device *adev)
1556 {
1557 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1558 
1559 	if (!con)
1560 		return 0;
1561 
1562 	amdgpu_ras_fs_fini(adev);
1563 	amdgpu_ras_interrupt_remove_all(adev);
1564 
1565 	WARN(con->features, "Feature mask is not cleared");
1566 
1567 	if (con->features)
1568 		amdgpu_ras_disable_all_features(adev, 1);
1569 
1570 	amdgpu_ras_set_context(adev, NULL);
1571 	kfree(con);
1572 
1573 	return 0;
1574 }
1575