• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2017 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  *
22 */
23 
24 #include "CUnit/Basic.h"
25 
26 #include "amdgpu_test.h"
27 #include "amdgpu_drm.h"
28 #include "amdgpu_internal.h"
29 #include <unistd.h>
30 #include <fcntl.h>
31 #include <stdio.h>
32 #include "xf86drm.h"
33 
34 const char *ras_block_string[] = {
35 	"umc",
36 	"sdma",
37 	"gfx",
38 	"mmhub",
39 	"athub",
40 	"pcie_bif",
41 	"hdp",
42 	"xgmi_wafl",
43 	"df",
44 	"smn",
45 	"sem",
46 	"mp0",
47 	"mp1",
48 	"fuse",
49 };
50 
51 #define ras_block_str(i) (ras_block_string[i])
52 
53 enum amdgpu_ras_block {
54 	AMDGPU_RAS_BLOCK__UMC = 0,
55 	AMDGPU_RAS_BLOCK__SDMA,
56 	AMDGPU_RAS_BLOCK__GFX,
57 	AMDGPU_RAS_BLOCK__MMHUB,
58 	AMDGPU_RAS_BLOCK__ATHUB,
59 	AMDGPU_RAS_BLOCK__PCIE_BIF,
60 	AMDGPU_RAS_BLOCK__HDP,
61 	AMDGPU_RAS_BLOCK__XGMI_WAFL,
62 	AMDGPU_RAS_BLOCK__DF,
63 	AMDGPU_RAS_BLOCK__SMN,
64 	AMDGPU_RAS_BLOCK__SEM,
65 	AMDGPU_RAS_BLOCK__MP0,
66 	AMDGPU_RAS_BLOCK__MP1,
67 	AMDGPU_RAS_BLOCK__FUSE,
68 
69 	AMDGPU_RAS_BLOCK__LAST
70 };
71 
72 #define AMDGPU_RAS_BLOCK_COUNT  AMDGPU_RAS_BLOCK__LAST
73 #define AMDGPU_RAS_BLOCK_MASK   ((1ULL << AMDGPU_RAS_BLOCK_COUNT) - 1)
74 
75 enum amdgpu_ras_error_type {
76 	AMDGPU_RAS_ERROR__NONE				= 0,
77 	AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE		= 2,
78 	AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE		= 4,
79 	AMDGPU_RAS_ERROR__POISON			= 8,
80 };
81 
82 struct ras_common_if {
83 	enum amdgpu_ras_block block;
84 	enum amdgpu_ras_error_type type;
85 	uint32_t sub_block_index;
86 	char name[32];
87 };
88 
89 struct ras_inject_if {
90 	struct ras_common_if head;
91 	uint64_t address;
92 	uint64_t value;
93 };
94 
95 struct ras_debug_if {
96 	union {
97 		struct ras_common_if head;
98 		struct ras_inject_if inject;
99 	};
100 	int op;
101 };
102 /* for now, only umc, gfx, sdma has implemented. */
103 #define DEFAULT_RAS_BLOCK_MASK_INJECT (1 << AMDGPU_RAS_BLOCK__UMC)
104 #define DEFAULT_RAS_BLOCK_MASK_QUERY (1 << AMDGPU_RAS_BLOCK__UMC)
105 #define DEFAULT_RAS_BLOCK_MASK_BASIC (1 << AMDGPU_RAS_BLOCK__UMC |\
106 		(1 << AMDGPU_RAS_BLOCK__SDMA) |\
107 		(1 << AMDGPU_RAS_BLOCK__GFX))
108 
109 static uint32_t ras_block_mask_inject = DEFAULT_RAS_BLOCK_MASK_INJECT;
110 static uint32_t ras_block_mask_query = DEFAULT_RAS_BLOCK_MASK_INJECT;
111 static uint32_t ras_block_mask_basic = DEFAULT_RAS_BLOCK_MASK_BASIC;
112 
113 struct ras_test_mask {
114 	uint32_t inject_mask;
115 	uint32_t query_mask;
116 	uint32_t basic_mask;
117 };
118 
119 struct amdgpu_ras_data {
120 	amdgpu_device_handle device_handle;
121 	uint32_t  id;
122 	uint32_t  capability;
123 	struct ras_test_mask test_mask;
124 };
125 
126 /* all devices who has ras supported */
127 static struct amdgpu_ras_data devices[MAX_CARDS_SUPPORTED];
128 static int devices_count;
129 
130 struct ras_DID_test_mask{
131 	uint16_t device_id;
132 	uint16_t revision_id;
133 	struct ras_test_mask test_mask;
134 };
135 
136 /* white list for inject test. */
137 #define RAS_BLOCK_MASK_ALL {\
138 	DEFAULT_RAS_BLOCK_MASK_INJECT,\
139 	DEFAULT_RAS_BLOCK_MASK_QUERY,\
140 	DEFAULT_RAS_BLOCK_MASK_BASIC\
141 }
142 
143 #define RAS_BLOCK_MASK_QUERY_BASIC {\
144 	0,\
145 	DEFAULT_RAS_BLOCK_MASK_QUERY,\
146 	DEFAULT_RAS_BLOCK_MASK_BASIC\
147 }
148 
149 static const struct ras_DID_test_mask ras_DID_array[] = {
150 	{0x66a1, 0x00, RAS_BLOCK_MASK_ALL},
151 	{0x66a1, 0x01, RAS_BLOCK_MASK_ALL},
152 	{0x66a1, 0x04, RAS_BLOCK_MASK_ALL},
153 };
154 
amdgpu_ras_get_test_mask(drmDevicePtr device)155 static struct ras_test_mask amdgpu_ras_get_test_mask(drmDevicePtr device)
156 {
157 	int i;
158 	static struct ras_test_mask default_test_mask = RAS_BLOCK_MASK_QUERY_BASIC;
159 
160 	for (i = 0; i < sizeof(ras_DID_array) / sizeof(ras_DID_array[0]); i++) {
161 		if (ras_DID_array[i].device_id == device->deviceinfo.pci->device_id &&
162 				ras_DID_array[i].revision_id == device->deviceinfo.pci->revision_id)
163 			return ras_DID_array[i].test_mask;
164 	}
165 	return default_test_mask;
166 }
167 
amdgpu_ras_lookup_capability(amdgpu_device_handle device_handle)168 static uint32_t amdgpu_ras_lookup_capability(amdgpu_device_handle device_handle)
169 {
170 	union {
171 		uint64_t feature_mask;
172 		struct {
173 			uint32_t enabled_features;
174 			uint32_t supported_features;
175 		};
176 	} features = { 0 };
177 	int ret;
178 
179 	ret = amdgpu_query_info(device_handle, AMDGPU_INFO_RAS_ENABLED_FEATURES,
180 			sizeof(features), &features);
181 	if (ret)
182 		return 0;
183 
184 	return features.supported_features;
185 }
186 
187 static int get_file_contents(char *file, char *buf, int size);
188 
amdgpu_ras_lookup_id(drmDevicePtr device)189 static int amdgpu_ras_lookup_id(drmDevicePtr device)
190 {
191 	char path[1024];
192 	char str[128];
193 	drmPciBusInfo info;
194 	int i;
195 	int ret;
196 
197 	for (i = 0; i < MAX_CARDS_SUPPORTED; i++) {
198 		memset(str, 0, sizeof(str));
199 		memset(&info, 0, sizeof(info));
200 		sprintf(path, "/sys/kernel/debug/dri/%d/name", i);
201 		if (get_file_contents(path, str, sizeof(str)) <= 0)
202 			continue;
203 
204 		ret = sscanf(str, "amdgpu dev=%04hx:%02hhx:%02hhx.%01hhx",
205 				&info.domain, &info.bus, &info.dev, &info.func);
206 		if (ret != 4)
207 			continue;
208 
209 		if (memcmp(&info, device->businfo.pci, sizeof(info)) == 0)
210 				return i;
211 	}
212 	return -1;
213 }
214 
suite_ras_tests_enable(void)215 CU_BOOL suite_ras_tests_enable(void)
216 {
217 	amdgpu_device_handle device_handle;
218 	uint32_t  major_version;
219 	uint32_t  minor_version;
220 	int i;
221 	drmDevicePtr device;
222 
223 	for (i = 0; i < MAX_CARDS_SUPPORTED && drm_amdgpu[i] >= 0; i++) {
224 		if (amdgpu_device_initialize(drm_amdgpu[i], &major_version,
225 					&minor_version, &device_handle))
226 			continue;
227 
228 		if (drmGetDevice2(drm_amdgpu[i],
229 					DRM_DEVICE_GET_PCI_REVISION,
230 					&device))
231 			continue;
232 
233 		if (device->bustype == DRM_BUS_PCI &&
234 				amdgpu_ras_lookup_capability(device_handle)) {
235 			amdgpu_device_deinitialize(device_handle);
236 			return CU_TRUE;
237 		}
238 
239 		if (amdgpu_device_deinitialize(device_handle))
240 			continue;
241 	}
242 
243 	return CU_FALSE;
244 }
245 
suite_ras_tests_init(void)246 int suite_ras_tests_init(void)
247 {
248 	drmDevicePtr device;
249 	amdgpu_device_handle device_handle;
250 	uint32_t  major_version;
251 	uint32_t  minor_version;
252 	uint32_t  capability;
253 	struct ras_test_mask test_mask;
254 	int id;
255 	int i;
256 	int r;
257 
258 	for (i = 0; i < MAX_CARDS_SUPPORTED && drm_amdgpu[i] >= 0; i++) {
259 		r = amdgpu_device_initialize(drm_amdgpu[i], &major_version,
260 				&minor_version, &device_handle);
261 		if (r)
262 			continue;
263 
264 		if (drmGetDevice2(drm_amdgpu[i],
265 					DRM_DEVICE_GET_PCI_REVISION,
266 					&device)) {
267 			amdgpu_device_deinitialize(device_handle);
268 			continue;
269 		}
270 
271 		if (device->bustype != DRM_BUS_PCI) {
272 			amdgpu_device_deinitialize(device_handle);
273 			continue;
274 		}
275 
276 		capability = amdgpu_ras_lookup_capability(device_handle);
277 		if (capability == 0) {
278 			amdgpu_device_deinitialize(device_handle);
279 			continue;
280 
281 		}
282 
283 		id = amdgpu_ras_lookup_id(device);
284 		if (id == -1) {
285 			amdgpu_device_deinitialize(device_handle);
286 			continue;
287 		}
288 
289 		test_mask = amdgpu_ras_get_test_mask(device);
290 
291 		devices[devices_count++] = (struct amdgpu_ras_data) {
292 			device_handle, id, capability, test_mask,
293 		};
294 	}
295 
296 	if (devices_count == 0)
297 		return CUE_SINIT_FAILED;
298 
299 	return CUE_SUCCESS;
300 }
301 
suite_ras_tests_clean(void)302 int suite_ras_tests_clean(void)
303 {
304 	int r;
305 	int i;
306 	int ret = CUE_SUCCESS;
307 
308 	for (i = 0; i < devices_count; i++) {
309 		r = amdgpu_device_deinitialize(devices[i].device_handle);
310 		if (r)
311 			ret = CUE_SCLEAN_FAILED;
312 	}
313 	return ret;
314 }
315 
316 static void amdgpu_ras_disable_test(void);
317 static void amdgpu_ras_enable_test(void);
318 static void amdgpu_ras_inject_test(void);
319 static void amdgpu_ras_query_test(void);
320 static void amdgpu_ras_basic_test(void);
321 
322 CU_TestInfo ras_tests[] = {
323 	{ "ras basic test",	amdgpu_ras_basic_test },
324 	{ "ras query test",	amdgpu_ras_query_test },
325 	{ "ras inject test",	amdgpu_ras_inject_test },
326 	{ "ras disable test",	amdgpu_ras_disable_test },
327 #if 0
328 	{ "ras enable test",	amdgpu_ras_enable_test },
329 #endif
330 	CU_TEST_INFO_NULL,
331 };
332 
333 //helpers
334 
335 static int test_card;
336 static char sysfs_path[1024];
337 static char debugfs_path[1024];
338 static uint32_t ras_mask;
339 static amdgpu_device_handle device_handle;
340 
set_test_card(int card)341 static int set_test_card(int card)
342 {
343 	int i;
344 
345 	test_card = card;
346 	sprintf(sysfs_path, "/sys/class/drm/card%d/device/ras/", devices[card].id);
347 	sprintf(debugfs_path, "/sys/kernel/debug/dri/%d/ras/", devices[card].id);
348 	ras_mask = devices[card].capability;
349 	device_handle = devices[card].device_handle;
350 	ras_block_mask_inject = devices[card].test_mask.inject_mask;
351 	ras_block_mask_query = devices[card].test_mask.query_mask;
352 	ras_block_mask_basic = devices[card].test_mask.basic_mask;
353 
354 	return 0;
355 }
356 
get_ras_sysfs_root(void)357 static const char *get_ras_sysfs_root(void)
358 {
359 	return sysfs_path;
360 }
361 
get_ras_debugfs_root(void)362 static const char *get_ras_debugfs_root(void)
363 {
364 	return debugfs_path;
365 }
366 
set_file_contents(char * file,char * buf,int size)367 static int set_file_contents(char *file, char *buf, int size)
368 {
369 	int n, fd;
370 	fd = open(file, O_WRONLY);
371 	if (fd == -1)
372 		return -1;
373 	n = write(fd, buf, size);
374 	close(fd);
375 	return n;
376 }
377 
get_file_contents(char * file,char * buf,int size)378 static int get_file_contents(char *file, char *buf, int size)
379 {
380 	int n, fd;
381 	fd = open(file, O_RDONLY);
382 	if (fd == -1)
383 		return -1;
384 	n = read(fd, buf, size);
385 	close(fd);
386 	return n;
387 }
388 
is_file_ok(char * file,int flags)389 static int is_file_ok(char *file, int flags)
390 {
391 	int fd;
392 
393 	fd = open(file, flags);
394 	if (fd == -1)
395 		return -1;
396 	close(fd);
397 	return 0;
398 }
399 
amdgpu_ras_is_feature_enabled(enum amdgpu_ras_block block)400 static int amdgpu_ras_is_feature_enabled(enum amdgpu_ras_block block)
401 {
402 	uint32_t feature_mask;
403 	int ret;
404 
405 	ret = amdgpu_query_info(device_handle, AMDGPU_INFO_RAS_ENABLED_FEATURES,
406 			sizeof(feature_mask), &feature_mask);
407 	if (ret)
408 		return -1;
409 
410 	return (1 << block) & feature_mask;
411 }
412 
amdgpu_ras_is_feature_supported(enum amdgpu_ras_block block)413 static int amdgpu_ras_is_feature_supported(enum amdgpu_ras_block block)
414 {
415 	return (1 << block) & ras_mask;
416 }
417 
amdgpu_ras_invoke(struct ras_debug_if * data)418 static int amdgpu_ras_invoke(struct ras_debug_if *data)
419 {
420 	char path[1024];
421 	int ret;
422 
423 	sprintf(path, "%s%s", get_ras_debugfs_root(), "ras_ctrl");
424 
425 	ret = set_file_contents(path, (char *)data, sizeof(*data))
426 		- sizeof(*data);
427 	return ret;
428 }
429 
amdgpu_ras_query_err_count(enum amdgpu_ras_block block,unsigned long * ue,unsigned long * ce)430 static int amdgpu_ras_query_err_count(enum amdgpu_ras_block block,
431 		unsigned long *ue, unsigned long *ce)
432 {
433 	char buf[64];
434 	char name[1024];
435 	int ret;
436 
437 	*ue = *ce = 0;
438 
439 	if (amdgpu_ras_is_feature_supported(block) <= 0)
440 		return -1;
441 
442 	sprintf(name, "%s%s%s", get_ras_sysfs_root(), ras_block_str(block), "_err_count");
443 
444 	if (is_file_ok(name, O_RDONLY))
445 		return 0;
446 
447 	if (get_file_contents(name, buf, sizeof(buf)) <= 0)
448 		return -1;
449 
450 	if (sscanf(buf, "ue: %lu\nce: %lu", ue, ce) != 2)
451 		return -1;
452 
453 	return 0;
454 }
455 
456 //tests
amdgpu_ras_features_test(int enable)457 static void amdgpu_ras_features_test(int enable)
458 {
459 	struct ras_debug_if data;
460 	int ret;
461 	int i;
462 
463 	data.op = enable;
464 	for (i = 0; i < AMDGPU_RAS_BLOCK__LAST; i++) {
465 		struct ras_common_if head = {
466 			.block = i,
467 			.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
468 			.sub_block_index = 0,
469 			.name = "",
470 		};
471 
472 		if (amdgpu_ras_is_feature_supported(i) <= 0)
473 			continue;
474 
475 		data.head = head;
476 
477 		ret = amdgpu_ras_invoke(&data);
478 		CU_ASSERT_EQUAL(ret, 0);
479 
480 		if (ret)
481 			continue;
482 
483 		ret = enable ^ amdgpu_ras_is_feature_enabled(i);
484 		CU_ASSERT_EQUAL(ret, 0);
485 	}
486 }
487 
amdgpu_ras_disable_test(void)488 static void amdgpu_ras_disable_test(void)
489 {
490 	int i;
491 	for (i = 0; i < devices_count; i++) {
492 		set_test_card(i);
493 		amdgpu_ras_features_test(0);
494 	}
495 }
496 
amdgpu_ras_enable_test(void)497 static void amdgpu_ras_enable_test(void)
498 {
499 	int i;
500 	for (i = 0; i < devices_count; i++) {
501 		set_test_card(i);
502 		amdgpu_ras_features_test(1);
503 	}
504 }
505 
__amdgpu_ras_inject_test(void)506 static void __amdgpu_ras_inject_test(void)
507 {
508 	struct ras_debug_if data;
509 	int ret;
510 	int i;
511 	unsigned long ue, ce, ue_old, ce_old;
512 
513 	data.op = 2;
514 	for (i = 0; i < AMDGPU_RAS_BLOCK__LAST; i++) {
515 		int timeout = 3;
516 		struct ras_inject_if inject = {
517 			.head = {
518 				.block = i,
519 				.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
520 				.sub_block_index = 0,
521 				.name = "",
522 			},
523 			.address = 0,
524 			.value = 0,
525 		};
526 
527 		if (amdgpu_ras_is_feature_enabled(i) <= 0)
528 			continue;
529 
530 		if (!((1 << i) & ras_block_mask_inject))
531 			continue;
532 
533 		data.inject = inject;
534 
535 		ret = amdgpu_ras_query_err_count(i, &ue_old, &ce_old);
536 		CU_ASSERT_EQUAL(ret, 0);
537 
538 		if (ret)
539 			continue;
540 
541 		ret = amdgpu_ras_invoke(&data);
542 		CU_ASSERT_EQUAL(ret, 0);
543 
544 		if (ret)
545 			continue;
546 
547 loop:
548 		while (timeout > 0) {
549 			ret = amdgpu_ras_query_err_count(i, &ue, &ce);
550 			CU_ASSERT_EQUAL(ret, 0);
551 
552 			if (ret)
553 				continue;
554 			if (ue_old != ue) {
555 				/*recovery takes ~10s*/
556 				sleep(10);
557 				break;
558 			}
559 
560 			sleep(1);
561 			timeout -= 1;
562 		}
563 
564 		CU_ASSERT_EQUAL(ue_old + 1, ue);
565 		CU_ASSERT_EQUAL(ce_old, ce);
566 	}
567 }
568 
amdgpu_ras_inject_test(void)569 static void amdgpu_ras_inject_test(void)
570 {
571 	int i;
572 	for (i = 0; i < devices_count; i++) {
573 		set_test_card(i);
574 		__amdgpu_ras_inject_test();
575 	}
576 }
577 
__amdgpu_ras_query_test(void)578 static void __amdgpu_ras_query_test(void)
579 {
580 	unsigned long ue, ce;
581 	int ret;
582 	int i;
583 
584 	for (i = 0; i < AMDGPU_RAS_BLOCK__LAST; i++) {
585 		if (amdgpu_ras_is_feature_supported(i) <= 0)
586 			continue;
587 
588 		if (!((1 << i) & ras_block_mask_query))
589 			continue;
590 
591 		ret = amdgpu_ras_query_err_count(i, &ue, &ce);
592 		CU_ASSERT_EQUAL(ret, 0);
593 	}
594 }
595 
amdgpu_ras_query_test(void)596 static void amdgpu_ras_query_test(void)
597 {
598 	int i;
599 	for (i = 0; i < devices_count; i++) {
600 		set_test_card(i);
601 		__amdgpu_ras_query_test();
602 	}
603 }
604 
amdgpu_ras_basic_test(void)605 static void amdgpu_ras_basic_test(void)
606 {
607 	unsigned long ue, ce;
608 	char name[1024];
609 	int ret;
610 	int i;
611 	int j;
612 	uint32_t features;
613 	char path[1024];
614 
615 	ret = is_file_ok("/sys/module/amdgpu/parameters/ras_mask", O_RDONLY);
616 	CU_ASSERT_EQUAL(ret, 0);
617 
618 	for (i = 0; i < devices_count; i++) {
619 		set_test_card(i);
620 
621 		ret = amdgpu_query_info(device_handle, AMDGPU_INFO_RAS_ENABLED_FEATURES,
622 				sizeof(features), &features);
623 		CU_ASSERT_EQUAL(ret, 0);
624 
625 		sprintf(path, "%s%s", get_ras_debugfs_root(), "ras_ctrl");
626 		ret = is_file_ok(path, O_WRONLY);
627 		CU_ASSERT_EQUAL(ret, 0);
628 
629 		sprintf(path, "%s%s", get_ras_sysfs_root(), "features");
630 		ret = is_file_ok(path, O_RDONLY);
631 		CU_ASSERT_EQUAL(ret, 0);
632 
633 		for (j = 0; j < AMDGPU_RAS_BLOCK__LAST; j++) {
634 			ret = amdgpu_ras_is_feature_supported(j);
635 			if (ret <= 0)
636 				continue;
637 
638 			if (!((1 << j) & ras_block_mask_basic))
639 				continue;
640 
641 			sprintf(path, "%s%s%s", get_ras_sysfs_root(), ras_block_str(j), "_err_count");
642 			ret = is_file_ok(path, O_RDONLY);
643 			CU_ASSERT_EQUAL(ret, 0);
644 
645 			sprintf(path, "%s%s%s", get_ras_debugfs_root(), ras_block_str(j), "_err_inject");
646 			ret = is_file_ok(path, O_WRONLY);
647 			CU_ASSERT_EQUAL(ret, 0);
648 		}
649 	}
650 }
651