1 /* 2 * Copyright 2017 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 * 22 */ 23 24 #include "CUnit/Basic.h" 25 26 #include "amdgpu_test.h" 27 #include "amdgpu_drm.h" 28 #include "amdgpu_internal.h" 29 #include <unistd.h> 30 #include <fcntl.h> 31 #include <stdio.h> 32 #include "xf86drm.h" 33 #include <limits.h> 34 35 #define PATH_SIZE PATH_MAX 36 37 #define ARRAY_SIZE(a) (sizeof(a) / sizeof((a)[0])) 38 39 const char *ras_block_string[] = { 40 "umc", 41 "sdma", 42 "gfx", 43 "mmhub", 44 "athub", 45 "pcie_bif", 46 "hdp", 47 "xgmi_wafl", 48 "df", 49 "smn", 50 "sem", 51 "mp0", 52 "mp1", 53 "fuse", 54 }; 55 56 #define ras_block_str(i) (ras_block_string[i]) 57 58 enum amdgpu_ras_block { 59 AMDGPU_RAS_BLOCK__UMC = 0, 60 AMDGPU_RAS_BLOCK__SDMA, 61 AMDGPU_RAS_BLOCK__GFX, 62 AMDGPU_RAS_BLOCK__MMHUB, 63 AMDGPU_RAS_BLOCK__ATHUB, 64 AMDGPU_RAS_BLOCK__PCIE_BIF, 65 AMDGPU_RAS_BLOCK__HDP, 66 AMDGPU_RAS_BLOCK__XGMI_WAFL, 67 AMDGPU_RAS_BLOCK__DF, 68 AMDGPU_RAS_BLOCK__SMN, 69 AMDGPU_RAS_BLOCK__SEM, 70 AMDGPU_RAS_BLOCK__MP0, 71 AMDGPU_RAS_BLOCK__MP1, 72 AMDGPU_RAS_BLOCK__FUSE, 73 74 AMDGPU_RAS_BLOCK__LAST 75 }; 76 77 #define AMDGPU_RAS_BLOCK_COUNT AMDGPU_RAS_BLOCK__LAST 78 #define AMDGPU_RAS_BLOCK_MASK ((1ULL << AMDGPU_RAS_BLOCK_COUNT) - 1) 79 80 enum amdgpu_ras_gfx_subblock { 81 /* CPC */ 82 AMDGPU_RAS_BLOCK__GFX_CPC_INDEX_START = 0, 83 AMDGPU_RAS_BLOCK__GFX_CPC_SCRATCH = 84 AMDGPU_RAS_BLOCK__GFX_CPC_INDEX_START, 85 AMDGPU_RAS_BLOCK__GFX_CPC_UCODE, 86 AMDGPU_RAS_BLOCK__GFX_DC_STATE_ME1, 87 AMDGPU_RAS_BLOCK__GFX_DC_CSINVOC_ME1, 88 AMDGPU_RAS_BLOCK__GFX_DC_RESTORE_ME1, 89 AMDGPU_RAS_BLOCK__GFX_DC_STATE_ME2, 90 AMDGPU_RAS_BLOCK__GFX_DC_CSINVOC_ME2, 91 AMDGPU_RAS_BLOCK__GFX_DC_RESTORE_ME2, 92 AMDGPU_RAS_BLOCK__GFX_CPC_INDEX_END = 93 AMDGPU_RAS_BLOCK__GFX_DC_RESTORE_ME2, 94 /* CPF */ 95 AMDGPU_RAS_BLOCK__GFX_CPF_INDEX_START, 96 AMDGPU_RAS_BLOCK__GFX_CPF_ROQ_ME2 = 97 AMDGPU_RAS_BLOCK__GFX_CPF_INDEX_START, 98 AMDGPU_RAS_BLOCK__GFX_CPF_ROQ_ME1, 99 AMDGPU_RAS_BLOCK__GFX_CPF_TAG, 100 AMDGPU_RAS_BLOCK__GFX_CPF_INDEX_END = AMDGPU_RAS_BLOCK__GFX_CPF_TAG, 101 /* CPG */ 102 AMDGPU_RAS_BLOCK__GFX_CPG_INDEX_START, 103 AMDGPU_RAS_BLOCK__GFX_CPG_DMA_ROQ = 104 AMDGPU_RAS_BLOCK__GFX_CPG_INDEX_START, 105 AMDGPU_RAS_BLOCK__GFX_CPG_DMA_TAG, 106 AMDGPU_RAS_BLOCK__GFX_CPG_TAG, 107 AMDGPU_RAS_BLOCK__GFX_CPG_INDEX_END = AMDGPU_RAS_BLOCK__GFX_CPG_TAG, 108 /* GDS */ 109 AMDGPU_RAS_BLOCK__GFX_GDS_INDEX_START, 110 AMDGPU_RAS_BLOCK__GFX_GDS_MEM = AMDGPU_RAS_BLOCK__GFX_GDS_INDEX_START, 111 AMDGPU_RAS_BLOCK__GFX_GDS_INPUT_QUEUE, 112 AMDGPU_RAS_BLOCK__GFX_GDS_OA_PHY_CMD_RAM_MEM, 113 AMDGPU_RAS_BLOCK__GFX_GDS_OA_PHY_DATA_RAM_MEM, 114 AMDGPU_RAS_BLOCK__GFX_GDS_OA_PIPE_MEM, 115 AMDGPU_RAS_BLOCK__GFX_GDS_INDEX_END = 116 AMDGPU_RAS_BLOCK__GFX_GDS_OA_PIPE_MEM, 117 /* SPI */ 118 AMDGPU_RAS_BLOCK__GFX_SPI_SR_MEM, 119 /* SQ */ 120 AMDGPU_RAS_BLOCK__GFX_SQ_INDEX_START, 121 AMDGPU_RAS_BLOCK__GFX_SQ_SGPR = AMDGPU_RAS_BLOCK__GFX_SQ_INDEX_START, 122 AMDGPU_RAS_BLOCK__GFX_SQ_LDS_D, 123 AMDGPU_RAS_BLOCK__GFX_SQ_LDS_I, 124 AMDGPU_RAS_BLOCK__GFX_SQ_VGPR, 125 AMDGPU_RAS_BLOCK__GFX_SQ_INDEX_END = AMDGPU_RAS_BLOCK__GFX_SQ_VGPR, 126 /* SQC (3 ranges) */ 127 AMDGPU_RAS_BLOCK__GFX_SQC_INDEX_START, 128 /* SQC range 0 */ 129 AMDGPU_RAS_BLOCK__GFX_SQC_INDEX0_START = 130 AMDGPU_RAS_BLOCK__GFX_SQC_INDEX_START, 131 AMDGPU_RAS_BLOCK__GFX_SQC_INST_UTCL1_LFIFO = 132 AMDGPU_RAS_BLOCK__GFX_SQC_INDEX0_START, 133 AMDGPU_RAS_BLOCK__GFX_SQC_DATA_CU0_WRITE_DATA_BUF, 134 AMDGPU_RAS_BLOCK__GFX_SQC_DATA_CU0_UTCL1_LFIFO, 135 AMDGPU_RAS_BLOCK__GFX_SQC_DATA_CU1_WRITE_DATA_BUF, 136 AMDGPU_RAS_BLOCK__GFX_SQC_DATA_CU1_UTCL1_LFIFO, 137 AMDGPU_RAS_BLOCK__GFX_SQC_DATA_CU2_WRITE_DATA_BUF, 138 AMDGPU_RAS_BLOCK__GFX_SQC_DATA_CU2_UTCL1_LFIFO, 139 AMDGPU_RAS_BLOCK__GFX_SQC_INDEX0_END = 140 AMDGPU_RAS_BLOCK__GFX_SQC_DATA_CU2_UTCL1_LFIFO, 141 /* SQC range 1 */ 142 AMDGPU_RAS_BLOCK__GFX_SQC_INDEX1_START, 143 AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKA_TAG_RAM = 144 AMDGPU_RAS_BLOCK__GFX_SQC_INDEX1_START, 145 AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKA_UTCL1_MISS_FIFO, 146 AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKA_MISS_FIFO, 147 AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKA_BANK_RAM, 148 AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKA_TAG_RAM, 149 AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKA_HIT_FIFO, 150 AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKA_MISS_FIFO, 151 AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKA_DIRTY_BIT_RAM, 152 AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKA_BANK_RAM, 153 AMDGPU_RAS_BLOCK__GFX_SQC_INDEX1_END = 154 AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKA_BANK_RAM, 155 /* SQC range 2 */ 156 AMDGPU_RAS_BLOCK__GFX_SQC_INDEX2_START, 157 AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKB_TAG_RAM = 158 AMDGPU_RAS_BLOCK__GFX_SQC_INDEX2_START, 159 AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKB_UTCL1_MISS_FIFO, 160 AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKB_MISS_FIFO, 161 AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKB_BANK_RAM, 162 AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKB_TAG_RAM, 163 AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKB_HIT_FIFO, 164 AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKB_MISS_FIFO, 165 AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKB_DIRTY_BIT_RAM, 166 AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKB_BANK_RAM, 167 AMDGPU_RAS_BLOCK__GFX_SQC_INDEX2_END = 168 AMDGPU_RAS_BLOCK__GFX_SQC_DATA_BANKB_BANK_RAM, 169 AMDGPU_RAS_BLOCK__GFX_SQC_INDEX_END = 170 AMDGPU_RAS_BLOCK__GFX_SQC_INDEX2_END, 171 /* TA */ 172 AMDGPU_RAS_BLOCK__GFX_TA_INDEX_START, 173 AMDGPU_RAS_BLOCK__GFX_TA_FS_DFIFO = 174 AMDGPU_RAS_BLOCK__GFX_TA_INDEX_START, 175 AMDGPU_RAS_BLOCK__GFX_TA_FS_AFIFO, 176 AMDGPU_RAS_BLOCK__GFX_TA_FL_LFIFO, 177 AMDGPU_RAS_BLOCK__GFX_TA_FX_LFIFO, 178 AMDGPU_RAS_BLOCK__GFX_TA_FS_CFIFO, 179 AMDGPU_RAS_BLOCK__GFX_TA_INDEX_END = AMDGPU_RAS_BLOCK__GFX_TA_FS_CFIFO, 180 /* TCA */ 181 AMDGPU_RAS_BLOCK__GFX_TCA_INDEX_START, 182 AMDGPU_RAS_BLOCK__GFX_TCA_HOLE_FIFO = 183 AMDGPU_RAS_BLOCK__GFX_TCA_INDEX_START, 184 AMDGPU_RAS_BLOCK__GFX_TCA_REQ_FIFO, 185 AMDGPU_RAS_BLOCK__GFX_TCA_INDEX_END = 186 AMDGPU_RAS_BLOCK__GFX_TCA_REQ_FIFO, 187 /* TCC (5 sub-ranges) */ 188 AMDGPU_RAS_BLOCK__GFX_TCC_INDEX_START, 189 /* TCC range 0 */ 190 AMDGPU_RAS_BLOCK__GFX_TCC_INDEX0_START = 191 AMDGPU_RAS_BLOCK__GFX_TCC_INDEX_START, 192 AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DATA = 193 AMDGPU_RAS_BLOCK__GFX_TCC_INDEX0_START, 194 AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DATA_BANK_0_1, 195 AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DATA_BANK_1_0, 196 AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DATA_BANK_1_1, 197 AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DIRTY_BANK_0, 198 AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DIRTY_BANK_1, 199 AMDGPU_RAS_BLOCK__GFX_TCC_HIGH_RATE_TAG, 200 AMDGPU_RAS_BLOCK__GFX_TCC_LOW_RATE_TAG, 201 AMDGPU_RAS_BLOCK__GFX_TCC_INDEX0_END = 202 AMDGPU_RAS_BLOCK__GFX_TCC_LOW_RATE_TAG, 203 /* TCC range 1 */ 204 AMDGPU_RAS_BLOCK__GFX_TCC_INDEX1_START, 205 AMDGPU_RAS_BLOCK__GFX_TCC_IN_USE_DEC = 206 AMDGPU_RAS_BLOCK__GFX_TCC_INDEX1_START, 207 AMDGPU_RAS_BLOCK__GFX_TCC_IN_USE_TRANSFER, 208 AMDGPU_RAS_BLOCK__GFX_TCC_INDEX1_END = 209 AMDGPU_RAS_BLOCK__GFX_TCC_IN_USE_TRANSFER, 210 /* TCC range 2 */ 211 AMDGPU_RAS_BLOCK__GFX_TCC_INDEX2_START, 212 AMDGPU_RAS_BLOCK__GFX_TCC_RETURN_DATA = 213 AMDGPU_RAS_BLOCK__GFX_TCC_INDEX2_START, 214 AMDGPU_RAS_BLOCK__GFX_TCC_RETURN_CONTROL, 215 AMDGPU_RAS_BLOCK__GFX_TCC_UC_ATOMIC_FIFO, 216 AMDGPU_RAS_BLOCK__GFX_TCC_WRITE_RETURN, 217 AMDGPU_RAS_BLOCK__GFX_TCC_WRITE_CACHE_READ, 218 AMDGPU_RAS_BLOCK__GFX_TCC_SRC_FIFO, 219 AMDGPU_RAS_BLOCK__GFX_TCC_SRC_FIFO_NEXT_RAM, 220 AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_TAG_PROBE_FIFO, 221 AMDGPU_RAS_BLOCK__GFX_TCC_INDEX2_END = 222 AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_TAG_PROBE_FIFO, 223 /* TCC range 3 */ 224 AMDGPU_RAS_BLOCK__GFX_TCC_INDEX3_START, 225 AMDGPU_RAS_BLOCK__GFX_TCC_LATENCY_FIFO = 226 AMDGPU_RAS_BLOCK__GFX_TCC_INDEX3_START, 227 AMDGPU_RAS_BLOCK__GFX_TCC_LATENCY_FIFO_NEXT_RAM, 228 AMDGPU_RAS_BLOCK__GFX_TCC_INDEX3_END = 229 AMDGPU_RAS_BLOCK__GFX_TCC_LATENCY_FIFO_NEXT_RAM, 230 /* TCC range 4 */ 231 AMDGPU_RAS_BLOCK__GFX_TCC_INDEX4_START, 232 AMDGPU_RAS_BLOCK__GFX_TCC_WRRET_TAG_WRITE_RETURN = 233 AMDGPU_RAS_BLOCK__GFX_TCC_INDEX4_START, 234 AMDGPU_RAS_BLOCK__GFX_TCC_ATOMIC_RETURN_BUFFER, 235 AMDGPU_RAS_BLOCK__GFX_TCC_INDEX4_END = 236 AMDGPU_RAS_BLOCK__GFX_TCC_ATOMIC_RETURN_BUFFER, 237 AMDGPU_RAS_BLOCK__GFX_TCC_INDEX_END = 238 AMDGPU_RAS_BLOCK__GFX_TCC_INDEX4_END, 239 /* TCI */ 240 AMDGPU_RAS_BLOCK__GFX_TCI_WRITE_RAM, 241 /* TCP */ 242 AMDGPU_RAS_BLOCK__GFX_TCP_INDEX_START, 243 AMDGPU_RAS_BLOCK__GFX_TCP_CACHE_RAM = 244 AMDGPU_RAS_BLOCK__GFX_TCP_INDEX_START, 245 AMDGPU_RAS_BLOCK__GFX_TCP_LFIFO_RAM, 246 AMDGPU_RAS_BLOCK__GFX_TCP_CMD_FIFO, 247 AMDGPU_RAS_BLOCK__GFX_TCP_VM_FIFO, 248 AMDGPU_RAS_BLOCK__GFX_TCP_DB_RAM, 249 AMDGPU_RAS_BLOCK__GFX_TCP_UTCL1_LFIFO0, 250 AMDGPU_RAS_BLOCK__GFX_TCP_UTCL1_LFIFO1, 251 AMDGPU_RAS_BLOCK__GFX_TCP_INDEX_END = 252 AMDGPU_RAS_BLOCK__GFX_TCP_UTCL1_LFIFO1, 253 /* TD */ 254 AMDGPU_RAS_BLOCK__GFX_TD_INDEX_START, 255 AMDGPU_RAS_BLOCK__GFX_TD_SS_FIFO_LO = 256 AMDGPU_RAS_BLOCK__GFX_TD_INDEX_START, 257 AMDGPU_RAS_BLOCK__GFX_TD_SS_FIFO_HI, 258 AMDGPU_RAS_BLOCK__GFX_TD_CS_FIFO, 259 AMDGPU_RAS_BLOCK__GFX_TD_INDEX_END = AMDGPU_RAS_BLOCK__GFX_TD_CS_FIFO, 260 /* EA (3 sub-ranges) */ 261 AMDGPU_RAS_BLOCK__GFX_EA_INDEX_START, 262 /* EA range 0 */ 263 AMDGPU_RAS_BLOCK__GFX_EA_INDEX0_START = 264 AMDGPU_RAS_BLOCK__GFX_EA_INDEX_START, 265 AMDGPU_RAS_BLOCK__GFX_EA_DRAMRD_CMDMEM = 266 AMDGPU_RAS_BLOCK__GFX_EA_INDEX0_START, 267 AMDGPU_RAS_BLOCK__GFX_EA_DRAMWR_CMDMEM, 268 AMDGPU_RAS_BLOCK__GFX_EA_DRAMWR_DATAMEM, 269 AMDGPU_RAS_BLOCK__GFX_EA_RRET_TAGMEM, 270 AMDGPU_RAS_BLOCK__GFX_EA_WRET_TAGMEM, 271 AMDGPU_RAS_BLOCK__GFX_EA_GMIRD_CMDMEM, 272 AMDGPU_RAS_BLOCK__GFX_EA_GMIWR_CMDMEM, 273 AMDGPU_RAS_BLOCK__GFX_EA_GMIWR_DATAMEM, 274 AMDGPU_RAS_BLOCK__GFX_EA_INDEX0_END = 275 AMDGPU_RAS_BLOCK__GFX_EA_GMIWR_DATAMEM, 276 /* EA range 1 */ 277 AMDGPU_RAS_BLOCK__GFX_EA_INDEX1_START, 278 AMDGPU_RAS_BLOCK__GFX_EA_DRAMRD_PAGEMEM = 279 AMDGPU_RAS_BLOCK__GFX_EA_INDEX1_START, 280 AMDGPU_RAS_BLOCK__GFX_EA_DRAMWR_PAGEMEM, 281 AMDGPU_RAS_BLOCK__GFX_EA_IORD_CMDMEM, 282 AMDGPU_RAS_BLOCK__GFX_EA_IOWR_CMDMEM, 283 AMDGPU_RAS_BLOCK__GFX_EA_IOWR_DATAMEM, 284 AMDGPU_RAS_BLOCK__GFX_EA_GMIRD_PAGEMEM, 285 AMDGPU_RAS_BLOCK__GFX_EA_GMIWR_PAGEMEM, 286 AMDGPU_RAS_BLOCK__GFX_EA_INDEX1_END = 287 AMDGPU_RAS_BLOCK__GFX_EA_GMIWR_PAGEMEM, 288 /* EA range 2 */ 289 AMDGPU_RAS_BLOCK__GFX_EA_INDEX2_START, 290 AMDGPU_RAS_BLOCK__GFX_EA_MAM_D0MEM = 291 AMDGPU_RAS_BLOCK__GFX_EA_INDEX2_START, 292 AMDGPU_RAS_BLOCK__GFX_EA_MAM_D1MEM, 293 AMDGPU_RAS_BLOCK__GFX_EA_MAM_D2MEM, 294 AMDGPU_RAS_BLOCK__GFX_EA_MAM_D3MEM, 295 AMDGPU_RAS_BLOCK__GFX_EA_INDEX2_END = 296 AMDGPU_RAS_BLOCK__GFX_EA_MAM_D3MEM, 297 AMDGPU_RAS_BLOCK__GFX_EA_INDEX_END = 298 AMDGPU_RAS_BLOCK__GFX_EA_INDEX2_END, 299 /* UTC VM L2 bank */ 300 AMDGPU_RAS_BLOCK__UTC_VML2_BANK_CACHE, 301 /* UTC VM walker */ 302 AMDGPU_RAS_BLOCK__UTC_VML2_WALKER, 303 /* UTC ATC L2 2MB cache */ 304 AMDGPU_RAS_BLOCK__UTC_ATCL2_CACHE_2M_BANK, 305 /* UTC ATC L2 4KB cache */ 306 AMDGPU_RAS_BLOCK__UTC_ATCL2_CACHE_4K_BANK, 307 AMDGPU_RAS_BLOCK__GFX_MAX 308 }; 309 310 enum amdgpu_ras_error_type { 311 AMDGPU_RAS_ERROR__NONE = 0, 312 AMDGPU_RAS_ERROR__PARITY = 1, 313 AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE = 2, 314 AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE = 4, 315 AMDGPU_RAS_ERROR__POISON = 8, 316 }; 317 318 struct ras_inject_test_config { 319 char name[64]; 320 char block[32]; 321 int sub_block; 322 enum amdgpu_ras_error_type type; 323 uint64_t address; 324 uint64_t value; 325 }; 326 327 struct ras_common_if { 328 enum amdgpu_ras_block block; 329 enum amdgpu_ras_error_type type; 330 uint32_t sub_block_index; 331 char name[32]; 332 }; 333 334 struct ras_inject_if { 335 struct ras_common_if head; 336 uint64_t address; 337 uint64_t value; 338 }; 339 340 struct ras_debug_if { 341 union { 342 struct ras_common_if head; 343 struct ras_inject_if inject; 344 }; 345 int op; 346 }; 347 /* for now, only umc, gfx, sdma has implemented. */ 348 #define DEFAULT_RAS_BLOCK_MASK_INJECT ((1 << AMDGPU_RAS_BLOCK__UMC) |\ 349 (1 << AMDGPU_RAS_BLOCK__GFX)) 350 #define DEFAULT_RAS_BLOCK_MASK_QUERY ((1 << AMDGPU_RAS_BLOCK__UMC) |\ 351 (1 << AMDGPU_RAS_BLOCK__GFX)) 352 #define DEFAULT_RAS_BLOCK_MASK_BASIC (1 << AMDGPU_RAS_BLOCK__UMC |\ 353 (1 << AMDGPU_RAS_BLOCK__SDMA) |\ 354 (1 << AMDGPU_RAS_BLOCK__GFX)) 355 356 static uint32_t ras_block_mask_inject = DEFAULT_RAS_BLOCK_MASK_INJECT; 357 static uint32_t ras_block_mask_query = DEFAULT_RAS_BLOCK_MASK_INJECT; 358 static uint32_t ras_block_mask_basic = DEFAULT_RAS_BLOCK_MASK_BASIC; 359 360 struct ras_test_mask { 361 uint32_t inject_mask; 362 uint32_t query_mask; 363 uint32_t basic_mask; 364 }; 365 366 struct amdgpu_ras_data { 367 amdgpu_device_handle device_handle; 368 uint32_t id; 369 uint32_t capability; 370 struct ras_test_mask test_mask; 371 }; 372 373 /* all devices who has ras supported */ 374 static struct amdgpu_ras_data devices[MAX_CARDS_SUPPORTED]; 375 static int devices_count; 376 377 struct ras_DID_test_mask{ 378 uint16_t device_id; 379 uint16_t revision_id; 380 struct ras_test_mask test_mask; 381 }; 382 383 /* white list for inject test. */ 384 #define RAS_BLOCK_MASK_ALL {\ 385 DEFAULT_RAS_BLOCK_MASK_INJECT,\ 386 DEFAULT_RAS_BLOCK_MASK_QUERY,\ 387 DEFAULT_RAS_BLOCK_MASK_BASIC\ 388 } 389 390 #define RAS_BLOCK_MASK_QUERY_BASIC {\ 391 0,\ 392 DEFAULT_RAS_BLOCK_MASK_QUERY,\ 393 DEFAULT_RAS_BLOCK_MASK_BASIC\ 394 } 395 396 static const struct ras_inject_test_config umc_ras_inject_test[] = { 397 {"ras_umc.1.0", "umc", 0, AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0}, 398 }; 399 400 static const struct ras_inject_test_config gfx_ras_inject_test[] = { 401 {"ras_gfx.2.0", "gfx", AMDGPU_RAS_BLOCK__GFX_CPC_UCODE, 402 AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0}, 403 {"ras_gfx.2.1", "gfx", AMDGPU_RAS_BLOCK__GFX_CPF_TAG, 404 AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0}, 405 {"ras_gfx.2.2", "gfx", AMDGPU_RAS_BLOCK__GFX_CPG_TAG, 406 AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0}, 407 {"ras_gfx.2.3", "gfx", AMDGPU_RAS_BLOCK__GFX_SQ_LDS_D, 408 AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0}, 409 {"ras_gfx.2.4", "gfx", AMDGPU_RAS_BLOCK__GFX_SQC_DATA_CU1_UTCL1_LFIFO, 410 AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0}, 411 {"ras_gfx.2.5", "gfx", AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKA_TAG_RAM, 412 AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0}, 413 {"ras_gfx.2.6", "gfx", AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKB_TAG_RAM, 414 AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0}, 415 {"ras_gfx.2.7", "gfx", AMDGPU_RAS_BLOCK__GFX_TA_FS_DFIFO, 416 AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0}, 417 {"ras_gfx.2.8", "gfx", AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DATA, 418 AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0}, 419 {"ras_gfx.2.9", "gfx", AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DATA_BANK_0_1, 420 AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0}, 421 {"ras_gfx.2.10", "gfx", AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DATA_BANK_1_0, 422 AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0}, 423 {"ras_gfx.2.11", "gfx", AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DATA_BANK_1_1, 424 AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0}, 425 {"ras_gfx.2.12", "gfx", AMDGPU_RAS_BLOCK__GFX_TCP_CACHE_RAM, 426 AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0}, 427 {"ras_gfx.2.13", "gfx", AMDGPU_RAS_BLOCK__GFX_TD_SS_FIFO_LO, 428 AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0}, 429 {"ras_gfx.2.14", "gfx", AMDGPU_RAS_BLOCK__GFX_EA_DRAMRD_CMDMEM, 430 AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0}, 431 }; 432 433 static const struct ras_DID_test_mask ras_DID_array[] = { 434 {0x66a1, 0x00, RAS_BLOCK_MASK_ALL}, 435 {0x66a1, 0x01, RAS_BLOCK_MASK_ALL}, 436 {0x66a1, 0x04, RAS_BLOCK_MASK_ALL}, 437 }; 438 439 static uint32_t amdgpu_ras_find_block_id_by_name(const char *name) 440 { 441 int i; 442 443 for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) { 444 if (strcmp(name, ras_block_string[i]) == 0) 445 return i; 446 } 447 448 return ARRAY_SIZE(ras_block_string); 449 } 450 451 static char *amdgpu_ras_get_error_type_id(enum amdgpu_ras_error_type type) 452 { 453 switch (type) { 454 case AMDGPU_RAS_ERROR__PARITY: 455 return "parity"; 456 case AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE: 457 return "single_correctable"; 458 case AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE: 459 return "multi_uncorrectable"; 460 case AMDGPU_RAS_ERROR__POISON: 461 return "poison"; 462 case AMDGPU_RAS_ERROR__NONE: 463 default: 464 return NULL; 465 } 466 } 467 468 static struct ras_test_mask amdgpu_ras_get_test_mask(drmDevicePtr device) 469 { 470 int i; 471 static struct ras_test_mask default_test_mask = RAS_BLOCK_MASK_QUERY_BASIC; 472 473 for (i = 0; i < sizeof(ras_DID_array) / sizeof(ras_DID_array[0]); i++) { 474 if (ras_DID_array[i].device_id == device->deviceinfo.pci->device_id && 475 ras_DID_array[i].revision_id == device->deviceinfo.pci->revision_id) 476 return ras_DID_array[i].test_mask; 477 } 478 return default_test_mask; 479 } 480 481 static uint32_t amdgpu_ras_lookup_capability(amdgpu_device_handle device_handle) 482 { 483 union { 484 uint64_t feature_mask; 485 struct { 486 uint32_t enabled_features; 487 uint32_t supported_features; 488 }; 489 } features = { 0 }; 490 int ret; 491 492 ret = amdgpu_query_info(device_handle, AMDGPU_INFO_RAS_ENABLED_FEATURES, 493 sizeof(features), &features); 494 if (ret) 495 return 0; 496 497 return features.supported_features; 498 } 499 500 static int get_file_contents(char *file, char *buf, int size); 501 502 static int amdgpu_ras_lookup_id(drmDevicePtr device) 503 { 504 char path[PATH_SIZE]; 505 char str[128]; 506 drmPciBusInfo info; 507 int i; 508 int ret; 509 510 for (i = 0; i < MAX_CARDS_SUPPORTED; i++) { 511 memset(str, 0, sizeof(str)); 512 memset(&info, 0, sizeof(info)); 513 snprintf(path, PATH_SIZE, "/sys/kernel/debug/dri/%d/name", i); 514 if (get_file_contents(path, str, sizeof(str)) <= 0) 515 continue; 516 517 ret = sscanf(str, "amdgpu dev=%04hx:%02hhx:%02hhx.%01hhx", 518 &info.domain, &info.bus, &info.dev, &info.func); 519 if (ret != 4) 520 continue; 521 522 if (memcmp(&info, device->businfo.pci, sizeof(info)) == 0) 523 return i; 524 } 525 return -1; 526 } 527 528 //helpers 529 530 static int test_card; 531 static char sysfs_path[PATH_SIZE]; 532 static char debugfs_path[PATH_SIZE]; 533 static uint32_t ras_mask; 534 static amdgpu_device_handle device_handle; 535 536 static void set_test_card(int card) 537 { 538 test_card = card; 539 snprintf(sysfs_path, PATH_SIZE, "/sys/class/drm/card%d/device/ras/", devices[card].id); 540 snprintf(debugfs_path, PATH_SIZE, "/sys/kernel/debug/dri/%d/ras/", devices[card].id); 541 ras_mask = devices[card].capability; 542 device_handle = devices[card].device_handle; 543 ras_block_mask_inject = devices[card].test_mask.inject_mask; 544 ras_block_mask_query = devices[card].test_mask.query_mask; 545 ras_block_mask_basic = devices[card].test_mask.basic_mask; 546 } 547 548 static const char *get_ras_sysfs_root(void) 549 { 550 return sysfs_path; 551 } 552 553 static const char *get_ras_debugfs_root(void) 554 { 555 return debugfs_path; 556 } 557 558 static int set_file_contents(char *file, char *buf, int size) 559 { 560 int n, fd; 561 fd = open(file, O_WRONLY); 562 if (fd == -1) 563 return -1; 564 n = write(fd, buf, size); 565 close(fd); 566 return n; 567 } 568 569 static int get_file_contents(char *file, char *buf, int size) 570 { 571 int n, fd; 572 fd = open(file, O_RDONLY); 573 if (fd == -1) 574 return -1; 575 n = read(fd, buf, size); 576 close(fd); 577 return n; 578 } 579 580 static int is_file_ok(char *file, int flags) 581 { 582 int fd; 583 584 fd = open(file, flags); 585 if (fd == -1) 586 return -1; 587 close(fd); 588 return 0; 589 } 590 591 static int amdgpu_ras_is_feature_enabled(enum amdgpu_ras_block block) 592 { 593 uint32_t feature_mask; 594 int ret; 595 596 ret = amdgpu_query_info(device_handle, AMDGPU_INFO_RAS_ENABLED_FEATURES, 597 sizeof(feature_mask), &feature_mask); 598 if (ret) 599 return -1; 600 601 return (1 << block) & feature_mask; 602 } 603 604 static int amdgpu_ras_is_feature_supported(enum amdgpu_ras_block block) 605 { 606 return (1 << block) & ras_mask; 607 } 608 609 static int amdgpu_ras_invoke(struct ras_debug_if *data) 610 { 611 char path[PATH_SIZE]; 612 int ret; 613 614 snprintf(path, sizeof(path), "%s", get_ras_debugfs_root()); 615 strncat(path, "ras_ctrl", sizeof(path) - strlen(path)); 616 617 ret = set_file_contents(path, (char *)data, sizeof(*data)) 618 - sizeof(*data); 619 return ret; 620 } 621 622 static int amdgpu_ras_query_err_count(enum amdgpu_ras_block block, 623 unsigned long *ue, unsigned long *ce) 624 { 625 char buf[64]; 626 char name[PATH_SIZE]; 627 628 *ue = *ce = 0; 629 630 if (amdgpu_ras_is_feature_supported(block) <= 0) 631 return -1; 632 633 snprintf(name, sizeof(name), "%s", get_ras_sysfs_root()); 634 strncat(name, ras_block_str(block), sizeof(name) - strlen(name)); 635 strncat(name, "_err_count", sizeof(name) - strlen(name)); 636 637 if (is_file_ok(name, O_RDONLY)) 638 return 0; 639 640 if (get_file_contents(name, buf, sizeof(buf)) <= 0) 641 return -1; 642 643 if (sscanf(buf, "ue: %lu\nce: %lu", ue, ce) != 2) 644 return -1; 645 646 return 0; 647 } 648 649 static int amdgpu_ras_inject(enum amdgpu_ras_block block, 650 uint32_t sub_block, enum amdgpu_ras_error_type type, 651 uint64_t address, uint64_t value) 652 { 653 struct ras_debug_if data = { .op = 2, }; 654 struct ras_inject_if *inject = &data.inject; 655 int ret; 656 657 if (amdgpu_ras_is_feature_enabled(block) <= 0) { 658 fprintf(stderr, "block id(%d) is not valid\n", block); 659 return -1; 660 } 661 662 inject->head.block = block; 663 inject->head.type = type; 664 inject->head.sub_block_index = sub_block; 665 strncpy(inject->head.name, ras_block_str(block), sizeof(inject->head.name)-1); 666 inject->address = address; 667 inject->value = value; 668 669 ret = amdgpu_ras_invoke(&data); 670 CU_ASSERT_EQUAL(ret, 0); 671 if (ret) 672 return -1; 673 674 return 0; 675 } 676 677 //tests 678 static void amdgpu_ras_features_test(int enable) 679 { 680 struct ras_debug_if data; 681 int ret; 682 int i; 683 684 data.op = enable; 685 for (i = 0; i < AMDGPU_RAS_BLOCK__LAST; i++) { 686 struct ras_common_if head = { 687 .block = i, 688 .type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE, 689 .sub_block_index = 0, 690 .name = "", 691 }; 692 693 if (amdgpu_ras_is_feature_supported(i) <= 0) 694 continue; 695 696 data.head = head; 697 698 ret = amdgpu_ras_invoke(&data); 699 CU_ASSERT_EQUAL(ret, 0); 700 701 if (ret) 702 continue; 703 704 ret = enable ^ amdgpu_ras_is_feature_enabled(i); 705 CU_ASSERT_EQUAL(ret, 0); 706 } 707 } 708 709 static void amdgpu_ras_disable_test(void) 710 { 711 int i; 712 for (i = 0; i < devices_count; i++) { 713 set_test_card(i); 714 amdgpu_ras_features_test(0); 715 } 716 } 717 718 static void amdgpu_ras_enable_test(void) 719 { 720 int i; 721 for (i = 0; i < devices_count; i++) { 722 set_test_card(i); 723 amdgpu_ras_features_test(1); 724 } 725 } 726 727 static void __amdgpu_ras_ip_inject_test(const struct ras_inject_test_config *ip_test, 728 uint32_t size) 729 { 730 int i, ret; 731 unsigned long old_ue, old_ce; 732 unsigned long ue, ce; 733 uint32_t block; 734 int timeout; 735 bool pass; 736 737 for (i = 0; i < size; i++) { 738 timeout = 3; 739 pass = false; 740 741 block = amdgpu_ras_find_block_id_by_name(ip_test[i].block); 742 743 /* Ensure one valid ip block */ 744 if (block == ARRAY_SIZE(ras_block_string)) 745 break; 746 747 /* Ensure RAS feature for the IP block is enabled by kernel */ 748 if (amdgpu_ras_is_feature_supported(block) <= 0) 749 break; 750 751 ret = amdgpu_ras_query_err_count(block, &old_ue, &old_ce); 752 CU_ASSERT_EQUAL(ret, 0); 753 if (ret) 754 break; 755 756 ret = amdgpu_ras_inject(block, 757 ip_test[i].sub_block, 758 ip_test[i].type, 759 ip_test[i].address, 760 ip_test[i].value); 761 CU_ASSERT_EQUAL(ret, 0); 762 if (ret) 763 break; 764 765 while (timeout > 0) { 766 sleep(5); 767 768 ret = amdgpu_ras_query_err_count(block, &ue, &ce); 769 CU_ASSERT_EQUAL(ret, 0); 770 if (ret) 771 break; 772 773 if (old_ue != ue || old_ce != ce) { 774 pass = true; 775 sleep(20); 776 break; 777 } 778 timeout -= 1; 779 } 780 printf("\t Test %s@block %s, subblock %d, error_type %s, address %ld, value %ld: %s\n", 781 ip_test[i].name, 782 ip_test[i].block, 783 ip_test[i].sub_block, 784 amdgpu_ras_get_error_type_id(ip_test[i].type), 785 ip_test[i].address, 786 ip_test[i].value, 787 pass ? "Pass" : "Fail"); 788 } 789 } 790 791 static void __amdgpu_ras_inject_test(void) 792 { 793 printf("...\n"); 794 795 /* run UMC ras inject test */ 796 __amdgpu_ras_ip_inject_test(umc_ras_inject_test, 797 ARRAY_SIZE(umc_ras_inject_test)); 798 799 /* run GFX ras inject test */ 800 __amdgpu_ras_ip_inject_test(gfx_ras_inject_test, 801 ARRAY_SIZE(gfx_ras_inject_test)); 802 } 803 804 static void amdgpu_ras_inject_test(void) 805 { 806 int i; 807 for (i = 0; i < devices_count; i++) { 808 set_test_card(i); 809 __amdgpu_ras_inject_test(); 810 } 811 } 812 813 static void __amdgpu_ras_query_test(void) 814 { 815 unsigned long ue, ce; 816 int ret; 817 int i; 818 819 for (i = 0; i < AMDGPU_RAS_BLOCK__LAST; i++) { 820 if (amdgpu_ras_is_feature_supported(i) <= 0) 821 continue; 822 823 if (!((1 << i) & ras_block_mask_query)) 824 continue; 825 826 ret = amdgpu_ras_query_err_count(i, &ue, &ce); 827 CU_ASSERT_EQUAL(ret, 0); 828 } 829 } 830 831 static void amdgpu_ras_query_test(void) 832 { 833 int i; 834 for (i = 0; i < devices_count; i++) { 835 set_test_card(i); 836 __amdgpu_ras_query_test(); 837 } 838 } 839 840 static void amdgpu_ras_basic_test(void) 841 { 842 int ret; 843 int i; 844 int j; 845 uint32_t features; 846 char path[PATH_SIZE]; 847 848 ret = is_file_ok("/sys/module/amdgpu/parameters/ras_mask", O_RDONLY); 849 CU_ASSERT_EQUAL(ret, 0); 850 851 for (i = 0; i < devices_count; i++) { 852 set_test_card(i); 853 854 ret = amdgpu_query_info(device_handle, AMDGPU_INFO_RAS_ENABLED_FEATURES, 855 sizeof(features), &features); 856 CU_ASSERT_EQUAL(ret, 0); 857 858 snprintf(path, sizeof(path), "%s", get_ras_debugfs_root()); 859 strncat(path, "ras_ctrl", sizeof(path) - strlen(path)); 860 861 ret = is_file_ok(path, O_WRONLY); 862 CU_ASSERT_EQUAL(ret, 0); 863 864 snprintf(path, sizeof(path), "%s", get_ras_sysfs_root()); 865 strncat(path, "features", sizeof(path) - strlen(path)); 866 867 ret = is_file_ok(path, O_RDONLY); 868 CU_ASSERT_EQUAL(ret, 0); 869 870 for (j = 0; j < AMDGPU_RAS_BLOCK__LAST; j++) { 871 ret = amdgpu_ras_is_feature_supported(j); 872 if (ret <= 0) 873 continue; 874 875 if (!((1 << j) & ras_block_mask_basic)) 876 continue; 877 878 snprintf(path, sizeof(path), "%s", get_ras_sysfs_root()); 879 strncat(path, ras_block_str(j), sizeof(path) - strlen(path)); 880 strncat(path, "_err_count", sizeof(path) - strlen(path)); 881 882 ret = is_file_ok(path, O_RDONLY); 883 CU_ASSERT_EQUAL(ret, 0); 884 885 snprintf(path, sizeof(path), "%s", get_ras_debugfs_root()); 886 strncat(path, ras_block_str(j), sizeof(path) - strlen(path)); 887 strncat(path, "_err_inject", sizeof(path) - strlen(path)); 888 889 ret = is_file_ok(path, O_WRONLY); 890 CU_ASSERT_EQUAL(ret, 0); 891 } 892 } 893 } 894 895 CU_TestInfo ras_tests[] = { 896 { "ras basic test", amdgpu_ras_basic_test }, 897 { "ras query test", amdgpu_ras_query_test }, 898 { "ras inject test", amdgpu_ras_inject_test }, 899 { "ras disable test", amdgpu_ras_disable_test }, 900 { "ras enable test", amdgpu_ras_enable_test }, 901 CU_TEST_INFO_NULL, 902 }; 903 904 CU_BOOL suite_ras_tests_enable(void) 905 { 906 amdgpu_device_handle device_handle; 907 uint32_t major_version; 908 uint32_t minor_version; 909 int i; 910 drmDevicePtr device; 911 912 for (i = 0; i < MAX_CARDS_SUPPORTED && drm_amdgpu[i] >= 0; i++) { 913 if (amdgpu_device_initialize(drm_amdgpu[i], &major_version, 914 &minor_version, &device_handle)) 915 continue; 916 917 if (drmGetDevice2(drm_amdgpu[i], 918 DRM_DEVICE_GET_PCI_REVISION, 919 &device)) 920 continue; 921 922 if (device->bustype == DRM_BUS_PCI && 923 amdgpu_ras_lookup_capability(device_handle)) { 924 amdgpu_device_deinitialize(device_handle); 925 return CU_TRUE; 926 } 927 928 if (amdgpu_device_deinitialize(device_handle)) 929 continue; 930 } 931 932 return CU_FALSE; 933 } 934 935 int suite_ras_tests_init(void) 936 { 937 drmDevicePtr device; 938 amdgpu_device_handle device_handle; 939 uint32_t major_version; 940 uint32_t minor_version; 941 uint32_t capability; 942 struct ras_test_mask test_mask; 943 int id; 944 int i; 945 int r; 946 947 for (i = 0; i < MAX_CARDS_SUPPORTED && drm_amdgpu[i] >= 0; i++) { 948 r = amdgpu_device_initialize(drm_amdgpu[i], &major_version, 949 &minor_version, &device_handle); 950 if (r) 951 continue; 952 953 if (drmGetDevice2(drm_amdgpu[i], 954 DRM_DEVICE_GET_PCI_REVISION, 955 &device)) { 956 amdgpu_device_deinitialize(device_handle); 957 continue; 958 } 959 960 if (device->bustype != DRM_BUS_PCI) { 961 amdgpu_device_deinitialize(device_handle); 962 continue; 963 } 964 965 capability = amdgpu_ras_lookup_capability(device_handle); 966 if (capability == 0) { 967 amdgpu_device_deinitialize(device_handle); 968 continue; 969 970 } 971 972 id = amdgpu_ras_lookup_id(device); 973 if (id == -1) { 974 amdgpu_device_deinitialize(device_handle); 975 continue; 976 } 977 978 test_mask = amdgpu_ras_get_test_mask(device); 979 980 devices[devices_count++] = (struct amdgpu_ras_data) { 981 device_handle, id, capability, test_mask, 982 }; 983 } 984 985 if (devices_count == 0) 986 return CUE_SINIT_FAILED; 987 988 return CUE_SUCCESS; 989 } 990 991 int suite_ras_tests_clean(void) 992 { 993 int r; 994 int i; 995 int ret = CUE_SUCCESS; 996 997 for (i = 0; i < devices_count; i++) { 998 r = amdgpu_device_deinitialize(devices[i].device_handle); 999 if (r) 1000 ret = CUE_SCLEAN_FAILED; 1001 } 1002 return ret; 1003 } 1004