• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2020 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  *
22  */
23 #include "umc_v8_7.h"
24 #include "amdgpu_ras.h"
25 #include "amdgpu.h"
26 
27 #include "rsmu/rsmu_0_0_2_offset.h"
28 #include "rsmu/rsmu_0_0_2_sh_mask.h"
29 #include "umc/umc_8_7_0_offset.h"
30 #include "umc/umc_8_7_0_sh_mask.h"
31 
32 #define UMC_8_INST_DIST			0x40000
33 
34 const uint32_t
35 	umc_v8_7_channel_idx_tbl[UMC_V8_7_UMC_INSTANCE_NUM][UMC_V8_7_CHANNEL_INSTANCE_NUM] = {
36 		{2, 11},  {4, 13},
37 		{1, 8},   {7, 14},
38 		{10, 3},  {12, 5},
39 		{9, 0},   {15, 6}
40 };
41 
get_umc_8_reg_offset(struct amdgpu_device * adev,uint32_t umc_inst,uint32_t ch_inst)42 static inline uint32_t get_umc_8_reg_offset(struct amdgpu_device *adev,
43 					    uint32_t umc_inst,
44 					    uint32_t ch_inst)
45 {
46 	return adev->umc.channel_offs*ch_inst + UMC_8_INST_DIST*umc_inst;
47 }
48 
umc_v8_7_clear_error_count_per_channel(struct amdgpu_device * adev,uint32_t umc_reg_offset)49 static void umc_v8_7_clear_error_count_per_channel(struct amdgpu_device *adev,
50 					uint32_t umc_reg_offset)
51 {
52 	uint32_t ecc_err_cnt_addr;
53 	uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
54 
55 	ecc_err_cnt_sel_addr =
56 		SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_GeccErrCntSel);
57 	ecc_err_cnt_addr =
58 		SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_GeccErrCnt);
59 
60 	/* select the lower chip */
61 	ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr +
62 					umc_reg_offset) * 4);
63 	ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel,
64 					UMCCH0_0_GeccErrCntSel,
65 					GeccErrCntCsSel, 0);
66 	WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4,
67 			ecc_err_cnt_sel);
68 
69 	/* clear lower chip error count */
70 	WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4,
71 			UMC_V8_7_CE_CNT_INIT);
72 
73 	/* select the higher chip */
74 	ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr +
75 					umc_reg_offset) * 4);
76 	ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel,
77 					UMCCH0_0_GeccErrCntSel,
78 					GeccErrCntCsSel, 1);
79 	WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4,
80 			ecc_err_cnt_sel);
81 
82 	/* clear higher chip error count */
83 	WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4,
84 			UMC_V8_7_CE_CNT_INIT);
85 }
86 
umc_v8_7_clear_error_count(struct amdgpu_device * adev)87 static void umc_v8_7_clear_error_count(struct amdgpu_device *adev)
88 {
89 	uint32_t umc_inst        = 0;
90 	uint32_t ch_inst         = 0;
91 	uint32_t umc_reg_offset  = 0;
92 
93 	LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
94 		umc_reg_offset = get_umc_8_reg_offset(adev,
95 						umc_inst,
96 						ch_inst);
97 
98 		umc_v8_7_clear_error_count_per_channel(adev,
99 						umc_reg_offset);
100 	}
101 }
102 
umc_v8_7_query_correctable_error_count(struct amdgpu_device * adev,uint32_t umc_reg_offset,unsigned long * error_count)103 static void umc_v8_7_query_correctable_error_count(struct amdgpu_device *adev,
104 						   uint32_t umc_reg_offset,
105 						   unsigned long *error_count)
106 {
107 	uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
108 	uint32_t ecc_err_cnt, ecc_err_cnt_addr;
109 	uint64_t mc_umc_status;
110 	uint32_t mc_umc_status_addr;
111 
112 	/* UMC 8_7_2 registers */
113 	ecc_err_cnt_sel_addr =
114 		SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_GeccErrCntSel);
115 	ecc_err_cnt_addr =
116 		SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_GeccErrCnt);
117 	mc_umc_status_addr =
118 		SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0);
119 
120 	/* select the lower chip and check the error count */
121 	ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4);
122 	ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_GeccErrCntSel,
123 					GeccErrCntCsSel, 0);
124 	WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
125 
126 	ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4);
127 	*error_count +=
128 		(REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_GeccErrCnt, GeccErrCnt) -
129 		 UMC_V8_7_CE_CNT_INIT);
130 
131 	/* select the higher chip and check the err counter */
132 	ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_GeccErrCntSel,
133 					GeccErrCntCsSel, 1);
134 	WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
135 
136 	ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4);
137 	*error_count +=
138 		(REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_GeccErrCnt, GeccErrCnt) -
139 		 UMC_V8_7_CE_CNT_INIT);
140 
141 	/* check for SRAM correctable error
142 	  MCUMC_STATUS is a 64 bit register */
143 	mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
144 	if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, ErrorCodeExt) == 6 &&
145 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
146 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)
147 		*error_count += 1;
148 }
149 
umc_v8_7_querry_uncorrectable_error_count(struct amdgpu_device * adev,uint32_t umc_reg_offset,unsigned long * error_count)150 static void umc_v8_7_querry_uncorrectable_error_count(struct amdgpu_device *adev,
151 						      uint32_t umc_reg_offset,
152 						      unsigned long *error_count)
153 {
154 	uint64_t mc_umc_status;
155 	uint32_t mc_umc_status_addr;
156 
157 	mc_umc_status_addr = SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0);
158 
159 	/* check the MCUMC_STATUS */
160 	mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
161 	if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
162 	    (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 ||
163 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
164 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
165 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 ||
166 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1))
167 		*error_count += 1;
168 }
169 
umc_v8_7_query_ras_error_count(struct amdgpu_device * adev,void * ras_error_status)170 static void umc_v8_7_query_ras_error_count(struct amdgpu_device *adev,
171 					   void *ras_error_status)
172 {
173 	struct ras_err_data* err_data = (struct ras_err_data*)ras_error_status;
174 
175 	uint32_t umc_inst        = 0;
176 	uint32_t ch_inst         = 0;
177 	uint32_t umc_reg_offset  = 0;
178 
179 	LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
180 		umc_reg_offset = get_umc_8_reg_offset(adev,
181 						      umc_inst,
182 						      ch_inst);
183 
184 		umc_v8_7_query_correctable_error_count(adev,
185 						       umc_reg_offset,
186 						       &(err_data->ce_count));
187 		umc_v8_7_querry_uncorrectable_error_count(adev,
188 							  umc_reg_offset,
189 							  &(err_data->ue_count));
190 	}
191 
192 	umc_v8_7_clear_error_count(adev);
193 }
194 
umc_v8_7_query_error_address(struct amdgpu_device * adev,struct ras_err_data * err_data,uint32_t umc_reg_offset,uint32_t ch_inst,uint32_t umc_inst)195 static void umc_v8_7_query_error_address(struct amdgpu_device *adev,
196 					 struct ras_err_data *err_data,
197 					 uint32_t umc_reg_offset,
198 					 uint32_t ch_inst,
199 					 uint32_t umc_inst)
200 {
201 	uint32_t lsb, mc_umc_status_addr;
202 	uint64_t mc_umc_status, err_addr, retired_page, mc_umc_addrt0;
203 	struct eeprom_table_record *err_rec;
204 	uint32_t channel_index = adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
205 
206 	mc_umc_status_addr =
207 		SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0);
208 	mc_umc_addrt0 =
209 		SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_ADDRT0);
210 
211 	mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
212 
213 	if (mc_umc_status == 0)
214 		return;
215 
216 	if (!err_data->err_addr) {
217 		/* clear umc status */
218 		WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL);
219 		return;
220 	}
221 
222 	err_rec = &err_data->err_addr[err_data->err_addr_cnt];
223 
224 	/* calculate error address if ue/ce error is detected */
225 	if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
226 	    (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
227 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)) {
228 
229 		err_addr = RREG64_PCIE((mc_umc_addrt0 + umc_reg_offset) * 4);
230 		/* the lowest lsb bits should be ignored */
231 		lsb = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, LSB);
232 		err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
233 		err_addr &= ~((0x1ULL << lsb) - 1);
234 
235 		/* translate umc channel address to soc pa, 3 parts are included */
236 		retired_page = ADDR_OF_4KB_BLOCK(err_addr) |
237 				ADDR_OF_256B_BLOCK(channel_index) |
238 				OFFSET_IN_256B_BLOCK(err_addr);
239 
240 		/* we only save ue error information currently, ce is skipped */
241 		if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC)
242 				== 1) {
243 			err_rec->address = err_addr;
244 			/* page frame address is saved */
245 			err_rec->retired_page = retired_page >> AMDGPU_GPU_PAGE_SHIFT;
246 			err_rec->ts = (uint64_t)ktime_get_real_seconds();
247 			err_rec->err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
248 			err_rec->cu = 0;
249 			err_rec->mem_channel = channel_index;
250 			err_rec->mcumc_id = umc_inst;
251 
252 			err_data->err_addr_cnt++;
253 		}
254 	}
255 
256 	/* clear umc status */
257 	WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL);
258 }
259 
umc_v8_7_query_ras_error_address(struct amdgpu_device * adev,void * ras_error_status)260 static void umc_v8_7_query_ras_error_address(struct amdgpu_device *adev,
261 					     void *ras_error_status)
262 {
263 	struct ras_err_data* err_data = (struct ras_err_data*)ras_error_status;
264 
265 	uint32_t umc_inst        = 0;
266 	uint32_t ch_inst         = 0;
267 	uint32_t umc_reg_offset  = 0;
268 
269 	LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
270 		umc_reg_offset = get_umc_8_reg_offset(adev,
271 						      umc_inst,
272 						      ch_inst);
273 
274 		umc_v8_7_query_error_address(adev,
275 					     err_data,
276 					     umc_reg_offset,
277 					     ch_inst,
278 					     umc_inst);
279 	}
280 }
281 
umc_v8_7_err_cnt_init_per_channel(struct amdgpu_device * adev,uint32_t umc_reg_offset)282 static void umc_v8_7_err_cnt_init_per_channel(struct amdgpu_device *adev,
283 					      uint32_t umc_reg_offset)
284 {
285 	uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
286 	uint32_t ecc_err_cnt_addr;
287 
288 	ecc_err_cnt_sel_addr =
289 		SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_GeccErrCntSel);
290 	ecc_err_cnt_addr =
291 		SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_GeccErrCnt);
292 
293 	/* select the lower chip and check the error count */
294 	ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4);
295 	ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_GeccErrCntSel,
296 					GeccErrCntCsSel, 0);
297 	/* set ce error interrupt type to APIC based interrupt */
298 	ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_GeccErrCntSel,
299 					GeccErrInt, 0x1);
300 	WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
301 	/* set error count to initial value */
302 	WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, UMC_V8_7_CE_CNT_INIT);
303 
304 	/* select the higher chip and check the err counter */
305 	ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_GeccErrCntSel,
306 					GeccErrCntCsSel, 1);
307 	WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
308 	WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, UMC_V8_7_CE_CNT_INIT);
309 }
310 
umc_v8_7_err_cnt_init(struct amdgpu_device * adev)311 static void umc_v8_7_err_cnt_init(struct amdgpu_device *adev)
312 {
313 	uint32_t umc_inst        = 0;
314 	uint32_t ch_inst         = 0;
315 	uint32_t umc_reg_offset  = 0;
316 
317 	LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
318 		umc_reg_offset = get_umc_8_reg_offset(adev,
319 						      umc_inst,
320 						      ch_inst);
321 
322 		umc_v8_7_err_cnt_init_per_channel(adev, umc_reg_offset);
323 	}
324 }
325 
326 const struct amdgpu_umc_funcs umc_v8_7_funcs = {
327 	.err_cnt_init = umc_v8_7_err_cnt_init,
328 	.ras_late_init = amdgpu_umc_ras_late_init,
329 	.query_ras_error_count = umc_v8_7_query_ras_error_count,
330 	.query_ras_error_address = umc_v8_7_query_ras_error_address,
331 };
332