1 /*
2 * OPAL hypervisor Maintenance interrupt handling support in PowreNV.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; If not, see <http://www.gnu.org/licenses/>.
16 *
17 * Copyright 2014 IBM Corporation
18 * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
19 */
20
21 #undef DEBUG
22
23 #include <linux/kernel.h>
24 #include <linux/init.h>
25 #include <linux/of.h>
26 #include <linux/mm.h>
27 #include <linux/slab.h>
28
29 #include <asm/opal.h>
30 #include <asm/cputable.h>
31 #include <asm/machdep.h>
32
33 #include "powernv.h"
34
35 static int opal_hmi_handler_nb_init;
36 struct OpalHmiEvtNode {
37 struct list_head list;
38 struct OpalHMIEvent hmi_evt;
39 };
40
41 struct xstop_reason {
42 uint32_t xstop_reason;
43 const char *unit_failed;
44 const char *description;
45 };
46
47 static LIST_HEAD(opal_hmi_evt_list);
48 static DEFINE_SPINLOCK(opal_hmi_evt_lock);
49
print_core_checkstop_reason(const char * level,struct OpalHMIEvent * hmi_evt)50 static void print_core_checkstop_reason(const char *level,
51 struct OpalHMIEvent *hmi_evt)
52 {
53 int i;
54 static const struct xstop_reason xstop_reason[] = {
55 { CORE_CHECKSTOP_IFU_REGFILE, "IFU",
56 "RegFile core check stop" },
57 { CORE_CHECKSTOP_IFU_LOGIC, "IFU", "Logic core check stop" },
58 { CORE_CHECKSTOP_PC_DURING_RECOV, "PC",
59 "Core checkstop during recovery" },
60 { CORE_CHECKSTOP_ISU_REGFILE, "ISU",
61 "RegFile core check stop (mapper error)" },
62 { CORE_CHECKSTOP_ISU_LOGIC, "ISU", "Logic core check stop" },
63 { CORE_CHECKSTOP_FXU_LOGIC, "FXU", "Logic core check stop" },
64 { CORE_CHECKSTOP_VSU_LOGIC, "VSU", "Logic core check stop" },
65 { CORE_CHECKSTOP_PC_RECOV_IN_MAINT_MODE, "PC",
66 "Recovery in maintenance mode" },
67 { CORE_CHECKSTOP_LSU_REGFILE, "LSU",
68 "RegFile core check stop" },
69 { CORE_CHECKSTOP_PC_FWD_PROGRESS, "PC",
70 "Forward Progress Error" },
71 { CORE_CHECKSTOP_LSU_LOGIC, "LSU", "Logic core check stop" },
72 { CORE_CHECKSTOP_PC_LOGIC, "PC", "Logic core check stop" },
73 { CORE_CHECKSTOP_PC_HYP_RESOURCE, "PC",
74 "Hypervisor Resource error - core check stop" },
75 { CORE_CHECKSTOP_PC_HANG_RECOV_FAILED, "PC",
76 "Hang Recovery Failed (core check stop)" },
77 { CORE_CHECKSTOP_PC_AMBI_HANG_DETECTED, "PC",
78 "Ambiguous Hang Detected (unknown source)" },
79 { CORE_CHECKSTOP_PC_DEBUG_TRIG_ERR_INJ, "PC",
80 "Debug Trigger Error inject" },
81 { CORE_CHECKSTOP_PC_SPRD_HYP_ERR_INJ, "PC",
82 "Hypervisor check stop via SPRC/SPRD" },
83 };
84
85 /* Validity check */
86 if (!hmi_evt->u.xstop_error.xstop_reason) {
87 printk("%s Unknown Core check stop.\n", level);
88 return;
89 }
90
91 printk("%s CPU PIR: %08x\n", level,
92 be32_to_cpu(hmi_evt->u.xstop_error.u.pir));
93 for (i = 0; i < ARRAY_SIZE(xstop_reason); i++)
94 if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) &
95 xstop_reason[i].xstop_reason)
96 printk("%s [Unit: %-3s] %s\n", level,
97 xstop_reason[i].unit_failed,
98 xstop_reason[i].description);
99 }
100
print_nx_checkstop_reason(const char * level,struct OpalHMIEvent * hmi_evt)101 static void print_nx_checkstop_reason(const char *level,
102 struct OpalHMIEvent *hmi_evt)
103 {
104 int i;
105 static const struct xstop_reason xstop_reason[] = {
106 { NX_CHECKSTOP_SHM_INVAL_STATE_ERR, "DMA & Engine",
107 "SHM invalid state error" },
108 { NX_CHECKSTOP_DMA_INVAL_STATE_ERR_1, "DMA & Engine",
109 "DMA invalid state error bit 15" },
110 { NX_CHECKSTOP_DMA_INVAL_STATE_ERR_2, "DMA & Engine",
111 "DMA invalid state error bit 16" },
112 { NX_CHECKSTOP_DMA_CH0_INVAL_STATE_ERR, "DMA & Engine",
113 "Channel 0 invalid state error" },
114 { NX_CHECKSTOP_DMA_CH1_INVAL_STATE_ERR, "DMA & Engine",
115 "Channel 1 invalid state error" },
116 { NX_CHECKSTOP_DMA_CH2_INVAL_STATE_ERR, "DMA & Engine",
117 "Channel 2 invalid state error" },
118 { NX_CHECKSTOP_DMA_CH3_INVAL_STATE_ERR, "DMA & Engine",
119 "Channel 3 invalid state error" },
120 { NX_CHECKSTOP_DMA_CH4_INVAL_STATE_ERR, "DMA & Engine",
121 "Channel 4 invalid state error" },
122 { NX_CHECKSTOP_DMA_CH5_INVAL_STATE_ERR, "DMA & Engine",
123 "Channel 5 invalid state error" },
124 { NX_CHECKSTOP_DMA_CH6_INVAL_STATE_ERR, "DMA & Engine",
125 "Channel 6 invalid state error" },
126 { NX_CHECKSTOP_DMA_CH7_INVAL_STATE_ERR, "DMA & Engine",
127 "Channel 7 invalid state error" },
128 { NX_CHECKSTOP_DMA_CRB_UE, "DMA & Engine",
129 "UE error on CRB(CSB address, CCB)" },
130 { NX_CHECKSTOP_DMA_CRB_SUE, "DMA & Engine",
131 "SUE error on CRB(CSB address, CCB)" },
132 { NX_CHECKSTOP_PBI_ISN_UE, "PowerBus Interface",
133 "CRB Kill ISN received while holding ISN with UE error" },
134 };
135
136 /* Validity check */
137 if (!hmi_evt->u.xstop_error.xstop_reason) {
138 printk("%s Unknown NX check stop.\n", level);
139 return;
140 }
141
142 printk("%s NX checkstop on CHIP ID: %x\n", level,
143 be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id));
144 for (i = 0; i < ARRAY_SIZE(xstop_reason); i++)
145 if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) &
146 xstop_reason[i].xstop_reason)
147 printk("%s [Unit: %-3s] %s\n", level,
148 xstop_reason[i].unit_failed,
149 xstop_reason[i].description);
150 }
151
print_checkstop_reason(const char * level,struct OpalHMIEvent * hmi_evt)152 static void print_checkstop_reason(const char *level,
153 struct OpalHMIEvent *hmi_evt)
154 {
155 uint8_t type = hmi_evt->u.xstop_error.xstop_type;
156 switch (type) {
157 case CHECKSTOP_TYPE_CORE:
158 print_core_checkstop_reason(level, hmi_evt);
159 break;
160 case CHECKSTOP_TYPE_NX:
161 print_nx_checkstop_reason(level, hmi_evt);
162 break;
163 default:
164 printk("%s Unknown Malfunction Alert of type %d\n",
165 level, type);
166 break;
167 }
168 }
169
print_hmi_event_info(struct OpalHMIEvent * hmi_evt)170 static void print_hmi_event_info(struct OpalHMIEvent *hmi_evt)
171 {
172 const char *level, *sevstr, *error_info;
173 static const char *hmi_error_types[] = {
174 "Malfunction Alert",
175 "Processor Recovery done",
176 "Processor recovery occurred again",
177 "Processor recovery occurred for masked error",
178 "Timer facility experienced an error",
179 "TFMR SPR is corrupted",
180 "UPS (Uniterrupted Power System) Overflow indication",
181 "An XSCOM operation failure",
182 "An XSCOM operation completed",
183 "SCOM has set a reserved FIR bit to cause recovery",
184 "Debug trigger has set a reserved FIR bit to cause recovery",
185 "A hypervisor resource error occurred",
186 "CAPP recovery process is in progress",
187 };
188
189 /* Print things out */
190 if (hmi_evt->version < OpalHMIEvt_V1) {
191 pr_err("HMI Interrupt, Unknown event version %d !\n",
192 hmi_evt->version);
193 return;
194 }
195 switch (hmi_evt->severity) {
196 case OpalHMI_SEV_NO_ERROR:
197 level = KERN_INFO;
198 sevstr = "Harmless";
199 break;
200 case OpalHMI_SEV_WARNING:
201 level = KERN_WARNING;
202 sevstr = "";
203 break;
204 case OpalHMI_SEV_ERROR_SYNC:
205 level = KERN_ERR;
206 sevstr = "Severe";
207 break;
208 case OpalHMI_SEV_FATAL:
209 default:
210 level = KERN_ERR;
211 sevstr = "Fatal";
212 break;
213 }
214
215 printk("%s%s Hypervisor Maintenance interrupt [%s]\n",
216 level, sevstr,
217 hmi_evt->disposition == OpalHMI_DISPOSITION_RECOVERED ?
218 "Recovered" : "Not recovered");
219 error_info = hmi_evt->type < ARRAY_SIZE(hmi_error_types) ?
220 hmi_error_types[hmi_evt->type]
221 : "Unknown";
222 printk("%s Error detail: %s\n", level, error_info);
223 printk("%s HMER: %016llx\n", level, be64_to_cpu(hmi_evt->hmer));
224 if ((hmi_evt->type == OpalHMI_ERROR_TFAC) ||
225 (hmi_evt->type == OpalHMI_ERROR_TFMR_PARITY))
226 printk("%s TFMR: %016llx\n", level,
227 be64_to_cpu(hmi_evt->tfmr));
228
229 if (hmi_evt->version < OpalHMIEvt_V2)
230 return;
231
232 /* OpalHMIEvt_V2 and above provides reason for malfunction alert. */
233 if (hmi_evt->type == OpalHMI_ERROR_MALFUNC_ALERT)
234 print_checkstop_reason(level, hmi_evt);
235 }
236
hmi_event_handler(struct work_struct * work)237 static void hmi_event_handler(struct work_struct *work)
238 {
239 unsigned long flags;
240 struct OpalHMIEvent *hmi_evt;
241 struct OpalHmiEvtNode *msg_node;
242 uint8_t disposition;
243 struct opal_msg msg;
244 int unrecoverable = 0;
245
246 spin_lock_irqsave(&opal_hmi_evt_lock, flags);
247 while (!list_empty(&opal_hmi_evt_list)) {
248 msg_node = list_entry(opal_hmi_evt_list.next,
249 struct OpalHmiEvtNode, list);
250 list_del(&msg_node->list);
251 spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
252
253 hmi_evt = (struct OpalHMIEvent *) &msg_node->hmi_evt;
254 print_hmi_event_info(hmi_evt);
255 disposition = hmi_evt->disposition;
256 kfree(msg_node);
257
258 /*
259 * Check if HMI event has been recovered or not. If not
260 * then kernel can't continue, we need to panic.
261 * But before we do that, display all the HMI event
262 * available on the list and set unrecoverable flag to 1.
263 */
264 if (disposition != OpalHMI_DISPOSITION_RECOVERED)
265 unrecoverable = 1;
266
267 spin_lock_irqsave(&opal_hmi_evt_lock, flags);
268 }
269 spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
270
271 if (unrecoverable) {
272 /* Pull all HMI events from OPAL before we panic. */
273 while (opal_get_msg(__pa(&msg), sizeof(msg)) == OPAL_SUCCESS) {
274 u32 type;
275
276 type = be32_to_cpu(msg.msg_type);
277
278 /* skip if not HMI event */
279 if (type != OPAL_MSG_HMI_EVT)
280 continue;
281
282 /* HMI event info starts from param[0] */
283 hmi_evt = (struct OpalHMIEvent *)&msg.params[0];
284 print_hmi_event_info(hmi_evt);
285 }
286
287 pnv_platform_error_reboot(NULL, "Unrecoverable HMI exception");
288 }
289 }
290
291 static DECLARE_WORK(hmi_event_work, hmi_event_handler);
292 /*
293 * opal_handle_hmi_event - notifier handler that queues up HMI events
294 * to be preocessed later.
295 */
opal_handle_hmi_event(struct notifier_block * nb,unsigned long msg_type,void * msg)296 static int opal_handle_hmi_event(struct notifier_block *nb,
297 unsigned long msg_type, void *msg)
298 {
299 unsigned long flags;
300 struct OpalHMIEvent *hmi_evt;
301 struct opal_msg *hmi_msg = msg;
302 struct OpalHmiEvtNode *msg_node;
303
304 /* Sanity Checks */
305 if (msg_type != OPAL_MSG_HMI_EVT)
306 return 0;
307
308 /* HMI event info starts from param[0] */
309 hmi_evt = (struct OpalHMIEvent *)&hmi_msg->params[0];
310
311 /* Delay the logging of HMI events to workqueue. */
312 msg_node = kzalloc(sizeof(*msg_node), GFP_ATOMIC);
313 if (!msg_node) {
314 pr_err("HMI: out of memory, Opal message event not handled\n");
315 return -ENOMEM;
316 }
317 memcpy(&msg_node->hmi_evt, hmi_evt, sizeof(struct OpalHMIEvent));
318
319 spin_lock_irqsave(&opal_hmi_evt_lock, flags);
320 list_add(&msg_node->list, &opal_hmi_evt_list);
321 spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
322
323 schedule_work(&hmi_event_work);
324 return 0;
325 }
326
327 static struct notifier_block opal_hmi_handler_nb = {
328 .notifier_call = opal_handle_hmi_event,
329 .next = NULL,
330 .priority = 0,
331 };
332
opal_hmi_handler_init(void)333 int __init opal_hmi_handler_init(void)
334 {
335 int ret;
336
337 if (!opal_hmi_handler_nb_init) {
338 ret = opal_message_notifier_register(
339 OPAL_MSG_HMI_EVT, &opal_hmi_handler_nb);
340 if (ret) {
341 pr_err("%s: Can't register OPAL event notifier (%d)\n",
342 __func__, ret);
343 return ret;
344 }
345 opal_hmi_handler_nb_init = 1;
346 }
347 return 0;
348 }
349