1 /*
2 * OPAL hypervisor Maintenance interrupt handling support in PowreNV.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; If not, see <http://www.gnu.org/licenses/>.
16 *
17 * Copyright 2014 IBM Corporation
18 * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
19 */
20
21 #undef DEBUG
22
23 #include <linux/kernel.h>
24 #include <linux/init.h>
25 #include <linux/of.h>
26 #include <linux/mm.h>
27 #include <linux/slab.h>
28
29 #include <asm/opal.h>
30 #include <asm/cputable.h>
31 #include <asm/machdep.h>
32
33 static int opal_hmi_handler_nb_init;
34 struct OpalHmiEvtNode {
35 struct list_head list;
36 struct OpalHMIEvent hmi_evt;
37 };
38
39 struct xstop_reason {
40 uint32_t xstop_reason;
41 const char *unit_failed;
42 const char *description;
43 };
44
45 static LIST_HEAD(opal_hmi_evt_list);
46 static DEFINE_SPINLOCK(opal_hmi_evt_lock);
47
print_core_checkstop_reason(const char * level,struct OpalHMIEvent * hmi_evt)48 static void print_core_checkstop_reason(const char *level,
49 struct OpalHMIEvent *hmi_evt)
50 {
51 int i;
52 static const struct xstop_reason xstop_reason[] = {
53 { CORE_CHECKSTOP_IFU_REGFILE, "IFU",
54 "RegFile core check stop" },
55 { CORE_CHECKSTOP_IFU_LOGIC, "IFU", "Logic core check stop" },
56 { CORE_CHECKSTOP_PC_DURING_RECOV, "PC",
57 "Core checkstop during recovery" },
58 { CORE_CHECKSTOP_ISU_REGFILE, "ISU",
59 "RegFile core check stop (mapper error)" },
60 { CORE_CHECKSTOP_ISU_LOGIC, "ISU", "Logic core check stop" },
61 { CORE_CHECKSTOP_FXU_LOGIC, "FXU", "Logic core check stop" },
62 { CORE_CHECKSTOP_VSU_LOGIC, "VSU", "Logic core check stop" },
63 { CORE_CHECKSTOP_PC_RECOV_IN_MAINT_MODE, "PC",
64 "Recovery in maintenance mode" },
65 { CORE_CHECKSTOP_LSU_REGFILE, "LSU",
66 "RegFile core check stop" },
67 { CORE_CHECKSTOP_PC_FWD_PROGRESS, "PC",
68 "Forward Progress Error" },
69 { CORE_CHECKSTOP_LSU_LOGIC, "LSU", "Logic core check stop" },
70 { CORE_CHECKSTOP_PC_LOGIC, "PC", "Logic core check stop" },
71 { CORE_CHECKSTOP_PC_HYP_RESOURCE, "PC",
72 "Hypervisor Resource error - core check stop" },
73 { CORE_CHECKSTOP_PC_HANG_RECOV_FAILED, "PC",
74 "Hang Recovery Failed (core check stop)" },
75 { CORE_CHECKSTOP_PC_AMBI_HANG_DETECTED, "PC",
76 "Ambiguous Hang Detected (unknown source)" },
77 { CORE_CHECKSTOP_PC_DEBUG_TRIG_ERR_INJ, "PC",
78 "Debug Trigger Error inject" },
79 { CORE_CHECKSTOP_PC_SPRD_HYP_ERR_INJ, "PC",
80 "Hypervisor check stop via SPRC/SPRD" },
81 };
82
83 /* Validity check */
84 if (!hmi_evt->u.xstop_error.xstop_reason) {
85 printk("%s Unknown Core check stop.\n", level);
86 return;
87 }
88
89 printk("%s CPU PIR: %08x\n", level,
90 be32_to_cpu(hmi_evt->u.xstop_error.u.pir));
91 for (i = 0; i < ARRAY_SIZE(xstop_reason); i++)
92 if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) &
93 xstop_reason[i].xstop_reason)
94 printk("%s [Unit: %-3s] %s\n", level,
95 xstop_reason[i].unit_failed,
96 xstop_reason[i].description);
97 }
98
print_nx_checkstop_reason(const char * level,struct OpalHMIEvent * hmi_evt)99 static void print_nx_checkstop_reason(const char *level,
100 struct OpalHMIEvent *hmi_evt)
101 {
102 int i;
103 static const struct xstop_reason xstop_reason[] = {
104 { NX_CHECKSTOP_SHM_INVAL_STATE_ERR, "DMA & Engine",
105 "SHM invalid state error" },
106 { NX_CHECKSTOP_DMA_INVAL_STATE_ERR_1, "DMA & Engine",
107 "DMA invalid state error bit 15" },
108 { NX_CHECKSTOP_DMA_INVAL_STATE_ERR_2, "DMA & Engine",
109 "DMA invalid state error bit 16" },
110 { NX_CHECKSTOP_DMA_CH0_INVAL_STATE_ERR, "DMA & Engine",
111 "Channel 0 invalid state error" },
112 { NX_CHECKSTOP_DMA_CH1_INVAL_STATE_ERR, "DMA & Engine",
113 "Channel 1 invalid state error" },
114 { NX_CHECKSTOP_DMA_CH2_INVAL_STATE_ERR, "DMA & Engine",
115 "Channel 2 invalid state error" },
116 { NX_CHECKSTOP_DMA_CH3_INVAL_STATE_ERR, "DMA & Engine",
117 "Channel 3 invalid state error" },
118 { NX_CHECKSTOP_DMA_CH4_INVAL_STATE_ERR, "DMA & Engine",
119 "Channel 4 invalid state error" },
120 { NX_CHECKSTOP_DMA_CH5_INVAL_STATE_ERR, "DMA & Engine",
121 "Channel 5 invalid state error" },
122 { NX_CHECKSTOP_DMA_CH6_INVAL_STATE_ERR, "DMA & Engine",
123 "Channel 6 invalid state error" },
124 { NX_CHECKSTOP_DMA_CH7_INVAL_STATE_ERR, "DMA & Engine",
125 "Channel 7 invalid state error" },
126 { NX_CHECKSTOP_DMA_CRB_UE, "DMA & Engine",
127 "UE error on CRB(CSB address, CCB)" },
128 { NX_CHECKSTOP_DMA_CRB_SUE, "DMA & Engine",
129 "SUE error on CRB(CSB address, CCB)" },
130 { NX_CHECKSTOP_PBI_ISN_UE, "PowerBus Interface",
131 "CRB Kill ISN received while holding ISN with UE error" },
132 };
133
134 /* Validity check */
135 if (!hmi_evt->u.xstop_error.xstop_reason) {
136 printk("%s Unknown NX check stop.\n", level);
137 return;
138 }
139
140 printk("%s NX checkstop on CHIP ID: %x\n", level,
141 be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id));
142 for (i = 0; i < ARRAY_SIZE(xstop_reason); i++)
143 if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) &
144 xstop_reason[i].xstop_reason)
145 printk("%s [Unit: %-3s] %s\n", level,
146 xstop_reason[i].unit_failed,
147 xstop_reason[i].description);
148 }
149
print_checkstop_reason(const char * level,struct OpalHMIEvent * hmi_evt)150 static void print_checkstop_reason(const char *level,
151 struct OpalHMIEvent *hmi_evt)
152 {
153 switch (hmi_evt->u.xstop_error.xstop_type) {
154 case CHECKSTOP_TYPE_CORE:
155 print_core_checkstop_reason(level, hmi_evt);
156 break;
157 case CHECKSTOP_TYPE_NX:
158 print_nx_checkstop_reason(level, hmi_evt);
159 break;
160 case CHECKSTOP_TYPE_UNKNOWN:
161 printk("%s Unknown Malfunction Alert.\n", level);
162 break;
163 }
164 }
165
print_hmi_event_info(struct OpalHMIEvent * hmi_evt)166 static void print_hmi_event_info(struct OpalHMIEvent *hmi_evt)
167 {
168 const char *level, *sevstr, *error_info;
169 static const char *hmi_error_types[] = {
170 "Malfunction Alert",
171 "Processor Recovery done",
172 "Processor recovery occurred again",
173 "Processor recovery occurred for masked error",
174 "Timer facility experienced an error",
175 "TFMR SPR is corrupted",
176 "UPS (Uniterrupted Power System) Overflow indication",
177 "An XSCOM operation failure",
178 "An XSCOM operation completed",
179 "SCOM has set a reserved FIR bit to cause recovery",
180 "Debug trigger has set a reserved FIR bit to cause recovery",
181 "A hypervisor resource error occurred"
182 };
183
184 /* Print things out */
185 if (hmi_evt->version < OpalHMIEvt_V1) {
186 pr_err("HMI Interrupt, Unknown event version %d !\n",
187 hmi_evt->version);
188 return;
189 }
190 switch (hmi_evt->severity) {
191 case OpalHMI_SEV_NO_ERROR:
192 level = KERN_INFO;
193 sevstr = "Harmless";
194 break;
195 case OpalHMI_SEV_WARNING:
196 level = KERN_WARNING;
197 sevstr = "";
198 break;
199 case OpalHMI_SEV_ERROR_SYNC:
200 level = KERN_ERR;
201 sevstr = "Severe";
202 break;
203 case OpalHMI_SEV_FATAL:
204 default:
205 level = KERN_ERR;
206 sevstr = "Fatal";
207 break;
208 }
209
210 printk("%s%s Hypervisor Maintenance interrupt [%s]\n",
211 level, sevstr,
212 hmi_evt->disposition == OpalHMI_DISPOSITION_RECOVERED ?
213 "Recovered" : "Not recovered");
214 error_info = hmi_evt->type < ARRAY_SIZE(hmi_error_types) ?
215 hmi_error_types[hmi_evt->type]
216 : "Unknown";
217 printk("%s Error detail: %s\n", level, error_info);
218 printk("%s HMER: %016llx\n", level, be64_to_cpu(hmi_evt->hmer));
219 if ((hmi_evt->type == OpalHMI_ERROR_TFAC) ||
220 (hmi_evt->type == OpalHMI_ERROR_TFMR_PARITY))
221 printk("%s TFMR: %016llx\n", level,
222 be64_to_cpu(hmi_evt->tfmr));
223
224 if (hmi_evt->version < OpalHMIEvt_V2)
225 return;
226
227 /* OpalHMIEvt_V2 and above provides reason for malfunction alert. */
228 if (hmi_evt->type == OpalHMI_ERROR_MALFUNC_ALERT)
229 print_checkstop_reason(level, hmi_evt);
230 }
231
hmi_event_handler(struct work_struct * work)232 static void hmi_event_handler(struct work_struct *work)
233 {
234 unsigned long flags;
235 struct OpalHMIEvent *hmi_evt;
236 struct OpalHmiEvtNode *msg_node;
237 uint8_t disposition;
238 struct opal_msg msg;
239 int unrecoverable = 0;
240
241 spin_lock_irqsave(&opal_hmi_evt_lock, flags);
242 while (!list_empty(&opal_hmi_evt_list)) {
243 msg_node = list_entry(opal_hmi_evt_list.next,
244 struct OpalHmiEvtNode, list);
245 list_del(&msg_node->list);
246 spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
247
248 hmi_evt = (struct OpalHMIEvent *) &msg_node->hmi_evt;
249 print_hmi_event_info(hmi_evt);
250 disposition = hmi_evt->disposition;
251 kfree(msg_node);
252
253 /*
254 * Check if HMI event has been recovered or not. If not
255 * then kernel can't continue, we need to panic.
256 * But before we do that, display all the HMI event
257 * available on the list and set unrecoverable flag to 1.
258 */
259 if (disposition != OpalHMI_DISPOSITION_RECOVERED)
260 unrecoverable = 1;
261
262 spin_lock_irqsave(&opal_hmi_evt_lock, flags);
263 }
264 spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
265
266 if (unrecoverable) {
267 int ret;
268
269 /* Pull all HMI events from OPAL before we panic. */
270 while (opal_get_msg(__pa(&msg), sizeof(msg)) == OPAL_SUCCESS) {
271 u32 type;
272
273 type = be32_to_cpu(msg.msg_type);
274
275 /* skip if not HMI event */
276 if (type != OPAL_MSG_HMI_EVT)
277 continue;
278
279 /* HMI event info starts from param[0] */
280 hmi_evt = (struct OpalHMIEvent *)&msg.params[0];
281 print_hmi_event_info(hmi_evt);
282 }
283
284 /*
285 * Unrecoverable HMI exception. We need to inform BMC/OCC
286 * about this error so that it can collect relevant data
287 * for error analysis before rebooting.
288 */
289 ret = opal_cec_reboot2(OPAL_REBOOT_PLATFORM_ERROR,
290 "Unrecoverable HMI exception");
291 if (ret == OPAL_UNSUPPORTED) {
292 pr_emerg("Reboot type %d not supported\n",
293 OPAL_REBOOT_PLATFORM_ERROR);
294 }
295
296 /*
297 * Fall through and panic if opal_cec_reboot2() returns
298 * OPAL_UNSUPPORTED.
299 */
300 panic("Unrecoverable HMI exception");
301 }
302 }
303
304 static DECLARE_WORK(hmi_event_work, hmi_event_handler);
305 /*
306 * opal_handle_hmi_event - notifier handler that queues up HMI events
307 * to be preocessed later.
308 */
opal_handle_hmi_event(struct notifier_block * nb,unsigned long msg_type,void * msg)309 static int opal_handle_hmi_event(struct notifier_block *nb,
310 unsigned long msg_type, void *msg)
311 {
312 unsigned long flags;
313 struct OpalHMIEvent *hmi_evt;
314 struct opal_msg *hmi_msg = msg;
315 struct OpalHmiEvtNode *msg_node;
316
317 /* Sanity Checks */
318 if (msg_type != OPAL_MSG_HMI_EVT)
319 return 0;
320
321 /* HMI event info starts from param[0] */
322 hmi_evt = (struct OpalHMIEvent *)&hmi_msg->params[0];
323
324 /* Delay the logging of HMI events to workqueue. */
325 msg_node = kzalloc(sizeof(*msg_node), GFP_ATOMIC);
326 if (!msg_node) {
327 pr_err("HMI: out of memory, Opal message event not handled\n");
328 return -ENOMEM;
329 }
330 memcpy(&msg_node->hmi_evt, hmi_evt, sizeof(struct OpalHMIEvent));
331
332 spin_lock_irqsave(&opal_hmi_evt_lock, flags);
333 list_add(&msg_node->list, &opal_hmi_evt_list);
334 spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
335
336 schedule_work(&hmi_event_work);
337 return 0;
338 }
339
340 static struct notifier_block opal_hmi_handler_nb = {
341 .notifier_call = opal_handle_hmi_event,
342 .next = NULL,
343 .priority = 0,
344 };
345
opal_hmi_handler_init(void)346 int __init opal_hmi_handler_init(void)
347 {
348 int ret;
349
350 if (!opal_hmi_handler_nb_init) {
351 ret = opal_message_notifier_register(
352 OPAL_MSG_HMI_EVT, &opal_hmi_handler_nb);
353 if (ret) {
354 pr_err("%s: Can't register OPAL event notifier (%d)\n",
355 __func__, ret);
356 return ret;
357 }
358 opal_hmi_handler_nb_init = 1;
359 }
360 return 0;
361 }
362