• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0
2 
3 /*
4  * Copyright 2016-2019 HabanaLabs, Ltd.
5  * All Rights Reserved.
6  *
7  */
8 
9 #define pr_fmt(fmt)		"habanalabs: " fmt
10 
11 #include "habanalabs.h"
12 
13 #include <linux/pci.h>
14 #include <linux/aer.h>
15 #include <linux/module.h>
16 
17 #define HL_DRIVER_AUTHOR	"HabanaLabs Kernel Driver Team"
18 
19 #define HL_DRIVER_DESC		"Driver for HabanaLabs's AI Accelerators"
20 
21 MODULE_AUTHOR(HL_DRIVER_AUTHOR);
22 MODULE_DESCRIPTION(HL_DRIVER_DESC);
23 MODULE_LICENSE("GPL v2");
24 
25 static int hl_major;
26 static struct class *hl_class;
27 static DEFINE_IDR(hl_devs_idr);
28 static DEFINE_MUTEX(hl_devs_idr_lock);
29 
30 static int timeout_locked = 30;
31 static int reset_on_lockup = 1;
32 static int memory_scrub;
33 static ulong boot_error_status_mask = ULONG_MAX;
34 
35 module_param(timeout_locked, int, 0444);
36 MODULE_PARM_DESC(timeout_locked,
37 	"Device lockup timeout in seconds (0 = disabled, default 30s)");
38 
39 module_param(reset_on_lockup, int, 0444);
40 MODULE_PARM_DESC(reset_on_lockup,
41 	"Do device reset on lockup (0 = no, 1 = yes, default yes)");
42 
43 module_param(memory_scrub, int, 0444);
44 MODULE_PARM_DESC(memory_scrub,
45 	"Scrub device memory in various states (0 = no, 1 = yes, default no)");
46 
47 module_param(boot_error_status_mask, ulong, 0444);
48 MODULE_PARM_DESC(boot_error_status_mask,
49 	"Mask of the error status during device CPU boot (If bitX is cleared then error X is masked. Default all 1's)");
50 
51 #define PCI_VENDOR_ID_HABANALABS	0x1da3
52 
53 #define PCI_IDS_GOYA			0x0001
54 #define PCI_IDS_GAUDI			0x1000
55 #define PCI_IDS_GAUDI_SEC		0x1010
56 
57 static const struct pci_device_id ids[] = {
58 	{ PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GOYA), },
59 	{ PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI), },
60 	{ PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI_SEC), },
61 	{ 0, }
62 };
63 MODULE_DEVICE_TABLE(pci, ids);
64 
65 /*
66  * get_asic_type - translate device id to asic type
67  *
68  * @device: id of the PCI device
69  *
70  * Translate device id to asic type.
71  * In case of unidentified device, return -1
72  */
get_asic_type(u16 device)73 static enum hl_asic_type get_asic_type(u16 device)
74 {
75 	enum hl_asic_type asic_type;
76 
77 	switch (device) {
78 	case PCI_IDS_GOYA:
79 		asic_type = ASIC_GOYA;
80 		break;
81 	case PCI_IDS_GAUDI:
82 		asic_type = ASIC_GAUDI;
83 		break;
84 	case PCI_IDS_GAUDI_SEC:
85 		asic_type = ASIC_GAUDI_SEC;
86 		break;
87 	default:
88 		asic_type = ASIC_INVALID;
89 		break;
90 	}
91 
92 	return asic_type;
93 }
94 
is_asic_secured(enum hl_asic_type asic_type)95 static bool is_asic_secured(enum hl_asic_type asic_type)
96 {
97 	switch (asic_type) {
98 	case ASIC_GAUDI_SEC:
99 		return true;
100 	default:
101 		return false;
102 	}
103 }
104 
105 /*
106  * hl_device_open - open function for habanalabs device
107  *
108  * @inode: pointer to inode structure
109  * @filp: pointer to file structure
110  *
111  * Called when process opens an habanalabs device.
112  */
hl_device_open(struct inode * inode,struct file * filp)113 int hl_device_open(struct inode *inode, struct file *filp)
114 {
115 	enum hl_device_status status;
116 	struct hl_device *hdev;
117 	struct hl_fpriv *hpriv;
118 	int rc;
119 
120 	mutex_lock(&hl_devs_idr_lock);
121 	hdev = idr_find(&hl_devs_idr, iminor(inode));
122 	mutex_unlock(&hl_devs_idr_lock);
123 
124 	if (!hdev) {
125 		pr_err("Couldn't find device %d:%d\n",
126 			imajor(inode), iminor(inode));
127 		return -ENXIO;
128 	}
129 
130 	hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
131 	if (!hpriv)
132 		return -ENOMEM;
133 
134 	hpriv->hdev = hdev;
135 	filp->private_data = hpriv;
136 	hpriv->filp = filp;
137 	mutex_init(&hpriv->restore_phase_mutex);
138 	kref_init(&hpriv->refcount);
139 	nonseekable_open(inode, filp);
140 
141 	hl_cb_mgr_init(&hpriv->cb_mgr);
142 	hl_ctx_mgr_init(&hpriv->ctx_mgr);
143 
144 	hpriv->taskpid = get_task_pid(current, PIDTYPE_PID);
145 
146 	mutex_lock(&hdev->fpriv_list_lock);
147 
148 	if (!hl_device_operational(hdev, &status)) {
149 		dev_err_ratelimited(hdev->dev,
150 			"Can't open %s because it is %s\n",
151 			dev_name(hdev->dev), hdev->status[status]);
152 		rc = -EPERM;
153 		goto out_err;
154 	}
155 
156 	if (hdev->in_debug) {
157 		dev_err_ratelimited(hdev->dev,
158 			"Can't open %s because it is being debugged by another user\n",
159 			dev_name(hdev->dev));
160 		rc = -EPERM;
161 		goto out_err;
162 	}
163 
164 	if (hdev->compute_ctx) {
165 		dev_dbg_ratelimited(hdev->dev,
166 			"Can't open %s because another user is working on it\n",
167 			dev_name(hdev->dev));
168 		rc = -EBUSY;
169 		goto out_err;
170 	}
171 
172 	rc = hl_ctx_create(hdev, hpriv);
173 	if (rc) {
174 		dev_err(hdev->dev, "Failed to create context %d\n", rc);
175 		goto out_err;
176 	}
177 
178 	/* Device is IDLE at this point so it is legal to change PLLs.
179 	 * There is no need to check anything because if the PLL is
180 	 * already HIGH, the set function will return without doing
181 	 * anything
182 	 */
183 	hl_device_set_frequency(hdev, PLL_HIGH);
184 
185 	list_add(&hpriv->dev_node, &hdev->fpriv_list);
186 	mutex_unlock(&hdev->fpriv_list_lock);
187 
188 	hl_debugfs_add_file(hpriv);
189 
190 	hdev->open_counter++;
191 	hdev->last_successful_open_jif = jiffies;
192 
193 	return 0;
194 
195 out_err:
196 	mutex_unlock(&hdev->fpriv_list_lock);
197 	hl_cb_mgr_fini(hpriv->hdev, &hpriv->cb_mgr);
198 	hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
199 	filp->private_data = NULL;
200 	mutex_destroy(&hpriv->restore_phase_mutex);
201 	put_pid(hpriv->taskpid);
202 
203 	kfree(hpriv);
204 
205 	return rc;
206 }
207 
hl_device_open_ctrl(struct inode * inode,struct file * filp)208 int hl_device_open_ctrl(struct inode *inode, struct file *filp)
209 {
210 	struct hl_device *hdev;
211 	struct hl_fpriv *hpriv;
212 	int rc;
213 
214 	mutex_lock(&hl_devs_idr_lock);
215 	hdev = idr_find(&hl_devs_idr, iminor(inode));
216 	mutex_unlock(&hl_devs_idr_lock);
217 
218 	if (!hdev) {
219 		pr_err("Couldn't find device %d:%d\n",
220 			imajor(inode), iminor(inode));
221 		return -ENXIO;
222 	}
223 
224 	hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
225 	if (!hpriv)
226 		return -ENOMEM;
227 
228 	mutex_lock(&hdev->fpriv_list_lock);
229 
230 	if (!hl_device_operational(hdev, NULL)) {
231 		dev_err_ratelimited(hdev->dev_ctrl,
232 			"Can't open %s because it is disabled or in reset\n",
233 			dev_name(hdev->dev_ctrl));
234 		rc = -EPERM;
235 		goto out_err;
236 	}
237 
238 	list_add(&hpriv->dev_node, &hdev->fpriv_list);
239 	mutex_unlock(&hdev->fpriv_list_lock);
240 
241 	hpriv->hdev = hdev;
242 	filp->private_data = hpriv;
243 	hpriv->filp = filp;
244 	hpriv->is_control = true;
245 	nonseekable_open(inode, filp);
246 
247 	hpriv->taskpid = find_get_pid(current->pid);
248 
249 	return 0;
250 
251 out_err:
252 	mutex_unlock(&hdev->fpriv_list_lock);
253 	kfree(hpriv);
254 	return rc;
255 }
256 
set_driver_behavior_per_device(struct hl_device * hdev)257 static void set_driver_behavior_per_device(struct hl_device *hdev)
258 {
259 	hdev->fw_components = FW_TYPE_ALL_TYPES;
260 	hdev->cpu_queues_enable = 1;
261 	hdev->heartbeat = 1;
262 	hdev->mmu_enable = 1;
263 	hdev->clock_gating_mask = ULONG_MAX;
264 	hdev->sram_scrambler_enable = 1;
265 	hdev->dram_scrambler_enable = 1;
266 	hdev->bmc_enable = 1;
267 	hdev->hard_reset_on_fw_events = 1;
268 	hdev->reset_on_preboot_fail = 1;
269 	hdev->reset_if_device_not_idle = 1;
270 
271 	hdev->reset_pcilink = 0;
272 	hdev->axi_drain = 0;
273 }
274 
275 /*
276  * create_hdev - create habanalabs device instance
277  *
278  * @dev: will hold the pointer to the new habanalabs device structure
279  * @pdev: pointer to the pci device
280  * @asic_type: in case of simulator device, which device is it
281  * @minor: in case of simulator device, the minor of the device
282  *
283  * Allocate memory for habanalabs device and initialize basic fields
284  * Identify the ASIC type
285  * Allocate ID (minor) for the device (only for real devices)
286  */
create_hdev(struct hl_device ** dev,struct pci_dev * pdev,enum hl_asic_type asic_type,int minor)287 int create_hdev(struct hl_device **dev, struct pci_dev *pdev,
288 		enum hl_asic_type asic_type, int minor)
289 {
290 	struct hl_device *hdev;
291 	int rc, main_id, ctrl_id = 0;
292 
293 	*dev = NULL;
294 
295 	hdev = kzalloc(sizeof(*hdev), GFP_KERNEL);
296 	if (!hdev)
297 		return -ENOMEM;
298 
299 	/* First, we must find out which ASIC are we handling. This is needed
300 	 * to configure the behavior of the driver (kernel parameters)
301 	 */
302 	if (pdev) {
303 		hdev->asic_type = get_asic_type(pdev->device);
304 		if (hdev->asic_type == ASIC_INVALID) {
305 			dev_err(&pdev->dev, "Unsupported ASIC\n");
306 			rc = -ENODEV;
307 			goto free_hdev;
308 		}
309 	} else {
310 		hdev->asic_type = asic_type;
311 	}
312 
313 	if (pdev)
314 		hdev->asic_prop.fw_security_enabled =
315 					is_asic_secured(hdev->asic_type);
316 	else
317 		hdev->asic_prop.fw_security_enabled = false;
318 
319 	/* Assign status description string */
320 	strncpy(hdev->status[HL_DEVICE_STATUS_OPERATIONAL],
321 					"operational", HL_STR_MAX);
322 	strncpy(hdev->status[HL_DEVICE_STATUS_IN_RESET],
323 					"in reset", HL_STR_MAX);
324 	strncpy(hdev->status[HL_DEVICE_STATUS_MALFUNCTION],
325 					"disabled", HL_STR_MAX);
326 	strncpy(hdev->status[HL_DEVICE_STATUS_NEEDS_RESET],
327 					"needs reset", HL_STR_MAX);
328 	strncpy(hdev->status[HL_DEVICE_STATUS_IN_DEVICE_CREATION],
329 					"in device creation", HL_STR_MAX);
330 
331 	hdev->major = hl_major;
332 	hdev->reset_on_lockup = reset_on_lockup;
333 	hdev->memory_scrub = memory_scrub;
334 	hdev->boot_error_status_mask = boot_error_status_mask;
335 	hdev->stop_on_err = true;
336 
337 	hdev->pldm = 0;
338 
339 	set_driver_behavior_per_device(hdev);
340 
341 	hdev->curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
342 
343 	if (timeout_locked)
344 		hdev->timeout_jiffies = msecs_to_jiffies(timeout_locked * 1000);
345 	else
346 		hdev->timeout_jiffies = MAX_SCHEDULE_TIMEOUT;
347 
348 	hdev->disabled = true;
349 	hdev->pdev = pdev; /* can be NULL in case of simulator device */
350 
351 	/* Set default DMA mask to 32 bits */
352 	hdev->dma_mask = 32;
353 
354 	mutex_lock(&hl_devs_idr_lock);
355 
356 	/* Always save 2 numbers, 1 for main device and 1 for control.
357 	 * They must be consecutive
358 	 */
359 	main_id = idr_alloc(&hl_devs_idr, hdev, 0, HL_MAX_MINORS,
360 				GFP_KERNEL);
361 
362 	if (main_id >= 0)
363 		ctrl_id = idr_alloc(&hl_devs_idr, hdev, main_id + 1,
364 					main_id + 2, GFP_KERNEL);
365 
366 	mutex_unlock(&hl_devs_idr_lock);
367 
368 	if ((main_id < 0) || (ctrl_id < 0)) {
369 		if ((main_id == -ENOSPC) || (ctrl_id == -ENOSPC))
370 			pr_err("too many devices in the system\n");
371 
372 		if (main_id >= 0) {
373 			mutex_lock(&hl_devs_idr_lock);
374 			idr_remove(&hl_devs_idr, main_id);
375 			mutex_unlock(&hl_devs_idr_lock);
376 		}
377 
378 		rc = -EBUSY;
379 		goto free_hdev;
380 	}
381 
382 	hdev->id = main_id;
383 	hdev->id_control = ctrl_id;
384 
385 	*dev = hdev;
386 
387 	return 0;
388 
389 free_hdev:
390 	kfree(hdev);
391 	return rc;
392 }
393 
394 /*
395  * destroy_hdev - destroy habanalabs device instance
396  *
397  * @dev: pointer to the habanalabs device structure
398  *
399  */
destroy_hdev(struct hl_device * hdev)400 void destroy_hdev(struct hl_device *hdev)
401 {
402 	/* Remove device from the device list */
403 	mutex_lock(&hl_devs_idr_lock);
404 	idr_remove(&hl_devs_idr, hdev->id);
405 	idr_remove(&hl_devs_idr, hdev->id_control);
406 	mutex_unlock(&hl_devs_idr_lock);
407 
408 	kfree(hdev);
409 }
410 
hl_pmops_suspend(struct device * dev)411 static int hl_pmops_suspend(struct device *dev)
412 {
413 	struct hl_device *hdev = dev_get_drvdata(dev);
414 
415 	pr_debug("Going to suspend PCI device\n");
416 
417 	if (!hdev) {
418 		pr_err("device pointer is NULL in suspend\n");
419 		return 0;
420 	}
421 
422 	return hl_device_suspend(hdev);
423 }
424 
hl_pmops_resume(struct device * dev)425 static int hl_pmops_resume(struct device *dev)
426 {
427 	struct hl_device *hdev = dev_get_drvdata(dev);
428 
429 	pr_debug("Going to resume PCI device\n");
430 
431 	if (!hdev) {
432 		pr_err("device pointer is NULL in resume\n");
433 		return 0;
434 	}
435 
436 	return hl_device_resume(hdev);
437 }
438 
439 /*
440  * hl_pci_probe - probe PCI habanalabs devices
441  *
442  * @pdev: pointer to pci device
443  * @id: pointer to pci device id structure
444  *
445  * Standard PCI probe function for habanalabs device.
446  * Create a new habanalabs device and initialize it according to the
447  * device's type
448  */
hl_pci_probe(struct pci_dev * pdev,const struct pci_device_id * id)449 static int hl_pci_probe(struct pci_dev *pdev,
450 				const struct pci_device_id *id)
451 {
452 	struct hl_device *hdev;
453 	int rc;
454 
455 	dev_info(&pdev->dev, HL_NAME
456 		 " device found [%04x:%04x] (rev %x)\n",
457 		 (int)pdev->vendor, (int)pdev->device, (int)pdev->revision);
458 
459 	rc = create_hdev(&hdev, pdev, ASIC_INVALID, -1);
460 	if (rc)
461 		return rc;
462 
463 	pci_set_drvdata(pdev, hdev);
464 
465 	pci_enable_pcie_error_reporting(pdev);
466 
467 	rc = hl_device_init(hdev, hl_class);
468 	if (rc) {
469 		dev_err(&pdev->dev, "Fatal error during habanalabs device init\n");
470 		rc = -ENODEV;
471 		goto disable_device;
472 	}
473 
474 	return 0;
475 
476 disable_device:
477 	pci_disable_pcie_error_reporting(pdev);
478 	pci_set_drvdata(pdev, NULL);
479 	destroy_hdev(hdev);
480 
481 	return rc;
482 }
483 
484 /*
485  * hl_pci_remove - remove PCI habanalabs devices
486  *
487  * @pdev: pointer to pci device
488  *
489  * Standard PCI remove function for habanalabs device
490  */
hl_pci_remove(struct pci_dev * pdev)491 static void hl_pci_remove(struct pci_dev *pdev)
492 {
493 	struct hl_device *hdev;
494 
495 	hdev = pci_get_drvdata(pdev);
496 	if (!hdev)
497 		return;
498 
499 	hl_device_fini(hdev);
500 	pci_disable_pcie_error_reporting(pdev);
501 	pci_set_drvdata(pdev, NULL);
502 	destroy_hdev(hdev);
503 }
504 
505 /**
506  * hl_pci_err_detected - a PCI bus error detected on this device
507  *
508  * @pdev: pointer to pci device
509  * @state: PCI error type
510  *
511  * Called by the PCI subsystem whenever a non-correctable
512  * PCI bus error is detected
513  */
514 static pci_ers_result_t
hl_pci_err_detected(struct pci_dev * pdev,pci_channel_state_t state)515 hl_pci_err_detected(struct pci_dev *pdev, pci_channel_state_t state)
516 {
517 	struct hl_device *hdev = pci_get_drvdata(pdev);
518 	enum pci_ers_result result;
519 
520 	switch (state) {
521 	case pci_channel_io_normal:
522 		return PCI_ERS_RESULT_CAN_RECOVER;
523 
524 	case pci_channel_io_frozen:
525 		dev_warn(hdev->dev, "frozen state error detected\n");
526 		result = PCI_ERS_RESULT_NEED_RESET;
527 		break;
528 
529 	case pci_channel_io_perm_failure:
530 		dev_warn(hdev->dev, "failure state error detected\n");
531 		result = PCI_ERS_RESULT_DISCONNECT;
532 		break;
533 
534 	default:
535 		result = PCI_ERS_RESULT_NONE;
536 	}
537 
538 	hdev->asic_funcs->halt_engines(hdev, true, false);
539 
540 	return result;
541 }
542 
543 /**
544  * hl_pci_err_resume - resume after a PCI slot reset
545  *
546  * @pdev: pointer to pci device
547  *
548  */
hl_pci_err_resume(struct pci_dev * pdev)549 static void hl_pci_err_resume(struct pci_dev *pdev)
550 {
551 	struct hl_device *hdev = pci_get_drvdata(pdev);
552 
553 	dev_warn(hdev->dev, "Resuming device after PCI slot reset\n");
554 	hl_device_resume(hdev);
555 }
556 
557 /**
558  * hl_pci_err_slot_reset - a PCI slot reset has just happened
559  *
560  * @pdev: pointer to pci device
561  *
562  * Determine if the driver can recover from the PCI slot reset
563  */
hl_pci_err_slot_reset(struct pci_dev * pdev)564 static pci_ers_result_t hl_pci_err_slot_reset(struct pci_dev *pdev)
565 {
566 	return PCI_ERS_RESULT_RECOVERED;
567 }
568 
569 static const struct dev_pm_ops hl_pm_ops = {
570 	.suspend = hl_pmops_suspend,
571 	.resume = hl_pmops_resume,
572 };
573 
574 static const struct pci_error_handlers hl_pci_err_handler = {
575 	.error_detected = hl_pci_err_detected,
576 	.slot_reset = hl_pci_err_slot_reset,
577 	.resume = hl_pci_err_resume,
578 };
579 
580 static struct pci_driver hl_pci_driver = {
581 	.name = HL_NAME,
582 	.id_table = ids,
583 	.probe = hl_pci_probe,
584 	.remove = hl_pci_remove,
585 	.shutdown = hl_pci_remove,
586 	.driver = {
587 		.name = HL_NAME,
588 		.pm = &hl_pm_ops,
589 		.probe_type = PROBE_PREFER_ASYNCHRONOUS,
590 	},
591 	.err_handler = &hl_pci_err_handler,
592 };
593 
594 /*
595  * hl_init - Initialize the habanalabs kernel driver
596  */
hl_init(void)597 static int __init hl_init(void)
598 {
599 	int rc;
600 	dev_t dev;
601 
602 	pr_info("loading driver\n");
603 
604 	rc = alloc_chrdev_region(&dev, 0, HL_MAX_MINORS, HL_NAME);
605 	if (rc < 0) {
606 		pr_err("unable to get major\n");
607 		return rc;
608 	}
609 
610 	hl_major = MAJOR(dev);
611 
612 	hl_class = class_create(THIS_MODULE, HL_NAME);
613 	if (IS_ERR(hl_class)) {
614 		pr_err("failed to allocate class\n");
615 		rc = PTR_ERR(hl_class);
616 		goto remove_major;
617 	}
618 
619 	hl_debugfs_init();
620 
621 	rc = pci_register_driver(&hl_pci_driver);
622 	if (rc) {
623 		pr_err("failed to register pci device\n");
624 		goto remove_debugfs;
625 	}
626 
627 	pr_debug("driver loaded\n");
628 
629 	return 0;
630 
631 remove_debugfs:
632 	hl_debugfs_fini();
633 	class_destroy(hl_class);
634 remove_major:
635 	unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS);
636 	return rc;
637 }
638 
639 /*
640  * hl_exit - Release all resources of the habanalabs kernel driver
641  */
hl_exit(void)642 static void __exit hl_exit(void)
643 {
644 	pci_unregister_driver(&hl_pci_driver);
645 
646 	/*
647 	 * Removing debugfs must be after all devices or simulator devices
648 	 * have been removed because otherwise we get a bug in the
649 	 * debugfs module for referencing NULL objects
650 	 */
651 	hl_debugfs_fini();
652 
653 	class_destroy(hl_class);
654 	unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS);
655 
656 	idr_destroy(&hl_devs_idr);
657 
658 	pr_debug("driver removed\n");
659 }
660 
661 module_init(hl_init);
662 module_exit(hl_exit);
663