• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0
2 
3 /*
4  * Copyright 2016-2019 HabanaLabs, Ltd.
5  * All Rights Reserved.
6  *
7  */
8 
9 #define pr_fmt(fmt)		"habanalabs: " fmt
10 
11 #include "habanalabs.h"
12 
13 #include <linux/pci.h>
14 #include <linux/module.h>
15 
16 #define HL_DRIVER_AUTHOR	"HabanaLabs Kernel Driver Team"
17 
18 #define HL_DRIVER_DESC		"Driver for HabanaLabs's AI Accelerators"
19 
20 MODULE_AUTHOR(HL_DRIVER_AUTHOR);
21 MODULE_DESCRIPTION(HL_DRIVER_DESC);
22 MODULE_LICENSE("GPL v2");
23 
24 static int hl_major;
25 static struct class *hl_class;
26 static DEFINE_IDR(hl_devs_idr);
27 static DEFINE_MUTEX(hl_devs_idr_lock);
28 
29 static int timeout_locked = 5;
30 static int reset_on_lockup = 1;
31 
32 module_param(timeout_locked, int, 0444);
33 MODULE_PARM_DESC(timeout_locked,
34 	"Device lockup timeout in seconds (0 = disabled, default 5s)");
35 
36 module_param(reset_on_lockup, int, 0444);
37 MODULE_PARM_DESC(reset_on_lockup,
38 	"Do device reset on lockup (0 = no, 1 = yes, default yes)");
39 
40 #define PCI_VENDOR_ID_HABANALABS	0x1da3
41 
42 #define PCI_IDS_GOYA			0x0001
43 
44 static const struct pci_device_id ids[] = {
45 	{ PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GOYA), },
46 	{ 0, }
47 };
48 MODULE_DEVICE_TABLE(pci, ids);
49 
50 /*
51  * get_asic_type - translate device id to asic type
52  *
53  * @device: id of the PCI device
54  *
55  * Translate device id to asic type.
56  * In case of unidentified device, return -1
57  */
get_asic_type(u16 device)58 static enum hl_asic_type get_asic_type(u16 device)
59 {
60 	enum hl_asic_type asic_type;
61 
62 	switch (device) {
63 	case PCI_IDS_GOYA:
64 		asic_type = ASIC_GOYA;
65 		break;
66 	default:
67 		asic_type = ASIC_INVALID;
68 		break;
69 	}
70 
71 	return asic_type;
72 }
73 
74 /*
75  * hl_device_open - open function for habanalabs device
76  *
77  * @inode: pointer to inode structure
78  * @filp: pointer to file structure
79  *
80  * Called when process opens an habanalabs device.
81  */
hl_device_open(struct inode * inode,struct file * filp)82 int hl_device_open(struct inode *inode, struct file *filp)
83 {
84 	struct hl_device *hdev;
85 	struct hl_fpriv *hpriv;
86 	int rc;
87 
88 	mutex_lock(&hl_devs_idr_lock);
89 	hdev = idr_find(&hl_devs_idr, iminor(inode));
90 	mutex_unlock(&hl_devs_idr_lock);
91 
92 	if (!hdev) {
93 		pr_err("Couldn't find device %d:%d\n",
94 			imajor(inode), iminor(inode));
95 		return -ENXIO;
96 	}
97 
98 	hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
99 	if (!hpriv)
100 		return -ENOMEM;
101 
102 	hpriv->hdev = hdev;
103 	filp->private_data = hpriv;
104 	hpriv->filp = filp;
105 	mutex_init(&hpriv->restore_phase_mutex);
106 	kref_init(&hpriv->refcount);
107 	nonseekable_open(inode, filp);
108 
109 	hl_cb_mgr_init(&hpriv->cb_mgr);
110 	hl_ctx_mgr_init(&hpriv->ctx_mgr);
111 
112 	hpriv->taskpid = find_get_pid(current->pid);
113 
114 	mutex_lock(&hdev->fpriv_list_lock);
115 
116 	if (hl_device_disabled_or_in_reset(hdev)) {
117 		dev_err_ratelimited(hdev->dev,
118 			"Can't open %s because it is disabled or in reset\n",
119 			dev_name(hdev->dev));
120 		rc = -EPERM;
121 		goto out_err;
122 	}
123 
124 	if (hdev->in_debug) {
125 		dev_err_ratelimited(hdev->dev,
126 			"Can't open %s because it is being debugged by another user\n",
127 			dev_name(hdev->dev));
128 		rc = -EPERM;
129 		goto out_err;
130 	}
131 
132 	if (hdev->compute_ctx) {
133 		dev_dbg_ratelimited(hdev->dev,
134 			"Can't open %s because another user is working on it\n",
135 			dev_name(hdev->dev));
136 		rc = -EBUSY;
137 		goto out_err;
138 	}
139 
140 	rc = hl_ctx_create(hdev, hpriv);
141 	if (rc) {
142 		dev_err(hdev->dev, "Failed to create context %d\n", rc);
143 		goto out_err;
144 	}
145 
146 	/* Device is IDLE at this point so it is legal to change PLLs.
147 	 * There is no need to check anything because if the PLL is
148 	 * already HIGH, the set function will return without doing
149 	 * anything
150 	 */
151 	hl_device_set_frequency(hdev, PLL_HIGH);
152 
153 	list_add(&hpriv->dev_node, &hdev->fpriv_list);
154 	mutex_unlock(&hdev->fpriv_list_lock);
155 
156 	hl_debugfs_add_file(hpriv);
157 
158 	return 0;
159 
160 out_err:
161 	mutex_unlock(&hdev->fpriv_list_lock);
162 
163 	hl_cb_mgr_fini(hpriv->hdev, &hpriv->cb_mgr);
164 	hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
165 	filp->private_data = NULL;
166 	mutex_destroy(&hpriv->restore_phase_mutex);
167 	put_pid(hpriv->taskpid);
168 
169 	kfree(hpriv);
170 	return rc;
171 }
172 
hl_device_open_ctrl(struct inode * inode,struct file * filp)173 int hl_device_open_ctrl(struct inode *inode, struct file *filp)
174 {
175 	struct hl_device *hdev;
176 	struct hl_fpriv *hpriv;
177 	int rc;
178 
179 	mutex_lock(&hl_devs_idr_lock);
180 	hdev = idr_find(&hl_devs_idr, iminor(inode));
181 	mutex_unlock(&hl_devs_idr_lock);
182 
183 	if (!hdev) {
184 		pr_err("Couldn't find device %d:%d\n",
185 			imajor(inode), iminor(inode));
186 		return -ENXIO;
187 	}
188 
189 	hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
190 	if (!hpriv)
191 		return -ENOMEM;
192 
193 	mutex_lock(&hdev->fpriv_list_lock);
194 
195 	if (hl_device_disabled_or_in_reset(hdev)) {
196 		dev_err_ratelimited(hdev->dev_ctrl,
197 			"Can't open %s because it is disabled or in reset\n",
198 			dev_name(hdev->dev_ctrl));
199 		rc = -EPERM;
200 		goto out_err;
201 	}
202 
203 	list_add(&hpriv->dev_node, &hdev->fpriv_list);
204 	mutex_unlock(&hdev->fpriv_list_lock);
205 
206 	hpriv->hdev = hdev;
207 	filp->private_data = hpriv;
208 	hpriv->filp = filp;
209 	hpriv->is_control = true;
210 	nonseekable_open(inode, filp);
211 
212 	hpriv->taskpid = find_get_pid(current->pid);
213 
214 	return 0;
215 
216 out_err:
217 	mutex_unlock(&hdev->fpriv_list_lock);
218 	kfree(hpriv);
219 	return rc;
220 }
221 
set_driver_behavior_per_device(struct hl_device * hdev)222 static void set_driver_behavior_per_device(struct hl_device *hdev)
223 {
224 	hdev->mmu_enable = 1;
225 	hdev->cpu_enable = 1;
226 	hdev->fw_loading = 1;
227 	hdev->cpu_queues_enable = 1;
228 	hdev->heartbeat = 1;
229 
230 	hdev->reset_pcilink = 0;
231 }
232 
233 /*
234  * create_hdev - create habanalabs device instance
235  *
236  * @dev: will hold the pointer to the new habanalabs device structure
237  * @pdev: pointer to the pci device
238  * @asic_type: in case of simulator device, which device is it
239  * @minor: in case of simulator device, the minor of the device
240  *
241  * Allocate memory for habanalabs device and initialize basic fields
242  * Identify the ASIC type
243  * Allocate ID (minor) for the device (only for real devices)
244  */
create_hdev(struct hl_device ** dev,struct pci_dev * pdev,enum hl_asic_type asic_type,int minor)245 int create_hdev(struct hl_device **dev, struct pci_dev *pdev,
246 		enum hl_asic_type asic_type, int minor)
247 {
248 	struct hl_device *hdev;
249 	int rc, main_id, ctrl_id = 0;
250 
251 	*dev = NULL;
252 
253 	hdev = kzalloc(sizeof(*hdev), GFP_KERNEL);
254 	if (!hdev)
255 		return -ENOMEM;
256 
257 	/* First, we must find out which ASIC are we handling. This is needed
258 	 * to configure the behavior of the driver (kernel parameters)
259 	 */
260 	if (pdev) {
261 		hdev->asic_type = get_asic_type(pdev->device);
262 		if (hdev->asic_type == ASIC_INVALID) {
263 			dev_err(&pdev->dev, "Unsupported ASIC\n");
264 			rc = -ENODEV;
265 			goto free_hdev;
266 		}
267 	} else {
268 		hdev->asic_type = asic_type;
269 	}
270 
271 	hdev->major = hl_major;
272 	hdev->reset_on_lockup = reset_on_lockup;
273 	hdev->pldm = 0;
274 
275 	set_driver_behavior_per_device(hdev);
276 
277 	if (timeout_locked)
278 		hdev->timeout_jiffies = msecs_to_jiffies(timeout_locked * 1000);
279 	else
280 		hdev->timeout_jiffies = MAX_SCHEDULE_TIMEOUT;
281 
282 	hdev->disabled = true;
283 	hdev->pdev = pdev; /* can be NULL in case of simulator device */
284 
285 	/* Set default DMA mask to 32 bits */
286 	hdev->dma_mask = 32;
287 
288 	mutex_lock(&hl_devs_idr_lock);
289 
290 	/* Always save 2 numbers, 1 for main device and 1 for control.
291 	 * They must be consecutive
292 	 */
293 	main_id = idr_alloc(&hl_devs_idr, hdev, 0, HL_MAX_MINORS,
294 				GFP_KERNEL);
295 
296 	if (main_id >= 0)
297 		ctrl_id = idr_alloc(&hl_devs_idr, hdev, main_id + 1,
298 					main_id + 2, GFP_KERNEL);
299 
300 	mutex_unlock(&hl_devs_idr_lock);
301 
302 	if ((main_id < 0) || (ctrl_id < 0)) {
303 		if ((main_id == -ENOSPC) || (ctrl_id == -ENOSPC))
304 			pr_err("too many devices in the system\n");
305 
306 		if (main_id >= 0) {
307 			mutex_lock(&hl_devs_idr_lock);
308 			idr_remove(&hl_devs_idr, main_id);
309 			mutex_unlock(&hl_devs_idr_lock);
310 		}
311 
312 		rc = -EBUSY;
313 		goto free_hdev;
314 	}
315 
316 	hdev->id = main_id;
317 	hdev->id_control = ctrl_id;
318 
319 	*dev = hdev;
320 
321 	return 0;
322 
323 free_hdev:
324 	kfree(hdev);
325 	return rc;
326 }
327 
328 /*
329  * destroy_hdev - destroy habanalabs device instance
330  *
331  * @dev: pointer to the habanalabs device structure
332  *
333  */
destroy_hdev(struct hl_device * hdev)334 void destroy_hdev(struct hl_device *hdev)
335 {
336 	/* Remove device from the device list */
337 	mutex_lock(&hl_devs_idr_lock);
338 	idr_remove(&hl_devs_idr, hdev->id);
339 	idr_remove(&hl_devs_idr, hdev->id_control);
340 	mutex_unlock(&hl_devs_idr_lock);
341 
342 	kfree(hdev);
343 }
344 
hl_pmops_suspend(struct device * dev)345 static int hl_pmops_suspend(struct device *dev)
346 {
347 	struct hl_device *hdev = dev_get_drvdata(dev);
348 
349 	pr_debug("Going to suspend PCI device\n");
350 
351 	if (!hdev) {
352 		pr_err("device pointer is NULL in suspend\n");
353 		return 0;
354 	}
355 
356 	return hl_device_suspend(hdev);
357 }
358 
hl_pmops_resume(struct device * dev)359 static int hl_pmops_resume(struct device *dev)
360 {
361 	struct hl_device *hdev = dev_get_drvdata(dev);
362 
363 	pr_debug("Going to resume PCI device\n");
364 
365 	if (!hdev) {
366 		pr_err("device pointer is NULL in resume\n");
367 		return 0;
368 	}
369 
370 	return hl_device_resume(hdev);
371 }
372 
373 /*
374  * hl_pci_probe - probe PCI habanalabs devices
375  *
376  * @pdev: pointer to pci device
377  * @id: pointer to pci device id structure
378  *
379  * Standard PCI probe function for habanalabs device.
380  * Create a new habanalabs device and initialize it according to the
381  * device's type
382  */
hl_pci_probe(struct pci_dev * pdev,const struct pci_device_id * id)383 static int hl_pci_probe(struct pci_dev *pdev,
384 				const struct pci_device_id *id)
385 {
386 	struct hl_device *hdev;
387 	int rc;
388 
389 	dev_info(&pdev->dev, HL_NAME
390 		 " device found [%04x:%04x] (rev %x)\n",
391 		 (int)pdev->vendor, (int)pdev->device, (int)pdev->revision);
392 
393 	rc = create_hdev(&hdev, pdev, ASIC_INVALID, -1);
394 	if (rc)
395 		return rc;
396 
397 	pci_set_drvdata(pdev, hdev);
398 
399 	rc = hl_device_init(hdev, hl_class);
400 	if (rc) {
401 		dev_err(&pdev->dev, "Fatal error during habanalabs device init\n");
402 		rc = -ENODEV;
403 		goto disable_device;
404 	}
405 
406 	return 0;
407 
408 disable_device:
409 	pci_set_drvdata(pdev, NULL);
410 	destroy_hdev(hdev);
411 
412 	return rc;
413 }
414 
415 /*
416  * hl_pci_remove - remove PCI habanalabs devices
417  *
418  * @pdev: pointer to pci device
419  *
420  * Standard PCI remove function for habanalabs device
421  */
hl_pci_remove(struct pci_dev * pdev)422 static void hl_pci_remove(struct pci_dev *pdev)
423 {
424 	struct hl_device *hdev;
425 
426 	hdev = pci_get_drvdata(pdev);
427 	if (!hdev)
428 		return;
429 
430 	hl_device_fini(hdev);
431 	pci_set_drvdata(pdev, NULL);
432 
433 	destroy_hdev(hdev);
434 }
435 
436 static const struct dev_pm_ops hl_pm_ops = {
437 	.suspend = hl_pmops_suspend,
438 	.resume = hl_pmops_resume,
439 };
440 
441 static struct pci_driver hl_pci_driver = {
442 	.name = HL_NAME,
443 	.id_table = ids,
444 	.probe = hl_pci_probe,
445 	.remove = hl_pci_remove,
446 	.driver.pm = &hl_pm_ops,
447 };
448 
449 /*
450  * hl_init - Initialize the habanalabs kernel driver
451  */
hl_init(void)452 static int __init hl_init(void)
453 {
454 	int rc;
455 	dev_t dev;
456 
457 	pr_info("loading driver\n");
458 
459 	rc = alloc_chrdev_region(&dev, 0, HL_MAX_MINORS, HL_NAME);
460 	if (rc < 0) {
461 		pr_err("unable to get major\n");
462 		return rc;
463 	}
464 
465 	hl_major = MAJOR(dev);
466 
467 	hl_class = class_create(THIS_MODULE, HL_NAME);
468 	if (IS_ERR(hl_class)) {
469 		pr_err("failed to allocate class\n");
470 		rc = PTR_ERR(hl_class);
471 		goto remove_major;
472 	}
473 
474 	hl_debugfs_init();
475 
476 	rc = pci_register_driver(&hl_pci_driver);
477 	if (rc) {
478 		pr_err("failed to register pci device\n");
479 		goto remove_debugfs;
480 	}
481 
482 	pr_debug("driver loaded\n");
483 
484 	return 0;
485 
486 remove_debugfs:
487 	hl_debugfs_fini();
488 	class_destroy(hl_class);
489 remove_major:
490 	unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS);
491 	return rc;
492 }
493 
494 /*
495  * hl_exit - Release all resources of the habanalabs kernel driver
496  */
hl_exit(void)497 static void __exit hl_exit(void)
498 {
499 	pci_unregister_driver(&hl_pci_driver);
500 
501 	/*
502 	 * Removing debugfs must be after all devices or simulator devices
503 	 * have been removed because otherwise we get a bug in the
504 	 * debugfs module for referencing NULL objects
505 	 */
506 	hl_debugfs_fini();
507 
508 	class_destroy(hl_class);
509 	unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS);
510 
511 	idr_destroy(&hl_devs_idr);
512 
513 	pr_debug("driver removed\n");
514 }
515 
516 module_init(hl_init);
517 module_exit(hl_exit);
518