1 // SPDX-License-Identifier: GPL-2.0
2
3 /*
4 * Copyright 2016-2019 HabanaLabs, Ltd.
5 * All Rights Reserved.
6 *
7 */
8
9 #define pr_fmt(fmt) "habanalabs: " fmt
10
11 #include "habanalabs.h"
12
13 #include <linux/pci.h>
14 #include <linux/module.h>
15
16 #define HL_DRIVER_AUTHOR "HabanaLabs Kernel Driver Team"
17
18 #define HL_DRIVER_DESC "Driver for HabanaLabs's AI Accelerators"
19
20 MODULE_AUTHOR(HL_DRIVER_AUTHOR);
21 MODULE_DESCRIPTION(HL_DRIVER_DESC);
22 MODULE_LICENSE("GPL v2");
23
24 static int hl_major;
25 static struct class *hl_class;
26 static DEFINE_IDR(hl_devs_idr);
27 static DEFINE_MUTEX(hl_devs_idr_lock);
28
29 static int timeout_locked = 5;
30 static int reset_on_lockup = 1;
31
32 module_param(timeout_locked, int, 0444);
33 MODULE_PARM_DESC(timeout_locked,
34 "Device lockup timeout in seconds (0 = disabled, default 5s)");
35
36 module_param(reset_on_lockup, int, 0444);
37 MODULE_PARM_DESC(reset_on_lockup,
38 "Do device reset on lockup (0 = no, 1 = yes, default yes)");
39
40 #define PCI_VENDOR_ID_HABANALABS 0x1da3
41
42 #define PCI_IDS_GOYA 0x0001
43
44 static const struct pci_device_id ids[] = {
45 { PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GOYA), },
46 { 0, }
47 };
48 MODULE_DEVICE_TABLE(pci, ids);
49
50 /*
51 * get_asic_type - translate device id to asic type
52 *
53 * @device: id of the PCI device
54 *
55 * Translate device id to asic type.
56 * In case of unidentified device, return -1
57 */
get_asic_type(u16 device)58 static enum hl_asic_type get_asic_type(u16 device)
59 {
60 enum hl_asic_type asic_type;
61
62 switch (device) {
63 case PCI_IDS_GOYA:
64 asic_type = ASIC_GOYA;
65 break;
66 default:
67 asic_type = ASIC_INVALID;
68 break;
69 }
70
71 return asic_type;
72 }
73
74 /*
75 * hl_device_open - open function for habanalabs device
76 *
77 * @inode: pointer to inode structure
78 * @filp: pointer to file structure
79 *
80 * Called when process opens an habanalabs device.
81 */
hl_device_open(struct inode * inode,struct file * filp)82 int hl_device_open(struct inode *inode, struct file *filp)
83 {
84 struct hl_device *hdev;
85 struct hl_fpriv *hpriv;
86 int rc;
87
88 mutex_lock(&hl_devs_idr_lock);
89 hdev = idr_find(&hl_devs_idr, iminor(inode));
90 mutex_unlock(&hl_devs_idr_lock);
91
92 if (!hdev) {
93 pr_err("Couldn't find device %d:%d\n",
94 imajor(inode), iminor(inode));
95 return -ENXIO;
96 }
97
98 hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
99 if (!hpriv)
100 return -ENOMEM;
101
102 hpriv->hdev = hdev;
103 filp->private_data = hpriv;
104 hpriv->filp = filp;
105 mutex_init(&hpriv->restore_phase_mutex);
106 kref_init(&hpriv->refcount);
107 nonseekable_open(inode, filp);
108
109 hl_cb_mgr_init(&hpriv->cb_mgr);
110 hl_ctx_mgr_init(&hpriv->ctx_mgr);
111
112 hpriv->taskpid = find_get_pid(current->pid);
113
114 mutex_lock(&hdev->fpriv_list_lock);
115
116 if (hl_device_disabled_or_in_reset(hdev)) {
117 dev_err_ratelimited(hdev->dev,
118 "Can't open %s because it is disabled or in reset\n",
119 dev_name(hdev->dev));
120 rc = -EPERM;
121 goto out_err;
122 }
123
124 if (hdev->in_debug) {
125 dev_err_ratelimited(hdev->dev,
126 "Can't open %s because it is being debugged by another user\n",
127 dev_name(hdev->dev));
128 rc = -EPERM;
129 goto out_err;
130 }
131
132 if (hdev->compute_ctx) {
133 dev_dbg_ratelimited(hdev->dev,
134 "Can't open %s because another user is working on it\n",
135 dev_name(hdev->dev));
136 rc = -EBUSY;
137 goto out_err;
138 }
139
140 rc = hl_ctx_create(hdev, hpriv);
141 if (rc) {
142 dev_err(hdev->dev, "Failed to create context %d\n", rc);
143 goto out_err;
144 }
145
146 /* Device is IDLE at this point so it is legal to change PLLs.
147 * There is no need to check anything because if the PLL is
148 * already HIGH, the set function will return without doing
149 * anything
150 */
151 hl_device_set_frequency(hdev, PLL_HIGH);
152
153 list_add(&hpriv->dev_node, &hdev->fpriv_list);
154 mutex_unlock(&hdev->fpriv_list_lock);
155
156 hl_debugfs_add_file(hpriv);
157
158 return 0;
159
160 out_err:
161 mutex_unlock(&hdev->fpriv_list_lock);
162
163 hl_cb_mgr_fini(hpriv->hdev, &hpriv->cb_mgr);
164 hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
165 filp->private_data = NULL;
166 mutex_destroy(&hpriv->restore_phase_mutex);
167 put_pid(hpriv->taskpid);
168
169 kfree(hpriv);
170 return rc;
171 }
172
hl_device_open_ctrl(struct inode * inode,struct file * filp)173 int hl_device_open_ctrl(struct inode *inode, struct file *filp)
174 {
175 struct hl_device *hdev;
176 struct hl_fpriv *hpriv;
177 int rc;
178
179 mutex_lock(&hl_devs_idr_lock);
180 hdev = idr_find(&hl_devs_idr, iminor(inode));
181 mutex_unlock(&hl_devs_idr_lock);
182
183 if (!hdev) {
184 pr_err("Couldn't find device %d:%d\n",
185 imajor(inode), iminor(inode));
186 return -ENXIO;
187 }
188
189 hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
190 if (!hpriv)
191 return -ENOMEM;
192
193 mutex_lock(&hdev->fpriv_list_lock);
194
195 if (hl_device_disabled_or_in_reset(hdev)) {
196 dev_err_ratelimited(hdev->dev_ctrl,
197 "Can't open %s because it is disabled or in reset\n",
198 dev_name(hdev->dev_ctrl));
199 rc = -EPERM;
200 goto out_err;
201 }
202
203 list_add(&hpriv->dev_node, &hdev->fpriv_list);
204 mutex_unlock(&hdev->fpriv_list_lock);
205
206 hpriv->hdev = hdev;
207 filp->private_data = hpriv;
208 hpriv->filp = filp;
209 hpriv->is_control = true;
210 nonseekable_open(inode, filp);
211
212 hpriv->taskpid = find_get_pid(current->pid);
213
214 return 0;
215
216 out_err:
217 mutex_unlock(&hdev->fpriv_list_lock);
218 kfree(hpriv);
219 return rc;
220 }
221
set_driver_behavior_per_device(struct hl_device * hdev)222 static void set_driver_behavior_per_device(struct hl_device *hdev)
223 {
224 hdev->mmu_enable = 1;
225 hdev->cpu_enable = 1;
226 hdev->fw_loading = 1;
227 hdev->cpu_queues_enable = 1;
228 hdev->heartbeat = 1;
229
230 hdev->reset_pcilink = 0;
231 }
232
233 /*
234 * create_hdev - create habanalabs device instance
235 *
236 * @dev: will hold the pointer to the new habanalabs device structure
237 * @pdev: pointer to the pci device
238 * @asic_type: in case of simulator device, which device is it
239 * @minor: in case of simulator device, the minor of the device
240 *
241 * Allocate memory for habanalabs device and initialize basic fields
242 * Identify the ASIC type
243 * Allocate ID (minor) for the device (only for real devices)
244 */
create_hdev(struct hl_device ** dev,struct pci_dev * pdev,enum hl_asic_type asic_type,int minor)245 int create_hdev(struct hl_device **dev, struct pci_dev *pdev,
246 enum hl_asic_type asic_type, int minor)
247 {
248 struct hl_device *hdev;
249 int rc, main_id, ctrl_id = 0;
250
251 *dev = NULL;
252
253 hdev = kzalloc(sizeof(*hdev), GFP_KERNEL);
254 if (!hdev)
255 return -ENOMEM;
256
257 /* First, we must find out which ASIC are we handling. This is needed
258 * to configure the behavior of the driver (kernel parameters)
259 */
260 if (pdev) {
261 hdev->asic_type = get_asic_type(pdev->device);
262 if (hdev->asic_type == ASIC_INVALID) {
263 dev_err(&pdev->dev, "Unsupported ASIC\n");
264 rc = -ENODEV;
265 goto free_hdev;
266 }
267 } else {
268 hdev->asic_type = asic_type;
269 }
270
271 hdev->major = hl_major;
272 hdev->reset_on_lockup = reset_on_lockup;
273 hdev->pldm = 0;
274
275 set_driver_behavior_per_device(hdev);
276
277 if (timeout_locked)
278 hdev->timeout_jiffies = msecs_to_jiffies(timeout_locked * 1000);
279 else
280 hdev->timeout_jiffies = MAX_SCHEDULE_TIMEOUT;
281
282 hdev->disabled = true;
283 hdev->pdev = pdev; /* can be NULL in case of simulator device */
284
285 /* Set default DMA mask to 32 bits */
286 hdev->dma_mask = 32;
287
288 mutex_lock(&hl_devs_idr_lock);
289
290 /* Always save 2 numbers, 1 for main device and 1 for control.
291 * They must be consecutive
292 */
293 main_id = idr_alloc(&hl_devs_idr, hdev, 0, HL_MAX_MINORS,
294 GFP_KERNEL);
295
296 if (main_id >= 0)
297 ctrl_id = idr_alloc(&hl_devs_idr, hdev, main_id + 1,
298 main_id + 2, GFP_KERNEL);
299
300 mutex_unlock(&hl_devs_idr_lock);
301
302 if ((main_id < 0) || (ctrl_id < 0)) {
303 if ((main_id == -ENOSPC) || (ctrl_id == -ENOSPC))
304 pr_err("too many devices in the system\n");
305
306 if (main_id >= 0) {
307 mutex_lock(&hl_devs_idr_lock);
308 idr_remove(&hl_devs_idr, main_id);
309 mutex_unlock(&hl_devs_idr_lock);
310 }
311
312 rc = -EBUSY;
313 goto free_hdev;
314 }
315
316 hdev->id = main_id;
317 hdev->id_control = ctrl_id;
318
319 *dev = hdev;
320
321 return 0;
322
323 free_hdev:
324 kfree(hdev);
325 return rc;
326 }
327
328 /*
329 * destroy_hdev - destroy habanalabs device instance
330 *
331 * @dev: pointer to the habanalabs device structure
332 *
333 */
destroy_hdev(struct hl_device * hdev)334 void destroy_hdev(struct hl_device *hdev)
335 {
336 /* Remove device from the device list */
337 mutex_lock(&hl_devs_idr_lock);
338 idr_remove(&hl_devs_idr, hdev->id);
339 idr_remove(&hl_devs_idr, hdev->id_control);
340 mutex_unlock(&hl_devs_idr_lock);
341
342 kfree(hdev);
343 }
344
hl_pmops_suspend(struct device * dev)345 static int hl_pmops_suspend(struct device *dev)
346 {
347 struct hl_device *hdev = dev_get_drvdata(dev);
348
349 pr_debug("Going to suspend PCI device\n");
350
351 if (!hdev) {
352 pr_err("device pointer is NULL in suspend\n");
353 return 0;
354 }
355
356 return hl_device_suspend(hdev);
357 }
358
hl_pmops_resume(struct device * dev)359 static int hl_pmops_resume(struct device *dev)
360 {
361 struct hl_device *hdev = dev_get_drvdata(dev);
362
363 pr_debug("Going to resume PCI device\n");
364
365 if (!hdev) {
366 pr_err("device pointer is NULL in resume\n");
367 return 0;
368 }
369
370 return hl_device_resume(hdev);
371 }
372
373 /*
374 * hl_pci_probe - probe PCI habanalabs devices
375 *
376 * @pdev: pointer to pci device
377 * @id: pointer to pci device id structure
378 *
379 * Standard PCI probe function for habanalabs device.
380 * Create a new habanalabs device and initialize it according to the
381 * device's type
382 */
hl_pci_probe(struct pci_dev * pdev,const struct pci_device_id * id)383 static int hl_pci_probe(struct pci_dev *pdev,
384 const struct pci_device_id *id)
385 {
386 struct hl_device *hdev;
387 int rc;
388
389 dev_info(&pdev->dev, HL_NAME
390 " device found [%04x:%04x] (rev %x)\n",
391 (int)pdev->vendor, (int)pdev->device, (int)pdev->revision);
392
393 rc = create_hdev(&hdev, pdev, ASIC_INVALID, -1);
394 if (rc)
395 return rc;
396
397 pci_set_drvdata(pdev, hdev);
398
399 rc = hl_device_init(hdev, hl_class);
400 if (rc) {
401 dev_err(&pdev->dev, "Fatal error during habanalabs device init\n");
402 rc = -ENODEV;
403 goto disable_device;
404 }
405
406 return 0;
407
408 disable_device:
409 pci_set_drvdata(pdev, NULL);
410 destroy_hdev(hdev);
411
412 return rc;
413 }
414
415 /*
416 * hl_pci_remove - remove PCI habanalabs devices
417 *
418 * @pdev: pointer to pci device
419 *
420 * Standard PCI remove function for habanalabs device
421 */
hl_pci_remove(struct pci_dev * pdev)422 static void hl_pci_remove(struct pci_dev *pdev)
423 {
424 struct hl_device *hdev;
425
426 hdev = pci_get_drvdata(pdev);
427 if (!hdev)
428 return;
429
430 hl_device_fini(hdev);
431 pci_set_drvdata(pdev, NULL);
432
433 destroy_hdev(hdev);
434 }
435
436 static const struct dev_pm_ops hl_pm_ops = {
437 .suspend = hl_pmops_suspend,
438 .resume = hl_pmops_resume,
439 };
440
441 static struct pci_driver hl_pci_driver = {
442 .name = HL_NAME,
443 .id_table = ids,
444 .probe = hl_pci_probe,
445 .remove = hl_pci_remove,
446 .driver.pm = &hl_pm_ops,
447 };
448
449 /*
450 * hl_init - Initialize the habanalabs kernel driver
451 */
hl_init(void)452 static int __init hl_init(void)
453 {
454 int rc;
455 dev_t dev;
456
457 pr_info("loading driver\n");
458
459 rc = alloc_chrdev_region(&dev, 0, HL_MAX_MINORS, HL_NAME);
460 if (rc < 0) {
461 pr_err("unable to get major\n");
462 return rc;
463 }
464
465 hl_major = MAJOR(dev);
466
467 hl_class = class_create(THIS_MODULE, HL_NAME);
468 if (IS_ERR(hl_class)) {
469 pr_err("failed to allocate class\n");
470 rc = PTR_ERR(hl_class);
471 goto remove_major;
472 }
473
474 hl_debugfs_init();
475
476 rc = pci_register_driver(&hl_pci_driver);
477 if (rc) {
478 pr_err("failed to register pci device\n");
479 goto remove_debugfs;
480 }
481
482 pr_debug("driver loaded\n");
483
484 return 0;
485
486 remove_debugfs:
487 hl_debugfs_fini();
488 class_destroy(hl_class);
489 remove_major:
490 unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS);
491 return rc;
492 }
493
494 /*
495 * hl_exit - Release all resources of the habanalabs kernel driver
496 */
hl_exit(void)497 static void __exit hl_exit(void)
498 {
499 pci_unregister_driver(&hl_pci_driver);
500
501 /*
502 * Removing debugfs must be after all devices or simulator devices
503 * have been removed because otherwise we get a bug in the
504 * debugfs module for referencing NULL objects
505 */
506 hl_debugfs_fini();
507
508 class_destroy(hl_class);
509 unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS);
510
511 idr_destroy(&hl_devs_idr);
512
513 pr_debug("driver removed\n");
514 }
515
516 module_init(hl_init);
517 module_exit(hl_exit);
518