1 /*
2 * VFIO core
3 *
4 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
5 * Author: Alex Williamson <alex.williamson@redhat.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 *
11 * Derived from original vfio:
12 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
13 * Author: Tom Lyon, pugs@cisco.com
14 */
15
16 #include <linux/cdev.h>
17 #include <linux/compat.h>
18 #include <linux/device.h>
19 #include <linux/file.h>
20 #include <linux/anon_inodes.h>
21 #include <linux/fs.h>
22 #include <linux/idr.h>
23 #include <linux/iommu.h>
24 #include <linux/list.h>
25 #include <linux/miscdevice.h>
26 #include <linux/module.h>
27 #include <linux/mutex.h>
28 #include <linux/pci.h>
29 #include <linux/rwsem.h>
30 #include <linux/sched.h>
31 #include <linux/slab.h>
32 #include <linux/stat.h>
33 #include <linux/string.h>
34 #include <linux/uaccess.h>
35 #include <linux/vfio.h>
36 #include <linux/wait.h>
37
38 #define DRIVER_VERSION "0.3"
39 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
40 #define DRIVER_DESC "VFIO - User Level meta-driver"
41
42 static struct vfio {
43 struct class *class;
44 struct list_head iommu_drivers_list;
45 struct mutex iommu_drivers_lock;
46 struct list_head group_list;
47 struct idr group_idr;
48 struct mutex group_lock;
49 struct cdev group_cdev;
50 dev_t group_devt;
51 wait_queue_head_t release_q;
52 } vfio;
53
54 struct vfio_iommu_driver {
55 const struct vfio_iommu_driver_ops *ops;
56 struct list_head vfio_next;
57 };
58
59 struct vfio_container {
60 struct kref kref;
61 struct list_head group_list;
62 struct rw_semaphore group_lock;
63 struct vfio_iommu_driver *iommu_driver;
64 void *iommu_data;
65 bool noiommu;
66 };
67
68 struct vfio_unbound_dev {
69 struct device *dev;
70 struct list_head unbound_next;
71 };
72
73 struct vfio_group {
74 struct kref kref;
75 int minor;
76 atomic_t container_users;
77 struct iommu_group *iommu_group;
78 struct vfio_container *container;
79 struct list_head device_list;
80 struct mutex device_lock;
81 struct device *dev;
82 struct notifier_block nb;
83 struct list_head vfio_next;
84 struct list_head container_next;
85 struct list_head unbound_list;
86 struct mutex unbound_lock;
87 atomic_t opened;
88 bool noiommu;
89 };
90
91 struct vfio_device {
92 struct kref kref;
93 struct device *dev;
94 const struct vfio_device_ops *ops;
95 struct vfio_group *group;
96 struct list_head group_next;
97 void *device_data;
98 };
99
100 #ifdef CONFIG_VFIO_NOIOMMU
101 static bool noiommu __read_mostly;
102 module_param_named(enable_unsafe_noiommu_mode,
103 noiommu, bool, S_IRUGO | S_IWUSR);
104 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)");
105 #endif
106
107 /*
108 * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe
109 * and remove functions, any use cases other than acquiring the first
110 * reference for the purpose of calling vfio_add_group_dev() or removing
111 * that symmetric reference after vfio_del_group_dev() should use the raw
112 * iommu_group_{get,put} functions. In particular, vfio_iommu_group_put()
113 * removes the device from the dummy group and cannot be nested.
114 */
vfio_iommu_group_get(struct device * dev)115 struct iommu_group *vfio_iommu_group_get(struct device *dev)
116 {
117 struct iommu_group *group;
118 int __maybe_unused ret;
119
120 group = iommu_group_get(dev);
121
122 #ifdef CONFIG_VFIO_NOIOMMU
123 /*
124 * With noiommu enabled, an IOMMU group will be created for a device
125 * that doesn't already have one and doesn't have an iommu_ops on their
126 * bus. We set iommudata simply to be able to identify these groups
127 * as special use and for reclamation later.
128 */
129 if (group || !noiommu || iommu_present(dev->bus))
130 return group;
131
132 group = iommu_group_alloc();
133 if (IS_ERR(group))
134 return NULL;
135
136 iommu_group_set_name(group, "vfio-noiommu");
137 iommu_group_set_iommudata(group, &noiommu, NULL);
138 ret = iommu_group_add_device(group, dev);
139 iommu_group_put(group);
140 if (ret)
141 return NULL;
142
143 /*
144 * Where to taint? At this point we've added an IOMMU group for a
145 * device that is not backed by iommu_ops, therefore any iommu_
146 * callback using iommu_ops can legitimately Oops. So, while we may
147 * be about to give a DMA capable device to a user without IOMMU
148 * protection, which is clearly taint-worthy, let's go ahead and do
149 * it here.
150 */
151 add_taint(TAINT_USER, LOCKDEP_STILL_OK);
152 dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
153 #endif
154
155 return group;
156 }
157 EXPORT_SYMBOL_GPL(vfio_iommu_group_get);
158
vfio_iommu_group_put(struct iommu_group * group,struct device * dev)159 void vfio_iommu_group_put(struct iommu_group *group, struct device *dev)
160 {
161 #ifdef CONFIG_VFIO_NOIOMMU
162 if (iommu_group_get_iommudata(group) == &noiommu)
163 iommu_group_remove_device(dev);
164 #endif
165
166 iommu_group_put(group);
167 }
168 EXPORT_SYMBOL_GPL(vfio_iommu_group_put);
169
170 #ifdef CONFIG_VFIO_NOIOMMU
vfio_noiommu_open(unsigned long arg)171 static void *vfio_noiommu_open(unsigned long arg)
172 {
173 if (arg != VFIO_NOIOMMU_IOMMU)
174 return ERR_PTR(-EINVAL);
175 if (!capable(CAP_SYS_RAWIO))
176 return ERR_PTR(-EPERM);
177
178 return NULL;
179 }
180
vfio_noiommu_release(void * iommu_data)181 static void vfio_noiommu_release(void *iommu_data)
182 {
183 }
184
vfio_noiommu_ioctl(void * iommu_data,unsigned int cmd,unsigned long arg)185 static long vfio_noiommu_ioctl(void *iommu_data,
186 unsigned int cmd, unsigned long arg)
187 {
188 if (cmd == VFIO_CHECK_EXTENSION)
189 return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
190
191 return -ENOTTY;
192 }
193
vfio_noiommu_attach_group(void * iommu_data,struct iommu_group * iommu_group)194 static int vfio_noiommu_attach_group(void *iommu_data,
195 struct iommu_group *iommu_group)
196 {
197 return iommu_group_get_iommudata(iommu_group) == &noiommu ? 0 : -EINVAL;
198 }
199
vfio_noiommu_detach_group(void * iommu_data,struct iommu_group * iommu_group)200 static void vfio_noiommu_detach_group(void *iommu_data,
201 struct iommu_group *iommu_group)
202 {
203 }
204
205 static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
206 .name = "vfio-noiommu",
207 .owner = THIS_MODULE,
208 .open = vfio_noiommu_open,
209 .release = vfio_noiommu_release,
210 .ioctl = vfio_noiommu_ioctl,
211 .attach_group = vfio_noiommu_attach_group,
212 .detach_group = vfio_noiommu_detach_group,
213 };
214 #endif
215
216
217 /**
218 * IOMMU driver registration
219 */
vfio_register_iommu_driver(const struct vfio_iommu_driver_ops * ops)220 int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
221 {
222 struct vfio_iommu_driver *driver, *tmp;
223
224 driver = kzalloc(sizeof(*driver), GFP_KERNEL);
225 if (!driver)
226 return -ENOMEM;
227
228 driver->ops = ops;
229
230 mutex_lock(&vfio.iommu_drivers_lock);
231
232 /* Check for duplicates */
233 list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
234 if (tmp->ops == ops) {
235 mutex_unlock(&vfio.iommu_drivers_lock);
236 kfree(driver);
237 return -EINVAL;
238 }
239 }
240
241 list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
242
243 mutex_unlock(&vfio.iommu_drivers_lock);
244
245 return 0;
246 }
247 EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
248
vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops * ops)249 void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
250 {
251 struct vfio_iommu_driver *driver;
252
253 mutex_lock(&vfio.iommu_drivers_lock);
254 list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
255 if (driver->ops == ops) {
256 list_del(&driver->vfio_next);
257 mutex_unlock(&vfio.iommu_drivers_lock);
258 kfree(driver);
259 return;
260 }
261 }
262 mutex_unlock(&vfio.iommu_drivers_lock);
263 }
264 EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
265
266 /**
267 * Group minor allocation/free - both called with vfio.group_lock held
268 */
vfio_alloc_group_minor(struct vfio_group * group)269 static int vfio_alloc_group_minor(struct vfio_group *group)
270 {
271 return idr_alloc(&vfio.group_idr, group, 0, MINORMASK + 1, GFP_KERNEL);
272 }
273
vfio_free_group_minor(int minor)274 static void vfio_free_group_minor(int minor)
275 {
276 idr_remove(&vfio.group_idr, minor);
277 }
278
279 static int vfio_iommu_group_notifier(struct notifier_block *nb,
280 unsigned long action, void *data);
281 static void vfio_group_get(struct vfio_group *group);
282
283 /**
284 * Container objects - containers are created when /dev/vfio/vfio is
285 * opened, but their lifecycle extends until the last user is done, so
286 * it's freed via kref. Must support container/group/device being
287 * closed in any order.
288 */
vfio_container_get(struct vfio_container * container)289 static void vfio_container_get(struct vfio_container *container)
290 {
291 kref_get(&container->kref);
292 }
293
vfio_container_release(struct kref * kref)294 static void vfio_container_release(struct kref *kref)
295 {
296 struct vfio_container *container;
297 container = container_of(kref, struct vfio_container, kref);
298
299 kfree(container);
300 }
301
vfio_container_put(struct vfio_container * container)302 static void vfio_container_put(struct vfio_container *container)
303 {
304 kref_put(&container->kref, vfio_container_release);
305 }
306
vfio_group_unlock_and_free(struct vfio_group * group)307 static void vfio_group_unlock_and_free(struct vfio_group *group)
308 {
309 mutex_unlock(&vfio.group_lock);
310 /*
311 * Unregister outside of lock. A spurious callback is harmless now
312 * that the group is no longer in vfio.group_list.
313 */
314 iommu_group_unregister_notifier(group->iommu_group, &group->nb);
315 kfree(group);
316 }
317
318 /**
319 * Group objects - create, release, get, put, search
320 */
vfio_create_group(struct iommu_group * iommu_group)321 static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
322 {
323 struct vfio_group *group, *tmp;
324 struct device *dev;
325 int ret, minor;
326
327 group = kzalloc(sizeof(*group), GFP_KERNEL);
328 if (!group)
329 return ERR_PTR(-ENOMEM);
330
331 kref_init(&group->kref);
332 INIT_LIST_HEAD(&group->device_list);
333 mutex_init(&group->device_lock);
334 INIT_LIST_HEAD(&group->unbound_list);
335 mutex_init(&group->unbound_lock);
336 atomic_set(&group->container_users, 0);
337 atomic_set(&group->opened, 0);
338 group->iommu_group = iommu_group;
339 #ifdef CONFIG_VFIO_NOIOMMU
340 group->noiommu = (iommu_group_get_iommudata(iommu_group) == &noiommu);
341 #endif
342
343 group->nb.notifier_call = vfio_iommu_group_notifier;
344
345 /*
346 * blocking notifiers acquire a rwsem around registering and hold
347 * it around callback. Therefore, need to register outside of
348 * vfio.group_lock to avoid A-B/B-A contention. Our callback won't
349 * do anything unless it can find the group in vfio.group_list, so
350 * no harm in registering early.
351 */
352 ret = iommu_group_register_notifier(iommu_group, &group->nb);
353 if (ret) {
354 kfree(group);
355 return ERR_PTR(ret);
356 }
357
358 mutex_lock(&vfio.group_lock);
359
360 /* Did we race creating this group? */
361 list_for_each_entry(tmp, &vfio.group_list, vfio_next) {
362 if (tmp->iommu_group == iommu_group) {
363 vfio_group_get(tmp);
364 vfio_group_unlock_and_free(group);
365 return tmp;
366 }
367 }
368
369 minor = vfio_alloc_group_minor(group);
370 if (minor < 0) {
371 vfio_group_unlock_and_free(group);
372 return ERR_PTR(minor);
373 }
374
375 dev = device_create(vfio.class, NULL,
376 MKDEV(MAJOR(vfio.group_devt), minor),
377 group, "%s%d", group->noiommu ? "noiommu-" : "",
378 iommu_group_id(iommu_group));
379 if (IS_ERR(dev)) {
380 vfio_free_group_minor(minor);
381 vfio_group_unlock_and_free(group);
382 return (struct vfio_group *)dev; /* ERR_PTR */
383 }
384
385 group->minor = minor;
386 group->dev = dev;
387
388 list_add(&group->vfio_next, &vfio.group_list);
389
390 mutex_unlock(&vfio.group_lock);
391
392 return group;
393 }
394
395 /* called with vfio.group_lock held */
vfio_group_release(struct kref * kref)396 static void vfio_group_release(struct kref *kref)
397 {
398 struct vfio_group *group = container_of(kref, struct vfio_group, kref);
399 struct vfio_unbound_dev *unbound, *tmp;
400 struct iommu_group *iommu_group = group->iommu_group;
401
402 WARN_ON(!list_empty(&group->device_list));
403
404 list_for_each_entry_safe(unbound, tmp,
405 &group->unbound_list, unbound_next) {
406 list_del(&unbound->unbound_next);
407 kfree(unbound);
408 }
409
410 device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor));
411 list_del(&group->vfio_next);
412 vfio_free_group_minor(group->minor);
413 vfio_group_unlock_and_free(group);
414 iommu_group_put(iommu_group);
415 }
416
vfio_group_put(struct vfio_group * group)417 static void vfio_group_put(struct vfio_group *group)
418 {
419 kref_put_mutex(&group->kref, vfio_group_release, &vfio.group_lock);
420 }
421
422 struct vfio_group_put_work {
423 struct work_struct work;
424 struct vfio_group *group;
425 };
426
vfio_group_put_bg(struct work_struct * work)427 static void vfio_group_put_bg(struct work_struct *work)
428 {
429 struct vfio_group_put_work *do_work;
430
431 do_work = container_of(work, struct vfio_group_put_work, work);
432
433 vfio_group_put(do_work->group);
434 kfree(do_work);
435 }
436
vfio_group_schedule_put(struct vfio_group * group)437 static void vfio_group_schedule_put(struct vfio_group *group)
438 {
439 struct vfio_group_put_work *do_work;
440
441 do_work = kmalloc(sizeof(*do_work), GFP_KERNEL);
442 if (WARN_ON(!do_work))
443 return;
444
445 INIT_WORK(&do_work->work, vfio_group_put_bg);
446 do_work->group = group;
447 schedule_work(&do_work->work);
448 }
449
450 /* Assume group_lock or group reference is held */
vfio_group_get(struct vfio_group * group)451 static void vfio_group_get(struct vfio_group *group)
452 {
453 kref_get(&group->kref);
454 }
455
456 /*
457 * Not really a try as we will sleep for mutex, but we need to make
458 * sure the group pointer is valid under lock and get a reference.
459 */
vfio_group_try_get(struct vfio_group * group)460 static struct vfio_group *vfio_group_try_get(struct vfio_group *group)
461 {
462 struct vfio_group *target = group;
463
464 mutex_lock(&vfio.group_lock);
465 list_for_each_entry(group, &vfio.group_list, vfio_next) {
466 if (group == target) {
467 vfio_group_get(group);
468 mutex_unlock(&vfio.group_lock);
469 return group;
470 }
471 }
472 mutex_unlock(&vfio.group_lock);
473
474 return NULL;
475 }
476
477 static
vfio_group_get_from_iommu(struct iommu_group * iommu_group)478 struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group)
479 {
480 struct vfio_group *group;
481
482 mutex_lock(&vfio.group_lock);
483 list_for_each_entry(group, &vfio.group_list, vfio_next) {
484 if (group->iommu_group == iommu_group) {
485 vfio_group_get(group);
486 mutex_unlock(&vfio.group_lock);
487 return group;
488 }
489 }
490 mutex_unlock(&vfio.group_lock);
491
492 return NULL;
493 }
494
vfio_group_get_from_minor(int minor)495 static struct vfio_group *vfio_group_get_from_minor(int minor)
496 {
497 struct vfio_group *group;
498
499 mutex_lock(&vfio.group_lock);
500 group = idr_find(&vfio.group_idr, minor);
501 if (!group) {
502 mutex_unlock(&vfio.group_lock);
503 return NULL;
504 }
505 vfio_group_get(group);
506 mutex_unlock(&vfio.group_lock);
507
508 return group;
509 }
510
511 /**
512 * Device objects - create, release, get, put, search
513 */
514 static
vfio_group_create_device(struct vfio_group * group,struct device * dev,const struct vfio_device_ops * ops,void * device_data)515 struct vfio_device *vfio_group_create_device(struct vfio_group *group,
516 struct device *dev,
517 const struct vfio_device_ops *ops,
518 void *device_data)
519 {
520 struct vfio_device *device;
521
522 device = kzalloc(sizeof(*device), GFP_KERNEL);
523 if (!device)
524 return ERR_PTR(-ENOMEM);
525
526 kref_init(&device->kref);
527 device->dev = dev;
528 device->group = group;
529 device->ops = ops;
530 device->device_data = device_data;
531 dev_set_drvdata(dev, device);
532
533 /* No need to get group_lock, caller has group reference */
534 vfio_group_get(group);
535
536 mutex_lock(&group->device_lock);
537 list_add(&device->group_next, &group->device_list);
538 mutex_unlock(&group->device_lock);
539
540 return device;
541 }
542
vfio_device_release(struct kref * kref)543 static void vfio_device_release(struct kref *kref)
544 {
545 struct vfio_device *device = container_of(kref,
546 struct vfio_device, kref);
547 struct vfio_group *group = device->group;
548
549 list_del(&device->group_next);
550 mutex_unlock(&group->device_lock);
551
552 dev_set_drvdata(device->dev, NULL);
553
554 kfree(device);
555
556 /* vfio_del_group_dev may be waiting for this device */
557 wake_up(&vfio.release_q);
558 }
559
560 /* Device reference always implies a group reference */
vfio_device_put(struct vfio_device * device)561 void vfio_device_put(struct vfio_device *device)
562 {
563 struct vfio_group *group = device->group;
564 kref_put_mutex(&device->kref, vfio_device_release, &group->device_lock);
565 vfio_group_put(group);
566 }
567 EXPORT_SYMBOL_GPL(vfio_device_put);
568
vfio_device_get(struct vfio_device * device)569 static void vfio_device_get(struct vfio_device *device)
570 {
571 vfio_group_get(device->group);
572 kref_get(&device->kref);
573 }
574
vfio_group_get_device(struct vfio_group * group,struct device * dev)575 static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
576 struct device *dev)
577 {
578 struct vfio_device *device;
579
580 mutex_lock(&group->device_lock);
581 list_for_each_entry(device, &group->device_list, group_next) {
582 if (device->dev == dev) {
583 vfio_device_get(device);
584 mutex_unlock(&group->device_lock);
585 return device;
586 }
587 }
588 mutex_unlock(&group->device_lock);
589 return NULL;
590 }
591
592 /*
593 * Some drivers, like pci-stub, are only used to prevent other drivers from
594 * claiming a device and are therefore perfectly legitimate for a user owned
595 * group. The pci-stub driver has no dependencies on DMA or the IOVA mapping
596 * of the device, but it does prevent the user from having direct access to
597 * the device, which is useful in some circumstances.
598 *
599 * We also assume that we can include PCI interconnect devices, ie. bridges.
600 * IOMMU grouping on PCI necessitates that if we lack isolation on a bridge
601 * then all of the downstream devices will be part of the same IOMMU group as
602 * the bridge. Thus, if placing the bridge into the user owned IOVA space
603 * breaks anything, it only does so for user owned devices downstream. Note
604 * that error notification via MSI can be affected for platforms that handle
605 * MSI within the same IOVA space as DMA.
606 */
607 static const char * const vfio_driver_whitelist[] = { "pci-stub" };
608
vfio_dev_whitelisted(struct device * dev,struct device_driver * drv)609 static bool vfio_dev_whitelisted(struct device *dev, struct device_driver *drv)
610 {
611 int i;
612
613 if (dev_is_pci(dev)) {
614 struct pci_dev *pdev = to_pci_dev(dev);
615
616 if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
617 return true;
618 }
619
620 for (i = 0; i < ARRAY_SIZE(vfio_driver_whitelist); i++) {
621 if (!strcmp(drv->name, vfio_driver_whitelist[i]))
622 return true;
623 }
624
625 return false;
626 }
627
628 /*
629 * A vfio group is viable for use by userspace if all devices are in
630 * one of the following states:
631 * - driver-less
632 * - bound to a vfio driver
633 * - bound to a whitelisted driver
634 * - a PCI interconnect device
635 *
636 * We use two methods to determine whether a device is bound to a vfio
637 * driver. The first is to test whether the device exists in the vfio
638 * group. The second is to test if the device exists on the group
639 * unbound_list, indicating it's in the middle of transitioning from
640 * a vfio driver to driver-less.
641 */
vfio_dev_viable(struct device * dev,void * data)642 static int vfio_dev_viable(struct device *dev, void *data)
643 {
644 struct vfio_group *group = data;
645 struct vfio_device *device;
646 struct device_driver *drv = ACCESS_ONCE(dev->driver);
647 struct vfio_unbound_dev *unbound;
648 int ret = -EINVAL;
649
650 mutex_lock(&group->unbound_lock);
651 list_for_each_entry(unbound, &group->unbound_list, unbound_next) {
652 if (dev == unbound->dev) {
653 ret = 0;
654 break;
655 }
656 }
657 mutex_unlock(&group->unbound_lock);
658
659 if (!ret || !drv || vfio_dev_whitelisted(dev, drv))
660 return 0;
661
662 device = vfio_group_get_device(group, dev);
663 if (device) {
664 vfio_device_put(device);
665 return 0;
666 }
667
668 return ret;
669 }
670
671 /**
672 * Async device support
673 */
vfio_group_nb_add_dev(struct vfio_group * group,struct device * dev)674 static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev)
675 {
676 struct vfio_device *device;
677
678 /* Do we already know about it? We shouldn't */
679 device = vfio_group_get_device(group, dev);
680 if (WARN_ON_ONCE(device)) {
681 vfio_device_put(device);
682 return 0;
683 }
684
685 /* Nothing to do for idle groups */
686 if (!atomic_read(&group->container_users))
687 return 0;
688
689 /* TODO Prevent device auto probing */
690 WARN(1, "Device %s added to live group %d!\n", dev_name(dev),
691 iommu_group_id(group->iommu_group));
692
693 return 0;
694 }
695
vfio_group_nb_verify(struct vfio_group * group,struct device * dev)696 static int vfio_group_nb_verify(struct vfio_group *group, struct device *dev)
697 {
698 /* We don't care what happens when the group isn't in use */
699 if (!atomic_read(&group->container_users))
700 return 0;
701
702 return vfio_dev_viable(dev, group);
703 }
704
vfio_iommu_group_notifier(struct notifier_block * nb,unsigned long action,void * data)705 static int vfio_iommu_group_notifier(struct notifier_block *nb,
706 unsigned long action, void *data)
707 {
708 struct vfio_group *group = container_of(nb, struct vfio_group, nb);
709 struct device *dev = data;
710 struct vfio_unbound_dev *unbound;
711
712 /*
713 * Need to go through a group_lock lookup to get a reference or we
714 * risk racing a group being removed. Ignore spurious notifies.
715 */
716 group = vfio_group_try_get(group);
717 if (!group)
718 return NOTIFY_OK;
719
720 switch (action) {
721 case IOMMU_GROUP_NOTIFY_ADD_DEVICE:
722 vfio_group_nb_add_dev(group, dev);
723 break;
724 case IOMMU_GROUP_NOTIFY_DEL_DEVICE:
725 /*
726 * Nothing to do here. If the device is in use, then the
727 * vfio sub-driver should block the remove callback until
728 * it is unused. If the device is unused or attached to a
729 * stub driver, then it should be released and we don't
730 * care that it will be going away.
731 */
732 break;
733 case IOMMU_GROUP_NOTIFY_BIND_DRIVER:
734 pr_debug("%s: Device %s, group %d binding to driver\n",
735 __func__, dev_name(dev),
736 iommu_group_id(group->iommu_group));
737 break;
738 case IOMMU_GROUP_NOTIFY_BOUND_DRIVER:
739 pr_debug("%s: Device %s, group %d bound to driver %s\n",
740 __func__, dev_name(dev),
741 iommu_group_id(group->iommu_group), dev->driver->name);
742 BUG_ON(vfio_group_nb_verify(group, dev));
743 break;
744 case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER:
745 pr_debug("%s: Device %s, group %d unbinding from driver %s\n",
746 __func__, dev_name(dev),
747 iommu_group_id(group->iommu_group), dev->driver->name);
748 break;
749 case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER:
750 pr_debug("%s: Device %s, group %d unbound from driver\n",
751 __func__, dev_name(dev),
752 iommu_group_id(group->iommu_group));
753 /*
754 * XXX An unbound device in a live group is ok, but we'd
755 * really like to avoid the above BUG_ON by preventing other
756 * drivers from binding to it. Once that occurs, we have to
757 * stop the system to maintain isolation. At a minimum, we'd
758 * want a toggle to disable driver auto probe for this device.
759 */
760
761 mutex_lock(&group->unbound_lock);
762 list_for_each_entry(unbound,
763 &group->unbound_list, unbound_next) {
764 if (dev == unbound->dev) {
765 list_del(&unbound->unbound_next);
766 kfree(unbound);
767 break;
768 }
769 }
770 mutex_unlock(&group->unbound_lock);
771 break;
772 }
773
774 /*
775 * If we're the last reference to the group, the group will be
776 * released, which includes unregistering the iommu group notifier.
777 * We hold a read-lock on that notifier list, unregistering needs
778 * a write-lock... deadlock. Release our reference asynchronously
779 * to avoid that situation.
780 */
781 vfio_group_schedule_put(group);
782 return NOTIFY_OK;
783 }
784
785 /**
786 * VFIO driver API
787 */
vfio_add_group_dev(struct device * dev,const struct vfio_device_ops * ops,void * device_data)788 int vfio_add_group_dev(struct device *dev,
789 const struct vfio_device_ops *ops, void *device_data)
790 {
791 struct iommu_group *iommu_group;
792 struct vfio_group *group;
793 struct vfio_device *device;
794
795 iommu_group = iommu_group_get(dev);
796 if (!iommu_group)
797 return -EINVAL;
798
799 group = vfio_group_get_from_iommu(iommu_group);
800 if (!group) {
801 group = vfio_create_group(iommu_group);
802 if (IS_ERR(group)) {
803 iommu_group_put(iommu_group);
804 return PTR_ERR(group);
805 }
806 } else {
807 /*
808 * A found vfio_group already holds a reference to the
809 * iommu_group. A created vfio_group keeps the reference.
810 */
811 iommu_group_put(iommu_group);
812 }
813
814 device = vfio_group_get_device(group, dev);
815 if (device) {
816 WARN(1, "Device %s already exists on group %d\n",
817 dev_name(dev), iommu_group_id(iommu_group));
818 vfio_device_put(device);
819 vfio_group_put(group);
820 return -EBUSY;
821 }
822
823 device = vfio_group_create_device(group, dev, ops, device_data);
824 if (IS_ERR(device)) {
825 vfio_group_put(group);
826 return PTR_ERR(device);
827 }
828
829 /*
830 * Drop all but the vfio_device reference. The vfio_device holds
831 * a reference to the vfio_group, which holds a reference to the
832 * iommu_group.
833 */
834 vfio_group_put(group);
835
836 return 0;
837 }
838 EXPORT_SYMBOL_GPL(vfio_add_group_dev);
839
840 /**
841 * Get a reference to the vfio_device for a device. Even if the
842 * caller thinks they own the device, they could be racing with a
843 * release call path, so we can't trust drvdata for the shortcut.
844 * Go the long way around, from the iommu_group to the vfio_group
845 * to the vfio_device.
846 */
vfio_device_get_from_dev(struct device * dev)847 struct vfio_device *vfio_device_get_from_dev(struct device *dev)
848 {
849 struct iommu_group *iommu_group;
850 struct vfio_group *group;
851 struct vfio_device *device;
852
853 iommu_group = iommu_group_get(dev);
854 if (!iommu_group)
855 return NULL;
856
857 group = vfio_group_get_from_iommu(iommu_group);
858 iommu_group_put(iommu_group);
859 if (!group)
860 return NULL;
861
862 device = vfio_group_get_device(group, dev);
863 vfio_group_put(group);
864
865 return device;
866 }
867 EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
868
vfio_device_get_from_name(struct vfio_group * group,char * buf)869 static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
870 char *buf)
871 {
872 struct vfio_device *it, *device = NULL;
873
874 mutex_lock(&group->device_lock);
875 list_for_each_entry(it, &group->device_list, group_next) {
876 if (!strcmp(dev_name(it->dev), buf)) {
877 device = it;
878 vfio_device_get(device);
879 break;
880 }
881 }
882 mutex_unlock(&group->device_lock);
883
884 return device;
885 }
886
887 /*
888 * Caller must hold a reference to the vfio_device
889 */
vfio_device_data(struct vfio_device * device)890 void *vfio_device_data(struct vfio_device *device)
891 {
892 return device->device_data;
893 }
894 EXPORT_SYMBOL_GPL(vfio_device_data);
895
896 /* Given a referenced group, check if it contains the device */
vfio_dev_present(struct vfio_group * group,struct device * dev)897 static bool vfio_dev_present(struct vfio_group *group, struct device *dev)
898 {
899 struct vfio_device *device;
900
901 device = vfio_group_get_device(group, dev);
902 if (!device)
903 return false;
904
905 vfio_device_put(device);
906 return true;
907 }
908
909 /*
910 * Decrement the device reference count and wait for the device to be
911 * removed. Open file descriptors for the device... */
vfio_del_group_dev(struct device * dev)912 void *vfio_del_group_dev(struct device *dev)
913 {
914 struct vfio_device *device = dev_get_drvdata(dev);
915 struct vfio_group *group = device->group;
916 void *device_data = device->device_data;
917 struct vfio_unbound_dev *unbound;
918 unsigned int i = 0;
919 long ret;
920 bool interrupted = false;
921
922 /*
923 * The group exists so long as we have a device reference. Get
924 * a group reference and use it to scan for the device going away.
925 */
926 vfio_group_get(group);
927
928 /*
929 * When the device is removed from the group, the group suddenly
930 * becomes non-viable; the device has a driver (until the unbind
931 * completes), but it's not present in the group. This is bad news
932 * for any external users that need to re-acquire a group reference
933 * in order to match and release their existing reference. To
934 * solve this, we track such devices on the unbound_list to bridge
935 * the gap until they're fully unbound.
936 */
937 unbound = kzalloc(sizeof(*unbound), GFP_KERNEL);
938 if (unbound) {
939 unbound->dev = dev;
940 mutex_lock(&group->unbound_lock);
941 list_add(&unbound->unbound_next, &group->unbound_list);
942 mutex_unlock(&group->unbound_lock);
943 }
944 WARN_ON(!unbound);
945
946 vfio_device_put(device);
947
948 /*
949 * If the device is still present in the group after the above
950 * 'put', then it is in use and we need to request it from the
951 * bus driver. The driver may in turn need to request the
952 * device from the user. We send the request on an arbitrary
953 * interval with counter to allow the driver to take escalating
954 * measures to release the device if it has the ability to do so.
955 */
956 do {
957 device = vfio_group_get_device(group, dev);
958 if (!device)
959 break;
960
961 if (device->ops->request)
962 device->ops->request(device_data, i++);
963
964 vfio_device_put(device);
965
966 if (interrupted) {
967 ret = wait_event_timeout(vfio.release_q,
968 !vfio_dev_present(group, dev), HZ * 10);
969 } else {
970 ret = wait_event_interruptible_timeout(vfio.release_q,
971 !vfio_dev_present(group, dev), HZ * 10);
972 if (ret == -ERESTARTSYS) {
973 interrupted = true;
974 dev_warn(dev,
975 "Device is currently in use, task"
976 " \"%s\" (%d) "
977 "blocked until device is released",
978 current->comm, task_pid_nr(current));
979 }
980 }
981 } while (ret <= 0);
982
983 vfio_group_put(group);
984
985 return device_data;
986 }
987 EXPORT_SYMBOL_GPL(vfio_del_group_dev);
988
989 /**
990 * VFIO base fd, /dev/vfio/vfio
991 */
vfio_ioctl_check_extension(struct vfio_container * container,unsigned long arg)992 static long vfio_ioctl_check_extension(struct vfio_container *container,
993 unsigned long arg)
994 {
995 struct vfio_iommu_driver *driver;
996 long ret = 0;
997
998 down_read(&container->group_lock);
999
1000 driver = container->iommu_driver;
1001
1002 switch (arg) {
1003 /* No base extensions yet */
1004 default:
1005 /*
1006 * If no driver is set, poll all registered drivers for
1007 * extensions and return the first positive result. If
1008 * a driver is already set, further queries will be passed
1009 * only to that driver.
1010 */
1011 if (!driver) {
1012 mutex_lock(&vfio.iommu_drivers_lock);
1013 list_for_each_entry(driver, &vfio.iommu_drivers_list,
1014 vfio_next) {
1015
1016 #ifdef CONFIG_VFIO_NOIOMMU
1017 if (!list_empty(&container->group_list) &&
1018 (container->noiommu !=
1019 (driver->ops == &vfio_noiommu_ops)))
1020 continue;
1021 #endif
1022
1023 if (!try_module_get(driver->ops->owner))
1024 continue;
1025
1026 ret = driver->ops->ioctl(NULL,
1027 VFIO_CHECK_EXTENSION,
1028 arg);
1029 module_put(driver->ops->owner);
1030 if (ret > 0)
1031 break;
1032 }
1033 mutex_unlock(&vfio.iommu_drivers_lock);
1034 } else
1035 ret = driver->ops->ioctl(container->iommu_data,
1036 VFIO_CHECK_EXTENSION, arg);
1037 }
1038
1039 up_read(&container->group_lock);
1040
1041 return ret;
1042 }
1043
1044 /* hold write lock on container->group_lock */
__vfio_container_attach_groups(struct vfio_container * container,struct vfio_iommu_driver * driver,void * data)1045 static int __vfio_container_attach_groups(struct vfio_container *container,
1046 struct vfio_iommu_driver *driver,
1047 void *data)
1048 {
1049 struct vfio_group *group;
1050 int ret = -ENODEV;
1051
1052 list_for_each_entry(group, &container->group_list, container_next) {
1053 ret = driver->ops->attach_group(data, group->iommu_group);
1054 if (ret)
1055 goto unwind;
1056 }
1057
1058 return ret;
1059
1060 unwind:
1061 list_for_each_entry_continue_reverse(group, &container->group_list,
1062 container_next) {
1063 driver->ops->detach_group(data, group->iommu_group);
1064 }
1065
1066 return ret;
1067 }
1068
vfio_ioctl_set_iommu(struct vfio_container * container,unsigned long arg)1069 static long vfio_ioctl_set_iommu(struct vfio_container *container,
1070 unsigned long arg)
1071 {
1072 struct vfio_iommu_driver *driver;
1073 long ret = -ENODEV;
1074
1075 down_write(&container->group_lock);
1076
1077 /*
1078 * The container is designed to be an unprivileged interface while
1079 * the group can be assigned to specific users. Therefore, only by
1080 * adding a group to a container does the user get the privilege of
1081 * enabling the iommu, which may allocate finite resources. There
1082 * is no unset_iommu, but by removing all the groups from a container,
1083 * the container is deprivileged and returns to an unset state.
1084 */
1085 if (list_empty(&container->group_list) || container->iommu_driver) {
1086 up_write(&container->group_lock);
1087 return -EINVAL;
1088 }
1089
1090 mutex_lock(&vfio.iommu_drivers_lock);
1091 list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
1092 void *data;
1093
1094 #ifdef CONFIG_VFIO_NOIOMMU
1095 /*
1096 * Only noiommu containers can use vfio-noiommu and noiommu
1097 * containers can only use vfio-noiommu.
1098 */
1099 if (container->noiommu != (driver->ops == &vfio_noiommu_ops))
1100 continue;
1101 #endif
1102
1103 if (!try_module_get(driver->ops->owner))
1104 continue;
1105
1106 /*
1107 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
1108 * so test which iommu driver reported support for this
1109 * extension and call open on them. We also pass them the
1110 * magic, allowing a single driver to support multiple
1111 * interfaces if they'd like.
1112 */
1113 if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
1114 module_put(driver->ops->owner);
1115 continue;
1116 }
1117
1118 data = driver->ops->open(arg);
1119 if (IS_ERR(data)) {
1120 ret = PTR_ERR(data);
1121 module_put(driver->ops->owner);
1122 continue;
1123 }
1124
1125 ret = __vfio_container_attach_groups(container, driver, data);
1126 if (ret) {
1127 driver->ops->release(data);
1128 module_put(driver->ops->owner);
1129 continue;
1130 }
1131
1132 container->iommu_driver = driver;
1133 container->iommu_data = data;
1134 break;
1135 }
1136
1137 mutex_unlock(&vfio.iommu_drivers_lock);
1138 up_write(&container->group_lock);
1139
1140 return ret;
1141 }
1142
vfio_fops_unl_ioctl(struct file * filep,unsigned int cmd,unsigned long arg)1143 static long vfio_fops_unl_ioctl(struct file *filep,
1144 unsigned int cmd, unsigned long arg)
1145 {
1146 struct vfio_container *container = filep->private_data;
1147 struct vfio_iommu_driver *driver;
1148 void *data;
1149 long ret = -EINVAL;
1150
1151 if (!container)
1152 return ret;
1153
1154 switch (cmd) {
1155 case VFIO_GET_API_VERSION:
1156 ret = VFIO_API_VERSION;
1157 break;
1158 case VFIO_CHECK_EXTENSION:
1159 ret = vfio_ioctl_check_extension(container, arg);
1160 break;
1161 case VFIO_SET_IOMMU:
1162 ret = vfio_ioctl_set_iommu(container, arg);
1163 break;
1164 default:
1165 down_read(&container->group_lock);
1166
1167 driver = container->iommu_driver;
1168 data = container->iommu_data;
1169
1170 if (driver) /* passthrough all unrecognized ioctls */
1171 ret = driver->ops->ioctl(data, cmd, arg);
1172
1173 up_read(&container->group_lock);
1174 }
1175
1176 return ret;
1177 }
1178
1179 #ifdef CONFIG_COMPAT
vfio_fops_compat_ioctl(struct file * filep,unsigned int cmd,unsigned long arg)1180 static long vfio_fops_compat_ioctl(struct file *filep,
1181 unsigned int cmd, unsigned long arg)
1182 {
1183 arg = (unsigned long)compat_ptr(arg);
1184 return vfio_fops_unl_ioctl(filep, cmd, arg);
1185 }
1186 #endif /* CONFIG_COMPAT */
1187
vfio_fops_open(struct inode * inode,struct file * filep)1188 static int vfio_fops_open(struct inode *inode, struct file *filep)
1189 {
1190 struct vfio_container *container;
1191
1192 container = kzalloc(sizeof(*container), GFP_KERNEL);
1193 if (!container)
1194 return -ENOMEM;
1195
1196 INIT_LIST_HEAD(&container->group_list);
1197 init_rwsem(&container->group_lock);
1198 kref_init(&container->kref);
1199
1200 filep->private_data = container;
1201
1202 return 0;
1203 }
1204
vfio_fops_release(struct inode * inode,struct file * filep)1205 static int vfio_fops_release(struct inode *inode, struct file *filep)
1206 {
1207 struct vfio_container *container = filep->private_data;
1208
1209 filep->private_data = NULL;
1210
1211 vfio_container_put(container);
1212
1213 return 0;
1214 }
1215
1216 /*
1217 * Once an iommu driver is set, we optionally pass read/write/mmap
1218 * on to the driver, allowing management interfaces beyond ioctl.
1219 */
vfio_fops_read(struct file * filep,char __user * buf,size_t count,loff_t * ppos)1220 static ssize_t vfio_fops_read(struct file *filep, char __user *buf,
1221 size_t count, loff_t *ppos)
1222 {
1223 struct vfio_container *container = filep->private_data;
1224 struct vfio_iommu_driver *driver;
1225 ssize_t ret = -EINVAL;
1226
1227 down_read(&container->group_lock);
1228
1229 driver = container->iommu_driver;
1230 if (likely(driver && driver->ops->read))
1231 ret = driver->ops->read(container->iommu_data,
1232 buf, count, ppos);
1233
1234 up_read(&container->group_lock);
1235
1236 return ret;
1237 }
1238
vfio_fops_write(struct file * filep,const char __user * buf,size_t count,loff_t * ppos)1239 static ssize_t vfio_fops_write(struct file *filep, const char __user *buf,
1240 size_t count, loff_t *ppos)
1241 {
1242 struct vfio_container *container = filep->private_data;
1243 struct vfio_iommu_driver *driver;
1244 ssize_t ret = -EINVAL;
1245
1246 down_read(&container->group_lock);
1247
1248 driver = container->iommu_driver;
1249 if (likely(driver && driver->ops->write))
1250 ret = driver->ops->write(container->iommu_data,
1251 buf, count, ppos);
1252
1253 up_read(&container->group_lock);
1254
1255 return ret;
1256 }
1257
vfio_fops_mmap(struct file * filep,struct vm_area_struct * vma)1258 static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1259 {
1260 struct vfio_container *container = filep->private_data;
1261 struct vfio_iommu_driver *driver;
1262 int ret = -EINVAL;
1263
1264 down_read(&container->group_lock);
1265
1266 driver = container->iommu_driver;
1267 if (likely(driver && driver->ops->mmap))
1268 ret = driver->ops->mmap(container->iommu_data, vma);
1269
1270 up_read(&container->group_lock);
1271
1272 return ret;
1273 }
1274
1275 static const struct file_operations vfio_fops = {
1276 .owner = THIS_MODULE,
1277 .open = vfio_fops_open,
1278 .release = vfio_fops_release,
1279 .read = vfio_fops_read,
1280 .write = vfio_fops_write,
1281 .unlocked_ioctl = vfio_fops_unl_ioctl,
1282 #ifdef CONFIG_COMPAT
1283 .compat_ioctl = vfio_fops_compat_ioctl,
1284 #endif
1285 .mmap = vfio_fops_mmap,
1286 };
1287
1288 /**
1289 * VFIO Group fd, /dev/vfio/$GROUP
1290 */
__vfio_group_unset_container(struct vfio_group * group)1291 static void __vfio_group_unset_container(struct vfio_group *group)
1292 {
1293 struct vfio_container *container = group->container;
1294 struct vfio_iommu_driver *driver;
1295
1296 down_write(&container->group_lock);
1297
1298 driver = container->iommu_driver;
1299 if (driver)
1300 driver->ops->detach_group(container->iommu_data,
1301 group->iommu_group);
1302
1303 group->container = NULL;
1304 list_del(&group->container_next);
1305
1306 /* Detaching the last group deprivileges a container, remove iommu */
1307 if (driver && list_empty(&container->group_list)) {
1308 driver->ops->release(container->iommu_data);
1309 module_put(driver->ops->owner);
1310 container->iommu_driver = NULL;
1311 container->iommu_data = NULL;
1312 }
1313
1314 up_write(&container->group_lock);
1315
1316 vfio_container_put(container);
1317 }
1318
1319 /*
1320 * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
1321 * if there was no container to unset. Since the ioctl is called on
1322 * the group, we know that still exists, therefore the only valid
1323 * transition here is 1->0.
1324 */
vfio_group_unset_container(struct vfio_group * group)1325 static int vfio_group_unset_container(struct vfio_group *group)
1326 {
1327 int users = atomic_cmpxchg(&group->container_users, 1, 0);
1328
1329 if (!users)
1330 return -EINVAL;
1331 if (users != 1)
1332 return -EBUSY;
1333
1334 __vfio_group_unset_container(group);
1335
1336 return 0;
1337 }
1338
1339 /*
1340 * When removing container users, anything that removes the last user
1341 * implicitly removes the group from the container. That is, if the
1342 * group file descriptor is closed, as well as any device file descriptors,
1343 * the group is free.
1344 */
vfio_group_try_dissolve_container(struct vfio_group * group)1345 static void vfio_group_try_dissolve_container(struct vfio_group *group)
1346 {
1347 if (0 == atomic_dec_if_positive(&group->container_users))
1348 __vfio_group_unset_container(group);
1349 }
1350
vfio_group_set_container(struct vfio_group * group,int container_fd)1351 static int vfio_group_set_container(struct vfio_group *group, int container_fd)
1352 {
1353 struct fd f;
1354 struct vfio_container *container;
1355 struct vfio_iommu_driver *driver;
1356 int ret = 0;
1357
1358 if (atomic_read(&group->container_users))
1359 return -EINVAL;
1360
1361 if (group->noiommu && !capable(CAP_SYS_RAWIO))
1362 return -EPERM;
1363
1364 f = fdget(container_fd);
1365 if (!f.file)
1366 return -EBADF;
1367
1368 /* Sanity check, is this really our fd? */
1369 if (f.file->f_op != &vfio_fops) {
1370 fdput(f);
1371 return -EINVAL;
1372 }
1373
1374 container = f.file->private_data;
1375 WARN_ON(!container); /* fget ensures we don't race vfio_release */
1376
1377 down_write(&container->group_lock);
1378
1379 /* Real groups and fake groups cannot mix */
1380 if (!list_empty(&container->group_list) &&
1381 container->noiommu != group->noiommu) {
1382 ret = -EPERM;
1383 goto unlock_out;
1384 }
1385
1386 driver = container->iommu_driver;
1387 if (driver) {
1388 ret = driver->ops->attach_group(container->iommu_data,
1389 group->iommu_group);
1390 if (ret)
1391 goto unlock_out;
1392 }
1393
1394 group->container = container;
1395 container->noiommu = group->noiommu;
1396 list_add(&group->container_next, &container->group_list);
1397
1398 /* Get a reference on the container and mark a user within the group */
1399 vfio_container_get(container);
1400 atomic_inc(&group->container_users);
1401
1402 unlock_out:
1403 up_write(&container->group_lock);
1404 fdput(f);
1405 return ret;
1406 }
1407
vfio_group_viable(struct vfio_group * group)1408 static bool vfio_group_viable(struct vfio_group *group)
1409 {
1410 return (iommu_group_for_each_dev(group->iommu_group,
1411 group, vfio_dev_viable) == 0);
1412 }
1413
1414 static const struct file_operations vfio_device_fops;
1415
vfio_group_get_device_fd(struct vfio_group * group,char * buf)1416 static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1417 {
1418 struct vfio_device *device;
1419 struct file *filep;
1420 int ret;
1421
1422 if (0 == atomic_read(&group->container_users) ||
1423 !group->container->iommu_driver || !vfio_group_viable(group))
1424 return -EINVAL;
1425
1426 if (group->noiommu && !capable(CAP_SYS_RAWIO))
1427 return -EPERM;
1428
1429 device = vfio_device_get_from_name(group, buf);
1430 if (!device)
1431 return -ENODEV;
1432
1433 ret = device->ops->open(device->device_data);
1434 if (ret) {
1435 vfio_device_put(device);
1436 return ret;
1437 }
1438
1439 /*
1440 * We can't use anon_inode_getfd() because we need to modify
1441 * the f_mode flags directly to allow more than just ioctls
1442 */
1443 ret = get_unused_fd_flags(O_CLOEXEC);
1444 if (ret < 0) {
1445 device->ops->release(device->device_data);
1446 vfio_device_put(device);
1447 return ret;
1448 }
1449
1450 filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1451 device, O_RDWR);
1452 if (IS_ERR(filep)) {
1453 put_unused_fd(ret);
1454 ret = PTR_ERR(filep);
1455 device->ops->release(device->device_data);
1456 vfio_device_put(device);
1457 return ret;
1458 }
1459
1460 /*
1461 * TODO: add an anon_inode interface to do this.
1462 * Appears to be missing by lack of need rather than
1463 * explicitly prevented. Now there's need.
1464 */
1465 filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1466
1467 atomic_inc(&group->container_users);
1468
1469 fd_install(ret, filep);
1470
1471 if (group->noiommu)
1472 dev_warn(device->dev, "vfio-noiommu device opened by user "
1473 "(%s:%d)\n", current->comm, task_pid_nr(current));
1474
1475 return ret;
1476 }
1477
vfio_group_fops_unl_ioctl(struct file * filep,unsigned int cmd,unsigned long arg)1478 static long vfio_group_fops_unl_ioctl(struct file *filep,
1479 unsigned int cmd, unsigned long arg)
1480 {
1481 struct vfio_group *group = filep->private_data;
1482 long ret = -ENOTTY;
1483
1484 switch (cmd) {
1485 case VFIO_GROUP_GET_STATUS:
1486 {
1487 struct vfio_group_status status;
1488 unsigned long minsz;
1489
1490 minsz = offsetofend(struct vfio_group_status, flags);
1491
1492 if (copy_from_user(&status, (void __user *)arg, minsz))
1493 return -EFAULT;
1494
1495 if (status.argsz < minsz)
1496 return -EINVAL;
1497
1498 status.flags = 0;
1499
1500 if (vfio_group_viable(group))
1501 status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1502
1503 if (group->container)
1504 status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET;
1505
1506 if (copy_to_user((void __user *)arg, &status, minsz))
1507 return -EFAULT;
1508
1509 ret = 0;
1510 break;
1511 }
1512 case VFIO_GROUP_SET_CONTAINER:
1513 {
1514 int fd;
1515
1516 if (get_user(fd, (int __user *)arg))
1517 return -EFAULT;
1518
1519 if (fd < 0)
1520 return -EINVAL;
1521
1522 ret = vfio_group_set_container(group, fd);
1523 break;
1524 }
1525 case VFIO_GROUP_UNSET_CONTAINER:
1526 ret = vfio_group_unset_container(group);
1527 break;
1528 case VFIO_GROUP_GET_DEVICE_FD:
1529 {
1530 char *buf;
1531
1532 buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1533 if (IS_ERR(buf))
1534 return PTR_ERR(buf);
1535
1536 ret = vfio_group_get_device_fd(group, buf);
1537 kfree(buf);
1538 break;
1539 }
1540 }
1541
1542 return ret;
1543 }
1544
1545 #ifdef CONFIG_COMPAT
vfio_group_fops_compat_ioctl(struct file * filep,unsigned int cmd,unsigned long arg)1546 static long vfio_group_fops_compat_ioctl(struct file *filep,
1547 unsigned int cmd, unsigned long arg)
1548 {
1549 arg = (unsigned long)compat_ptr(arg);
1550 return vfio_group_fops_unl_ioctl(filep, cmd, arg);
1551 }
1552 #endif /* CONFIG_COMPAT */
1553
vfio_group_fops_open(struct inode * inode,struct file * filep)1554 static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1555 {
1556 struct vfio_group *group;
1557 int opened;
1558
1559 group = vfio_group_get_from_minor(iminor(inode));
1560 if (!group)
1561 return -ENODEV;
1562
1563 if (group->noiommu && !capable(CAP_SYS_RAWIO)) {
1564 vfio_group_put(group);
1565 return -EPERM;
1566 }
1567
1568 /* Do we need multiple instances of the group open? Seems not. */
1569 opened = atomic_cmpxchg(&group->opened, 0, 1);
1570 if (opened) {
1571 vfio_group_put(group);
1572 return -EBUSY;
1573 }
1574
1575 /* Is something still in use from a previous open? */
1576 if (group->container) {
1577 atomic_dec(&group->opened);
1578 vfio_group_put(group);
1579 return -EBUSY;
1580 }
1581
1582 filep->private_data = group;
1583
1584 return 0;
1585 }
1586
vfio_group_fops_release(struct inode * inode,struct file * filep)1587 static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1588 {
1589 struct vfio_group *group = filep->private_data;
1590
1591 filep->private_data = NULL;
1592
1593 vfio_group_try_dissolve_container(group);
1594
1595 atomic_dec(&group->opened);
1596
1597 vfio_group_put(group);
1598
1599 return 0;
1600 }
1601
1602 static const struct file_operations vfio_group_fops = {
1603 .owner = THIS_MODULE,
1604 .unlocked_ioctl = vfio_group_fops_unl_ioctl,
1605 #ifdef CONFIG_COMPAT
1606 .compat_ioctl = vfio_group_fops_compat_ioctl,
1607 #endif
1608 .open = vfio_group_fops_open,
1609 .release = vfio_group_fops_release,
1610 };
1611
1612 /**
1613 * VFIO Device fd
1614 */
vfio_device_fops_release(struct inode * inode,struct file * filep)1615 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1616 {
1617 struct vfio_device *device = filep->private_data;
1618
1619 device->ops->release(device->device_data);
1620
1621 vfio_group_try_dissolve_container(device->group);
1622
1623 vfio_device_put(device);
1624
1625 return 0;
1626 }
1627
vfio_device_fops_unl_ioctl(struct file * filep,unsigned int cmd,unsigned long arg)1628 static long vfio_device_fops_unl_ioctl(struct file *filep,
1629 unsigned int cmd, unsigned long arg)
1630 {
1631 struct vfio_device *device = filep->private_data;
1632
1633 if (unlikely(!device->ops->ioctl))
1634 return -EINVAL;
1635
1636 return device->ops->ioctl(device->device_data, cmd, arg);
1637 }
1638
vfio_device_fops_read(struct file * filep,char __user * buf,size_t count,loff_t * ppos)1639 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1640 size_t count, loff_t *ppos)
1641 {
1642 struct vfio_device *device = filep->private_data;
1643
1644 if (unlikely(!device->ops->read))
1645 return -EINVAL;
1646
1647 return device->ops->read(device->device_data, buf, count, ppos);
1648 }
1649
vfio_device_fops_write(struct file * filep,const char __user * buf,size_t count,loff_t * ppos)1650 static ssize_t vfio_device_fops_write(struct file *filep,
1651 const char __user *buf,
1652 size_t count, loff_t *ppos)
1653 {
1654 struct vfio_device *device = filep->private_data;
1655
1656 if (unlikely(!device->ops->write))
1657 return -EINVAL;
1658
1659 return device->ops->write(device->device_data, buf, count, ppos);
1660 }
1661
vfio_device_fops_mmap(struct file * filep,struct vm_area_struct * vma)1662 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1663 {
1664 struct vfio_device *device = filep->private_data;
1665
1666 if (unlikely(!device->ops->mmap))
1667 return -EINVAL;
1668
1669 return device->ops->mmap(device->device_data, vma);
1670 }
1671
1672 #ifdef CONFIG_COMPAT
vfio_device_fops_compat_ioctl(struct file * filep,unsigned int cmd,unsigned long arg)1673 static long vfio_device_fops_compat_ioctl(struct file *filep,
1674 unsigned int cmd, unsigned long arg)
1675 {
1676 arg = (unsigned long)compat_ptr(arg);
1677 return vfio_device_fops_unl_ioctl(filep, cmd, arg);
1678 }
1679 #endif /* CONFIG_COMPAT */
1680
1681 static const struct file_operations vfio_device_fops = {
1682 .owner = THIS_MODULE,
1683 .release = vfio_device_fops_release,
1684 .read = vfio_device_fops_read,
1685 .write = vfio_device_fops_write,
1686 .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1687 #ifdef CONFIG_COMPAT
1688 .compat_ioctl = vfio_device_fops_compat_ioctl,
1689 #endif
1690 .mmap = vfio_device_fops_mmap,
1691 };
1692
1693 /**
1694 * External user API, exported by symbols to be linked dynamically.
1695 *
1696 * The protocol includes:
1697 * 1. do normal VFIO init operation:
1698 * - opening a new container;
1699 * - attaching group(s) to it;
1700 * - setting an IOMMU driver for a container.
1701 * When IOMMU is set for a container, all groups in it are
1702 * considered ready to use by an external user.
1703 *
1704 * 2. User space passes a group fd to an external user.
1705 * The external user calls vfio_group_get_external_user()
1706 * to verify that:
1707 * - the group is initialized;
1708 * - IOMMU is set for it.
1709 * If both checks passed, vfio_group_get_external_user()
1710 * increments the container user counter to prevent
1711 * the VFIO group from disposal before KVM exits.
1712 *
1713 * 3. The external user calls vfio_external_user_iommu_id()
1714 * to know an IOMMU ID.
1715 *
1716 * 4. When the external KVM finishes, it calls
1717 * vfio_group_put_external_user() to release the VFIO group.
1718 * This call decrements the container user counter.
1719 */
vfio_group_get_external_user(struct file * filep)1720 struct vfio_group *vfio_group_get_external_user(struct file *filep)
1721 {
1722 struct vfio_group *group = filep->private_data;
1723
1724 if (filep->f_op != &vfio_group_fops)
1725 return ERR_PTR(-EINVAL);
1726
1727 if (!atomic_inc_not_zero(&group->container_users))
1728 return ERR_PTR(-EINVAL);
1729
1730 if (group->noiommu) {
1731 atomic_dec(&group->container_users);
1732 return ERR_PTR(-EPERM);
1733 }
1734
1735 if (!group->container->iommu_driver ||
1736 !vfio_group_viable(group)) {
1737 atomic_dec(&group->container_users);
1738 return ERR_PTR(-EINVAL);
1739 }
1740
1741 vfio_group_get(group);
1742
1743 return group;
1744 }
1745 EXPORT_SYMBOL_GPL(vfio_group_get_external_user);
1746
vfio_group_put_external_user(struct vfio_group * group)1747 void vfio_group_put_external_user(struct vfio_group *group)
1748 {
1749 vfio_group_try_dissolve_container(group);
1750 vfio_group_put(group);
1751 }
1752 EXPORT_SYMBOL_GPL(vfio_group_put_external_user);
1753
vfio_external_group_match_file(struct vfio_group * test_group,struct file * filep)1754 bool vfio_external_group_match_file(struct vfio_group *test_group,
1755 struct file *filep)
1756 {
1757 struct vfio_group *group = filep->private_data;
1758
1759 return (filep->f_op == &vfio_group_fops) && (group == test_group);
1760 }
1761 EXPORT_SYMBOL_GPL(vfio_external_group_match_file);
1762
vfio_external_user_iommu_id(struct vfio_group * group)1763 int vfio_external_user_iommu_id(struct vfio_group *group)
1764 {
1765 return iommu_group_id(group->iommu_group);
1766 }
1767 EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id);
1768
vfio_external_check_extension(struct vfio_group * group,unsigned long arg)1769 long vfio_external_check_extension(struct vfio_group *group, unsigned long arg)
1770 {
1771 return vfio_ioctl_check_extension(group->container, arg);
1772 }
1773 EXPORT_SYMBOL_GPL(vfio_external_check_extension);
1774
1775 /**
1776 * Sub-module support
1777 */
1778 /*
1779 * Helper for managing a buffer of info chain capabilities, allocate or
1780 * reallocate a buffer with additional @size, filling in @id and @version
1781 * of the capability. A pointer to the new capability is returned.
1782 *
1783 * NB. The chain is based at the head of the buffer, so new entries are
1784 * added to the tail, vfio_info_cap_shift() should be called to fixup the
1785 * next offsets prior to copying to the user buffer.
1786 */
vfio_info_cap_add(struct vfio_info_cap * caps,size_t size,u16 id,u16 version)1787 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1788 size_t size, u16 id, u16 version)
1789 {
1790 void *buf;
1791 struct vfio_info_cap_header *header, *tmp;
1792
1793 buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1794 if (!buf) {
1795 kfree(caps->buf);
1796 caps->size = 0;
1797 return ERR_PTR(-ENOMEM);
1798 }
1799
1800 caps->buf = buf;
1801 header = buf + caps->size;
1802
1803 /* Eventually copied to user buffer, zero */
1804 memset(header, 0, size);
1805
1806 header->id = id;
1807 header->version = version;
1808
1809 /* Add to the end of the capability chain */
1810 for (tmp = caps->buf; tmp->next; tmp = (void *)tmp + tmp->next)
1811 ; /* nothing */
1812
1813 tmp->next = caps->size;
1814 caps->size += size;
1815
1816 return header;
1817 }
1818 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1819
vfio_info_cap_shift(struct vfio_info_cap * caps,size_t offset)1820 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1821 {
1822 struct vfio_info_cap_header *tmp;
1823
1824 for (tmp = caps->buf; tmp->next; tmp = (void *)tmp + tmp->next - offset)
1825 tmp->next += offset;
1826 }
1827 EXPORT_SYMBOL_GPL(vfio_info_cap_shift);
1828
1829 /**
1830 * Module/class support
1831 */
vfio_devnode(struct device * dev,umode_t * mode)1832 static char *vfio_devnode(struct device *dev, umode_t *mode)
1833 {
1834 return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
1835 }
1836
1837 static struct miscdevice vfio_dev = {
1838 .minor = VFIO_MINOR,
1839 .name = "vfio",
1840 .fops = &vfio_fops,
1841 .nodename = "vfio/vfio",
1842 .mode = S_IRUGO | S_IWUGO,
1843 };
1844
vfio_init(void)1845 static int __init vfio_init(void)
1846 {
1847 int ret;
1848
1849 idr_init(&vfio.group_idr);
1850 mutex_init(&vfio.group_lock);
1851 mutex_init(&vfio.iommu_drivers_lock);
1852 INIT_LIST_HEAD(&vfio.group_list);
1853 INIT_LIST_HEAD(&vfio.iommu_drivers_list);
1854 init_waitqueue_head(&vfio.release_q);
1855
1856 ret = misc_register(&vfio_dev);
1857 if (ret) {
1858 pr_err("vfio: misc device register failed\n");
1859 return ret;
1860 }
1861
1862 /* /dev/vfio/$GROUP */
1863 vfio.class = class_create(THIS_MODULE, "vfio");
1864 if (IS_ERR(vfio.class)) {
1865 ret = PTR_ERR(vfio.class);
1866 goto err_class;
1867 }
1868
1869 vfio.class->devnode = vfio_devnode;
1870
1871 ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK, "vfio");
1872 if (ret)
1873 goto err_alloc_chrdev;
1874
1875 cdev_init(&vfio.group_cdev, &vfio_group_fops);
1876 ret = cdev_add(&vfio.group_cdev, vfio.group_devt, MINORMASK);
1877 if (ret)
1878 goto err_cdev_add;
1879
1880 pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
1881
1882 /*
1883 * Attempt to load known iommu-drivers. This gives us a working
1884 * environment without the user needing to explicitly load iommu
1885 * drivers.
1886 */
1887 request_module_nowait("vfio_iommu_type1");
1888 request_module_nowait("vfio_iommu_spapr_tce");
1889
1890 #ifdef CONFIG_VFIO_NOIOMMU
1891 vfio_register_iommu_driver(&vfio_noiommu_ops);
1892 #endif
1893 return 0;
1894
1895 err_cdev_add:
1896 unregister_chrdev_region(vfio.group_devt, MINORMASK);
1897 err_alloc_chrdev:
1898 class_destroy(vfio.class);
1899 vfio.class = NULL;
1900 err_class:
1901 misc_deregister(&vfio_dev);
1902 return ret;
1903 }
1904
vfio_cleanup(void)1905 static void __exit vfio_cleanup(void)
1906 {
1907 WARN_ON(!list_empty(&vfio.group_list));
1908
1909 #ifdef CONFIG_VFIO_NOIOMMU
1910 vfio_unregister_iommu_driver(&vfio_noiommu_ops);
1911 #endif
1912 idr_destroy(&vfio.group_idr);
1913 cdev_del(&vfio.group_cdev);
1914 unregister_chrdev_region(vfio.group_devt, MINORMASK);
1915 class_destroy(vfio.class);
1916 vfio.class = NULL;
1917 misc_deregister(&vfio_dev);
1918 }
1919
1920 module_init(vfio_init);
1921 module_exit(vfio_cleanup);
1922
1923 MODULE_VERSION(DRIVER_VERSION);
1924 MODULE_LICENSE("GPL v2");
1925 MODULE_AUTHOR(DRIVER_AUTHOR);
1926 MODULE_DESCRIPTION(DRIVER_DESC);
1927 MODULE_ALIAS_MISCDEV(VFIO_MINOR);
1928 MODULE_ALIAS("devname:vfio/vfio");
1929