• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * VFIO core
3  *
4  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
5  *     Author: Alex Williamson <alex.williamson@redhat.com>
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License version 2 as
9  * published by the Free Software Foundation.
10  *
11  * Derived from original vfio:
12  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
13  * Author: Tom Lyon, pugs@cisco.com
14  */
15 
16 #include <linux/cdev.h>
17 #include <linux/compat.h>
18 #include <linux/device.h>
19 #include <linux/file.h>
20 #include <linux/anon_inodes.h>
21 #include <linux/fs.h>
22 #include <linux/idr.h>
23 #include <linux/iommu.h>
24 #include <linux/list.h>
25 #include <linux/miscdevice.h>
26 #include <linux/module.h>
27 #include <linux/mutex.h>
28 #include <linux/pci.h>
29 #include <linux/rwsem.h>
30 #include <linux/sched.h>
31 #include <linux/slab.h>
32 #include <linux/stat.h>
33 #include <linux/string.h>
34 #include <linux/uaccess.h>
35 #include <linux/vfio.h>
36 #include <linux/wait.h>
37 
38 #define DRIVER_VERSION	"0.3"
39 #define DRIVER_AUTHOR	"Alex Williamson <alex.williamson@redhat.com>"
40 #define DRIVER_DESC	"VFIO - User Level meta-driver"
41 
42 static struct vfio {
43 	struct class			*class;
44 	struct list_head		iommu_drivers_list;
45 	struct mutex			iommu_drivers_lock;
46 	struct list_head		group_list;
47 	struct idr			group_idr;
48 	struct mutex			group_lock;
49 	struct cdev			group_cdev;
50 	dev_t				group_devt;
51 	wait_queue_head_t		release_q;
52 } vfio;
53 
54 struct vfio_iommu_driver {
55 	const struct vfio_iommu_driver_ops	*ops;
56 	struct list_head			vfio_next;
57 };
58 
59 struct vfio_container {
60 	struct kref			kref;
61 	struct list_head		group_list;
62 	struct rw_semaphore		group_lock;
63 	struct vfio_iommu_driver	*iommu_driver;
64 	void				*iommu_data;
65 	bool				noiommu;
66 };
67 
68 struct vfio_unbound_dev {
69 	struct device			*dev;
70 	struct list_head		unbound_next;
71 };
72 
73 struct vfio_group {
74 	struct kref			kref;
75 	int				minor;
76 	atomic_t			container_users;
77 	struct iommu_group		*iommu_group;
78 	struct vfio_container		*container;
79 	struct list_head		device_list;
80 	struct mutex			device_lock;
81 	struct device			*dev;
82 	struct notifier_block		nb;
83 	struct list_head		vfio_next;
84 	struct list_head		container_next;
85 	struct list_head		unbound_list;
86 	struct mutex			unbound_lock;
87 	atomic_t			opened;
88 	bool				noiommu;
89 };
90 
91 struct vfio_device {
92 	struct kref			kref;
93 	struct device			*dev;
94 	const struct vfio_device_ops	*ops;
95 	struct vfio_group		*group;
96 	struct list_head		group_next;
97 	void				*device_data;
98 };
99 
100 #ifdef CONFIG_VFIO_NOIOMMU
101 static bool noiommu __read_mostly;
102 module_param_named(enable_unsafe_noiommu_mode,
103 		   noiommu, bool, S_IRUGO | S_IWUSR);
104 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
105 #endif
106 
107 /*
108  * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe
109  * and remove functions, any use cases other than acquiring the first
110  * reference for the purpose of calling vfio_add_group_dev() or removing
111  * that symmetric reference after vfio_del_group_dev() should use the raw
112  * iommu_group_{get,put} functions.  In particular, vfio_iommu_group_put()
113  * removes the device from the dummy group and cannot be nested.
114  */
vfio_iommu_group_get(struct device * dev)115 struct iommu_group *vfio_iommu_group_get(struct device *dev)
116 {
117 	struct iommu_group *group;
118 	int __maybe_unused ret;
119 
120 	group = iommu_group_get(dev);
121 
122 #ifdef CONFIG_VFIO_NOIOMMU
123 	/*
124 	 * With noiommu enabled, an IOMMU group will be created for a device
125 	 * that doesn't already have one and doesn't have an iommu_ops on their
126 	 * bus.  We set iommudata simply to be able to identify these groups
127 	 * as special use and for reclamation later.
128 	 */
129 	if (group || !noiommu || iommu_present(dev->bus))
130 		return group;
131 
132 	group = iommu_group_alloc();
133 	if (IS_ERR(group))
134 		return NULL;
135 
136 	iommu_group_set_name(group, "vfio-noiommu");
137 	iommu_group_set_iommudata(group, &noiommu, NULL);
138 	ret = iommu_group_add_device(group, dev);
139 	iommu_group_put(group);
140 	if (ret)
141 		return NULL;
142 
143 	/*
144 	 * Where to taint?  At this point we've added an IOMMU group for a
145 	 * device that is not backed by iommu_ops, therefore any iommu_
146 	 * callback using iommu_ops can legitimately Oops.  So, while we may
147 	 * be about to give a DMA capable device to a user without IOMMU
148 	 * protection, which is clearly taint-worthy, let's go ahead and do
149 	 * it here.
150 	 */
151 	add_taint(TAINT_USER, LOCKDEP_STILL_OK);
152 	dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
153 #endif
154 
155 	return group;
156 }
157 EXPORT_SYMBOL_GPL(vfio_iommu_group_get);
158 
vfio_iommu_group_put(struct iommu_group * group,struct device * dev)159 void vfio_iommu_group_put(struct iommu_group *group, struct device *dev)
160 {
161 #ifdef CONFIG_VFIO_NOIOMMU
162 	if (iommu_group_get_iommudata(group) == &noiommu)
163 		iommu_group_remove_device(dev);
164 #endif
165 
166 	iommu_group_put(group);
167 }
168 EXPORT_SYMBOL_GPL(vfio_iommu_group_put);
169 
170 #ifdef CONFIG_VFIO_NOIOMMU
vfio_noiommu_open(unsigned long arg)171 static void *vfio_noiommu_open(unsigned long arg)
172 {
173 	if (arg != VFIO_NOIOMMU_IOMMU)
174 		return ERR_PTR(-EINVAL);
175 	if (!capable(CAP_SYS_RAWIO))
176 		return ERR_PTR(-EPERM);
177 
178 	return NULL;
179 }
180 
vfio_noiommu_release(void * iommu_data)181 static void vfio_noiommu_release(void *iommu_data)
182 {
183 }
184 
vfio_noiommu_ioctl(void * iommu_data,unsigned int cmd,unsigned long arg)185 static long vfio_noiommu_ioctl(void *iommu_data,
186 			       unsigned int cmd, unsigned long arg)
187 {
188 	if (cmd == VFIO_CHECK_EXTENSION)
189 		return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
190 
191 	return -ENOTTY;
192 }
193 
vfio_noiommu_attach_group(void * iommu_data,struct iommu_group * iommu_group)194 static int vfio_noiommu_attach_group(void *iommu_data,
195 				     struct iommu_group *iommu_group)
196 {
197 	return iommu_group_get_iommudata(iommu_group) == &noiommu ? 0 : -EINVAL;
198 }
199 
vfio_noiommu_detach_group(void * iommu_data,struct iommu_group * iommu_group)200 static void vfio_noiommu_detach_group(void *iommu_data,
201 				      struct iommu_group *iommu_group)
202 {
203 }
204 
205 static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
206 	.name = "vfio-noiommu",
207 	.owner = THIS_MODULE,
208 	.open = vfio_noiommu_open,
209 	.release = vfio_noiommu_release,
210 	.ioctl = vfio_noiommu_ioctl,
211 	.attach_group = vfio_noiommu_attach_group,
212 	.detach_group = vfio_noiommu_detach_group,
213 };
214 #endif
215 
216 
217 /**
218  * IOMMU driver registration
219  */
vfio_register_iommu_driver(const struct vfio_iommu_driver_ops * ops)220 int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
221 {
222 	struct vfio_iommu_driver *driver, *tmp;
223 
224 	driver = kzalloc(sizeof(*driver), GFP_KERNEL);
225 	if (!driver)
226 		return -ENOMEM;
227 
228 	driver->ops = ops;
229 
230 	mutex_lock(&vfio.iommu_drivers_lock);
231 
232 	/* Check for duplicates */
233 	list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
234 		if (tmp->ops == ops) {
235 			mutex_unlock(&vfio.iommu_drivers_lock);
236 			kfree(driver);
237 			return -EINVAL;
238 		}
239 	}
240 
241 	list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
242 
243 	mutex_unlock(&vfio.iommu_drivers_lock);
244 
245 	return 0;
246 }
247 EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
248 
vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops * ops)249 void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
250 {
251 	struct vfio_iommu_driver *driver;
252 
253 	mutex_lock(&vfio.iommu_drivers_lock);
254 	list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
255 		if (driver->ops == ops) {
256 			list_del(&driver->vfio_next);
257 			mutex_unlock(&vfio.iommu_drivers_lock);
258 			kfree(driver);
259 			return;
260 		}
261 	}
262 	mutex_unlock(&vfio.iommu_drivers_lock);
263 }
264 EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
265 
266 /**
267  * Group minor allocation/free - both called with vfio.group_lock held
268  */
vfio_alloc_group_minor(struct vfio_group * group)269 static int vfio_alloc_group_minor(struct vfio_group *group)
270 {
271 	return idr_alloc(&vfio.group_idr, group, 0, MINORMASK + 1, GFP_KERNEL);
272 }
273 
vfio_free_group_minor(int minor)274 static void vfio_free_group_minor(int minor)
275 {
276 	idr_remove(&vfio.group_idr, minor);
277 }
278 
279 static int vfio_iommu_group_notifier(struct notifier_block *nb,
280 				     unsigned long action, void *data);
281 static void vfio_group_get(struct vfio_group *group);
282 
283 /**
284  * Container objects - containers are created when /dev/vfio/vfio is
285  * opened, but their lifecycle extends until the last user is done, so
286  * it's freed via kref.  Must support container/group/device being
287  * closed in any order.
288  */
vfio_container_get(struct vfio_container * container)289 static void vfio_container_get(struct vfio_container *container)
290 {
291 	kref_get(&container->kref);
292 }
293 
vfio_container_release(struct kref * kref)294 static void vfio_container_release(struct kref *kref)
295 {
296 	struct vfio_container *container;
297 	container = container_of(kref, struct vfio_container, kref);
298 
299 	kfree(container);
300 }
301 
vfio_container_put(struct vfio_container * container)302 static void vfio_container_put(struct vfio_container *container)
303 {
304 	kref_put(&container->kref, vfio_container_release);
305 }
306 
vfio_group_unlock_and_free(struct vfio_group * group)307 static void vfio_group_unlock_and_free(struct vfio_group *group)
308 {
309 	mutex_unlock(&vfio.group_lock);
310 	/*
311 	 * Unregister outside of lock.  A spurious callback is harmless now
312 	 * that the group is no longer in vfio.group_list.
313 	 */
314 	iommu_group_unregister_notifier(group->iommu_group, &group->nb);
315 	kfree(group);
316 }
317 
318 /**
319  * Group objects - create, release, get, put, search
320  */
vfio_create_group(struct iommu_group * iommu_group)321 static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
322 {
323 	struct vfio_group *group, *tmp;
324 	struct device *dev;
325 	int ret, minor;
326 
327 	group = kzalloc(sizeof(*group), GFP_KERNEL);
328 	if (!group)
329 		return ERR_PTR(-ENOMEM);
330 
331 	kref_init(&group->kref);
332 	INIT_LIST_HEAD(&group->device_list);
333 	mutex_init(&group->device_lock);
334 	INIT_LIST_HEAD(&group->unbound_list);
335 	mutex_init(&group->unbound_lock);
336 	atomic_set(&group->container_users, 0);
337 	atomic_set(&group->opened, 0);
338 	group->iommu_group = iommu_group;
339 #ifdef CONFIG_VFIO_NOIOMMU
340 	group->noiommu = (iommu_group_get_iommudata(iommu_group) == &noiommu);
341 #endif
342 
343 	group->nb.notifier_call = vfio_iommu_group_notifier;
344 
345 	/*
346 	 * blocking notifiers acquire a rwsem around registering and hold
347 	 * it around callback.  Therefore, need to register outside of
348 	 * vfio.group_lock to avoid A-B/B-A contention.  Our callback won't
349 	 * do anything unless it can find the group in vfio.group_list, so
350 	 * no harm in registering early.
351 	 */
352 	ret = iommu_group_register_notifier(iommu_group, &group->nb);
353 	if (ret) {
354 		kfree(group);
355 		return ERR_PTR(ret);
356 	}
357 
358 	mutex_lock(&vfio.group_lock);
359 
360 	/* Did we race creating this group? */
361 	list_for_each_entry(tmp, &vfio.group_list, vfio_next) {
362 		if (tmp->iommu_group == iommu_group) {
363 			vfio_group_get(tmp);
364 			vfio_group_unlock_and_free(group);
365 			return tmp;
366 		}
367 	}
368 
369 	minor = vfio_alloc_group_minor(group);
370 	if (minor < 0) {
371 		vfio_group_unlock_and_free(group);
372 		return ERR_PTR(minor);
373 	}
374 
375 	dev = device_create(vfio.class, NULL,
376 			    MKDEV(MAJOR(vfio.group_devt), minor),
377 			    group, "%s%d", group->noiommu ? "noiommu-" : "",
378 			    iommu_group_id(iommu_group));
379 	if (IS_ERR(dev)) {
380 		vfio_free_group_minor(minor);
381 		vfio_group_unlock_and_free(group);
382 		return (struct vfio_group *)dev; /* ERR_PTR */
383 	}
384 
385 	group->minor = minor;
386 	group->dev = dev;
387 
388 	list_add(&group->vfio_next, &vfio.group_list);
389 
390 	mutex_unlock(&vfio.group_lock);
391 
392 	return group;
393 }
394 
395 /* called with vfio.group_lock held */
vfio_group_release(struct kref * kref)396 static void vfio_group_release(struct kref *kref)
397 {
398 	struct vfio_group *group = container_of(kref, struct vfio_group, kref);
399 	struct vfio_unbound_dev *unbound, *tmp;
400 	struct iommu_group *iommu_group = group->iommu_group;
401 
402 	WARN_ON(!list_empty(&group->device_list));
403 
404 	list_for_each_entry_safe(unbound, tmp,
405 				 &group->unbound_list, unbound_next) {
406 		list_del(&unbound->unbound_next);
407 		kfree(unbound);
408 	}
409 
410 	device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor));
411 	list_del(&group->vfio_next);
412 	vfio_free_group_minor(group->minor);
413 	vfio_group_unlock_and_free(group);
414 	iommu_group_put(iommu_group);
415 }
416 
vfio_group_put(struct vfio_group * group)417 static void vfio_group_put(struct vfio_group *group)
418 {
419 	kref_put_mutex(&group->kref, vfio_group_release, &vfio.group_lock);
420 }
421 
422 struct vfio_group_put_work {
423 	struct work_struct work;
424 	struct vfio_group *group;
425 };
426 
vfio_group_put_bg(struct work_struct * work)427 static void vfio_group_put_bg(struct work_struct *work)
428 {
429 	struct vfio_group_put_work *do_work;
430 
431 	do_work = container_of(work, struct vfio_group_put_work, work);
432 
433 	vfio_group_put(do_work->group);
434 	kfree(do_work);
435 }
436 
vfio_group_schedule_put(struct vfio_group * group)437 static void vfio_group_schedule_put(struct vfio_group *group)
438 {
439 	struct vfio_group_put_work *do_work;
440 
441 	do_work = kmalloc(sizeof(*do_work), GFP_KERNEL);
442 	if (WARN_ON(!do_work))
443 		return;
444 
445 	INIT_WORK(&do_work->work, vfio_group_put_bg);
446 	do_work->group = group;
447 	schedule_work(&do_work->work);
448 }
449 
450 /* Assume group_lock or group reference is held */
vfio_group_get(struct vfio_group * group)451 static void vfio_group_get(struct vfio_group *group)
452 {
453 	kref_get(&group->kref);
454 }
455 
456 /*
457  * Not really a try as we will sleep for mutex, but we need to make
458  * sure the group pointer is valid under lock and get a reference.
459  */
vfio_group_try_get(struct vfio_group * group)460 static struct vfio_group *vfio_group_try_get(struct vfio_group *group)
461 {
462 	struct vfio_group *target = group;
463 
464 	mutex_lock(&vfio.group_lock);
465 	list_for_each_entry(group, &vfio.group_list, vfio_next) {
466 		if (group == target) {
467 			vfio_group_get(group);
468 			mutex_unlock(&vfio.group_lock);
469 			return group;
470 		}
471 	}
472 	mutex_unlock(&vfio.group_lock);
473 
474 	return NULL;
475 }
476 
477 static
vfio_group_get_from_iommu(struct iommu_group * iommu_group)478 struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group)
479 {
480 	struct vfio_group *group;
481 
482 	mutex_lock(&vfio.group_lock);
483 	list_for_each_entry(group, &vfio.group_list, vfio_next) {
484 		if (group->iommu_group == iommu_group) {
485 			vfio_group_get(group);
486 			mutex_unlock(&vfio.group_lock);
487 			return group;
488 		}
489 	}
490 	mutex_unlock(&vfio.group_lock);
491 
492 	return NULL;
493 }
494 
vfio_group_get_from_minor(int minor)495 static struct vfio_group *vfio_group_get_from_minor(int minor)
496 {
497 	struct vfio_group *group;
498 
499 	mutex_lock(&vfio.group_lock);
500 	group = idr_find(&vfio.group_idr, minor);
501 	if (!group) {
502 		mutex_unlock(&vfio.group_lock);
503 		return NULL;
504 	}
505 	vfio_group_get(group);
506 	mutex_unlock(&vfio.group_lock);
507 
508 	return group;
509 }
510 
511 /**
512  * Device objects - create, release, get, put, search
513  */
514 static
vfio_group_create_device(struct vfio_group * group,struct device * dev,const struct vfio_device_ops * ops,void * device_data)515 struct vfio_device *vfio_group_create_device(struct vfio_group *group,
516 					     struct device *dev,
517 					     const struct vfio_device_ops *ops,
518 					     void *device_data)
519 {
520 	struct vfio_device *device;
521 
522 	device = kzalloc(sizeof(*device), GFP_KERNEL);
523 	if (!device)
524 		return ERR_PTR(-ENOMEM);
525 
526 	kref_init(&device->kref);
527 	device->dev = dev;
528 	device->group = group;
529 	device->ops = ops;
530 	device->device_data = device_data;
531 	dev_set_drvdata(dev, device);
532 
533 	/* No need to get group_lock, caller has group reference */
534 	vfio_group_get(group);
535 
536 	mutex_lock(&group->device_lock);
537 	list_add(&device->group_next, &group->device_list);
538 	mutex_unlock(&group->device_lock);
539 
540 	return device;
541 }
542 
vfio_device_release(struct kref * kref)543 static void vfio_device_release(struct kref *kref)
544 {
545 	struct vfio_device *device = container_of(kref,
546 						  struct vfio_device, kref);
547 	struct vfio_group *group = device->group;
548 
549 	list_del(&device->group_next);
550 	mutex_unlock(&group->device_lock);
551 
552 	dev_set_drvdata(device->dev, NULL);
553 
554 	kfree(device);
555 
556 	/* vfio_del_group_dev may be waiting for this device */
557 	wake_up(&vfio.release_q);
558 }
559 
560 /* Device reference always implies a group reference */
vfio_device_put(struct vfio_device * device)561 void vfio_device_put(struct vfio_device *device)
562 {
563 	struct vfio_group *group = device->group;
564 	kref_put_mutex(&device->kref, vfio_device_release, &group->device_lock);
565 	vfio_group_put(group);
566 }
567 EXPORT_SYMBOL_GPL(vfio_device_put);
568 
vfio_device_get(struct vfio_device * device)569 static void vfio_device_get(struct vfio_device *device)
570 {
571 	vfio_group_get(device->group);
572 	kref_get(&device->kref);
573 }
574 
vfio_group_get_device(struct vfio_group * group,struct device * dev)575 static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
576 						 struct device *dev)
577 {
578 	struct vfio_device *device;
579 
580 	mutex_lock(&group->device_lock);
581 	list_for_each_entry(device, &group->device_list, group_next) {
582 		if (device->dev == dev) {
583 			vfio_device_get(device);
584 			mutex_unlock(&group->device_lock);
585 			return device;
586 		}
587 	}
588 	mutex_unlock(&group->device_lock);
589 	return NULL;
590 }
591 
592 /*
593  * Some drivers, like pci-stub, are only used to prevent other drivers from
594  * claiming a device and are therefore perfectly legitimate for a user owned
595  * group.  The pci-stub driver has no dependencies on DMA or the IOVA mapping
596  * of the device, but it does prevent the user from having direct access to
597  * the device, which is useful in some circumstances.
598  *
599  * We also assume that we can include PCI interconnect devices, ie. bridges.
600  * IOMMU grouping on PCI necessitates that if we lack isolation on a bridge
601  * then all of the downstream devices will be part of the same IOMMU group as
602  * the bridge.  Thus, if placing the bridge into the user owned IOVA space
603  * breaks anything, it only does so for user owned devices downstream.  Note
604  * that error notification via MSI can be affected for platforms that handle
605  * MSI within the same IOVA space as DMA.
606  */
607 static const char * const vfio_driver_whitelist[] = { "pci-stub" };
608 
vfio_dev_whitelisted(struct device * dev,struct device_driver * drv)609 static bool vfio_dev_whitelisted(struct device *dev, struct device_driver *drv)
610 {
611 	int i;
612 
613 	if (dev_is_pci(dev)) {
614 		struct pci_dev *pdev = to_pci_dev(dev);
615 
616 		if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
617 			return true;
618 	}
619 
620 	for (i = 0; i < ARRAY_SIZE(vfio_driver_whitelist); i++) {
621 		if (!strcmp(drv->name, vfio_driver_whitelist[i]))
622 			return true;
623 	}
624 
625 	return false;
626 }
627 
628 /*
629  * A vfio group is viable for use by userspace if all devices are in
630  * one of the following states:
631  *  - driver-less
632  *  - bound to a vfio driver
633  *  - bound to a whitelisted driver
634  *  - a PCI interconnect device
635  *
636  * We use two methods to determine whether a device is bound to a vfio
637  * driver.  The first is to test whether the device exists in the vfio
638  * group.  The second is to test if the device exists on the group
639  * unbound_list, indicating it's in the middle of transitioning from
640  * a vfio driver to driver-less.
641  */
vfio_dev_viable(struct device * dev,void * data)642 static int vfio_dev_viable(struct device *dev, void *data)
643 {
644 	struct vfio_group *group = data;
645 	struct vfio_device *device;
646 	struct device_driver *drv = ACCESS_ONCE(dev->driver);
647 	struct vfio_unbound_dev *unbound;
648 	int ret = -EINVAL;
649 
650 	mutex_lock(&group->unbound_lock);
651 	list_for_each_entry(unbound, &group->unbound_list, unbound_next) {
652 		if (dev == unbound->dev) {
653 			ret = 0;
654 			break;
655 		}
656 	}
657 	mutex_unlock(&group->unbound_lock);
658 
659 	if (!ret || !drv || vfio_dev_whitelisted(dev, drv))
660 		return 0;
661 
662 	device = vfio_group_get_device(group, dev);
663 	if (device) {
664 		vfio_device_put(device);
665 		return 0;
666 	}
667 
668 	return ret;
669 }
670 
671 /**
672  * Async device support
673  */
vfio_group_nb_add_dev(struct vfio_group * group,struct device * dev)674 static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev)
675 {
676 	struct vfio_device *device;
677 
678 	/* Do we already know about it?  We shouldn't */
679 	device = vfio_group_get_device(group, dev);
680 	if (WARN_ON_ONCE(device)) {
681 		vfio_device_put(device);
682 		return 0;
683 	}
684 
685 	/* Nothing to do for idle groups */
686 	if (!atomic_read(&group->container_users))
687 		return 0;
688 
689 	/* TODO Prevent device auto probing */
690 	WARN(1, "Device %s added to live group %d!\n", dev_name(dev),
691 	     iommu_group_id(group->iommu_group));
692 
693 	return 0;
694 }
695 
vfio_group_nb_verify(struct vfio_group * group,struct device * dev)696 static int vfio_group_nb_verify(struct vfio_group *group, struct device *dev)
697 {
698 	/* We don't care what happens when the group isn't in use */
699 	if (!atomic_read(&group->container_users))
700 		return 0;
701 
702 	return vfio_dev_viable(dev, group);
703 }
704 
vfio_iommu_group_notifier(struct notifier_block * nb,unsigned long action,void * data)705 static int vfio_iommu_group_notifier(struct notifier_block *nb,
706 				     unsigned long action, void *data)
707 {
708 	struct vfio_group *group = container_of(nb, struct vfio_group, nb);
709 	struct device *dev = data;
710 	struct vfio_unbound_dev *unbound;
711 
712 	/*
713 	 * Need to go through a group_lock lookup to get a reference or we
714 	 * risk racing a group being removed.  Ignore spurious notifies.
715 	 */
716 	group = vfio_group_try_get(group);
717 	if (!group)
718 		return NOTIFY_OK;
719 
720 	switch (action) {
721 	case IOMMU_GROUP_NOTIFY_ADD_DEVICE:
722 		vfio_group_nb_add_dev(group, dev);
723 		break;
724 	case IOMMU_GROUP_NOTIFY_DEL_DEVICE:
725 		/*
726 		 * Nothing to do here.  If the device is in use, then the
727 		 * vfio sub-driver should block the remove callback until
728 		 * it is unused.  If the device is unused or attached to a
729 		 * stub driver, then it should be released and we don't
730 		 * care that it will be going away.
731 		 */
732 		break;
733 	case IOMMU_GROUP_NOTIFY_BIND_DRIVER:
734 		pr_debug("%s: Device %s, group %d binding to driver\n",
735 			 __func__, dev_name(dev),
736 			 iommu_group_id(group->iommu_group));
737 		break;
738 	case IOMMU_GROUP_NOTIFY_BOUND_DRIVER:
739 		pr_debug("%s: Device %s, group %d bound to driver %s\n",
740 			 __func__, dev_name(dev),
741 			 iommu_group_id(group->iommu_group), dev->driver->name);
742 		BUG_ON(vfio_group_nb_verify(group, dev));
743 		break;
744 	case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER:
745 		pr_debug("%s: Device %s, group %d unbinding from driver %s\n",
746 			 __func__, dev_name(dev),
747 			 iommu_group_id(group->iommu_group), dev->driver->name);
748 		break;
749 	case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER:
750 		pr_debug("%s: Device %s, group %d unbound from driver\n",
751 			 __func__, dev_name(dev),
752 			 iommu_group_id(group->iommu_group));
753 		/*
754 		 * XXX An unbound device in a live group is ok, but we'd
755 		 * really like to avoid the above BUG_ON by preventing other
756 		 * drivers from binding to it.  Once that occurs, we have to
757 		 * stop the system to maintain isolation.  At a minimum, we'd
758 		 * want a toggle to disable driver auto probe for this device.
759 		 */
760 
761 		mutex_lock(&group->unbound_lock);
762 		list_for_each_entry(unbound,
763 				    &group->unbound_list, unbound_next) {
764 			if (dev == unbound->dev) {
765 				list_del(&unbound->unbound_next);
766 				kfree(unbound);
767 				break;
768 			}
769 		}
770 		mutex_unlock(&group->unbound_lock);
771 		break;
772 	}
773 
774 	/*
775 	 * If we're the last reference to the group, the group will be
776 	 * released, which includes unregistering the iommu group notifier.
777 	 * We hold a read-lock on that notifier list, unregistering needs
778 	 * a write-lock... deadlock.  Release our reference asynchronously
779 	 * to avoid that situation.
780 	 */
781 	vfio_group_schedule_put(group);
782 	return NOTIFY_OK;
783 }
784 
785 /**
786  * VFIO driver API
787  */
vfio_add_group_dev(struct device * dev,const struct vfio_device_ops * ops,void * device_data)788 int vfio_add_group_dev(struct device *dev,
789 		       const struct vfio_device_ops *ops, void *device_data)
790 {
791 	struct iommu_group *iommu_group;
792 	struct vfio_group *group;
793 	struct vfio_device *device;
794 
795 	iommu_group = iommu_group_get(dev);
796 	if (!iommu_group)
797 		return -EINVAL;
798 
799 	group = vfio_group_get_from_iommu(iommu_group);
800 	if (!group) {
801 		group = vfio_create_group(iommu_group);
802 		if (IS_ERR(group)) {
803 			iommu_group_put(iommu_group);
804 			return PTR_ERR(group);
805 		}
806 	} else {
807 		/*
808 		 * A found vfio_group already holds a reference to the
809 		 * iommu_group.  A created vfio_group keeps the reference.
810 		 */
811 		iommu_group_put(iommu_group);
812 	}
813 
814 	device = vfio_group_get_device(group, dev);
815 	if (device) {
816 		WARN(1, "Device %s already exists on group %d\n",
817 		     dev_name(dev), iommu_group_id(iommu_group));
818 		vfio_device_put(device);
819 		vfio_group_put(group);
820 		return -EBUSY;
821 	}
822 
823 	device = vfio_group_create_device(group, dev, ops, device_data);
824 	if (IS_ERR(device)) {
825 		vfio_group_put(group);
826 		return PTR_ERR(device);
827 	}
828 
829 	/*
830 	 * Drop all but the vfio_device reference.  The vfio_device holds
831 	 * a reference to the vfio_group, which holds a reference to the
832 	 * iommu_group.
833 	 */
834 	vfio_group_put(group);
835 
836 	return 0;
837 }
838 EXPORT_SYMBOL_GPL(vfio_add_group_dev);
839 
840 /**
841  * Get a reference to the vfio_device for a device.  Even if the
842  * caller thinks they own the device, they could be racing with a
843  * release call path, so we can't trust drvdata for the shortcut.
844  * Go the long way around, from the iommu_group to the vfio_group
845  * to the vfio_device.
846  */
vfio_device_get_from_dev(struct device * dev)847 struct vfio_device *vfio_device_get_from_dev(struct device *dev)
848 {
849 	struct iommu_group *iommu_group;
850 	struct vfio_group *group;
851 	struct vfio_device *device;
852 
853 	iommu_group = iommu_group_get(dev);
854 	if (!iommu_group)
855 		return NULL;
856 
857 	group = vfio_group_get_from_iommu(iommu_group);
858 	iommu_group_put(iommu_group);
859 	if (!group)
860 		return NULL;
861 
862 	device = vfio_group_get_device(group, dev);
863 	vfio_group_put(group);
864 
865 	return device;
866 }
867 EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
868 
vfio_device_get_from_name(struct vfio_group * group,char * buf)869 static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
870 						     char *buf)
871 {
872 	struct vfio_device *it, *device = NULL;
873 
874 	mutex_lock(&group->device_lock);
875 	list_for_each_entry(it, &group->device_list, group_next) {
876 		if (!strcmp(dev_name(it->dev), buf)) {
877 			device = it;
878 			vfio_device_get(device);
879 			break;
880 		}
881 	}
882 	mutex_unlock(&group->device_lock);
883 
884 	return device;
885 }
886 
887 /*
888  * Caller must hold a reference to the vfio_device
889  */
vfio_device_data(struct vfio_device * device)890 void *vfio_device_data(struct vfio_device *device)
891 {
892 	return device->device_data;
893 }
894 EXPORT_SYMBOL_GPL(vfio_device_data);
895 
896 /* Given a referenced group, check if it contains the device */
vfio_dev_present(struct vfio_group * group,struct device * dev)897 static bool vfio_dev_present(struct vfio_group *group, struct device *dev)
898 {
899 	struct vfio_device *device;
900 
901 	device = vfio_group_get_device(group, dev);
902 	if (!device)
903 		return false;
904 
905 	vfio_device_put(device);
906 	return true;
907 }
908 
909 /*
910  * Decrement the device reference count and wait for the device to be
911  * removed.  Open file descriptors for the device... */
vfio_del_group_dev(struct device * dev)912 void *vfio_del_group_dev(struct device *dev)
913 {
914 	struct vfio_device *device = dev_get_drvdata(dev);
915 	struct vfio_group *group = device->group;
916 	void *device_data = device->device_data;
917 	struct vfio_unbound_dev *unbound;
918 	unsigned int i = 0;
919 	long ret;
920 	bool interrupted = false;
921 
922 	/*
923 	 * The group exists so long as we have a device reference.  Get
924 	 * a group reference and use it to scan for the device going away.
925 	 */
926 	vfio_group_get(group);
927 
928 	/*
929 	 * When the device is removed from the group, the group suddenly
930 	 * becomes non-viable; the device has a driver (until the unbind
931 	 * completes), but it's not present in the group.  This is bad news
932 	 * for any external users that need to re-acquire a group reference
933 	 * in order to match and release their existing reference.  To
934 	 * solve this, we track such devices on the unbound_list to bridge
935 	 * the gap until they're fully unbound.
936 	 */
937 	unbound = kzalloc(sizeof(*unbound), GFP_KERNEL);
938 	if (unbound) {
939 		unbound->dev = dev;
940 		mutex_lock(&group->unbound_lock);
941 		list_add(&unbound->unbound_next, &group->unbound_list);
942 		mutex_unlock(&group->unbound_lock);
943 	}
944 	WARN_ON(!unbound);
945 
946 	vfio_device_put(device);
947 
948 	/*
949 	 * If the device is still present in the group after the above
950 	 * 'put', then it is in use and we need to request it from the
951 	 * bus driver.  The driver may in turn need to request the
952 	 * device from the user.  We send the request on an arbitrary
953 	 * interval with counter to allow the driver to take escalating
954 	 * measures to release the device if it has the ability to do so.
955 	 */
956 	do {
957 		device = vfio_group_get_device(group, dev);
958 		if (!device)
959 			break;
960 
961 		if (device->ops->request)
962 			device->ops->request(device_data, i++);
963 
964 		vfio_device_put(device);
965 
966 		if (interrupted) {
967 			ret = wait_event_timeout(vfio.release_q,
968 					!vfio_dev_present(group, dev), HZ * 10);
969 		} else {
970 			ret = wait_event_interruptible_timeout(vfio.release_q,
971 					!vfio_dev_present(group, dev), HZ * 10);
972 			if (ret == -ERESTARTSYS) {
973 				interrupted = true;
974 				dev_warn(dev,
975 					 "Device is currently in use, task"
976 					 " \"%s\" (%d) "
977 					 "blocked until device is released",
978 					 current->comm, task_pid_nr(current));
979 			}
980 		}
981 	} while (ret <= 0);
982 
983 	vfio_group_put(group);
984 
985 	return device_data;
986 }
987 EXPORT_SYMBOL_GPL(vfio_del_group_dev);
988 
989 /**
990  * VFIO base fd, /dev/vfio/vfio
991  */
vfio_ioctl_check_extension(struct vfio_container * container,unsigned long arg)992 static long vfio_ioctl_check_extension(struct vfio_container *container,
993 				       unsigned long arg)
994 {
995 	struct vfio_iommu_driver *driver;
996 	long ret = 0;
997 
998 	down_read(&container->group_lock);
999 
1000 	driver = container->iommu_driver;
1001 
1002 	switch (arg) {
1003 		/* No base extensions yet */
1004 	default:
1005 		/*
1006 		 * If no driver is set, poll all registered drivers for
1007 		 * extensions and return the first positive result.  If
1008 		 * a driver is already set, further queries will be passed
1009 		 * only to that driver.
1010 		 */
1011 		if (!driver) {
1012 			mutex_lock(&vfio.iommu_drivers_lock);
1013 			list_for_each_entry(driver, &vfio.iommu_drivers_list,
1014 					    vfio_next) {
1015 
1016 #ifdef CONFIG_VFIO_NOIOMMU
1017 				if (!list_empty(&container->group_list) &&
1018 				    (container->noiommu !=
1019 				     (driver->ops == &vfio_noiommu_ops)))
1020 					continue;
1021 #endif
1022 
1023 				if (!try_module_get(driver->ops->owner))
1024 					continue;
1025 
1026 				ret = driver->ops->ioctl(NULL,
1027 							 VFIO_CHECK_EXTENSION,
1028 							 arg);
1029 				module_put(driver->ops->owner);
1030 				if (ret > 0)
1031 					break;
1032 			}
1033 			mutex_unlock(&vfio.iommu_drivers_lock);
1034 		} else
1035 			ret = driver->ops->ioctl(container->iommu_data,
1036 						 VFIO_CHECK_EXTENSION, arg);
1037 	}
1038 
1039 	up_read(&container->group_lock);
1040 
1041 	return ret;
1042 }
1043 
1044 /* hold write lock on container->group_lock */
__vfio_container_attach_groups(struct vfio_container * container,struct vfio_iommu_driver * driver,void * data)1045 static int __vfio_container_attach_groups(struct vfio_container *container,
1046 					  struct vfio_iommu_driver *driver,
1047 					  void *data)
1048 {
1049 	struct vfio_group *group;
1050 	int ret = -ENODEV;
1051 
1052 	list_for_each_entry(group, &container->group_list, container_next) {
1053 		ret = driver->ops->attach_group(data, group->iommu_group);
1054 		if (ret)
1055 			goto unwind;
1056 	}
1057 
1058 	return ret;
1059 
1060 unwind:
1061 	list_for_each_entry_continue_reverse(group, &container->group_list,
1062 					     container_next) {
1063 		driver->ops->detach_group(data, group->iommu_group);
1064 	}
1065 
1066 	return ret;
1067 }
1068 
vfio_ioctl_set_iommu(struct vfio_container * container,unsigned long arg)1069 static long vfio_ioctl_set_iommu(struct vfio_container *container,
1070 				 unsigned long arg)
1071 {
1072 	struct vfio_iommu_driver *driver;
1073 	long ret = -ENODEV;
1074 
1075 	down_write(&container->group_lock);
1076 
1077 	/*
1078 	 * The container is designed to be an unprivileged interface while
1079 	 * the group can be assigned to specific users.  Therefore, only by
1080 	 * adding a group to a container does the user get the privilege of
1081 	 * enabling the iommu, which may allocate finite resources.  There
1082 	 * is no unset_iommu, but by removing all the groups from a container,
1083 	 * the container is deprivileged and returns to an unset state.
1084 	 */
1085 	if (list_empty(&container->group_list) || container->iommu_driver) {
1086 		up_write(&container->group_lock);
1087 		return -EINVAL;
1088 	}
1089 
1090 	mutex_lock(&vfio.iommu_drivers_lock);
1091 	list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
1092 		void *data;
1093 
1094 #ifdef CONFIG_VFIO_NOIOMMU
1095 		/*
1096 		 * Only noiommu containers can use vfio-noiommu and noiommu
1097 		 * containers can only use vfio-noiommu.
1098 		 */
1099 		if (container->noiommu != (driver->ops == &vfio_noiommu_ops))
1100 			continue;
1101 #endif
1102 
1103 		if (!try_module_get(driver->ops->owner))
1104 			continue;
1105 
1106 		/*
1107 		 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
1108 		 * so test which iommu driver reported support for this
1109 		 * extension and call open on them.  We also pass them the
1110 		 * magic, allowing a single driver to support multiple
1111 		 * interfaces if they'd like.
1112 		 */
1113 		if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
1114 			module_put(driver->ops->owner);
1115 			continue;
1116 		}
1117 
1118 		data = driver->ops->open(arg);
1119 		if (IS_ERR(data)) {
1120 			ret = PTR_ERR(data);
1121 			module_put(driver->ops->owner);
1122 			continue;
1123 		}
1124 
1125 		ret = __vfio_container_attach_groups(container, driver, data);
1126 		if (ret) {
1127 			driver->ops->release(data);
1128 			module_put(driver->ops->owner);
1129 			continue;
1130 		}
1131 
1132 		container->iommu_driver = driver;
1133 		container->iommu_data = data;
1134 		break;
1135 	}
1136 
1137 	mutex_unlock(&vfio.iommu_drivers_lock);
1138 	up_write(&container->group_lock);
1139 
1140 	return ret;
1141 }
1142 
vfio_fops_unl_ioctl(struct file * filep,unsigned int cmd,unsigned long arg)1143 static long vfio_fops_unl_ioctl(struct file *filep,
1144 				unsigned int cmd, unsigned long arg)
1145 {
1146 	struct vfio_container *container = filep->private_data;
1147 	struct vfio_iommu_driver *driver;
1148 	void *data;
1149 	long ret = -EINVAL;
1150 
1151 	if (!container)
1152 		return ret;
1153 
1154 	switch (cmd) {
1155 	case VFIO_GET_API_VERSION:
1156 		ret = VFIO_API_VERSION;
1157 		break;
1158 	case VFIO_CHECK_EXTENSION:
1159 		ret = vfio_ioctl_check_extension(container, arg);
1160 		break;
1161 	case VFIO_SET_IOMMU:
1162 		ret = vfio_ioctl_set_iommu(container, arg);
1163 		break;
1164 	default:
1165 		down_read(&container->group_lock);
1166 
1167 		driver = container->iommu_driver;
1168 		data = container->iommu_data;
1169 
1170 		if (driver) /* passthrough all unrecognized ioctls */
1171 			ret = driver->ops->ioctl(data, cmd, arg);
1172 
1173 		up_read(&container->group_lock);
1174 	}
1175 
1176 	return ret;
1177 }
1178 
1179 #ifdef CONFIG_COMPAT
vfio_fops_compat_ioctl(struct file * filep,unsigned int cmd,unsigned long arg)1180 static long vfio_fops_compat_ioctl(struct file *filep,
1181 				   unsigned int cmd, unsigned long arg)
1182 {
1183 	arg = (unsigned long)compat_ptr(arg);
1184 	return vfio_fops_unl_ioctl(filep, cmd, arg);
1185 }
1186 #endif	/* CONFIG_COMPAT */
1187 
vfio_fops_open(struct inode * inode,struct file * filep)1188 static int vfio_fops_open(struct inode *inode, struct file *filep)
1189 {
1190 	struct vfio_container *container;
1191 
1192 	container = kzalloc(sizeof(*container), GFP_KERNEL);
1193 	if (!container)
1194 		return -ENOMEM;
1195 
1196 	INIT_LIST_HEAD(&container->group_list);
1197 	init_rwsem(&container->group_lock);
1198 	kref_init(&container->kref);
1199 
1200 	filep->private_data = container;
1201 
1202 	return 0;
1203 }
1204 
vfio_fops_release(struct inode * inode,struct file * filep)1205 static int vfio_fops_release(struct inode *inode, struct file *filep)
1206 {
1207 	struct vfio_container *container = filep->private_data;
1208 
1209 	filep->private_data = NULL;
1210 
1211 	vfio_container_put(container);
1212 
1213 	return 0;
1214 }
1215 
1216 /*
1217  * Once an iommu driver is set, we optionally pass read/write/mmap
1218  * on to the driver, allowing management interfaces beyond ioctl.
1219  */
vfio_fops_read(struct file * filep,char __user * buf,size_t count,loff_t * ppos)1220 static ssize_t vfio_fops_read(struct file *filep, char __user *buf,
1221 			      size_t count, loff_t *ppos)
1222 {
1223 	struct vfio_container *container = filep->private_data;
1224 	struct vfio_iommu_driver *driver;
1225 	ssize_t ret = -EINVAL;
1226 
1227 	down_read(&container->group_lock);
1228 
1229 	driver = container->iommu_driver;
1230 	if (likely(driver && driver->ops->read))
1231 		ret = driver->ops->read(container->iommu_data,
1232 					buf, count, ppos);
1233 
1234 	up_read(&container->group_lock);
1235 
1236 	return ret;
1237 }
1238 
vfio_fops_write(struct file * filep,const char __user * buf,size_t count,loff_t * ppos)1239 static ssize_t vfio_fops_write(struct file *filep, const char __user *buf,
1240 			       size_t count, loff_t *ppos)
1241 {
1242 	struct vfio_container *container = filep->private_data;
1243 	struct vfio_iommu_driver *driver;
1244 	ssize_t ret = -EINVAL;
1245 
1246 	down_read(&container->group_lock);
1247 
1248 	driver = container->iommu_driver;
1249 	if (likely(driver && driver->ops->write))
1250 		ret = driver->ops->write(container->iommu_data,
1251 					 buf, count, ppos);
1252 
1253 	up_read(&container->group_lock);
1254 
1255 	return ret;
1256 }
1257 
vfio_fops_mmap(struct file * filep,struct vm_area_struct * vma)1258 static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1259 {
1260 	struct vfio_container *container = filep->private_data;
1261 	struct vfio_iommu_driver *driver;
1262 	int ret = -EINVAL;
1263 
1264 	down_read(&container->group_lock);
1265 
1266 	driver = container->iommu_driver;
1267 	if (likely(driver && driver->ops->mmap))
1268 		ret = driver->ops->mmap(container->iommu_data, vma);
1269 
1270 	up_read(&container->group_lock);
1271 
1272 	return ret;
1273 }
1274 
1275 static const struct file_operations vfio_fops = {
1276 	.owner		= THIS_MODULE,
1277 	.open		= vfio_fops_open,
1278 	.release	= vfio_fops_release,
1279 	.read		= vfio_fops_read,
1280 	.write		= vfio_fops_write,
1281 	.unlocked_ioctl	= vfio_fops_unl_ioctl,
1282 #ifdef CONFIG_COMPAT
1283 	.compat_ioctl	= vfio_fops_compat_ioctl,
1284 #endif
1285 	.mmap		= vfio_fops_mmap,
1286 };
1287 
1288 /**
1289  * VFIO Group fd, /dev/vfio/$GROUP
1290  */
__vfio_group_unset_container(struct vfio_group * group)1291 static void __vfio_group_unset_container(struct vfio_group *group)
1292 {
1293 	struct vfio_container *container = group->container;
1294 	struct vfio_iommu_driver *driver;
1295 
1296 	down_write(&container->group_lock);
1297 
1298 	driver = container->iommu_driver;
1299 	if (driver)
1300 		driver->ops->detach_group(container->iommu_data,
1301 					  group->iommu_group);
1302 
1303 	group->container = NULL;
1304 	list_del(&group->container_next);
1305 
1306 	/* Detaching the last group deprivileges a container, remove iommu */
1307 	if (driver && list_empty(&container->group_list)) {
1308 		driver->ops->release(container->iommu_data);
1309 		module_put(driver->ops->owner);
1310 		container->iommu_driver = NULL;
1311 		container->iommu_data = NULL;
1312 	}
1313 
1314 	up_write(&container->group_lock);
1315 
1316 	vfio_container_put(container);
1317 }
1318 
1319 /*
1320  * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
1321  * if there was no container to unset.  Since the ioctl is called on
1322  * the group, we know that still exists, therefore the only valid
1323  * transition here is 1->0.
1324  */
vfio_group_unset_container(struct vfio_group * group)1325 static int vfio_group_unset_container(struct vfio_group *group)
1326 {
1327 	int users = atomic_cmpxchg(&group->container_users, 1, 0);
1328 
1329 	if (!users)
1330 		return -EINVAL;
1331 	if (users != 1)
1332 		return -EBUSY;
1333 
1334 	__vfio_group_unset_container(group);
1335 
1336 	return 0;
1337 }
1338 
1339 /*
1340  * When removing container users, anything that removes the last user
1341  * implicitly removes the group from the container.  That is, if the
1342  * group file descriptor is closed, as well as any device file descriptors,
1343  * the group is free.
1344  */
vfio_group_try_dissolve_container(struct vfio_group * group)1345 static void vfio_group_try_dissolve_container(struct vfio_group *group)
1346 {
1347 	if (0 == atomic_dec_if_positive(&group->container_users))
1348 		__vfio_group_unset_container(group);
1349 }
1350 
vfio_group_set_container(struct vfio_group * group,int container_fd)1351 static int vfio_group_set_container(struct vfio_group *group, int container_fd)
1352 {
1353 	struct fd f;
1354 	struct vfio_container *container;
1355 	struct vfio_iommu_driver *driver;
1356 	int ret = 0;
1357 
1358 	if (atomic_read(&group->container_users))
1359 		return -EINVAL;
1360 
1361 	if (group->noiommu && !capable(CAP_SYS_RAWIO))
1362 		return -EPERM;
1363 
1364 	f = fdget(container_fd);
1365 	if (!f.file)
1366 		return -EBADF;
1367 
1368 	/* Sanity check, is this really our fd? */
1369 	if (f.file->f_op != &vfio_fops) {
1370 		fdput(f);
1371 		return -EINVAL;
1372 	}
1373 
1374 	container = f.file->private_data;
1375 	WARN_ON(!container); /* fget ensures we don't race vfio_release */
1376 
1377 	down_write(&container->group_lock);
1378 
1379 	/* Real groups and fake groups cannot mix */
1380 	if (!list_empty(&container->group_list) &&
1381 	    container->noiommu != group->noiommu) {
1382 		ret = -EPERM;
1383 		goto unlock_out;
1384 	}
1385 
1386 	driver = container->iommu_driver;
1387 	if (driver) {
1388 		ret = driver->ops->attach_group(container->iommu_data,
1389 						group->iommu_group);
1390 		if (ret)
1391 			goto unlock_out;
1392 	}
1393 
1394 	group->container = container;
1395 	container->noiommu = group->noiommu;
1396 	list_add(&group->container_next, &container->group_list);
1397 
1398 	/* Get a reference on the container and mark a user within the group */
1399 	vfio_container_get(container);
1400 	atomic_inc(&group->container_users);
1401 
1402 unlock_out:
1403 	up_write(&container->group_lock);
1404 	fdput(f);
1405 	return ret;
1406 }
1407 
vfio_group_viable(struct vfio_group * group)1408 static bool vfio_group_viable(struct vfio_group *group)
1409 {
1410 	return (iommu_group_for_each_dev(group->iommu_group,
1411 					 group, vfio_dev_viable) == 0);
1412 }
1413 
1414 static const struct file_operations vfio_device_fops;
1415 
vfio_group_get_device_fd(struct vfio_group * group,char * buf)1416 static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1417 {
1418 	struct vfio_device *device;
1419 	struct file *filep;
1420 	int ret;
1421 
1422 	if (0 == atomic_read(&group->container_users) ||
1423 	    !group->container->iommu_driver || !vfio_group_viable(group))
1424 		return -EINVAL;
1425 
1426 	if (group->noiommu && !capable(CAP_SYS_RAWIO))
1427 		return -EPERM;
1428 
1429 	device = vfio_device_get_from_name(group, buf);
1430 	if (!device)
1431 		return -ENODEV;
1432 
1433 	ret = device->ops->open(device->device_data);
1434 	if (ret) {
1435 		vfio_device_put(device);
1436 		return ret;
1437 	}
1438 
1439 	/*
1440 	 * We can't use anon_inode_getfd() because we need to modify
1441 	 * the f_mode flags directly to allow more than just ioctls
1442 	 */
1443 	ret = get_unused_fd_flags(O_CLOEXEC);
1444 	if (ret < 0) {
1445 		device->ops->release(device->device_data);
1446 		vfio_device_put(device);
1447 		return ret;
1448 	}
1449 
1450 	filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1451 				   device, O_RDWR);
1452 	if (IS_ERR(filep)) {
1453 		put_unused_fd(ret);
1454 		ret = PTR_ERR(filep);
1455 		device->ops->release(device->device_data);
1456 		vfio_device_put(device);
1457 		return ret;
1458 	}
1459 
1460 	/*
1461 	 * TODO: add an anon_inode interface to do this.
1462 	 * Appears to be missing by lack of need rather than
1463 	 * explicitly prevented.  Now there's need.
1464 	 */
1465 	filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1466 
1467 	atomic_inc(&group->container_users);
1468 
1469 	fd_install(ret, filep);
1470 
1471 	if (group->noiommu)
1472 		dev_warn(device->dev, "vfio-noiommu device opened by user "
1473 			 "(%s:%d)\n", current->comm, task_pid_nr(current));
1474 
1475 	return ret;
1476 }
1477 
vfio_group_fops_unl_ioctl(struct file * filep,unsigned int cmd,unsigned long arg)1478 static long vfio_group_fops_unl_ioctl(struct file *filep,
1479 				      unsigned int cmd, unsigned long arg)
1480 {
1481 	struct vfio_group *group = filep->private_data;
1482 	long ret = -ENOTTY;
1483 
1484 	switch (cmd) {
1485 	case VFIO_GROUP_GET_STATUS:
1486 	{
1487 		struct vfio_group_status status;
1488 		unsigned long minsz;
1489 
1490 		minsz = offsetofend(struct vfio_group_status, flags);
1491 
1492 		if (copy_from_user(&status, (void __user *)arg, minsz))
1493 			return -EFAULT;
1494 
1495 		if (status.argsz < minsz)
1496 			return -EINVAL;
1497 
1498 		status.flags = 0;
1499 
1500 		if (vfio_group_viable(group))
1501 			status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1502 
1503 		if (group->container)
1504 			status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET;
1505 
1506 		if (copy_to_user((void __user *)arg, &status, minsz))
1507 			return -EFAULT;
1508 
1509 		ret = 0;
1510 		break;
1511 	}
1512 	case VFIO_GROUP_SET_CONTAINER:
1513 	{
1514 		int fd;
1515 
1516 		if (get_user(fd, (int __user *)arg))
1517 			return -EFAULT;
1518 
1519 		if (fd < 0)
1520 			return -EINVAL;
1521 
1522 		ret = vfio_group_set_container(group, fd);
1523 		break;
1524 	}
1525 	case VFIO_GROUP_UNSET_CONTAINER:
1526 		ret = vfio_group_unset_container(group);
1527 		break;
1528 	case VFIO_GROUP_GET_DEVICE_FD:
1529 	{
1530 		char *buf;
1531 
1532 		buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1533 		if (IS_ERR(buf))
1534 			return PTR_ERR(buf);
1535 
1536 		ret = vfio_group_get_device_fd(group, buf);
1537 		kfree(buf);
1538 		break;
1539 	}
1540 	}
1541 
1542 	return ret;
1543 }
1544 
1545 #ifdef CONFIG_COMPAT
vfio_group_fops_compat_ioctl(struct file * filep,unsigned int cmd,unsigned long arg)1546 static long vfio_group_fops_compat_ioctl(struct file *filep,
1547 					 unsigned int cmd, unsigned long arg)
1548 {
1549 	arg = (unsigned long)compat_ptr(arg);
1550 	return vfio_group_fops_unl_ioctl(filep, cmd, arg);
1551 }
1552 #endif	/* CONFIG_COMPAT */
1553 
vfio_group_fops_open(struct inode * inode,struct file * filep)1554 static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1555 {
1556 	struct vfio_group *group;
1557 	int opened;
1558 
1559 	group = vfio_group_get_from_minor(iminor(inode));
1560 	if (!group)
1561 		return -ENODEV;
1562 
1563 	if (group->noiommu && !capable(CAP_SYS_RAWIO)) {
1564 		vfio_group_put(group);
1565 		return -EPERM;
1566 	}
1567 
1568 	/* Do we need multiple instances of the group open?  Seems not. */
1569 	opened = atomic_cmpxchg(&group->opened, 0, 1);
1570 	if (opened) {
1571 		vfio_group_put(group);
1572 		return -EBUSY;
1573 	}
1574 
1575 	/* Is something still in use from a previous open? */
1576 	if (group->container) {
1577 		atomic_dec(&group->opened);
1578 		vfio_group_put(group);
1579 		return -EBUSY;
1580 	}
1581 
1582 	filep->private_data = group;
1583 
1584 	return 0;
1585 }
1586 
vfio_group_fops_release(struct inode * inode,struct file * filep)1587 static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1588 {
1589 	struct vfio_group *group = filep->private_data;
1590 
1591 	filep->private_data = NULL;
1592 
1593 	vfio_group_try_dissolve_container(group);
1594 
1595 	atomic_dec(&group->opened);
1596 
1597 	vfio_group_put(group);
1598 
1599 	return 0;
1600 }
1601 
1602 static const struct file_operations vfio_group_fops = {
1603 	.owner		= THIS_MODULE,
1604 	.unlocked_ioctl	= vfio_group_fops_unl_ioctl,
1605 #ifdef CONFIG_COMPAT
1606 	.compat_ioctl	= vfio_group_fops_compat_ioctl,
1607 #endif
1608 	.open		= vfio_group_fops_open,
1609 	.release	= vfio_group_fops_release,
1610 };
1611 
1612 /**
1613  * VFIO Device fd
1614  */
vfio_device_fops_release(struct inode * inode,struct file * filep)1615 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1616 {
1617 	struct vfio_device *device = filep->private_data;
1618 
1619 	device->ops->release(device->device_data);
1620 
1621 	vfio_group_try_dissolve_container(device->group);
1622 
1623 	vfio_device_put(device);
1624 
1625 	return 0;
1626 }
1627 
vfio_device_fops_unl_ioctl(struct file * filep,unsigned int cmd,unsigned long arg)1628 static long vfio_device_fops_unl_ioctl(struct file *filep,
1629 				       unsigned int cmd, unsigned long arg)
1630 {
1631 	struct vfio_device *device = filep->private_data;
1632 
1633 	if (unlikely(!device->ops->ioctl))
1634 		return -EINVAL;
1635 
1636 	return device->ops->ioctl(device->device_data, cmd, arg);
1637 }
1638 
vfio_device_fops_read(struct file * filep,char __user * buf,size_t count,loff_t * ppos)1639 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1640 				     size_t count, loff_t *ppos)
1641 {
1642 	struct vfio_device *device = filep->private_data;
1643 
1644 	if (unlikely(!device->ops->read))
1645 		return -EINVAL;
1646 
1647 	return device->ops->read(device->device_data, buf, count, ppos);
1648 }
1649 
vfio_device_fops_write(struct file * filep,const char __user * buf,size_t count,loff_t * ppos)1650 static ssize_t vfio_device_fops_write(struct file *filep,
1651 				      const char __user *buf,
1652 				      size_t count, loff_t *ppos)
1653 {
1654 	struct vfio_device *device = filep->private_data;
1655 
1656 	if (unlikely(!device->ops->write))
1657 		return -EINVAL;
1658 
1659 	return device->ops->write(device->device_data, buf, count, ppos);
1660 }
1661 
vfio_device_fops_mmap(struct file * filep,struct vm_area_struct * vma)1662 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1663 {
1664 	struct vfio_device *device = filep->private_data;
1665 
1666 	if (unlikely(!device->ops->mmap))
1667 		return -EINVAL;
1668 
1669 	return device->ops->mmap(device->device_data, vma);
1670 }
1671 
1672 #ifdef CONFIG_COMPAT
vfio_device_fops_compat_ioctl(struct file * filep,unsigned int cmd,unsigned long arg)1673 static long vfio_device_fops_compat_ioctl(struct file *filep,
1674 					  unsigned int cmd, unsigned long arg)
1675 {
1676 	arg = (unsigned long)compat_ptr(arg);
1677 	return vfio_device_fops_unl_ioctl(filep, cmd, arg);
1678 }
1679 #endif	/* CONFIG_COMPAT */
1680 
1681 static const struct file_operations vfio_device_fops = {
1682 	.owner		= THIS_MODULE,
1683 	.release	= vfio_device_fops_release,
1684 	.read		= vfio_device_fops_read,
1685 	.write		= vfio_device_fops_write,
1686 	.unlocked_ioctl	= vfio_device_fops_unl_ioctl,
1687 #ifdef CONFIG_COMPAT
1688 	.compat_ioctl	= vfio_device_fops_compat_ioctl,
1689 #endif
1690 	.mmap		= vfio_device_fops_mmap,
1691 };
1692 
1693 /**
1694  * External user API, exported by symbols to be linked dynamically.
1695  *
1696  * The protocol includes:
1697  *  1. do normal VFIO init operation:
1698  *	- opening a new container;
1699  *	- attaching group(s) to it;
1700  *	- setting an IOMMU driver for a container.
1701  * When IOMMU is set for a container, all groups in it are
1702  * considered ready to use by an external user.
1703  *
1704  * 2. User space passes a group fd to an external user.
1705  * The external user calls vfio_group_get_external_user()
1706  * to verify that:
1707  *	- the group is initialized;
1708  *	- IOMMU is set for it.
1709  * If both checks passed, vfio_group_get_external_user()
1710  * increments the container user counter to prevent
1711  * the VFIO group from disposal before KVM exits.
1712  *
1713  * 3. The external user calls vfio_external_user_iommu_id()
1714  * to know an IOMMU ID.
1715  *
1716  * 4. When the external KVM finishes, it calls
1717  * vfio_group_put_external_user() to release the VFIO group.
1718  * This call decrements the container user counter.
1719  */
vfio_group_get_external_user(struct file * filep)1720 struct vfio_group *vfio_group_get_external_user(struct file *filep)
1721 {
1722 	struct vfio_group *group = filep->private_data;
1723 
1724 	if (filep->f_op != &vfio_group_fops)
1725 		return ERR_PTR(-EINVAL);
1726 
1727 	if (!atomic_inc_not_zero(&group->container_users))
1728 		return ERR_PTR(-EINVAL);
1729 
1730 	if (group->noiommu) {
1731 		atomic_dec(&group->container_users);
1732 		return ERR_PTR(-EPERM);
1733 	}
1734 
1735 	if (!group->container->iommu_driver ||
1736 			!vfio_group_viable(group)) {
1737 		atomic_dec(&group->container_users);
1738 		return ERR_PTR(-EINVAL);
1739 	}
1740 
1741 	vfio_group_get(group);
1742 
1743 	return group;
1744 }
1745 EXPORT_SYMBOL_GPL(vfio_group_get_external_user);
1746 
vfio_group_put_external_user(struct vfio_group * group)1747 void vfio_group_put_external_user(struct vfio_group *group)
1748 {
1749 	vfio_group_try_dissolve_container(group);
1750 	vfio_group_put(group);
1751 }
1752 EXPORT_SYMBOL_GPL(vfio_group_put_external_user);
1753 
vfio_external_group_match_file(struct vfio_group * test_group,struct file * filep)1754 bool vfio_external_group_match_file(struct vfio_group *test_group,
1755 				    struct file *filep)
1756 {
1757 	struct vfio_group *group = filep->private_data;
1758 
1759 	return (filep->f_op == &vfio_group_fops) && (group == test_group);
1760 }
1761 EXPORT_SYMBOL_GPL(vfio_external_group_match_file);
1762 
vfio_external_user_iommu_id(struct vfio_group * group)1763 int vfio_external_user_iommu_id(struct vfio_group *group)
1764 {
1765 	return iommu_group_id(group->iommu_group);
1766 }
1767 EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id);
1768 
vfio_external_check_extension(struct vfio_group * group,unsigned long arg)1769 long vfio_external_check_extension(struct vfio_group *group, unsigned long arg)
1770 {
1771 	return vfio_ioctl_check_extension(group->container, arg);
1772 }
1773 EXPORT_SYMBOL_GPL(vfio_external_check_extension);
1774 
1775 /**
1776  * Sub-module support
1777  */
1778 /*
1779  * Helper for managing a buffer of info chain capabilities, allocate or
1780  * reallocate a buffer with additional @size, filling in @id and @version
1781  * of the capability.  A pointer to the new capability is returned.
1782  *
1783  * NB. The chain is based at the head of the buffer, so new entries are
1784  * added to the tail, vfio_info_cap_shift() should be called to fixup the
1785  * next offsets prior to copying to the user buffer.
1786  */
vfio_info_cap_add(struct vfio_info_cap * caps,size_t size,u16 id,u16 version)1787 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1788 					       size_t size, u16 id, u16 version)
1789 {
1790 	void *buf;
1791 	struct vfio_info_cap_header *header, *tmp;
1792 
1793 	buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1794 	if (!buf) {
1795 		kfree(caps->buf);
1796 		caps->size = 0;
1797 		return ERR_PTR(-ENOMEM);
1798 	}
1799 
1800 	caps->buf = buf;
1801 	header = buf + caps->size;
1802 
1803 	/* Eventually copied to user buffer, zero */
1804 	memset(header, 0, size);
1805 
1806 	header->id = id;
1807 	header->version = version;
1808 
1809 	/* Add to the end of the capability chain */
1810 	for (tmp = caps->buf; tmp->next; tmp = (void *)tmp + tmp->next)
1811 		; /* nothing */
1812 
1813 	tmp->next = caps->size;
1814 	caps->size += size;
1815 
1816 	return header;
1817 }
1818 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1819 
vfio_info_cap_shift(struct vfio_info_cap * caps,size_t offset)1820 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1821 {
1822 	struct vfio_info_cap_header *tmp;
1823 
1824 	for (tmp = caps->buf; tmp->next; tmp = (void *)tmp + tmp->next - offset)
1825 		tmp->next += offset;
1826 }
1827 EXPORT_SYMBOL_GPL(vfio_info_cap_shift);
1828 
1829 /**
1830  * Module/class support
1831  */
vfio_devnode(struct device * dev,umode_t * mode)1832 static char *vfio_devnode(struct device *dev, umode_t *mode)
1833 {
1834 	return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
1835 }
1836 
1837 static struct miscdevice vfio_dev = {
1838 	.minor = VFIO_MINOR,
1839 	.name = "vfio",
1840 	.fops = &vfio_fops,
1841 	.nodename = "vfio/vfio",
1842 	.mode = S_IRUGO | S_IWUGO,
1843 };
1844 
vfio_init(void)1845 static int __init vfio_init(void)
1846 {
1847 	int ret;
1848 
1849 	idr_init(&vfio.group_idr);
1850 	mutex_init(&vfio.group_lock);
1851 	mutex_init(&vfio.iommu_drivers_lock);
1852 	INIT_LIST_HEAD(&vfio.group_list);
1853 	INIT_LIST_HEAD(&vfio.iommu_drivers_list);
1854 	init_waitqueue_head(&vfio.release_q);
1855 
1856 	ret = misc_register(&vfio_dev);
1857 	if (ret) {
1858 		pr_err("vfio: misc device register failed\n");
1859 		return ret;
1860 	}
1861 
1862 	/* /dev/vfio/$GROUP */
1863 	vfio.class = class_create(THIS_MODULE, "vfio");
1864 	if (IS_ERR(vfio.class)) {
1865 		ret = PTR_ERR(vfio.class);
1866 		goto err_class;
1867 	}
1868 
1869 	vfio.class->devnode = vfio_devnode;
1870 
1871 	ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK, "vfio");
1872 	if (ret)
1873 		goto err_alloc_chrdev;
1874 
1875 	cdev_init(&vfio.group_cdev, &vfio_group_fops);
1876 	ret = cdev_add(&vfio.group_cdev, vfio.group_devt, MINORMASK);
1877 	if (ret)
1878 		goto err_cdev_add;
1879 
1880 	pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
1881 
1882 	/*
1883 	 * Attempt to load known iommu-drivers.  This gives us a working
1884 	 * environment without the user needing to explicitly load iommu
1885 	 * drivers.
1886 	 */
1887 	request_module_nowait("vfio_iommu_type1");
1888 	request_module_nowait("vfio_iommu_spapr_tce");
1889 
1890 #ifdef CONFIG_VFIO_NOIOMMU
1891 	vfio_register_iommu_driver(&vfio_noiommu_ops);
1892 #endif
1893 	return 0;
1894 
1895 err_cdev_add:
1896 	unregister_chrdev_region(vfio.group_devt, MINORMASK);
1897 err_alloc_chrdev:
1898 	class_destroy(vfio.class);
1899 	vfio.class = NULL;
1900 err_class:
1901 	misc_deregister(&vfio_dev);
1902 	return ret;
1903 }
1904 
vfio_cleanup(void)1905 static void __exit vfio_cleanup(void)
1906 {
1907 	WARN_ON(!list_empty(&vfio.group_list));
1908 
1909 #ifdef CONFIG_VFIO_NOIOMMU
1910 	vfio_unregister_iommu_driver(&vfio_noiommu_ops);
1911 #endif
1912 	idr_destroy(&vfio.group_idr);
1913 	cdev_del(&vfio.group_cdev);
1914 	unregister_chrdev_region(vfio.group_devt, MINORMASK);
1915 	class_destroy(vfio.class);
1916 	vfio.class = NULL;
1917 	misc_deregister(&vfio_dev);
1918 }
1919 
1920 module_init(vfio_init);
1921 module_exit(vfio_cleanup);
1922 
1923 MODULE_VERSION(DRIVER_VERSION);
1924 MODULE_LICENSE("GPL v2");
1925 MODULE_AUTHOR(DRIVER_AUTHOR);
1926 MODULE_DESCRIPTION(DRIVER_DESC);
1927 MODULE_ALIAS_MISCDEV(VFIO_MINOR);
1928 MODULE_ALIAS("devname:vfio/vfio");
1929