• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * VFIO core
4  *
5  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
6  *     Author: Alex Williamson <alex.williamson@redhat.com>
7  *
8  * Derived from original vfio:
9  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
10  * Author: Tom Lyon, pugs@cisco.com
11  */
12 
13 #include <linux/cdev.h>
14 #include <linux/compat.h>
15 #include <linux/device.h>
16 #include <linux/file.h>
17 #include <linux/anon_inodes.h>
18 #include <linux/fs.h>
19 #include <linux/idr.h>
20 #include <linux/iommu.h>
21 #include <linux/list.h>
22 #include <linux/miscdevice.h>
23 #include <linux/module.h>
24 #include <linux/mutex.h>
25 #include <linux/pci.h>
26 #include <linux/rwsem.h>
27 #include <linux/sched.h>
28 #include <linux/slab.h>
29 #include <linux/stat.h>
30 #include <linux/string.h>
31 #include <linux/uaccess.h>
32 #include <linux/vfio.h>
33 #include <linux/wait.h>
34 #include <linux/sched/signal.h>
35 
36 #define DRIVER_VERSION	"0.3"
37 #define DRIVER_AUTHOR	"Alex Williamson <alex.williamson@redhat.com>"
38 #define DRIVER_DESC	"VFIO - User Level meta-driver"
39 
40 static struct vfio {
41 	struct class			*class;
42 	struct list_head		iommu_drivers_list;
43 	struct mutex			iommu_drivers_lock;
44 	struct list_head		group_list;
45 	struct idr			group_idr;
46 	struct mutex			group_lock;
47 	struct cdev			group_cdev;
48 	dev_t				group_devt;
49 } vfio;
50 
51 struct vfio_iommu_driver {
52 	const struct vfio_iommu_driver_ops	*ops;
53 	struct list_head			vfio_next;
54 };
55 
56 struct vfio_container {
57 	struct kref			kref;
58 	struct list_head		group_list;
59 	struct rw_semaphore		group_lock;
60 	struct vfio_iommu_driver	*iommu_driver;
61 	void				*iommu_data;
62 	bool				noiommu;
63 };
64 
65 struct vfio_unbound_dev {
66 	struct device			*dev;
67 	struct list_head		unbound_next;
68 };
69 
70 struct vfio_group {
71 	struct kref			kref;
72 	int				minor;
73 	atomic_t			container_users;
74 	struct iommu_group		*iommu_group;
75 	struct vfio_container		*container;
76 	struct list_head		device_list;
77 	struct mutex			device_lock;
78 	struct device			*dev;
79 	struct notifier_block		nb;
80 	struct list_head		vfio_next;
81 	struct list_head		container_next;
82 	struct list_head		unbound_list;
83 	struct mutex			unbound_lock;
84 	atomic_t			opened;
85 	wait_queue_head_t		container_q;
86 	bool				noiommu;
87 	unsigned int			dev_counter;
88 	struct kvm			*kvm;
89 	struct blocking_notifier_head	notifier;
90 };
91 
92 #ifdef CONFIG_VFIO_NOIOMMU
93 static bool noiommu __read_mostly;
94 module_param_named(enable_unsafe_noiommu_mode,
95 		   noiommu, bool, S_IRUGO | S_IWUSR);
96 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
97 #endif
98 
99 static DEFINE_XARRAY(vfio_device_set_xa);
100 
vfio_assign_device_set(struct vfio_device * device,void * set_id)101 int vfio_assign_device_set(struct vfio_device *device, void *set_id)
102 {
103 	unsigned long idx = (unsigned long)set_id;
104 	struct vfio_device_set *new_dev_set;
105 	struct vfio_device_set *dev_set;
106 
107 	if (WARN_ON(!set_id))
108 		return -EINVAL;
109 
110 	/*
111 	 * Atomically acquire a singleton object in the xarray for this set_id
112 	 */
113 	xa_lock(&vfio_device_set_xa);
114 	dev_set = xa_load(&vfio_device_set_xa, idx);
115 	if (dev_set)
116 		goto found_get_ref;
117 	xa_unlock(&vfio_device_set_xa);
118 
119 	new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
120 	if (!new_dev_set)
121 		return -ENOMEM;
122 	mutex_init(&new_dev_set->lock);
123 	INIT_LIST_HEAD(&new_dev_set->device_list);
124 	new_dev_set->set_id = set_id;
125 
126 	xa_lock(&vfio_device_set_xa);
127 	dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
128 			       GFP_KERNEL);
129 	if (!dev_set) {
130 		dev_set = new_dev_set;
131 		goto found_get_ref;
132 	}
133 
134 	kfree(new_dev_set);
135 	if (xa_is_err(dev_set)) {
136 		xa_unlock(&vfio_device_set_xa);
137 		return xa_err(dev_set);
138 	}
139 
140 found_get_ref:
141 	dev_set->device_count++;
142 	xa_unlock(&vfio_device_set_xa);
143 	mutex_lock(&dev_set->lock);
144 	device->dev_set = dev_set;
145 	list_add_tail(&device->dev_set_list, &dev_set->device_list);
146 	mutex_unlock(&dev_set->lock);
147 	return 0;
148 }
149 EXPORT_SYMBOL_GPL(vfio_assign_device_set);
150 
vfio_release_device_set(struct vfio_device * device)151 static void vfio_release_device_set(struct vfio_device *device)
152 {
153 	struct vfio_device_set *dev_set = device->dev_set;
154 
155 	if (!dev_set)
156 		return;
157 
158 	mutex_lock(&dev_set->lock);
159 	list_del(&device->dev_set_list);
160 	mutex_unlock(&dev_set->lock);
161 
162 	xa_lock(&vfio_device_set_xa);
163 	if (!--dev_set->device_count) {
164 		__xa_erase(&vfio_device_set_xa,
165 			   (unsigned long)dev_set->set_id);
166 		mutex_destroy(&dev_set->lock);
167 		kfree(dev_set);
168 	}
169 	xa_unlock(&vfio_device_set_xa);
170 }
171 
172 /*
173  * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe
174  * and remove functions, any use cases other than acquiring the first
175  * reference for the purpose of calling vfio_register_group_dev() or removing
176  * that symmetric reference after vfio_unregister_group_dev() should use the raw
177  * iommu_group_{get,put} functions.  In particular, vfio_iommu_group_put()
178  * removes the device from the dummy group and cannot be nested.
179  */
vfio_iommu_group_get(struct device * dev)180 struct iommu_group *vfio_iommu_group_get(struct device *dev)
181 {
182 	struct iommu_group *group;
183 	int __maybe_unused ret;
184 
185 	group = iommu_group_get(dev);
186 
187 #ifdef CONFIG_VFIO_NOIOMMU
188 	/*
189 	 * With noiommu enabled, an IOMMU group will be created for a device
190 	 * that doesn't already have one and doesn't have an iommu_ops on their
191 	 * bus.  We set iommudata simply to be able to identify these groups
192 	 * as special use and for reclamation later.
193 	 */
194 	if (group || !noiommu || iommu_present(dev->bus))
195 		return group;
196 
197 	group = iommu_group_alloc();
198 	if (IS_ERR(group))
199 		return NULL;
200 
201 	iommu_group_set_name(group, "vfio-noiommu");
202 	iommu_group_set_iommudata(group, &noiommu, NULL);
203 	ret = iommu_group_add_device(group, dev);
204 	if (ret) {
205 		iommu_group_put(group);
206 		return NULL;
207 	}
208 
209 	/*
210 	 * Where to taint?  At this point we've added an IOMMU group for a
211 	 * device that is not backed by iommu_ops, therefore any iommu_
212 	 * callback using iommu_ops can legitimately Oops.  So, while we may
213 	 * be about to give a DMA capable device to a user without IOMMU
214 	 * protection, which is clearly taint-worthy, let's go ahead and do
215 	 * it here.
216 	 */
217 	add_taint(TAINT_USER, LOCKDEP_STILL_OK);
218 	dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
219 #endif
220 
221 	return group;
222 }
223 EXPORT_SYMBOL_GPL(vfio_iommu_group_get);
224 
vfio_iommu_group_put(struct iommu_group * group,struct device * dev)225 void vfio_iommu_group_put(struct iommu_group *group, struct device *dev)
226 {
227 #ifdef CONFIG_VFIO_NOIOMMU
228 	if (iommu_group_get_iommudata(group) == &noiommu)
229 		iommu_group_remove_device(dev);
230 #endif
231 
232 	iommu_group_put(group);
233 }
234 EXPORT_SYMBOL_GPL(vfio_iommu_group_put);
235 
236 #ifdef CONFIG_VFIO_NOIOMMU
vfio_noiommu_open(unsigned long arg)237 static void *vfio_noiommu_open(unsigned long arg)
238 {
239 	if (arg != VFIO_NOIOMMU_IOMMU)
240 		return ERR_PTR(-EINVAL);
241 	if (!capable(CAP_SYS_RAWIO))
242 		return ERR_PTR(-EPERM);
243 
244 	return NULL;
245 }
246 
vfio_noiommu_release(void * iommu_data)247 static void vfio_noiommu_release(void *iommu_data)
248 {
249 }
250 
vfio_noiommu_ioctl(void * iommu_data,unsigned int cmd,unsigned long arg)251 static long vfio_noiommu_ioctl(void *iommu_data,
252 			       unsigned int cmd, unsigned long arg)
253 {
254 	if (cmd == VFIO_CHECK_EXTENSION)
255 		return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
256 
257 	return -ENOTTY;
258 }
259 
vfio_noiommu_attach_group(void * iommu_data,struct iommu_group * iommu_group)260 static int vfio_noiommu_attach_group(void *iommu_data,
261 				     struct iommu_group *iommu_group)
262 {
263 	return iommu_group_get_iommudata(iommu_group) == &noiommu ? 0 : -EINVAL;
264 }
265 
vfio_noiommu_detach_group(void * iommu_data,struct iommu_group * iommu_group)266 static void vfio_noiommu_detach_group(void *iommu_data,
267 				      struct iommu_group *iommu_group)
268 {
269 }
270 
271 static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
272 	.name = "vfio-noiommu",
273 	.owner = THIS_MODULE,
274 	.open = vfio_noiommu_open,
275 	.release = vfio_noiommu_release,
276 	.ioctl = vfio_noiommu_ioctl,
277 	.attach_group = vfio_noiommu_attach_group,
278 	.detach_group = vfio_noiommu_detach_group,
279 };
280 #endif
281 
282 
283 /**
284  * IOMMU driver registration
285  */
vfio_register_iommu_driver(const struct vfio_iommu_driver_ops * ops)286 int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
287 {
288 	struct vfio_iommu_driver *driver, *tmp;
289 
290 	driver = kzalloc(sizeof(*driver), GFP_KERNEL);
291 	if (!driver)
292 		return -ENOMEM;
293 
294 	driver->ops = ops;
295 
296 	mutex_lock(&vfio.iommu_drivers_lock);
297 
298 	/* Check for duplicates */
299 	list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
300 		if (tmp->ops == ops) {
301 			mutex_unlock(&vfio.iommu_drivers_lock);
302 			kfree(driver);
303 			return -EINVAL;
304 		}
305 	}
306 
307 	list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
308 
309 	mutex_unlock(&vfio.iommu_drivers_lock);
310 
311 	return 0;
312 }
313 EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
314 
vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops * ops)315 void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
316 {
317 	struct vfio_iommu_driver *driver;
318 
319 	mutex_lock(&vfio.iommu_drivers_lock);
320 	list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
321 		if (driver->ops == ops) {
322 			list_del(&driver->vfio_next);
323 			mutex_unlock(&vfio.iommu_drivers_lock);
324 			kfree(driver);
325 			return;
326 		}
327 	}
328 	mutex_unlock(&vfio.iommu_drivers_lock);
329 }
330 EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
331 
332 /**
333  * Group minor allocation/free - both called with vfio.group_lock held
334  */
vfio_alloc_group_minor(struct vfio_group * group)335 static int vfio_alloc_group_minor(struct vfio_group *group)
336 {
337 	return idr_alloc(&vfio.group_idr, group, 0, MINORMASK + 1, GFP_KERNEL);
338 }
339 
vfio_free_group_minor(int minor)340 static void vfio_free_group_minor(int minor)
341 {
342 	idr_remove(&vfio.group_idr, minor);
343 }
344 
345 static int vfio_iommu_group_notifier(struct notifier_block *nb,
346 				     unsigned long action, void *data);
347 static void vfio_group_get(struct vfio_group *group);
348 
349 /**
350  * Container objects - containers are created when /dev/vfio/vfio is
351  * opened, but their lifecycle extends until the last user is done, so
352  * it's freed via kref.  Must support container/group/device being
353  * closed in any order.
354  */
vfio_container_get(struct vfio_container * container)355 static void vfio_container_get(struct vfio_container *container)
356 {
357 	kref_get(&container->kref);
358 }
359 
vfio_container_release(struct kref * kref)360 static void vfio_container_release(struct kref *kref)
361 {
362 	struct vfio_container *container;
363 	container = container_of(kref, struct vfio_container, kref);
364 
365 	kfree(container);
366 }
367 
vfio_container_put(struct vfio_container * container)368 static void vfio_container_put(struct vfio_container *container)
369 {
370 	kref_put(&container->kref, vfio_container_release);
371 }
372 
vfio_group_unlock_and_free(struct vfio_group * group)373 static void vfio_group_unlock_and_free(struct vfio_group *group)
374 {
375 	mutex_unlock(&vfio.group_lock);
376 	/*
377 	 * Unregister outside of lock.  A spurious callback is harmless now
378 	 * that the group is no longer in vfio.group_list.
379 	 */
380 	iommu_group_unregister_notifier(group->iommu_group, &group->nb);
381 	kfree(group);
382 }
383 
384 /**
385  * Group objects - create, release, get, put, search
386  */
vfio_create_group(struct iommu_group * iommu_group)387 static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
388 {
389 	struct vfio_group *group, *tmp;
390 	struct device *dev;
391 	int ret, minor;
392 
393 	group = kzalloc(sizeof(*group), GFP_KERNEL);
394 	if (!group)
395 		return ERR_PTR(-ENOMEM);
396 
397 	kref_init(&group->kref);
398 	INIT_LIST_HEAD(&group->device_list);
399 	mutex_init(&group->device_lock);
400 	INIT_LIST_HEAD(&group->unbound_list);
401 	mutex_init(&group->unbound_lock);
402 	atomic_set(&group->container_users, 0);
403 	atomic_set(&group->opened, 0);
404 	init_waitqueue_head(&group->container_q);
405 	group->iommu_group = iommu_group;
406 #ifdef CONFIG_VFIO_NOIOMMU
407 	group->noiommu = (iommu_group_get_iommudata(iommu_group) == &noiommu);
408 #endif
409 	BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
410 
411 	group->nb.notifier_call = vfio_iommu_group_notifier;
412 
413 	/*
414 	 * blocking notifiers acquire a rwsem around registering and hold
415 	 * it around callback.  Therefore, need to register outside of
416 	 * vfio.group_lock to avoid A-B/B-A contention.  Our callback won't
417 	 * do anything unless it can find the group in vfio.group_list, so
418 	 * no harm in registering early.
419 	 */
420 	ret = iommu_group_register_notifier(iommu_group, &group->nb);
421 	if (ret) {
422 		kfree(group);
423 		return ERR_PTR(ret);
424 	}
425 
426 	mutex_lock(&vfio.group_lock);
427 
428 	/* Did we race creating this group? */
429 	list_for_each_entry(tmp, &vfio.group_list, vfio_next) {
430 		if (tmp->iommu_group == iommu_group) {
431 			vfio_group_get(tmp);
432 			vfio_group_unlock_and_free(group);
433 			return tmp;
434 		}
435 	}
436 
437 	minor = vfio_alloc_group_minor(group);
438 	if (minor < 0) {
439 		vfio_group_unlock_and_free(group);
440 		return ERR_PTR(minor);
441 	}
442 
443 	dev = device_create(vfio.class, NULL,
444 			    MKDEV(MAJOR(vfio.group_devt), minor),
445 			    group, "%s%d", group->noiommu ? "noiommu-" : "",
446 			    iommu_group_id(iommu_group));
447 	if (IS_ERR(dev)) {
448 		vfio_free_group_minor(minor);
449 		vfio_group_unlock_and_free(group);
450 		return ERR_CAST(dev);
451 	}
452 
453 	group->minor = minor;
454 	group->dev = dev;
455 
456 	list_add(&group->vfio_next, &vfio.group_list);
457 
458 	mutex_unlock(&vfio.group_lock);
459 
460 	return group;
461 }
462 
463 /* called with vfio.group_lock held */
vfio_group_release(struct kref * kref)464 static void vfio_group_release(struct kref *kref)
465 {
466 	struct vfio_group *group = container_of(kref, struct vfio_group, kref);
467 	struct vfio_unbound_dev *unbound, *tmp;
468 	struct iommu_group *iommu_group = group->iommu_group;
469 
470 	WARN_ON(!list_empty(&group->device_list));
471 	WARN_ON(group->notifier.head);
472 
473 	list_for_each_entry_safe(unbound, tmp,
474 				 &group->unbound_list, unbound_next) {
475 		list_del(&unbound->unbound_next);
476 		kfree(unbound);
477 	}
478 
479 	device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor));
480 	list_del(&group->vfio_next);
481 	vfio_free_group_minor(group->minor);
482 	vfio_group_unlock_and_free(group);
483 	iommu_group_put(iommu_group);
484 }
485 
vfio_group_put(struct vfio_group * group)486 static void vfio_group_put(struct vfio_group *group)
487 {
488 	kref_put_mutex(&group->kref, vfio_group_release, &vfio.group_lock);
489 }
490 
491 struct vfio_group_put_work {
492 	struct work_struct work;
493 	struct vfio_group *group;
494 };
495 
vfio_group_put_bg(struct work_struct * work)496 static void vfio_group_put_bg(struct work_struct *work)
497 {
498 	struct vfio_group_put_work *do_work;
499 
500 	do_work = container_of(work, struct vfio_group_put_work, work);
501 
502 	vfio_group_put(do_work->group);
503 	kfree(do_work);
504 }
505 
vfio_group_schedule_put(struct vfio_group * group)506 static void vfio_group_schedule_put(struct vfio_group *group)
507 {
508 	struct vfio_group_put_work *do_work;
509 
510 	do_work = kmalloc(sizeof(*do_work), GFP_KERNEL);
511 	if (WARN_ON(!do_work))
512 		return;
513 
514 	INIT_WORK(&do_work->work, vfio_group_put_bg);
515 	do_work->group = group;
516 	schedule_work(&do_work->work);
517 }
518 
519 /* Assume group_lock or group reference is held */
vfio_group_get(struct vfio_group * group)520 static void vfio_group_get(struct vfio_group *group)
521 {
522 	kref_get(&group->kref);
523 }
524 
525 /*
526  * Not really a try as we will sleep for mutex, but we need to make
527  * sure the group pointer is valid under lock and get a reference.
528  */
vfio_group_try_get(struct vfio_group * group)529 static struct vfio_group *vfio_group_try_get(struct vfio_group *group)
530 {
531 	struct vfio_group *target = group;
532 
533 	mutex_lock(&vfio.group_lock);
534 	list_for_each_entry(group, &vfio.group_list, vfio_next) {
535 		if (group == target) {
536 			vfio_group_get(group);
537 			mutex_unlock(&vfio.group_lock);
538 			return group;
539 		}
540 	}
541 	mutex_unlock(&vfio.group_lock);
542 
543 	return NULL;
544 }
545 
546 static
vfio_group_get_from_iommu(struct iommu_group * iommu_group)547 struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group)
548 {
549 	struct vfio_group *group;
550 
551 	mutex_lock(&vfio.group_lock);
552 	list_for_each_entry(group, &vfio.group_list, vfio_next) {
553 		if (group->iommu_group == iommu_group) {
554 			vfio_group_get(group);
555 			mutex_unlock(&vfio.group_lock);
556 			return group;
557 		}
558 	}
559 	mutex_unlock(&vfio.group_lock);
560 
561 	return NULL;
562 }
563 
vfio_group_get_from_minor(int minor)564 static struct vfio_group *vfio_group_get_from_minor(int minor)
565 {
566 	struct vfio_group *group;
567 
568 	mutex_lock(&vfio.group_lock);
569 	group = idr_find(&vfio.group_idr, minor);
570 	if (!group) {
571 		mutex_unlock(&vfio.group_lock);
572 		return NULL;
573 	}
574 	vfio_group_get(group);
575 	mutex_unlock(&vfio.group_lock);
576 
577 	return group;
578 }
579 
vfio_group_get_from_dev(struct device * dev)580 static struct vfio_group *vfio_group_get_from_dev(struct device *dev)
581 {
582 	struct iommu_group *iommu_group;
583 	struct vfio_group *group;
584 
585 	iommu_group = iommu_group_get(dev);
586 	if (!iommu_group)
587 		return NULL;
588 
589 	group = vfio_group_get_from_iommu(iommu_group);
590 	iommu_group_put(iommu_group);
591 
592 	return group;
593 }
594 
595 /**
596  * Device objects - create, release, get, put, search
597  */
598 /* Device reference always implies a group reference */
vfio_device_put(struct vfio_device * device)599 void vfio_device_put(struct vfio_device *device)
600 {
601 	if (refcount_dec_and_test(&device->refcount))
602 		complete(&device->comp);
603 }
604 EXPORT_SYMBOL_GPL(vfio_device_put);
605 
vfio_device_try_get(struct vfio_device * device)606 static bool vfio_device_try_get(struct vfio_device *device)
607 {
608 	return refcount_inc_not_zero(&device->refcount);
609 }
610 
vfio_group_get_device(struct vfio_group * group,struct device * dev)611 static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
612 						 struct device *dev)
613 {
614 	struct vfio_device *device;
615 
616 	mutex_lock(&group->device_lock);
617 	list_for_each_entry(device, &group->device_list, group_next) {
618 		if (device->dev == dev && vfio_device_try_get(device)) {
619 			mutex_unlock(&group->device_lock);
620 			return device;
621 		}
622 	}
623 	mutex_unlock(&group->device_lock);
624 	return NULL;
625 }
626 
627 /*
628  * Some drivers, like pci-stub, are only used to prevent other drivers from
629  * claiming a device and are therefore perfectly legitimate for a user owned
630  * group.  The pci-stub driver has no dependencies on DMA or the IOVA mapping
631  * of the device, but it does prevent the user from having direct access to
632  * the device, which is useful in some circumstances.
633  *
634  * We also assume that we can include PCI interconnect devices, ie. bridges.
635  * IOMMU grouping on PCI necessitates that if we lack isolation on a bridge
636  * then all of the downstream devices will be part of the same IOMMU group as
637  * the bridge.  Thus, if placing the bridge into the user owned IOVA space
638  * breaks anything, it only does so for user owned devices downstream.  Note
639  * that error notification via MSI can be affected for platforms that handle
640  * MSI within the same IOVA space as DMA.
641  */
642 static const char * const vfio_driver_allowed[] = { "pci-stub" };
643 
vfio_dev_driver_allowed(struct device * dev,struct device_driver * drv)644 static bool vfio_dev_driver_allowed(struct device *dev,
645 				    struct device_driver *drv)
646 {
647 	if (dev_is_pci(dev)) {
648 		struct pci_dev *pdev = to_pci_dev(dev);
649 
650 		if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
651 			return true;
652 	}
653 
654 	return match_string(vfio_driver_allowed,
655 			    ARRAY_SIZE(vfio_driver_allowed),
656 			    drv->name) >= 0;
657 }
658 
659 /*
660  * A vfio group is viable for use by userspace if all devices are in
661  * one of the following states:
662  *  - driver-less
663  *  - bound to a vfio driver
664  *  - bound to an otherwise allowed driver
665  *  - a PCI interconnect device
666  *
667  * We use two methods to determine whether a device is bound to a vfio
668  * driver.  The first is to test whether the device exists in the vfio
669  * group.  The second is to test if the device exists on the group
670  * unbound_list, indicating it's in the middle of transitioning from
671  * a vfio driver to driver-less.
672  */
vfio_dev_viable(struct device * dev,void * data)673 static int vfio_dev_viable(struct device *dev, void *data)
674 {
675 	struct vfio_group *group = data;
676 	struct vfio_device *device;
677 	struct device_driver *drv = READ_ONCE(dev->driver);
678 	struct vfio_unbound_dev *unbound;
679 	int ret = -EINVAL;
680 
681 	mutex_lock(&group->unbound_lock);
682 	list_for_each_entry(unbound, &group->unbound_list, unbound_next) {
683 		if (dev == unbound->dev) {
684 			ret = 0;
685 			break;
686 		}
687 	}
688 	mutex_unlock(&group->unbound_lock);
689 
690 	if (!ret || !drv || vfio_dev_driver_allowed(dev, drv))
691 		return 0;
692 
693 	device = vfio_group_get_device(group, dev);
694 	if (device) {
695 		vfio_device_put(device);
696 		return 0;
697 	}
698 
699 	return ret;
700 }
701 
702 /**
703  * Async device support
704  */
vfio_group_nb_add_dev(struct vfio_group * group,struct device * dev)705 static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev)
706 {
707 	struct vfio_device *device;
708 
709 	/* Do we already know about it?  We shouldn't */
710 	device = vfio_group_get_device(group, dev);
711 	if (WARN_ON_ONCE(device)) {
712 		vfio_device_put(device);
713 		return 0;
714 	}
715 
716 	/* Nothing to do for idle groups */
717 	if (!atomic_read(&group->container_users))
718 		return 0;
719 
720 	/* TODO Prevent device auto probing */
721 	dev_WARN(dev, "Device added to live group %d!\n",
722 		 iommu_group_id(group->iommu_group));
723 
724 	return 0;
725 }
726 
vfio_group_nb_verify(struct vfio_group * group,struct device * dev)727 static int vfio_group_nb_verify(struct vfio_group *group, struct device *dev)
728 {
729 	/* We don't care what happens when the group isn't in use */
730 	if (!atomic_read(&group->container_users))
731 		return 0;
732 
733 	return vfio_dev_viable(dev, group);
734 }
735 
vfio_iommu_group_notifier(struct notifier_block * nb,unsigned long action,void * data)736 static int vfio_iommu_group_notifier(struct notifier_block *nb,
737 				     unsigned long action, void *data)
738 {
739 	struct vfio_group *group = container_of(nb, struct vfio_group, nb);
740 	struct device *dev = data;
741 	struct vfio_unbound_dev *unbound;
742 
743 	/*
744 	 * Need to go through a group_lock lookup to get a reference or we
745 	 * risk racing a group being removed.  Ignore spurious notifies.
746 	 */
747 	group = vfio_group_try_get(group);
748 	if (!group)
749 		return NOTIFY_OK;
750 
751 	switch (action) {
752 	case IOMMU_GROUP_NOTIFY_ADD_DEVICE:
753 		vfio_group_nb_add_dev(group, dev);
754 		break;
755 	case IOMMU_GROUP_NOTIFY_DEL_DEVICE:
756 		/*
757 		 * Nothing to do here.  If the device is in use, then the
758 		 * vfio sub-driver should block the remove callback until
759 		 * it is unused.  If the device is unused or attached to a
760 		 * stub driver, then it should be released and we don't
761 		 * care that it will be going away.
762 		 */
763 		break;
764 	case IOMMU_GROUP_NOTIFY_BIND_DRIVER:
765 		dev_dbg(dev, "%s: group %d binding to driver\n", __func__,
766 			iommu_group_id(group->iommu_group));
767 		break;
768 	case IOMMU_GROUP_NOTIFY_BOUND_DRIVER:
769 		dev_dbg(dev, "%s: group %d bound to driver %s\n", __func__,
770 			iommu_group_id(group->iommu_group), dev->driver->name);
771 		BUG_ON(vfio_group_nb_verify(group, dev));
772 		break;
773 	case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER:
774 		dev_dbg(dev, "%s: group %d unbinding from driver %s\n",
775 			__func__, iommu_group_id(group->iommu_group),
776 			dev->driver->name);
777 		break;
778 	case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER:
779 		dev_dbg(dev, "%s: group %d unbound from driver\n", __func__,
780 			iommu_group_id(group->iommu_group));
781 		/*
782 		 * XXX An unbound device in a live group is ok, but we'd
783 		 * really like to avoid the above BUG_ON by preventing other
784 		 * drivers from binding to it.  Once that occurs, we have to
785 		 * stop the system to maintain isolation.  At a minimum, we'd
786 		 * want a toggle to disable driver auto probe for this device.
787 		 */
788 
789 		mutex_lock(&group->unbound_lock);
790 		list_for_each_entry(unbound,
791 				    &group->unbound_list, unbound_next) {
792 			if (dev == unbound->dev) {
793 				list_del(&unbound->unbound_next);
794 				kfree(unbound);
795 				break;
796 			}
797 		}
798 		mutex_unlock(&group->unbound_lock);
799 		break;
800 	}
801 
802 	/*
803 	 * If we're the last reference to the group, the group will be
804 	 * released, which includes unregistering the iommu group notifier.
805 	 * We hold a read-lock on that notifier list, unregistering needs
806 	 * a write-lock... deadlock.  Release our reference asynchronously
807 	 * to avoid that situation.
808 	 */
809 	vfio_group_schedule_put(group);
810 	return NOTIFY_OK;
811 }
812 
813 /**
814  * VFIO driver API
815  */
vfio_init_group_dev(struct vfio_device * device,struct device * dev,const struct vfio_device_ops * ops)816 void vfio_init_group_dev(struct vfio_device *device, struct device *dev,
817 			 const struct vfio_device_ops *ops)
818 {
819 	init_completion(&device->comp);
820 	device->dev = dev;
821 	device->ops = ops;
822 }
823 EXPORT_SYMBOL_GPL(vfio_init_group_dev);
824 
vfio_uninit_group_dev(struct vfio_device * device)825 void vfio_uninit_group_dev(struct vfio_device *device)
826 {
827 	vfio_release_device_set(device);
828 }
829 EXPORT_SYMBOL_GPL(vfio_uninit_group_dev);
830 
vfio_register_group_dev(struct vfio_device * device)831 int vfio_register_group_dev(struct vfio_device *device)
832 {
833 	struct vfio_device *existing_device;
834 	struct iommu_group *iommu_group;
835 	struct vfio_group *group;
836 
837 	/*
838 	 * If the driver doesn't specify a set then the device is added to a
839 	 * singleton set just for itself.
840 	 */
841 	if (!device->dev_set)
842 		vfio_assign_device_set(device, device);
843 
844 	iommu_group = iommu_group_get(device->dev);
845 	if (!iommu_group)
846 		return -EINVAL;
847 
848 	group = vfio_group_get_from_iommu(iommu_group);
849 	if (!group) {
850 		group = vfio_create_group(iommu_group);
851 		if (IS_ERR(group)) {
852 			iommu_group_put(iommu_group);
853 			return PTR_ERR(group);
854 		}
855 	} else {
856 		/*
857 		 * A found vfio_group already holds a reference to the
858 		 * iommu_group.  A created vfio_group keeps the reference.
859 		 */
860 		iommu_group_put(iommu_group);
861 	}
862 
863 	existing_device = vfio_group_get_device(group, device->dev);
864 	if (existing_device) {
865 		dev_WARN(device->dev, "Device already exists on group %d\n",
866 			 iommu_group_id(iommu_group));
867 		vfio_device_put(existing_device);
868 		vfio_group_put(group);
869 		return -EBUSY;
870 	}
871 
872 	/* Our reference on group is moved to the device */
873 	device->group = group;
874 
875 	/* Refcounting can't start until the driver calls register */
876 	refcount_set(&device->refcount, 1);
877 
878 	mutex_lock(&group->device_lock);
879 	list_add(&device->group_next, &group->device_list);
880 	group->dev_counter++;
881 	mutex_unlock(&group->device_lock);
882 
883 	return 0;
884 }
885 EXPORT_SYMBOL_GPL(vfio_register_group_dev);
886 
887 /**
888  * Get a reference to the vfio_device for a device.  Even if the
889  * caller thinks they own the device, they could be racing with a
890  * release call path, so we can't trust drvdata for the shortcut.
891  * Go the long way around, from the iommu_group to the vfio_group
892  * to the vfio_device.
893  */
vfio_device_get_from_dev(struct device * dev)894 struct vfio_device *vfio_device_get_from_dev(struct device *dev)
895 {
896 	struct vfio_group *group;
897 	struct vfio_device *device;
898 
899 	group = vfio_group_get_from_dev(dev);
900 	if (!group)
901 		return NULL;
902 
903 	device = vfio_group_get_device(group, dev);
904 	vfio_group_put(group);
905 
906 	return device;
907 }
908 EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
909 
vfio_device_get_from_name(struct vfio_group * group,char * buf)910 static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
911 						     char *buf)
912 {
913 	struct vfio_device *it, *device = ERR_PTR(-ENODEV);
914 
915 	mutex_lock(&group->device_lock);
916 	list_for_each_entry(it, &group->device_list, group_next) {
917 		int ret;
918 
919 		if (it->ops->match) {
920 			ret = it->ops->match(it, buf);
921 			if (ret < 0) {
922 				device = ERR_PTR(ret);
923 				break;
924 			}
925 		} else {
926 			ret = !strcmp(dev_name(it->dev), buf);
927 		}
928 
929 		if (ret && vfio_device_try_get(it)) {
930 			device = it;
931 			break;
932 		}
933 	}
934 	mutex_unlock(&group->device_lock);
935 
936 	return device;
937 }
938 
939 /*
940  * Decrement the device reference count and wait for the device to be
941  * removed.  Open file descriptors for the device... */
vfio_unregister_group_dev(struct vfio_device * device)942 void vfio_unregister_group_dev(struct vfio_device *device)
943 {
944 	struct vfio_group *group = device->group;
945 	struct vfio_unbound_dev *unbound;
946 	unsigned int i = 0;
947 	bool interrupted = false;
948 	long rc;
949 
950 	/*
951 	 * When the device is removed from the group, the group suddenly
952 	 * becomes non-viable; the device has a driver (until the unbind
953 	 * completes), but it's not present in the group.  This is bad news
954 	 * for any external users that need to re-acquire a group reference
955 	 * in order to match and release their existing reference.  To
956 	 * solve this, we track such devices on the unbound_list to bridge
957 	 * the gap until they're fully unbound.
958 	 */
959 	unbound = kzalloc(sizeof(*unbound), GFP_KERNEL);
960 	if (unbound) {
961 		unbound->dev = device->dev;
962 		mutex_lock(&group->unbound_lock);
963 		list_add(&unbound->unbound_next, &group->unbound_list);
964 		mutex_unlock(&group->unbound_lock);
965 	}
966 	WARN_ON(!unbound);
967 
968 	vfio_device_put(device);
969 	rc = try_wait_for_completion(&device->comp);
970 	while (rc <= 0) {
971 		if (device->ops->request)
972 			device->ops->request(device, i++);
973 
974 		if (interrupted) {
975 			rc = wait_for_completion_timeout(&device->comp,
976 							 HZ * 10);
977 		} else {
978 			rc = wait_for_completion_interruptible_timeout(
979 				&device->comp, HZ * 10);
980 			if (rc < 0) {
981 				interrupted = true;
982 				dev_warn(device->dev,
983 					 "Device is currently in use, task"
984 					 " \"%s\" (%d) "
985 					 "blocked until device is released",
986 					 current->comm, task_pid_nr(current));
987 			}
988 		}
989 	}
990 
991 	mutex_lock(&group->device_lock);
992 	list_del(&device->group_next);
993 	group->dev_counter--;
994 	mutex_unlock(&group->device_lock);
995 
996 	/*
997 	 * In order to support multiple devices per group, devices can be
998 	 * plucked from the group while other devices in the group are still
999 	 * in use.  The container persists with this group and those remaining
1000 	 * devices still attached.  If the user creates an isolation violation
1001 	 * by binding this device to another driver while the group is still in
1002 	 * use, that's their fault.  However, in the case of removing the last,
1003 	 * or potentially the only, device in the group there can be no other
1004 	 * in-use devices in the group.  The user has done their due diligence
1005 	 * and we should lay no claims to those devices.  In order to do that,
1006 	 * we need to make sure the group is detached from the container.
1007 	 * Without this stall, we're potentially racing with a user process
1008 	 * that may attempt to immediately bind this device to another driver.
1009 	 */
1010 	if (list_empty(&group->device_list))
1011 		wait_event(group->container_q, !group->container);
1012 
1013 	/* Matches the get in vfio_register_group_dev() */
1014 	vfio_group_put(group);
1015 }
1016 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
1017 
1018 /**
1019  * VFIO base fd, /dev/vfio/vfio
1020  */
vfio_ioctl_check_extension(struct vfio_container * container,unsigned long arg)1021 static long vfio_ioctl_check_extension(struct vfio_container *container,
1022 				       unsigned long arg)
1023 {
1024 	struct vfio_iommu_driver *driver;
1025 	long ret = 0;
1026 
1027 	down_read(&container->group_lock);
1028 
1029 	driver = container->iommu_driver;
1030 
1031 	switch (arg) {
1032 		/* No base extensions yet */
1033 	default:
1034 		/*
1035 		 * If no driver is set, poll all registered drivers for
1036 		 * extensions and return the first positive result.  If
1037 		 * a driver is already set, further queries will be passed
1038 		 * only to that driver.
1039 		 */
1040 		if (!driver) {
1041 			mutex_lock(&vfio.iommu_drivers_lock);
1042 			list_for_each_entry(driver, &vfio.iommu_drivers_list,
1043 					    vfio_next) {
1044 
1045 #ifdef CONFIG_VFIO_NOIOMMU
1046 				if (!list_empty(&container->group_list) &&
1047 				    (container->noiommu !=
1048 				     (driver->ops == &vfio_noiommu_ops)))
1049 					continue;
1050 #endif
1051 
1052 				if (!try_module_get(driver->ops->owner))
1053 					continue;
1054 
1055 				ret = driver->ops->ioctl(NULL,
1056 							 VFIO_CHECK_EXTENSION,
1057 							 arg);
1058 				module_put(driver->ops->owner);
1059 				if (ret > 0)
1060 					break;
1061 			}
1062 			mutex_unlock(&vfio.iommu_drivers_lock);
1063 		} else
1064 			ret = driver->ops->ioctl(container->iommu_data,
1065 						 VFIO_CHECK_EXTENSION, arg);
1066 	}
1067 
1068 	up_read(&container->group_lock);
1069 
1070 	return ret;
1071 }
1072 
1073 /* hold write lock on container->group_lock */
__vfio_container_attach_groups(struct vfio_container * container,struct vfio_iommu_driver * driver,void * data)1074 static int __vfio_container_attach_groups(struct vfio_container *container,
1075 					  struct vfio_iommu_driver *driver,
1076 					  void *data)
1077 {
1078 	struct vfio_group *group;
1079 	int ret = -ENODEV;
1080 
1081 	list_for_each_entry(group, &container->group_list, container_next) {
1082 		ret = driver->ops->attach_group(data, group->iommu_group);
1083 		if (ret)
1084 			goto unwind;
1085 	}
1086 
1087 	return ret;
1088 
1089 unwind:
1090 	list_for_each_entry_continue_reverse(group, &container->group_list,
1091 					     container_next) {
1092 		driver->ops->detach_group(data, group->iommu_group);
1093 	}
1094 
1095 	return ret;
1096 }
1097 
vfio_ioctl_set_iommu(struct vfio_container * container,unsigned long arg)1098 static long vfio_ioctl_set_iommu(struct vfio_container *container,
1099 				 unsigned long arg)
1100 {
1101 	struct vfio_iommu_driver *driver;
1102 	long ret = -ENODEV;
1103 
1104 	down_write(&container->group_lock);
1105 
1106 	/*
1107 	 * The container is designed to be an unprivileged interface while
1108 	 * the group can be assigned to specific users.  Therefore, only by
1109 	 * adding a group to a container does the user get the privilege of
1110 	 * enabling the iommu, which may allocate finite resources.  There
1111 	 * is no unset_iommu, but by removing all the groups from a container,
1112 	 * the container is deprivileged and returns to an unset state.
1113 	 */
1114 	if (list_empty(&container->group_list) || container->iommu_driver) {
1115 		up_write(&container->group_lock);
1116 		return -EINVAL;
1117 	}
1118 
1119 	mutex_lock(&vfio.iommu_drivers_lock);
1120 	list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
1121 		void *data;
1122 
1123 #ifdef CONFIG_VFIO_NOIOMMU
1124 		/*
1125 		 * Only noiommu containers can use vfio-noiommu and noiommu
1126 		 * containers can only use vfio-noiommu.
1127 		 */
1128 		if (container->noiommu != (driver->ops == &vfio_noiommu_ops))
1129 			continue;
1130 #endif
1131 
1132 		if (!try_module_get(driver->ops->owner))
1133 			continue;
1134 
1135 		/*
1136 		 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
1137 		 * so test which iommu driver reported support for this
1138 		 * extension and call open on them.  We also pass them the
1139 		 * magic, allowing a single driver to support multiple
1140 		 * interfaces if they'd like.
1141 		 */
1142 		if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
1143 			module_put(driver->ops->owner);
1144 			continue;
1145 		}
1146 
1147 		data = driver->ops->open(arg);
1148 		if (IS_ERR(data)) {
1149 			ret = PTR_ERR(data);
1150 			module_put(driver->ops->owner);
1151 			continue;
1152 		}
1153 
1154 		ret = __vfio_container_attach_groups(container, driver, data);
1155 		if (ret) {
1156 			driver->ops->release(data);
1157 			module_put(driver->ops->owner);
1158 			continue;
1159 		}
1160 
1161 		container->iommu_driver = driver;
1162 		container->iommu_data = data;
1163 		break;
1164 	}
1165 
1166 	mutex_unlock(&vfio.iommu_drivers_lock);
1167 	up_write(&container->group_lock);
1168 
1169 	return ret;
1170 }
1171 
vfio_fops_unl_ioctl(struct file * filep,unsigned int cmd,unsigned long arg)1172 static long vfio_fops_unl_ioctl(struct file *filep,
1173 				unsigned int cmd, unsigned long arg)
1174 {
1175 	struct vfio_container *container = filep->private_data;
1176 	struct vfio_iommu_driver *driver;
1177 	void *data;
1178 	long ret = -EINVAL;
1179 
1180 	if (!container)
1181 		return ret;
1182 
1183 	switch (cmd) {
1184 	case VFIO_GET_API_VERSION:
1185 		ret = VFIO_API_VERSION;
1186 		break;
1187 	case VFIO_CHECK_EXTENSION:
1188 		ret = vfio_ioctl_check_extension(container, arg);
1189 		break;
1190 	case VFIO_SET_IOMMU:
1191 		ret = vfio_ioctl_set_iommu(container, arg);
1192 		break;
1193 	default:
1194 		driver = container->iommu_driver;
1195 		data = container->iommu_data;
1196 
1197 		if (driver) /* passthrough all unrecognized ioctls */
1198 			ret = driver->ops->ioctl(data, cmd, arg);
1199 	}
1200 
1201 	return ret;
1202 }
1203 
vfio_fops_open(struct inode * inode,struct file * filep)1204 static int vfio_fops_open(struct inode *inode, struct file *filep)
1205 {
1206 	struct vfio_container *container;
1207 
1208 	container = kzalloc(sizeof(*container), GFP_KERNEL);
1209 	if (!container)
1210 		return -ENOMEM;
1211 
1212 	INIT_LIST_HEAD(&container->group_list);
1213 	init_rwsem(&container->group_lock);
1214 	kref_init(&container->kref);
1215 
1216 	filep->private_data = container;
1217 
1218 	return 0;
1219 }
1220 
vfio_fops_release(struct inode * inode,struct file * filep)1221 static int vfio_fops_release(struct inode *inode, struct file *filep)
1222 {
1223 	struct vfio_container *container = filep->private_data;
1224 	struct vfio_iommu_driver *driver = container->iommu_driver;
1225 
1226 	if (driver && driver->ops->notify)
1227 		driver->ops->notify(container->iommu_data,
1228 				    VFIO_IOMMU_CONTAINER_CLOSE);
1229 
1230 	filep->private_data = NULL;
1231 
1232 	vfio_container_put(container);
1233 
1234 	return 0;
1235 }
1236 
1237 /*
1238  * Once an iommu driver is set, we optionally pass read/write/mmap
1239  * on to the driver, allowing management interfaces beyond ioctl.
1240  */
vfio_fops_read(struct file * filep,char __user * buf,size_t count,loff_t * ppos)1241 static ssize_t vfio_fops_read(struct file *filep, char __user *buf,
1242 			      size_t count, loff_t *ppos)
1243 {
1244 	struct vfio_container *container = filep->private_data;
1245 	struct vfio_iommu_driver *driver;
1246 	ssize_t ret = -EINVAL;
1247 
1248 	driver = container->iommu_driver;
1249 	if (likely(driver && driver->ops->read))
1250 		ret = driver->ops->read(container->iommu_data,
1251 					buf, count, ppos);
1252 
1253 	return ret;
1254 }
1255 
vfio_fops_write(struct file * filep,const char __user * buf,size_t count,loff_t * ppos)1256 static ssize_t vfio_fops_write(struct file *filep, const char __user *buf,
1257 			       size_t count, loff_t *ppos)
1258 {
1259 	struct vfio_container *container = filep->private_data;
1260 	struct vfio_iommu_driver *driver;
1261 	ssize_t ret = -EINVAL;
1262 
1263 	driver = container->iommu_driver;
1264 	if (likely(driver && driver->ops->write))
1265 		ret = driver->ops->write(container->iommu_data,
1266 					 buf, count, ppos);
1267 
1268 	return ret;
1269 }
1270 
vfio_fops_mmap(struct file * filep,struct vm_area_struct * vma)1271 static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1272 {
1273 	struct vfio_container *container = filep->private_data;
1274 	struct vfio_iommu_driver *driver;
1275 	int ret = -EINVAL;
1276 
1277 	driver = container->iommu_driver;
1278 	if (likely(driver && driver->ops->mmap))
1279 		ret = driver->ops->mmap(container->iommu_data, vma);
1280 
1281 	return ret;
1282 }
1283 
1284 static const struct file_operations vfio_fops = {
1285 	.owner		= THIS_MODULE,
1286 	.open		= vfio_fops_open,
1287 	.release	= vfio_fops_release,
1288 	.read		= vfio_fops_read,
1289 	.write		= vfio_fops_write,
1290 	.unlocked_ioctl	= vfio_fops_unl_ioctl,
1291 	.compat_ioctl	= compat_ptr_ioctl,
1292 	.mmap		= vfio_fops_mmap,
1293 };
1294 
1295 /**
1296  * VFIO Group fd, /dev/vfio/$GROUP
1297  */
__vfio_group_unset_container(struct vfio_group * group)1298 static void __vfio_group_unset_container(struct vfio_group *group)
1299 {
1300 	struct vfio_container *container = group->container;
1301 	struct vfio_iommu_driver *driver;
1302 
1303 	down_write(&container->group_lock);
1304 
1305 	driver = container->iommu_driver;
1306 	if (driver)
1307 		driver->ops->detach_group(container->iommu_data,
1308 					  group->iommu_group);
1309 
1310 	group->container = NULL;
1311 	wake_up(&group->container_q);
1312 	list_del(&group->container_next);
1313 
1314 	/* Detaching the last group deprivileges a container, remove iommu */
1315 	if (driver && list_empty(&container->group_list)) {
1316 		driver->ops->release(container->iommu_data);
1317 		module_put(driver->ops->owner);
1318 		container->iommu_driver = NULL;
1319 		container->iommu_data = NULL;
1320 	}
1321 
1322 	up_write(&container->group_lock);
1323 
1324 	vfio_container_put(container);
1325 }
1326 
1327 /*
1328  * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
1329  * if there was no container to unset.  Since the ioctl is called on
1330  * the group, we know that still exists, therefore the only valid
1331  * transition here is 1->0.
1332  */
vfio_group_unset_container(struct vfio_group * group)1333 static int vfio_group_unset_container(struct vfio_group *group)
1334 {
1335 	int users = atomic_cmpxchg(&group->container_users, 1, 0);
1336 
1337 	if (!users)
1338 		return -EINVAL;
1339 	if (users != 1)
1340 		return -EBUSY;
1341 
1342 	__vfio_group_unset_container(group);
1343 
1344 	return 0;
1345 }
1346 
1347 /*
1348  * When removing container users, anything that removes the last user
1349  * implicitly removes the group from the container.  That is, if the
1350  * group file descriptor is closed, as well as any device file descriptors,
1351  * the group is free.
1352  */
vfio_group_try_dissolve_container(struct vfio_group * group)1353 static void vfio_group_try_dissolve_container(struct vfio_group *group)
1354 {
1355 	if (0 == atomic_dec_if_positive(&group->container_users))
1356 		__vfio_group_unset_container(group);
1357 }
1358 
vfio_group_set_container(struct vfio_group * group,int container_fd)1359 static int vfio_group_set_container(struct vfio_group *group, int container_fd)
1360 {
1361 	struct fd f;
1362 	struct vfio_container *container;
1363 	struct vfio_iommu_driver *driver;
1364 	int ret = 0;
1365 
1366 	if (atomic_read(&group->container_users))
1367 		return -EINVAL;
1368 
1369 	if (group->noiommu && !capable(CAP_SYS_RAWIO))
1370 		return -EPERM;
1371 
1372 	f = fdget(container_fd);
1373 	if (!f.file)
1374 		return -EBADF;
1375 
1376 	/* Sanity check, is this really our fd? */
1377 	if (f.file->f_op != &vfio_fops) {
1378 		fdput(f);
1379 		return -EINVAL;
1380 	}
1381 
1382 	container = f.file->private_data;
1383 	WARN_ON(!container); /* fget ensures we don't race vfio_release */
1384 
1385 	down_write(&container->group_lock);
1386 
1387 	/* Real groups and fake groups cannot mix */
1388 	if (!list_empty(&container->group_list) &&
1389 	    container->noiommu != group->noiommu) {
1390 		ret = -EPERM;
1391 		goto unlock_out;
1392 	}
1393 
1394 	driver = container->iommu_driver;
1395 	if (driver) {
1396 		ret = driver->ops->attach_group(container->iommu_data,
1397 						group->iommu_group);
1398 		if (ret)
1399 			goto unlock_out;
1400 	}
1401 
1402 	group->container = container;
1403 	container->noiommu = group->noiommu;
1404 	list_add(&group->container_next, &container->group_list);
1405 
1406 	/* Get a reference on the container and mark a user within the group */
1407 	vfio_container_get(container);
1408 	atomic_inc(&group->container_users);
1409 
1410 unlock_out:
1411 	up_write(&container->group_lock);
1412 	fdput(f);
1413 	return ret;
1414 }
1415 
vfio_group_viable(struct vfio_group * group)1416 static bool vfio_group_viable(struct vfio_group *group)
1417 {
1418 	return (iommu_group_for_each_dev(group->iommu_group,
1419 					 group, vfio_dev_viable) == 0);
1420 }
1421 
vfio_group_add_container_user(struct vfio_group * group)1422 static int vfio_group_add_container_user(struct vfio_group *group)
1423 {
1424 	if (!atomic_inc_not_zero(&group->container_users))
1425 		return -EINVAL;
1426 
1427 	if (group->noiommu) {
1428 		atomic_dec(&group->container_users);
1429 		return -EPERM;
1430 	}
1431 	if (!group->container->iommu_driver || !vfio_group_viable(group)) {
1432 		atomic_dec(&group->container_users);
1433 		return -EINVAL;
1434 	}
1435 
1436 	return 0;
1437 }
1438 
1439 static const struct file_operations vfio_device_fops;
1440 
vfio_group_get_device_fd(struct vfio_group * group,char * buf)1441 static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1442 {
1443 	struct vfio_device *device;
1444 	struct file *filep;
1445 	int fdno;
1446 	int ret = 0;
1447 
1448 	if (0 == atomic_read(&group->container_users) ||
1449 	    !group->container->iommu_driver || !vfio_group_viable(group))
1450 		return -EINVAL;
1451 
1452 	if (group->noiommu && !capable(CAP_SYS_RAWIO))
1453 		return -EPERM;
1454 
1455 	device = vfio_device_get_from_name(group, buf);
1456 	if (IS_ERR(device))
1457 		return PTR_ERR(device);
1458 
1459 	if (!try_module_get(device->dev->driver->owner)) {
1460 		ret = -ENODEV;
1461 		goto err_device_put;
1462 	}
1463 
1464 	mutex_lock(&device->dev_set->lock);
1465 	device->open_count++;
1466 	if (device->open_count == 1 && device->ops->open_device) {
1467 		ret = device->ops->open_device(device);
1468 		if (ret)
1469 			goto err_undo_count;
1470 	}
1471 	mutex_unlock(&device->dev_set->lock);
1472 
1473 	/*
1474 	 * We can't use anon_inode_getfd() because we need to modify
1475 	 * the f_mode flags directly to allow more than just ioctls
1476 	 */
1477 	fdno = ret = get_unused_fd_flags(O_CLOEXEC);
1478 	if (ret < 0)
1479 		goto err_close_device;
1480 
1481 	filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1482 				   device, O_RDWR);
1483 	if (IS_ERR(filep)) {
1484 		ret = PTR_ERR(filep);
1485 		goto err_fd;
1486 	}
1487 
1488 	/*
1489 	 * TODO: add an anon_inode interface to do this.
1490 	 * Appears to be missing by lack of need rather than
1491 	 * explicitly prevented.  Now there's need.
1492 	 */
1493 	filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1494 
1495 	atomic_inc(&group->container_users);
1496 
1497 	fd_install(fdno, filep);
1498 
1499 	if (group->noiommu)
1500 		dev_warn(device->dev, "vfio-noiommu device opened by user "
1501 			 "(%s:%d)\n", current->comm, task_pid_nr(current));
1502 	return fdno;
1503 
1504 err_fd:
1505 	put_unused_fd(fdno);
1506 err_close_device:
1507 	mutex_lock(&device->dev_set->lock);
1508 	if (device->open_count == 1 && device->ops->close_device)
1509 		device->ops->close_device(device);
1510 err_undo_count:
1511 	device->open_count--;
1512 	mutex_unlock(&device->dev_set->lock);
1513 	module_put(device->dev->driver->owner);
1514 err_device_put:
1515 	vfio_device_put(device);
1516 	return ret;
1517 }
1518 
vfio_group_fops_unl_ioctl(struct file * filep,unsigned int cmd,unsigned long arg)1519 static long vfio_group_fops_unl_ioctl(struct file *filep,
1520 				      unsigned int cmd, unsigned long arg)
1521 {
1522 	struct vfio_group *group = filep->private_data;
1523 	long ret = -ENOTTY;
1524 
1525 	switch (cmd) {
1526 	case VFIO_GROUP_GET_STATUS:
1527 	{
1528 		struct vfio_group_status status;
1529 		unsigned long minsz;
1530 
1531 		minsz = offsetofend(struct vfio_group_status, flags);
1532 
1533 		if (copy_from_user(&status, (void __user *)arg, minsz))
1534 			return -EFAULT;
1535 
1536 		if (status.argsz < minsz)
1537 			return -EINVAL;
1538 
1539 		status.flags = 0;
1540 
1541 		if (vfio_group_viable(group))
1542 			status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1543 
1544 		if (group->container)
1545 			status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET;
1546 
1547 		if (copy_to_user((void __user *)arg, &status, minsz))
1548 			return -EFAULT;
1549 
1550 		ret = 0;
1551 		break;
1552 	}
1553 	case VFIO_GROUP_SET_CONTAINER:
1554 	{
1555 		int fd;
1556 
1557 		if (get_user(fd, (int __user *)arg))
1558 			return -EFAULT;
1559 
1560 		if (fd < 0)
1561 			return -EINVAL;
1562 
1563 		ret = vfio_group_set_container(group, fd);
1564 		break;
1565 	}
1566 	case VFIO_GROUP_UNSET_CONTAINER:
1567 		ret = vfio_group_unset_container(group);
1568 		break;
1569 	case VFIO_GROUP_GET_DEVICE_FD:
1570 	{
1571 		char *buf;
1572 
1573 		buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1574 		if (IS_ERR(buf))
1575 			return PTR_ERR(buf);
1576 
1577 		ret = vfio_group_get_device_fd(group, buf);
1578 		kfree(buf);
1579 		break;
1580 	}
1581 	}
1582 
1583 	return ret;
1584 }
1585 
vfio_group_fops_open(struct inode * inode,struct file * filep)1586 static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1587 {
1588 	struct vfio_group *group;
1589 	int opened;
1590 
1591 	group = vfio_group_get_from_minor(iminor(inode));
1592 	if (!group)
1593 		return -ENODEV;
1594 
1595 	if (group->noiommu && !capable(CAP_SYS_RAWIO)) {
1596 		vfio_group_put(group);
1597 		return -EPERM;
1598 	}
1599 
1600 	/* Do we need multiple instances of the group open?  Seems not. */
1601 	opened = atomic_cmpxchg(&group->opened, 0, 1);
1602 	if (opened) {
1603 		vfio_group_put(group);
1604 		return -EBUSY;
1605 	}
1606 
1607 	/* Is something still in use from a previous open? */
1608 	if (group->container) {
1609 		atomic_dec(&group->opened);
1610 		vfio_group_put(group);
1611 		return -EBUSY;
1612 	}
1613 
1614 	/* Warn if previous user didn't cleanup and re-init to drop them */
1615 	if (WARN_ON(group->notifier.head))
1616 		BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
1617 
1618 	filep->private_data = group;
1619 
1620 	return 0;
1621 }
1622 
vfio_group_fops_release(struct inode * inode,struct file * filep)1623 static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1624 {
1625 	struct vfio_group *group = filep->private_data;
1626 
1627 	filep->private_data = NULL;
1628 
1629 	vfio_group_try_dissolve_container(group);
1630 
1631 	atomic_dec(&group->opened);
1632 
1633 	vfio_group_put(group);
1634 
1635 	return 0;
1636 }
1637 
1638 static const struct file_operations vfio_group_fops = {
1639 	.owner		= THIS_MODULE,
1640 	.unlocked_ioctl	= vfio_group_fops_unl_ioctl,
1641 	.compat_ioctl	= compat_ptr_ioctl,
1642 	.open		= vfio_group_fops_open,
1643 	.release	= vfio_group_fops_release,
1644 };
1645 
1646 /**
1647  * VFIO Device fd
1648  */
vfio_device_fops_release(struct inode * inode,struct file * filep)1649 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1650 {
1651 	struct vfio_device *device = filep->private_data;
1652 
1653 	mutex_lock(&device->dev_set->lock);
1654 	if (!--device->open_count && device->ops->close_device)
1655 		device->ops->close_device(device);
1656 	mutex_unlock(&device->dev_set->lock);
1657 
1658 	module_put(device->dev->driver->owner);
1659 
1660 	vfio_group_try_dissolve_container(device->group);
1661 
1662 	vfio_device_put(device);
1663 
1664 	return 0;
1665 }
1666 
vfio_device_fops_unl_ioctl(struct file * filep,unsigned int cmd,unsigned long arg)1667 static long vfio_device_fops_unl_ioctl(struct file *filep,
1668 				       unsigned int cmd, unsigned long arg)
1669 {
1670 	struct vfio_device *device = filep->private_data;
1671 
1672 	if (unlikely(!device->ops->ioctl))
1673 		return -EINVAL;
1674 
1675 	return device->ops->ioctl(device, cmd, arg);
1676 }
1677 
vfio_device_fops_read(struct file * filep,char __user * buf,size_t count,loff_t * ppos)1678 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1679 				     size_t count, loff_t *ppos)
1680 {
1681 	struct vfio_device *device = filep->private_data;
1682 
1683 	if (unlikely(!device->ops->read))
1684 		return -EINVAL;
1685 
1686 	return device->ops->read(device, buf, count, ppos);
1687 }
1688 
vfio_device_fops_write(struct file * filep,const char __user * buf,size_t count,loff_t * ppos)1689 static ssize_t vfio_device_fops_write(struct file *filep,
1690 				      const char __user *buf,
1691 				      size_t count, loff_t *ppos)
1692 {
1693 	struct vfio_device *device = filep->private_data;
1694 
1695 	if (unlikely(!device->ops->write))
1696 		return -EINVAL;
1697 
1698 	return device->ops->write(device, buf, count, ppos);
1699 }
1700 
vfio_device_fops_mmap(struct file * filep,struct vm_area_struct * vma)1701 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1702 {
1703 	struct vfio_device *device = filep->private_data;
1704 
1705 	if (unlikely(!device->ops->mmap))
1706 		return -EINVAL;
1707 
1708 	return device->ops->mmap(device, vma);
1709 }
1710 
1711 static const struct file_operations vfio_device_fops = {
1712 	.owner		= THIS_MODULE,
1713 	.release	= vfio_device_fops_release,
1714 	.read		= vfio_device_fops_read,
1715 	.write		= vfio_device_fops_write,
1716 	.unlocked_ioctl	= vfio_device_fops_unl_ioctl,
1717 	.compat_ioctl	= compat_ptr_ioctl,
1718 	.mmap		= vfio_device_fops_mmap,
1719 };
1720 
1721 /**
1722  * External user API, exported by symbols to be linked dynamically.
1723  *
1724  * The protocol includes:
1725  *  1. do normal VFIO init operation:
1726  *	- opening a new container;
1727  *	- attaching group(s) to it;
1728  *	- setting an IOMMU driver for a container.
1729  * When IOMMU is set for a container, all groups in it are
1730  * considered ready to use by an external user.
1731  *
1732  * 2. User space passes a group fd to an external user.
1733  * The external user calls vfio_group_get_external_user()
1734  * to verify that:
1735  *	- the group is initialized;
1736  *	- IOMMU is set for it.
1737  * If both checks passed, vfio_group_get_external_user()
1738  * increments the container user counter to prevent
1739  * the VFIO group from disposal before KVM exits.
1740  *
1741  * 3. The external user calls vfio_external_user_iommu_id()
1742  * to know an IOMMU ID.
1743  *
1744  * 4. When the external KVM finishes, it calls
1745  * vfio_group_put_external_user() to release the VFIO group.
1746  * This call decrements the container user counter.
1747  */
vfio_group_get_external_user(struct file * filep)1748 struct vfio_group *vfio_group_get_external_user(struct file *filep)
1749 {
1750 	struct vfio_group *group = filep->private_data;
1751 	int ret;
1752 
1753 	if (filep->f_op != &vfio_group_fops)
1754 		return ERR_PTR(-EINVAL);
1755 
1756 	ret = vfio_group_add_container_user(group);
1757 	if (ret)
1758 		return ERR_PTR(ret);
1759 
1760 	vfio_group_get(group);
1761 
1762 	return group;
1763 }
1764 EXPORT_SYMBOL_GPL(vfio_group_get_external_user);
1765 
1766 /**
1767  * External user API, exported by symbols to be linked dynamically.
1768  * The external user passes in a device pointer
1769  * to verify that:
1770  *	- A VFIO group is assiciated with the device;
1771  *	- IOMMU is set for the group.
1772  * If both checks passed, vfio_group_get_external_user_from_dev()
1773  * increments the container user counter to prevent the VFIO group
1774  * from disposal before external user exits and returns the pointer
1775  * to the VFIO group.
1776  *
1777  * When the external user finishes using the VFIO group, it calls
1778  * vfio_group_put_external_user() to release the VFIO group and
1779  * decrement the container user counter.
1780  *
1781  * @dev [in]	: device
1782  * Return error PTR or pointer to VFIO group.
1783  */
1784 
vfio_group_get_external_user_from_dev(struct device * dev)1785 struct vfio_group *vfio_group_get_external_user_from_dev(struct device *dev)
1786 {
1787 	struct vfio_group *group;
1788 	int ret;
1789 
1790 	group = vfio_group_get_from_dev(dev);
1791 	if (!group)
1792 		return ERR_PTR(-ENODEV);
1793 
1794 	ret = vfio_group_add_container_user(group);
1795 	if (ret) {
1796 		vfio_group_put(group);
1797 		return ERR_PTR(ret);
1798 	}
1799 
1800 	return group;
1801 }
1802 EXPORT_SYMBOL_GPL(vfio_group_get_external_user_from_dev);
1803 
vfio_group_put_external_user(struct vfio_group * group)1804 void vfio_group_put_external_user(struct vfio_group *group)
1805 {
1806 	vfio_group_try_dissolve_container(group);
1807 	vfio_group_put(group);
1808 }
1809 EXPORT_SYMBOL_GPL(vfio_group_put_external_user);
1810 
vfio_external_group_match_file(struct vfio_group * test_group,struct file * filep)1811 bool vfio_external_group_match_file(struct vfio_group *test_group,
1812 				    struct file *filep)
1813 {
1814 	struct vfio_group *group = filep->private_data;
1815 
1816 	return (filep->f_op == &vfio_group_fops) && (group == test_group);
1817 }
1818 EXPORT_SYMBOL_GPL(vfio_external_group_match_file);
1819 
vfio_external_user_iommu_id(struct vfio_group * group)1820 int vfio_external_user_iommu_id(struct vfio_group *group)
1821 {
1822 	return iommu_group_id(group->iommu_group);
1823 }
1824 EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id);
1825 
vfio_external_check_extension(struct vfio_group * group,unsigned long arg)1826 long vfio_external_check_extension(struct vfio_group *group, unsigned long arg)
1827 {
1828 	return vfio_ioctl_check_extension(group->container, arg);
1829 }
1830 EXPORT_SYMBOL_GPL(vfio_external_check_extension);
1831 
1832 /**
1833  * Sub-module support
1834  */
1835 /*
1836  * Helper for managing a buffer of info chain capabilities, allocate or
1837  * reallocate a buffer with additional @size, filling in @id and @version
1838  * of the capability.  A pointer to the new capability is returned.
1839  *
1840  * NB. The chain is based at the head of the buffer, so new entries are
1841  * added to the tail, vfio_info_cap_shift() should be called to fixup the
1842  * next offsets prior to copying to the user buffer.
1843  */
vfio_info_cap_add(struct vfio_info_cap * caps,size_t size,u16 id,u16 version)1844 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1845 					       size_t size, u16 id, u16 version)
1846 {
1847 	void *buf;
1848 	struct vfio_info_cap_header *header, *tmp;
1849 
1850 	buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1851 	if (!buf) {
1852 		kfree(caps->buf);
1853 		caps->buf = NULL;
1854 		caps->size = 0;
1855 		return ERR_PTR(-ENOMEM);
1856 	}
1857 
1858 	caps->buf = buf;
1859 	header = buf + caps->size;
1860 
1861 	/* Eventually copied to user buffer, zero */
1862 	memset(header, 0, size);
1863 
1864 	header->id = id;
1865 	header->version = version;
1866 
1867 	/* Add to the end of the capability chain */
1868 	for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1869 		; /* nothing */
1870 
1871 	tmp->next = caps->size;
1872 	caps->size += size;
1873 
1874 	return header;
1875 }
1876 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1877 
vfio_info_cap_shift(struct vfio_info_cap * caps,size_t offset)1878 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1879 {
1880 	struct vfio_info_cap_header *tmp;
1881 	void *buf = (void *)caps->buf;
1882 
1883 	for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1884 		tmp->next += offset;
1885 }
1886 EXPORT_SYMBOL(vfio_info_cap_shift);
1887 
vfio_info_add_capability(struct vfio_info_cap * caps,struct vfio_info_cap_header * cap,size_t size)1888 int vfio_info_add_capability(struct vfio_info_cap *caps,
1889 			     struct vfio_info_cap_header *cap, size_t size)
1890 {
1891 	struct vfio_info_cap_header *header;
1892 
1893 	header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1894 	if (IS_ERR(header))
1895 		return PTR_ERR(header);
1896 
1897 	memcpy(header + 1, cap + 1, size - sizeof(*header));
1898 
1899 	return 0;
1900 }
1901 EXPORT_SYMBOL(vfio_info_add_capability);
1902 
vfio_set_irqs_validate_and_prepare(struct vfio_irq_set * hdr,int num_irqs,int max_irq_type,size_t * data_size)1903 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1904 				       int max_irq_type, size_t *data_size)
1905 {
1906 	unsigned long minsz;
1907 	size_t size;
1908 
1909 	minsz = offsetofend(struct vfio_irq_set, count);
1910 
1911 	if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1912 	    (hdr->count >= (U32_MAX - hdr->start)) ||
1913 	    (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1914 				VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1915 		return -EINVAL;
1916 
1917 	if (data_size)
1918 		*data_size = 0;
1919 
1920 	if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1921 		return -EINVAL;
1922 
1923 	switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1924 	case VFIO_IRQ_SET_DATA_NONE:
1925 		size = 0;
1926 		break;
1927 	case VFIO_IRQ_SET_DATA_BOOL:
1928 		size = sizeof(uint8_t);
1929 		break;
1930 	case VFIO_IRQ_SET_DATA_EVENTFD:
1931 		size = sizeof(int32_t);
1932 		break;
1933 	default:
1934 		return -EINVAL;
1935 	}
1936 
1937 	if (size) {
1938 		if (hdr->argsz - minsz < hdr->count * size)
1939 			return -EINVAL;
1940 
1941 		if (!data_size)
1942 			return -EINVAL;
1943 
1944 		*data_size = hdr->count * size;
1945 	}
1946 
1947 	return 0;
1948 }
1949 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1950 
1951 /*
1952  * Pin a set of guest PFNs and return their associated host PFNs for local
1953  * domain only.
1954  * @dev [in]     : device
1955  * @user_pfn [in]: array of user/guest PFNs to be pinned.
1956  * @npage [in]   : count of elements in user_pfn array.  This count should not
1957  *		   be greater VFIO_PIN_PAGES_MAX_ENTRIES.
1958  * @prot [in]    : protection flags
1959  * @phys_pfn[out]: array of host PFNs
1960  * Return error or number of pages pinned.
1961  */
vfio_pin_pages(struct device * dev,unsigned long * user_pfn,int npage,int prot,unsigned long * phys_pfn)1962 int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage,
1963 		   int prot, unsigned long *phys_pfn)
1964 {
1965 	struct vfio_container *container;
1966 	struct vfio_group *group;
1967 	struct vfio_iommu_driver *driver;
1968 	int ret;
1969 
1970 	if (!dev || !user_pfn || !phys_pfn || !npage)
1971 		return -EINVAL;
1972 
1973 	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1974 		return -E2BIG;
1975 
1976 	group = vfio_group_get_from_dev(dev);
1977 	if (!group)
1978 		return -ENODEV;
1979 
1980 	if (group->dev_counter > 1) {
1981 		ret = -EINVAL;
1982 		goto err_pin_pages;
1983 	}
1984 
1985 	ret = vfio_group_add_container_user(group);
1986 	if (ret)
1987 		goto err_pin_pages;
1988 
1989 	container = group->container;
1990 	driver = container->iommu_driver;
1991 	if (likely(driver && driver->ops->pin_pages))
1992 		ret = driver->ops->pin_pages(container->iommu_data,
1993 					     group->iommu_group, user_pfn,
1994 					     npage, prot, phys_pfn);
1995 	else
1996 		ret = -ENOTTY;
1997 
1998 	vfio_group_try_dissolve_container(group);
1999 
2000 err_pin_pages:
2001 	vfio_group_put(group);
2002 	return ret;
2003 }
2004 EXPORT_SYMBOL(vfio_pin_pages);
2005 
2006 /*
2007  * Unpin set of host PFNs for local domain only.
2008  * @dev [in]     : device
2009  * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest
2010  *		   PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
2011  * @npage [in]   : count of elements in user_pfn array.  This count should not
2012  *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
2013  * Return error or number of pages unpinned.
2014  */
vfio_unpin_pages(struct device * dev,unsigned long * user_pfn,int npage)2015 int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
2016 {
2017 	struct vfio_container *container;
2018 	struct vfio_group *group;
2019 	struct vfio_iommu_driver *driver;
2020 	int ret;
2021 
2022 	if (!dev || !user_pfn || !npage)
2023 		return -EINVAL;
2024 
2025 	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
2026 		return -E2BIG;
2027 
2028 	group = vfio_group_get_from_dev(dev);
2029 	if (!group)
2030 		return -ENODEV;
2031 
2032 	ret = vfio_group_add_container_user(group);
2033 	if (ret)
2034 		goto err_unpin_pages;
2035 
2036 	container = group->container;
2037 	driver = container->iommu_driver;
2038 	if (likely(driver && driver->ops->unpin_pages))
2039 		ret = driver->ops->unpin_pages(container->iommu_data, user_pfn,
2040 					       npage);
2041 	else
2042 		ret = -ENOTTY;
2043 
2044 	vfio_group_try_dissolve_container(group);
2045 
2046 err_unpin_pages:
2047 	vfio_group_put(group);
2048 	return ret;
2049 }
2050 EXPORT_SYMBOL(vfio_unpin_pages);
2051 
2052 /*
2053  * Pin a set of guest IOVA PFNs and return their associated host PFNs for a
2054  * VFIO group.
2055  *
2056  * The caller needs to call vfio_group_get_external_user() or
2057  * vfio_group_get_external_user_from_dev() prior to calling this interface,
2058  * so as to prevent the VFIO group from disposal in the middle of the call.
2059  * But it can keep the reference to the VFIO group for several calls into
2060  * this interface.
2061  * After finishing using of the VFIO group, the caller needs to release the
2062  * VFIO group by calling vfio_group_put_external_user().
2063  *
2064  * @group [in]		: VFIO group
2065  * @user_iova_pfn [in]	: array of user/guest IOVA PFNs to be pinned.
2066  * @npage [in]		: count of elements in user_iova_pfn array.
2067  *			  This count should not be greater
2068  *			  VFIO_PIN_PAGES_MAX_ENTRIES.
2069  * @prot [in]		: protection flags
2070  * @phys_pfn [out]	: array of host PFNs
2071  * Return error or number of pages pinned.
2072  */
vfio_group_pin_pages(struct vfio_group * group,unsigned long * user_iova_pfn,int npage,int prot,unsigned long * phys_pfn)2073 int vfio_group_pin_pages(struct vfio_group *group,
2074 			 unsigned long *user_iova_pfn, int npage,
2075 			 int prot, unsigned long *phys_pfn)
2076 {
2077 	struct vfio_container *container;
2078 	struct vfio_iommu_driver *driver;
2079 	int ret;
2080 
2081 	if (!group || !user_iova_pfn || !phys_pfn || !npage)
2082 		return -EINVAL;
2083 
2084 	if (group->dev_counter > 1)
2085 		return -EINVAL;
2086 
2087 	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
2088 		return -E2BIG;
2089 
2090 	container = group->container;
2091 	driver = container->iommu_driver;
2092 	if (likely(driver && driver->ops->pin_pages))
2093 		ret = driver->ops->pin_pages(container->iommu_data,
2094 					     group->iommu_group, user_iova_pfn,
2095 					     npage, prot, phys_pfn);
2096 	else
2097 		ret = -ENOTTY;
2098 
2099 	return ret;
2100 }
2101 EXPORT_SYMBOL(vfio_group_pin_pages);
2102 
2103 /*
2104  * Unpin a set of guest IOVA PFNs for a VFIO group.
2105  *
2106  * The caller needs to call vfio_group_get_external_user() or
2107  * vfio_group_get_external_user_from_dev() prior to calling this interface,
2108  * so as to prevent the VFIO group from disposal in the middle of the call.
2109  * But it can keep the reference to the VFIO group for several calls into
2110  * this interface.
2111  * After finishing using of the VFIO group, the caller needs to release the
2112  * VFIO group by calling vfio_group_put_external_user().
2113  *
2114  * @group [in]		: vfio group
2115  * @user_iova_pfn [in]	: array of user/guest IOVA PFNs to be unpinned.
2116  * @npage [in]		: count of elements in user_iova_pfn array.
2117  *			  This count should not be greater than
2118  *			  VFIO_PIN_PAGES_MAX_ENTRIES.
2119  * Return error or number of pages unpinned.
2120  */
vfio_group_unpin_pages(struct vfio_group * group,unsigned long * user_iova_pfn,int npage)2121 int vfio_group_unpin_pages(struct vfio_group *group,
2122 			   unsigned long *user_iova_pfn, int npage)
2123 {
2124 	struct vfio_container *container;
2125 	struct vfio_iommu_driver *driver;
2126 	int ret;
2127 
2128 	if (!group || !user_iova_pfn || !npage)
2129 		return -EINVAL;
2130 
2131 	if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
2132 		return -E2BIG;
2133 
2134 	container = group->container;
2135 	driver = container->iommu_driver;
2136 	if (likely(driver && driver->ops->unpin_pages))
2137 		ret = driver->ops->unpin_pages(container->iommu_data,
2138 					       user_iova_pfn, npage);
2139 	else
2140 		ret = -ENOTTY;
2141 
2142 	return ret;
2143 }
2144 EXPORT_SYMBOL(vfio_group_unpin_pages);
2145 
2146 
2147 /*
2148  * This interface allows the CPUs to perform some sort of virtual DMA on
2149  * behalf of the device.
2150  *
2151  * CPUs read/write from/into a range of IOVAs pointing to user space memory
2152  * into/from a kernel buffer.
2153  *
2154  * As the read/write of user space memory is conducted via the CPUs and is
2155  * not a real device DMA, it is not necessary to pin the user space memory.
2156  *
2157  * The caller needs to call vfio_group_get_external_user() or
2158  * vfio_group_get_external_user_from_dev() prior to calling this interface,
2159  * so as to prevent the VFIO group from disposal in the middle of the call.
2160  * But it can keep the reference to the VFIO group for several calls into
2161  * this interface.
2162  * After finishing using of the VFIO group, the caller needs to release the
2163  * VFIO group by calling vfio_group_put_external_user().
2164  *
2165  * @group [in]		: VFIO group
2166  * @user_iova [in]	: base IOVA of a user space buffer
2167  * @data [in]		: pointer to kernel buffer
2168  * @len [in]		: kernel buffer length
2169  * @write		: indicate read or write
2170  * Return error code on failure or 0 on success.
2171  */
vfio_dma_rw(struct vfio_group * group,dma_addr_t user_iova,void * data,size_t len,bool write)2172 int vfio_dma_rw(struct vfio_group *group, dma_addr_t user_iova,
2173 		void *data, size_t len, bool write)
2174 {
2175 	struct vfio_container *container;
2176 	struct vfio_iommu_driver *driver;
2177 	int ret = 0;
2178 
2179 	if (!group || !data || len <= 0)
2180 		return -EINVAL;
2181 
2182 	container = group->container;
2183 	driver = container->iommu_driver;
2184 
2185 	if (likely(driver && driver->ops->dma_rw))
2186 		ret = driver->ops->dma_rw(container->iommu_data,
2187 					  user_iova, data, len, write);
2188 	else
2189 		ret = -ENOTTY;
2190 
2191 	return ret;
2192 }
2193 EXPORT_SYMBOL(vfio_dma_rw);
2194 
vfio_register_iommu_notifier(struct vfio_group * group,unsigned long * events,struct notifier_block * nb)2195 static int vfio_register_iommu_notifier(struct vfio_group *group,
2196 					unsigned long *events,
2197 					struct notifier_block *nb)
2198 {
2199 	struct vfio_container *container;
2200 	struct vfio_iommu_driver *driver;
2201 	int ret;
2202 
2203 	ret = vfio_group_add_container_user(group);
2204 	if (ret)
2205 		return -EINVAL;
2206 
2207 	container = group->container;
2208 	driver = container->iommu_driver;
2209 	if (likely(driver && driver->ops->register_notifier))
2210 		ret = driver->ops->register_notifier(container->iommu_data,
2211 						     events, nb);
2212 	else
2213 		ret = -ENOTTY;
2214 
2215 	vfio_group_try_dissolve_container(group);
2216 
2217 	return ret;
2218 }
2219 
vfio_unregister_iommu_notifier(struct vfio_group * group,struct notifier_block * nb)2220 static int vfio_unregister_iommu_notifier(struct vfio_group *group,
2221 					  struct notifier_block *nb)
2222 {
2223 	struct vfio_container *container;
2224 	struct vfio_iommu_driver *driver;
2225 	int ret;
2226 
2227 	ret = vfio_group_add_container_user(group);
2228 	if (ret)
2229 		return -EINVAL;
2230 
2231 	container = group->container;
2232 	driver = container->iommu_driver;
2233 	if (likely(driver && driver->ops->unregister_notifier))
2234 		ret = driver->ops->unregister_notifier(container->iommu_data,
2235 						       nb);
2236 	else
2237 		ret = -ENOTTY;
2238 
2239 	vfio_group_try_dissolve_container(group);
2240 
2241 	return ret;
2242 }
2243 
vfio_group_set_kvm(struct vfio_group * group,struct kvm * kvm)2244 void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm)
2245 {
2246 	group->kvm = kvm;
2247 	blocking_notifier_call_chain(&group->notifier,
2248 				VFIO_GROUP_NOTIFY_SET_KVM, kvm);
2249 }
2250 EXPORT_SYMBOL_GPL(vfio_group_set_kvm);
2251 
vfio_register_group_notifier(struct vfio_group * group,unsigned long * events,struct notifier_block * nb)2252 static int vfio_register_group_notifier(struct vfio_group *group,
2253 					unsigned long *events,
2254 					struct notifier_block *nb)
2255 {
2256 	int ret;
2257 	bool set_kvm = false;
2258 
2259 	if (*events & VFIO_GROUP_NOTIFY_SET_KVM)
2260 		set_kvm = true;
2261 
2262 	/* clear known events */
2263 	*events &= ~VFIO_GROUP_NOTIFY_SET_KVM;
2264 
2265 	/* refuse to continue if still events remaining */
2266 	if (*events)
2267 		return -EINVAL;
2268 
2269 	ret = vfio_group_add_container_user(group);
2270 	if (ret)
2271 		return -EINVAL;
2272 
2273 	ret = blocking_notifier_chain_register(&group->notifier, nb);
2274 
2275 	/*
2276 	 * The attaching of kvm and vfio_group might already happen, so
2277 	 * here we replay once upon registration.
2278 	 */
2279 	if (!ret && set_kvm && group->kvm)
2280 		blocking_notifier_call_chain(&group->notifier,
2281 					VFIO_GROUP_NOTIFY_SET_KVM, group->kvm);
2282 
2283 	vfio_group_try_dissolve_container(group);
2284 
2285 	return ret;
2286 }
2287 
vfio_unregister_group_notifier(struct vfio_group * group,struct notifier_block * nb)2288 static int vfio_unregister_group_notifier(struct vfio_group *group,
2289 					 struct notifier_block *nb)
2290 {
2291 	int ret;
2292 
2293 	ret = vfio_group_add_container_user(group);
2294 	if (ret)
2295 		return -EINVAL;
2296 
2297 	ret = blocking_notifier_chain_unregister(&group->notifier, nb);
2298 
2299 	vfio_group_try_dissolve_container(group);
2300 
2301 	return ret;
2302 }
2303 
vfio_register_notifier(struct device * dev,enum vfio_notify_type type,unsigned long * events,struct notifier_block * nb)2304 int vfio_register_notifier(struct device *dev, enum vfio_notify_type type,
2305 			   unsigned long *events, struct notifier_block *nb)
2306 {
2307 	struct vfio_group *group;
2308 	int ret;
2309 
2310 	if (!dev || !nb || !events || (*events == 0))
2311 		return -EINVAL;
2312 
2313 	group = vfio_group_get_from_dev(dev);
2314 	if (!group)
2315 		return -ENODEV;
2316 
2317 	switch (type) {
2318 	case VFIO_IOMMU_NOTIFY:
2319 		ret = vfio_register_iommu_notifier(group, events, nb);
2320 		break;
2321 	case VFIO_GROUP_NOTIFY:
2322 		ret = vfio_register_group_notifier(group, events, nb);
2323 		break;
2324 	default:
2325 		ret = -EINVAL;
2326 	}
2327 
2328 	vfio_group_put(group);
2329 	return ret;
2330 }
2331 EXPORT_SYMBOL(vfio_register_notifier);
2332 
vfio_unregister_notifier(struct device * dev,enum vfio_notify_type type,struct notifier_block * nb)2333 int vfio_unregister_notifier(struct device *dev, enum vfio_notify_type type,
2334 			     struct notifier_block *nb)
2335 {
2336 	struct vfio_group *group;
2337 	int ret;
2338 
2339 	if (!dev || !nb)
2340 		return -EINVAL;
2341 
2342 	group = vfio_group_get_from_dev(dev);
2343 	if (!group)
2344 		return -ENODEV;
2345 
2346 	switch (type) {
2347 	case VFIO_IOMMU_NOTIFY:
2348 		ret = vfio_unregister_iommu_notifier(group, nb);
2349 		break;
2350 	case VFIO_GROUP_NOTIFY:
2351 		ret = vfio_unregister_group_notifier(group, nb);
2352 		break;
2353 	default:
2354 		ret = -EINVAL;
2355 	}
2356 
2357 	vfio_group_put(group);
2358 	return ret;
2359 }
2360 EXPORT_SYMBOL(vfio_unregister_notifier);
2361 
vfio_group_iommu_domain(struct vfio_group * group)2362 struct iommu_domain *vfio_group_iommu_domain(struct vfio_group *group)
2363 {
2364 	struct vfio_container *container;
2365 	struct vfio_iommu_driver *driver;
2366 
2367 	if (!group)
2368 		return ERR_PTR(-EINVAL);
2369 
2370 	container = group->container;
2371 	driver = container->iommu_driver;
2372 	if (likely(driver && driver->ops->group_iommu_domain))
2373 		return driver->ops->group_iommu_domain(container->iommu_data,
2374 						       group->iommu_group);
2375 
2376 	return ERR_PTR(-ENOTTY);
2377 }
2378 EXPORT_SYMBOL_GPL(vfio_group_iommu_domain);
2379 
2380 /**
2381  * Module/class support
2382  */
vfio_devnode(struct device * dev,umode_t * mode)2383 static char *vfio_devnode(struct device *dev, umode_t *mode)
2384 {
2385 	return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
2386 }
2387 
2388 static struct miscdevice vfio_dev = {
2389 	.minor = VFIO_MINOR,
2390 	.name = "vfio",
2391 	.fops = &vfio_fops,
2392 	.nodename = "vfio/vfio",
2393 	.mode = S_IRUGO | S_IWUGO,
2394 };
2395 
vfio_init(void)2396 static int __init vfio_init(void)
2397 {
2398 	int ret;
2399 
2400 	idr_init(&vfio.group_idr);
2401 	mutex_init(&vfio.group_lock);
2402 	mutex_init(&vfio.iommu_drivers_lock);
2403 	INIT_LIST_HEAD(&vfio.group_list);
2404 	INIT_LIST_HEAD(&vfio.iommu_drivers_list);
2405 
2406 	ret = misc_register(&vfio_dev);
2407 	if (ret) {
2408 		pr_err("vfio: misc device register failed\n");
2409 		return ret;
2410 	}
2411 
2412 	/* /dev/vfio/$GROUP */
2413 	vfio.class = class_create(THIS_MODULE, "vfio");
2414 	if (IS_ERR(vfio.class)) {
2415 		ret = PTR_ERR(vfio.class);
2416 		goto err_class;
2417 	}
2418 
2419 	vfio.class->devnode = vfio_devnode;
2420 
2421 	ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK + 1, "vfio");
2422 	if (ret)
2423 		goto err_alloc_chrdev;
2424 
2425 	cdev_init(&vfio.group_cdev, &vfio_group_fops);
2426 	ret = cdev_add(&vfio.group_cdev, vfio.group_devt, MINORMASK + 1);
2427 	if (ret)
2428 		goto err_cdev_add;
2429 
2430 	pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
2431 
2432 #ifdef CONFIG_VFIO_NOIOMMU
2433 	vfio_register_iommu_driver(&vfio_noiommu_ops);
2434 #endif
2435 	return 0;
2436 
2437 err_cdev_add:
2438 	unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
2439 err_alloc_chrdev:
2440 	class_destroy(vfio.class);
2441 	vfio.class = NULL;
2442 err_class:
2443 	misc_deregister(&vfio_dev);
2444 	return ret;
2445 }
2446 
vfio_cleanup(void)2447 static void __exit vfio_cleanup(void)
2448 {
2449 	WARN_ON(!list_empty(&vfio.group_list));
2450 
2451 #ifdef CONFIG_VFIO_NOIOMMU
2452 	vfio_unregister_iommu_driver(&vfio_noiommu_ops);
2453 #endif
2454 	idr_destroy(&vfio.group_idr);
2455 	cdev_del(&vfio.group_cdev);
2456 	unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
2457 	class_destroy(vfio.class);
2458 	vfio.class = NULL;
2459 	misc_deregister(&vfio_dev);
2460 	xa_destroy(&vfio_device_set_xa);
2461 }
2462 
2463 module_init(vfio_init);
2464 module_exit(vfio_cleanup);
2465 
2466 MODULE_VERSION(DRIVER_VERSION);
2467 MODULE_LICENSE("GPL v2");
2468 MODULE_AUTHOR(DRIVER_AUTHOR);
2469 MODULE_DESCRIPTION(DRIVER_DESC);
2470 MODULE_ALIAS_MISCDEV(VFIO_MINOR);
2471 MODULE_ALIAS("devname:vfio/vfio");
2472 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");
2473