1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * VFIO core
4  *
5  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
6  *     Author: Alex Williamson <alex.williamson@redhat.com>
7  *
8  * Derived from original vfio:
9  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
10  * Author: Tom Lyon, pugs@cisco.com
11  */
12 
13 #include <linux/cdev.h>
14 #include <linux/compat.h>
15 #include <linux/device.h>
16 #include <linux/fs.h>
17 #include <linux/idr.h>
18 #include <linux/iommu.h>
19 #if IS_ENABLED(CONFIG_KVM)
20 #include <linux/kvm_host.h>
21 #endif
22 #include <linux/list.h>
23 #include <linux/miscdevice.h>
24 #include <linux/module.h>
25 #include <linux/mount.h>
26 #include <linux/mutex.h>
27 #include <linux/pci.h>
28 #include <linux/pseudo_fs.h>
29 #include <linux/rwsem.h>
30 #include <linux/sched.h>
31 #include <linux/slab.h>
32 #include <linux/stat.h>
33 #include <linux/string.h>
34 #include <linux/uaccess.h>
35 #include <linux/vfio.h>
36 #include <linux/wait.h>
37 #include <linux/sched/signal.h>
38 #include <linux/pm_runtime.h>
39 #include <linux/interval_tree.h>
40 #include <linux/iova_bitmap.h>
41 #include <linux/iommufd.h>
42 #include "vfio.h"
43 
44 #define DRIVER_VERSION	"0.3"
45 #define DRIVER_AUTHOR	"Alex Williamson <alex.williamson@redhat.com>"
46 #define DRIVER_DESC	"VFIO - User Level meta-driver"
47 
48 #define VFIO_MAGIC 0x5646494f /* "VFIO" */
49 
50 static struct vfio {
51 	struct class			*device_class;
52 	struct ida			device_ida;
53 	struct vfsmount			*vfs_mount;
54 	int				fs_count;
55 } vfio;
56 
57 #ifdef CONFIG_VFIO_NOIOMMU
58 bool vfio_noiommu __read_mostly;
59 module_param_named(enable_unsafe_noiommu_mode,
60 		   vfio_noiommu, bool, S_IRUGO | S_IWUSR);
61 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
62 #endif
63 
64 static DEFINE_XARRAY(vfio_device_set_xa);
65 
vfio_assign_device_set(struct vfio_device * device,void * set_id)66 int vfio_assign_device_set(struct vfio_device *device, void *set_id)
67 {
68 	unsigned long idx = (unsigned long)set_id;
69 	struct vfio_device_set *new_dev_set;
70 	struct vfio_device_set *dev_set;
71 
72 	if (WARN_ON(!set_id))
73 		return -EINVAL;
74 
75 	/*
76 	 * Atomically acquire a singleton object in the xarray for this set_id
77 	 */
78 	xa_lock(&vfio_device_set_xa);
79 	dev_set = xa_load(&vfio_device_set_xa, idx);
80 	if (dev_set)
81 		goto found_get_ref;
82 	xa_unlock(&vfio_device_set_xa);
83 
84 	new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
85 	if (!new_dev_set)
86 		return -ENOMEM;
87 	mutex_init(&new_dev_set->lock);
88 	INIT_LIST_HEAD(&new_dev_set->device_list);
89 	new_dev_set->set_id = set_id;
90 
91 	xa_lock(&vfio_device_set_xa);
92 	dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
93 			       GFP_KERNEL);
94 	if (!dev_set) {
95 		dev_set = new_dev_set;
96 		goto found_get_ref;
97 	}
98 
99 	kfree(new_dev_set);
100 	if (xa_is_err(dev_set)) {
101 		xa_unlock(&vfio_device_set_xa);
102 		return xa_err(dev_set);
103 	}
104 
105 found_get_ref:
106 	dev_set->device_count++;
107 	xa_unlock(&vfio_device_set_xa);
108 	mutex_lock(&dev_set->lock);
109 	device->dev_set = dev_set;
110 	list_add_tail(&device->dev_set_list, &dev_set->device_list);
111 	mutex_unlock(&dev_set->lock);
112 	return 0;
113 }
114 EXPORT_SYMBOL_GPL(vfio_assign_device_set);
115 
vfio_release_device_set(struct vfio_device * device)116 static void vfio_release_device_set(struct vfio_device *device)
117 {
118 	struct vfio_device_set *dev_set = device->dev_set;
119 
120 	if (!dev_set)
121 		return;
122 
123 	mutex_lock(&dev_set->lock);
124 	list_del(&device->dev_set_list);
125 	mutex_unlock(&dev_set->lock);
126 
127 	xa_lock(&vfio_device_set_xa);
128 	if (!--dev_set->device_count) {
129 		__xa_erase(&vfio_device_set_xa,
130 			   (unsigned long)dev_set->set_id);
131 		mutex_destroy(&dev_set->lock);
132 		kfree(dev_set);
133 	}
134 	xa_unlock(&vfio_device_set_xa);
135 }
136 
vfio_device_set_open_count(struct vfio_device_set * dev_set)137 unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set)
138 {
139 	struct vfio_device *cur;
140 	unsigned int open_count = 0;
141 
142 	lockdep_assert_held(&dev_set->lock);
143 
144 	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
145 		open_count += cur->open_count;
146 	return open_count;
147 }
148 EXPORT_SYMBOL_GPL(vfio_device_set_open_count);
149 
150 struct vfio_device *
vfio_find_device_in_devset(struct vfio_device_set * dev_set,struct device * dev)151 vfio_find_device_in_devset(struct vfio_device_set *dev_set,
152 			   struct device *dev)
153 {
154 	struct vfio_device *cur;
155 
156 	lockdep_assert_held(&dev_set->lock);
157 
158 	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
159 		if (cur->dev == dev)
160 			return cur;
161 	return NULL;
162 }
163 EXPORT_SYMBOL_GPL(vfio_find_device_in_devset);
164 
165 /*
166  * Device objects - create, release, get, put, search
167  */
168 /* Device reference always implies a group reference */
vfio_device_put_registration(struct vfio_device * device)169 void vfio_device_put_registration(struct vfio_device *device)
170 {
171 	if (refcount_dec_and_test(&device->refcount))
172 		complete(&device->comp);
173 }
174 
vfio_device_try_get_registration(struct vfio_device * device)175 bool vfio_device_try_get_registration(struct vfio_device *device)
176 {
177 	return refcount_inc_not_zero(&device->refcount);
178 }
179 
180 /*
181  * VFIO driver API
182  */
183 /* Release helper called by vfio_put_device() */
vfio_device_release(struct device * dev)184 static void vfio_device_release(struct device *dev)
185 {
186 	struct vfio_device *device =
187 			container_of(dev, struct vfio_device, device);
188 
189 	vfio_release_device_set(device);
190 	ida_free(&vfio.device_ida, device->index);
191 
192 	if (device->ops->release)
193 		device->ops->release(device);
194 
195 	iput(device->inode);
196 	simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
197 	kvfree(device);
198 }
199 
200 static int vfio_init_device(struct vfio_device *device, struct device *dev,
201 			    const struct vfio_device_ops *ops);
202 
203 /*
204  * Allocate and initialize vfio_device so it can be registered to vfio
205  * core.
206  *
207  * Drivers should use the wrapper vfio_alloc_device() for allocation.
208  * @size is the size of the structure to be allocated, including any
209  * private data used by the driver.
210  *
211  * Driver may provide an @init callback to cover device private data.
212  *
213  * Use vfio_put_device() to release the structure after success return.
214  */
_vfio_alloc_device(size_t size,struct device * dev,const struct vfio_device_ops * ops)215 struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev,
216 				       const struct vfio_device_ops *ops)
217 {
218 	struct vfio_device *device;
219 	int ret;
220 
221 	if (WARN_ON(size < sizeof(struct vfio_device)))
222 		return ERR_PTR(-EINVAL);
223 
224 	device = kvzalloc(size, GFP_KERNEL);
225 	if (!device)
226 		return ERR_PTR(-ENOMEM);
227 
228 	ret = vfio_init_device(device, dev, ops);
229 	if (ret)
230 		goto out_free;
231 	return device;
232 
233 out_free:
234 	kvfree(device);
235 	return ERR_PTR(ret);
236 }
237 EXPORT_SYMBOL_GPL(_vfio_alloc_device);
238 
vfio_fs_init_fs_context(struct fs_context * fc)239 static int vfio_fs_init_fs_context(struct fs_context *fc)
240 {
241 	return init_pseudo(fc, VFIO_MAGIC) ? 0 : -ENOMEM;
242 }
243 
244 static struct file_system_type vfio_fs_type = {
245 	.name = "vfio",
246 	.owner = THIS_MODULE,
247 	.init_fs_context = vfio_fs_init_fs_context,
248 	.kill_sb = kill_anon_super,
249 };
250 
vfio_fs_inode_new(void)251 static struct inode *vfio_fs_inode_new(void)
252 {
253 	struct inode *inode;
254 	int ret;
255 
256 	ret = simple_pin_fs(&vfio_fs_type, &vfio.vfs_mount, &vfio.fs_count);
257 	if (ret)
258 		return ERR_PTR(ret);
259 
260 	inode = alloc_anon_inode(vfio.vfs_mount->mnt_sb);
261 	if (IS_ERR(inode))
262 		simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
263 
264 	return inode;
265 }
266 
267 /*
268  * Initialize a vfio_device so it can be registered to vfio core.
269  */
vfio_init_device(struct vfio_device * device,struct device * dev,const struct vfio_device_ops * ops)270 static int vfio_init_device(struct vfio_device *device, struct device *dev,
271 			    const struct vfio_device_ops *ops)
272 {
273 	int ret;
274 
275 	ret = ida_alloc_max(&vfio.device_ida, MINORMASK, GFP_KERNEL);
276 	if (ret < 0) {
277 		dev_dbg(dev, "Error to alloc index\n");
278 		return ret;
279 	}
280 
281 	device->index = ret;
282 	init_completion(&device->comp);
283 	device->dev = dev;
284 	device->ops = ops;
285 	device->inode = vfio_fs_inode_new();
286 	if (IS_ERR(device->inode)) {
287 		ret = PTR_ERR(device->inode);
288 		goto out_inode;
289 	}
290 
291 	if (ops->init) {
292 		ret = ops->init(device);
293 		if (ret)
294 			goto out_uninit;
295 	}
296 
297 	device_initialize(&device->device);
298 	device->device.release = vfio_device_release;
299 	device->device.class = vfio.device_class;
300 	device->device.parent = device->dev;
301 	return 0;
302 
303 out_uninit:
304 	iput(device->inode);
305 	simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
306 out_inode:
307 	vfio_release_device_set(device);
308 	ida_free(&vfio.device_ida, device->index);
309 	return ret;
310 }
311 
__vfio_register_dev(struct vfio_device * device,enum vfio_group_type type)312 static int __vfio_register_dev(struct vfio_device *device,
313 			       enum vfio_group_type type)
314 {
315 	int ret;
316 
317 	if (WARN_ON(IS_ENABLED(CONFIG_IOMMUFD) &&
318 		    (!device->ops->bind_iommufd ||
319 		     !device->ops->unbind_iommufd ||
320 		     !device->ops->attach_ioas ||
321 		     !device->ops->detach_ioas)))
322 		return -EINVAL;
323 
324 	/*
325 	 * If the driver doesn't specify a set then the device is added to a
326 	 * singleton set just for itself.
327 	 */
328 	if (!device->dev_set)
329 		vfio_assign_device_set(device, device);
330 
331 	ret = dev_set_name(&device->device, "vfio%d", device->index);
332 	if (ret)
333 		return ret;
334 
335 	ret = vfio_device_set_group(device, type);
336 	if (ret)
337 		return ret;
338 
339 	/*
340 	 * VFIO always sets IOMMU_CACHE because we offer no way for userspace to
341 	 * restore cache coherency. It has to be checked here because it is only
342 	 * valid for cases where we are using iommu groups.
343 	 */
344 	if (type == VFIO_IOMMU && !vfio_device_is_noiommu(device) &&
345 	    !device_iommu_capable(device->dev, IOMMU_CAP_CACHE_COHERENCY)) {
346 		ret = -EINVAL;
347 		goto err_out;
348 	}
349 
350 	ret = vfio_device_add(device);
351 	if (ret)
352 		goto err_out;
353 
354 	/* Refcounting can't start until the driver calls register */
355 	refcount_set(&device->refcount, 1);
356 
357 	vfio_device_group_register(device);
358 	vfio_device_debugfs_init(device);
359 
360 	return 0;
361 err_out:
362 	vfio_device_remove_group(device);
363 	return ret;
364 }
365 
vfio_register_group_dev(struct vfio_device * device)366 int vfio_register_group_dev(struct vfio_device *device)
367 {
368 	return __vfio_register_dev(device, VFIO_IOMMU);
369 }
370 EXPORT_SYMBOL_GPL(vfio_register_group_dev);
371 
372 /*
373  * Register a virtual device without IOMMU backing.  The user of this
374  * device must not be able to directly trigger unmediated DMA.
375  */
vfio_register_emulated_iommu_dev(struct vfio_device * device)376 int vfio_register_emulated_iommu_dev(struct vfio_device *device)
377 {
378 	return __vfio_register_dev(device, VFIO_EMULATED_IOMMU);
379 }
380 EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
381 
382 /*
383  * Decrement the device reference count and wait for the device to be
384  * removed.  Open file descriptors for the device... */
vfio_unregister_group_dev(struct vfio_device * device)385 void vfio_unregister_group_dev(struct vfio_device *device)
386 {
387 	unsigned int i = 0;
388 	bool interrupted = false;
389 	long rc;
390 
391 	/*
392 	 * Prevent new device opened by userspace via the
393 	 * VFIO_GROUP_GET_DEVICE_FD in the group path.
394 	 */
395 	vfio_device_group_unregister(device);
396 
397 	/*
398 	 * Balances vfio_device_add() in register path, also prevents
399 	 * new device opened by userspace in the cdev path.
400 	 */
401 	vfio_device_del(device);
402 
403 	vfio_device_put_registration(device);
404 	rc = try_wait_for_completion(&device->comp);
405 	while (rc <= 0) {
406 		if (device->ops->request)
407 			device->ops->request(device, i++);
408 
409 		if (interrupted) {
410 			rc = wait_for_completion_timeout(&device->comp,
411 							 HZ * 10);
412 		} else {
413 			rc = wait_for_completion_interruptible_timeout(
414 				&device->comp, HZ * 10);
415 			if (rc < 0) {
416 				interrupted = true;
417 				dev_warn(device->dev,
418 					 "Device is currently in use, task"
419 					 " \"%s\" (%d) "
420 					 "blocked until device is released",
421 					 current->comm, task_pid_nr(current));
422 			}
423 		}
424 	}
425 
426 	vfio_device_debugfs_exit(device);
427 	/* Balances vfio_device_set_group in register path */
428 	vfio_device_remove_group(device);
429 }
430 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
431 
432 #if IS_ENABLED(CONFIG_KVM)
vfio_device_get_kvm_safe(struct vfio_device * device,struct kvm * kvm)433 void vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm)
434 {
435 	void (*pfn)(struct kvm *kvm);
436 	bool (*fn)(struct kvm *kvm);
437 	bool ret;
438 
439 	lockdep_assert_held(&device->dev_set->lock);
440 
441 	if (!kvm)
442 		return;
443 
444 	pfn = symbol_get(kvm_put_kvm);
445 	if (WARN_ON(!pfn))
446 		return;
447 
448 	fn = symbol_get(kvm_get_kvm_safe);
449 	if (WARN_ON(!fn)) {
450 		symbol_put(kvm_put_kvm);
451 		return;
452 	}
453 
454 	ret = fn(kvm);
455 	symbol_put(kvm_get_kvm_safe);
456 	if (!ret) {
457 		symbol_put(kvm_put_kvm);
458 		return;
459 	}
460 
461 	device->put_kvm = pfn;
462 	device->kvm = kvm;
463 }
464 
vfio_device_put_kvm(struct vfio_device * device)465 void vfio_device_put_kvm(struct vfio_device *device)
466 {
467 	lockdep_assert_held(&device->dev_set->lock);
468 
469 	if (!device->kvm)
470 		return;
471 
472 	if (WARN_ON(!device->put_kvm))
473 		goto clear;
474 
475 	device->put_kvm(device->kvm);
476 	device->put_kvm = NULL;
477 	symbol_put(kvm_put_kvm);
478 
479 clear:
480 	device->kvm = NULL;
481 }
482 
483 static struct vfio_device *vfio_device_from_file(struct file *file);
484 /**
485  * vfio_file_get_device - Return struct device from vfio device fd
486  * @file: VFIO device file
487  */
vfio_file_get_device(struct file * file)488 struct device *vfio_file_get_device(struct file *file)
489 {
490 	struct vfio_device *device = vfio_device_from_file(file);
491 
492 	return device ? device->dev : NULL;
493 }
494 EXPORT_SYMBOL_GPL(vfio_file_get_device);
495 #endif
496 
497 /* true if the vfio_device has open_device() called but not close_device() */
vfio_assert_device_open(struct vfio_device * device)498 static bool vfio_assert_device_open(struct vfio_device *device)
499 {
500 	return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
501 }
502 
503 struct vfio_device_file *
vfio_allocate_device_file(struct vfio_device * device)504 vfio_allocate_device_file(struct vfio_device *device)
505 {
506 	struct vfio_device_file *df;
507 
508 	df = kzalloc(sizeof(*df), GFP_KERNEL_ACCOUNT);
509 	if (!df)
510 		return ERR_PTR(-ENOMEM);
511 
512 	df->device = device;
513 	spin_lock_init(&df->kvm_ref_lock);
514 
515 	return df;
516 }
517 
vfio_df_device_first_open(struct vfio_device_file * df)518 static int vfio_df_device_first_open(struct vfio_device_file *df)
519 {
520 	struct vfio_device *device = df->device;
521 	struct iommufd_ctx *iommufd = df->iommufd;
522 	int ret;
523 
524 	lockdep_assert_held(&device->dev_set->lock);
525 
526 	if (!try_module_get(device->dev->driver->owner))
527 		return -ENODEV;
528 
529 	if (iommufd)
530 		ret = vfio_df_iommufd_bind(df);
531 	else
532 		ret = vfio_device_group_use_iommu(device);
533 	if (ret)
534 		goto err_module_put;
535 
536 	if (device->ops->open_device) {
537 		ret = device->ops->open_device(device);
538 		if (ret)
539 			goto err_unuse_iommu;
540 	}
541 	return 0;
542 
543 err_unuse_iommu:
544 	if (iommufd)
545 		vfio_df_iommufd_unbind(df);
546 	else
547 		vfio_device_group_unuse_iommu(device);
548 err_module_put:
549 	module_put(device->dev->driver->owner);
550 	return ret;
551 }
552 
vfio_df_device_last_close(struct vfio_device_file * df)553 static void vfio_df_device_last_close(struct vfio_device_file *df)
554 {
555 	struct vfio_device *device = df->device;
556 	struct iommufd_ctx *iommufd = df->iommufd;
557 
558 	lockdep_assert_held(&device->dev_set->lock);
559 
560 	if (device->ops->close_device)
561 		device->ops->close_device(device);
562 	if (iommufd)
563 		vfio_df_iommufd_unbind(df);
564 	else
565 		vfio_device_group_unuse_iommu(device);
566 	module_put(device->dev->driver->owner);
567 }
568 
vfio_df_open(struct vfio_device_file * df)569 int vfio_df_open(struct vfio_device_file *df)
570 {
571 	struct vfio_device *device = df->device;
572 	int ret = 0;
573 
574 	lockdep_assert_held(&device->dev_set->lock);
575 
576 	/*
577 	 * Only the group path allows the device to be opened multiple
578 	 * times.  The device cdev path doesn't have a secure way for it.
579 	 */
580 	if (device->open_count != 0 && !df->group)
581 		return -EINVAL;
582 
583 	device->open_count++;
584 	if (device->open_count == 1) {
585 		ret = vfio_df_device_first_open(df);
586 		if (ret)
587 			device->open_count--;
588 	}
589 
590 	return ret;
591 }
592 
vfio_df_close(struct vfio_device_file * df)593 void vfio_df_close(struct vfio_device_file *df)
594 {
595 	struct vfio_device *device = df->device;
596 
597 	lockdep_assert_held(&device->dev_set->lock);
598 
599 	if (!vfio_assert_device_open(device))
600 		return;
601 	if (device->open_count == 1)
602 		vfio_df_device_last_close(df);
603 	device->open_count--;
604 }
605 
606 /*
607  * Wrapper around pm_runtime_resume_and_get().
608  * Return error code on failure or 0 on success.
609  */
vfio_device_pm_runtime_get(struct vfio_device * device)610 static inline int vfio_device_pm_runtime_get(struct vfio_device *device)
611 {
612 	struct device *dev = device->dev;
613 
614 	if (dev->driver && dev->driver->pm) {
615 		int ret;
616 
617 		ret = pm_runtime_resume_and_get(dev);
618 		if (ret) {
619 			dev_info_ratelimited(dev,
620 				"vfio: runtime resume failed %d\n", ret);
621 			return -EIO;
622 		}
623 	}
624 
625 	return 0;
626 }
627 
628 /*
629  * Wrapper around pm_runtime_put().
630  */
vfio_device_pm_runtime_put(struct vfio_device * device)631 static inline void vfio_device_pm_runtime_put(struct vfio_device *device)
632 {
633 	struct device *dev = device->dev;
634 
635 	if (dev->driver && dev->driver->pm)
636 		pm_runtime_put(dev);
637 }
638 
639 /*
640  * VFIO Device fd
641  */
vfio_device_fops_release(struct inode * inode,struct file * filep)642 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
643 {
644 	struct vfio_device_file *df = filep->private_data;
645 	struct vfio_device *device = df->device;
646 
647 	if (df->group)
648 		vfio_df_group_close(df);
649 	else
650 		vfio_df_unbind_iommufd(df);
651 
652 	vfio_device_put_registration(device);
653 
654 	kfree(df);
655 
656 	return 0;
657 }
658 
659 /*
660  * vfio_mig_get_next_state - Compute the next step in the FSM
661  * @cur_fsm - The current state the device is in
662  * @new_fsm - The target state to reach
663  * @next_fsm - Pointer to the next step to get to new_fsm
664  *
665  * Return 0 upon success, otherwise -errno
666  * Upon success the next step in the state progression between cur_fsm and
667  * new_fsm will be set in next_fsm.
668  *
669  * This breaks down requests for combination transitions into smaller steps and
670  * returns the next step to get to new_fsm. The function may need to be called
671  * multiple times before reaching new_fsm.
672  *
673  */
vfio_mig_get_next_state(struct vfio_device * device,enum vfio_device_mig_state cur_fsm,enum vfio_device_mig_state new_fsm,enum vfio_device_mig_state * next_fsm)674 int vfio_mig_get_next_state(struct vfio_device *device,
675 			    enum vfio_device_mig_state cur_fsm,
676 			    enum vfio_device_mig_state new_fsm,
677 			    enum vfio_device_mig_state *next_fsm)
678 {
679 	enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_PRE_COPY_P2P + 1 };
680 	/*
681 	 * The coding in this table requires the driver to implement the
682 	 * following FSM arcs:
683 	 *         RESUMING -> STOP
684 	 *         STOP -> RESUMING
685 	 *         STOP -> STOP_COPY
686 	 *         STOP_COPY -> STOP
687 	 *
688 	 * If P2P is supported then the driver must also implement these FSM
689 	 * arcs:
690 	 *         RUNNING -> RUNNING_P2P
691 	 *         RUNNING_P2P -> RUNNING
692 	 *         RUNNING_P2P -> STOP
693 	 *         STOP -> RUNNING_P2P
694 	 *
695 	 * If precopy is supported then the driver must support these additional
696 	 * FSM arcs:
697 	 *         RUNNING -> PRE_COPY
698 	 *         PRE_COPY -> RUNNING
699 	 *         PRE_COPY -> STOP_COPY
700 	 * However, if precopy and P2P are supported together then the driver
701 	 * must support these additional arcs beyond the P2P arcs above:
702 	 *         PRE_COPY -> RUNNING
703 	 *         PRE_COPY -> PRE_COPY_P2P
704 	 *         PRE_COPY_P2P -> PRE_COPY
705 	 *         PRE_COPY_P2P -> RUNNING_P2P
706 	 *         PRE_COPY_P2P -> STOP_COPY
707 	 *         RUNNING -> PRE_COPY
708 	 *         RUNNING_P2P -> PRE_COPY_P2P
709 	 *
710 	 * Without P2P and precopy the driver must implement:
711 	 *         RUNNING -> STOP
712 	 *         STOP -> RUNNING
713 	 *
714 	 * The coding will step through multiple states for some combination
715 	 * transitions; if all optional features are supported, this means the
716 	 * following ones:
717 	 *         PRE_COPY -> PRE_COPY_P2P -> STOP_COPY
718 	 *         PRE_COPY -> RUNNING -> RUNNING_P2P
719 	 *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP
720 	 *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP -> RESUMING
721 	 *         PRE_COPY_P2P -> RUNNING_P2P -> RUNNING
722 	 *         PRE_COPY_P2P -> RUNNING_P2P -> STOP
723 	 *         PRE_COPY_P2P -> RUNNING_P2P -> STOP -> RESUMING
724 	 *         RESUMING -> STOP -> RUNNING_P2P
725 	 *         RESUMING -> STOP -> RUNNING_P2P -> PRE_COPY_P2P
726 	 *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING
727 	 *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
728 	 *         RESUMING -> STOP -> STOP_COPY
729 	 *         RUNNING -> RUNNING_P2P -> PRE_COPY_P2P
730 	 *         RUNNING -> RUNNING_P2P -> STOP
731 	 *         RUNNING -> RUNNING_P2P -> STOP -> RESUMING
732 	 *         RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
733 	 *         RUNNING_P2P -> RUNNING -> PRE_COPY
734 	 *         RUNNING_P2P -> STOP -> RESUMING
735 	 *         RUNNING_P2P -> STOP -> STOP_COPY
736 	 *         STOP -> RUNNING_P2P -> PRE_COPY_P2P
737 	 *         STOP -> RUNNING_P2P -> RUNNING
738 	 *         STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
739 	 *         STOP_COPY -> STOP -> RESUMING
740 	 *         STOP_COPY -> STOP -> RUNNING_P2P
741 	 *         STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
742 	 *
743 	 *  The following transitions are blocked:
744 	 *         STOP_COPY -> PRE_COPY
745 	 *         STOP_COPY -> PRE_COPY_P2P
746 	 */
747 	static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
748 		[VFIO_DEVICE_STATE_STOP] = {
749 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
750 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
751 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
752 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
753 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
754 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
755 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
756 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
757 		},
758 		[VFIO_DEVICE_STATE_RUNNING] = {
759 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
760 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
761 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
762 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
763 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
764 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
765 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
766 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
767 		},
768 		[VFIO_DEVICE_STATE_PRE_COPY] = {
769 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING,
770 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
771 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
772 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
773 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
774 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING,
775 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING,
776 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
777 		},
778 		[VFIO_DEVICE_STATE_PRE_COPY_P2P] = {
779 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
780 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
781 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
782 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
783 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
784 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
785 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
786 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
787 		},
788 		[VFIO_DEVICE_STATE_STOP_COPY] = {
789 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
790 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
791 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
792 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
793 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
794 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
795 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
796 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
797 		},
798 		[VFIO_DEVICE_STATE_RESUMING] = {
799 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
800 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
801 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_STOP,
802 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_STOP,
803 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
804 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
805 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
806 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
807 		},
808 		[VFIO_DEVICE_STATE_RUNNING_P2P] = {
809 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
810 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
811 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING,
812 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
813 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
814 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
815 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
816 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
817 		},
818 		[VFIO_DEVICE_STATE_ERROR] = {
819 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
820 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
821 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
822 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
823 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
824 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
825 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
826 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
827 		},
828 	};
829 
830 	static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
831 		[VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
832 		[VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
833 		[VFIO_DEVICE_STATE_PRE_COPY] =
834 			VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY,
835 		[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_MIGRATION_STOP_COPY |
836 						   VFIO_MIGRATION_P2P |
837 						   VFIO_MIGRATION_PRE_COPY,
838 		[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
839 		[VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
840 		[VFIO_DEVICE_STATE_RUNNING_P2P] =
841 			VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
842 		[VFIO_DEVICE_STATE_ERROR] = ~0U,
843 	};
844 
845 	if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
846 		    (state_flags_table[cur_fsm] & device->migration_flags) !=
847 			state_flags_table[cur_fsm]))
848 		return -EINVAL;
849 
850 	if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
851 	   (state_flags_table[new_fsm] & device->migration_flags) !=
852 			state_flags_table[new_fsm])
853 		return -EINVAL;
854 
855 	/*
856 	 * Arcs touching optional and unsupported states are skipped over. The
857 	 * driver will instead see an arc from the original state to the next
858 	 * logical state, as per the above comment.
859 	 */
860 	*next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
861 	while ((state_flags_table[*next_fsm] & device->migration_flags) !=
862 			state_flags_table[*next_fsm])
863 		*next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
864 
865 	return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
866 }
867 EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
868 
869 /*
870  * Convert the drivers's struct file into a FD number and return it to userspace
871  */
vfio_ioct_mig_return_fd(struct file * filp,void __user * arg,struct vfio_device_feature_mig_state * mig)872 static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
873 				   struct vfio_device_feature_mig_state *mig)
874 {
875 	int ret;
876 	int fd;
877 
878 	fd = get_unused_fd_flags(O_CLOEXEC);
879 	if (fd < 0) {
880 		ret = fd;
881 		goto out_fput;
882 	}
883 
884 	mig->data_fd = fd;
885 	if (copy_to_user(arg, mig, sizeof(*mig))) {
886 		ret = -EFAULT;
887 		goto out_put_unused;
888 	}
889 	fd_install(fd, filp);
890 	return 0;
891 
892 out_put_unused:
893 	put_unused_fd(fd);
894 out_fput:
895 	fput(filp);
896 	return ret;
897 }
898 
899 static int
vfio_ioctl_device_feature_mig_device_state(struct vfio_device * device,u32 flags,void __user * arg,size_t argsz)900 vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
901 					   u32 flags, void __user *arg,
902 					   size_t argsz)
903 {
904 	size_t minsz =
905 		offsetofend(struct vfio_device_feature_mig_state, data_fd);
906 	struct vfio_device_feature_mig_state mig;
907 	struct file *filp = NULL;
908 	int ret;
909 
910 	if (!device->mig_ops)
911 		return -ENOTTY;
912 
913 	ret = vfio_check_feature(flags, argsz,
914 				 VFIO_DEVICE_FEATURE_SET |
915 				 VFIO_DEVICE_FEATURE_GET,
916 				 sizeof(mig));
917 	if (ret != 1)
918 		return ret;
919 
920 	if (copy_from_user(&mig, arg, minsz))
921 		return -EFAULT;
922 
923 	if (flags & VFIO_DEVICE_FEATURE_GET) {
924 		enum vfio_device_mig_state curr_state;
925 
926 		ret = device->mig_ops->migration_get_state(device,
927 							   &curr_state);
928 		if (ret)
929 			return ret;
930 		mig.device_state = curr_state;
931 		goto out_copy;
932 	}
933 
934 	/* Handle the VFIO_DEVICE_FEATURE_SET */
935 	filp = device->mig_ops->migration_set_state(device, mig.device_state);
936 	if (IS_ERR(filp) || !filp)
937 		goto out_copy;
938 
939 	return vfio_ioct_mig_return_fd(filp, arg, &mig);
940 out_copy:
941 	mig.data_fd = -1;
942 	if (copy_to_user(arg, &mig, sizeof(mig)))
943 		return -EFAULT;
944 	if (IS_ERR(filp))
945 		return PTR_ERR(filp);
946 	return 0;
947 }
948 
949 static int
vfio_ioctl_device_feature_migration_data_size(struct vfio_device * device,u32 flags,void __user * arg,size_t argsz)950 vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device,
951 					      u32 flags, void __user *arg,
952 					      size_t argsz)
953 {
954 	struct vfio_device_feature_mig_data_size data_size = {};
955 	unsigned long stop_copy_length;
956 	int ret;
957 
958 	if (!device->mig_ops)
959 		return -ENOTTY;
960 
961 	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
962 				 sizeof(data_size));
963 	if (ret != 1)
964 		return ret;
965 
966 	ret = device->mig_ops->migration_get_data_size(device, &stop_copy_length);
967 	if (ret)
968 		return ret;
969 
970 	data_size.stop_copy_length = stop_copy_length;
971 	if (copy_to_user(arg, &data_size, sizeof(data_size)))
972 		return -EFAULT;
973 
974 	return 0;
975 }
976 
vfio_ioctl_device_feature_migration(struct vfio_device * device,u32 flags,void __user * arg,size_t argsz)977 static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
978 					       u32 flags, void __user *arg,
979 					       size_t argsz)
980 {
981 	struct vfio_device_feature_migration mig = {
982 		.flags = device->migration_flags,
983 	};
984 	int ret;
985 
986 	if (!device->mig_ops)
987 		return -ENOTTY;
988 
989 	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
990 				 sizeof(mig));
991 	if (ret != 1)
992 		return ret;
993 	if (copy_to_user(arg, &mig, sizeof(mig)))
994 		return -EFAULT;
995 	return 0;
996 }
997 
vfio_combine_iova_ranges(struct rb_root_cached * root,u32 cur_nodes,u32 req_nodes)998 void vfio_combine_iova_ranges(struct rb_root_cached *root, u32 cur_nodes,
999 			      u32 req_nodes)
1000 {
1001 	struct interval_tree_node *prev, *curr, *comb_start, *comb_end;
1002 	unsigned long min_gap, curr_gap;
1003 
1004 	/* Special shortcut when a single range is required */
1005 	if (req_nodes == 1) {
1006 		unsigned long last;
1007 
1008 		comb_start = interval_tree_iter_first(root, 0, ULONG_MAX);
1009 
1010 		/* Empty list */
1011 		if (WARN_ON_ONCE(!comb_start))
1012 			return;
1013 
1014 		curr = comb_start;
1015 		while (curr) {
1016 			last = curr->last;
1017 			prev = curr;
1018 			curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
1019 			if (prev != comb_start)
1020 				interval_tree_remove(prev, root);
1021 		}
1022 		comb_start->last = last;
1023 		return;
1024 	}
1025 
1026 	/* Combine ranges which have the smallest gap */
1027 	while (cur_nodes > req_nodes) {
1028 		prev = NULL;
1029 		min_gap = ULONG_MAX;
1030 		curr = interval_tree_iter_first(root, 0, ULONG_MAX);
1031 		while (curr) {
1032 			if (prev) {
1033 				curr_gap = curr->start - prev->last;
1034 				if (curr_gap < min_gap) {
1035 					min_gap = curr_gap;
1036 					comb_start = prev;
1037 					comb_end = curr;
1038 				}
1039 			}
1040 			prev = curr;
1041 			curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
1042 		}
1043 
1044 		/* Empty list or no nodes to combine */
1045 		if (WARN_ON_ONCE(min_gap == ULONG_MAX))
1046 			break;
1047 
1048 		comb_start->last = comb_end->last;
1049 		interval_tree_remove(comb_end, root);
1050 		cur_nodes--;
1051 	}
1052 }
1053 EXPORT_SYMBOL_GPL(vfio_combine_iova_ranges);
1054 
1055 /* Ranges should fit into a single kernel page */
1056 #define LOG_MAX_RANGES \
1057 	(PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range))
1058 
1059 static int
vfio_ioctl_device_feature_logging_start(struct vfio_device * device,u32 flags,void __user * arg,size_t argsz)1060 vfio_ioctl_device_feature_logging_start(struct vfio_device *device,
1061 					u32 flags, void __user *arg,
1062 					size_t argsz)
1063 {
1064 	size_t minsz =
1065 		offsetofend(struct vfio_device_feature_dma_logging_control,
1066 			    ranges);
1067 	struct vfio_device_feature_dma_logging_range __user *ranges;
1068 	struct vfio_device_feature_dma_logging_control control;
1069 	struct vfio_device_feature_dma_logging_range range;
1070 	struct rb_root_cached root = RB_ROOT_CACHED;
1071 	struct interval_tree_node *nodes;
1072 	u64 iova_end;
1073 	u32 nnodes;
1074 	int i, ret;
1075 
1076 	if (!device->log_ops)
1077 		return -ENOTTY;
1078 
1079 	ret = vfio_check_feature(flags, argsz,
1080 				 VFIO_DEVICE_FEATURE_SET,
1081 				 sizeof(control));
1082 	if (ret != 1)
1083 		return ret;
1084 
1085 	if (copy_from_user(&control, arg, minsz))
1086 		return -EFAULT;
1087 
1088 	nnodes = control.num_ranges;
1089 	if (!nnodes)
1090 		return -EINVAL;
1091 
1092 	if (nnodes > LOG_MAX_RANGES)
1093 		return -E2BIG;
1094 
1095 	ranges = u64_to_user_ptr(control.ranges);
1096 	nodes = kmalloc_array(nnodes, sizeof(struct interval_tree_node),
1097 			      GFP_KERNEL);
1098 	if (!nodes)
1099 		return -ENOMEM;
1100 
1101 	for (i = 0; i < nnodes; i++) {
1102 		if (copy_from_user(&range, &ranges[i], sizeof(range))) {
1103 			ret = -EFAULT;
1104 			goto end;
1105 		}
1106 		if (!IS_ALIGNED(range.iova, control.page_size) ||
1107 		    !IS_ALIGNED(range.length, control.page_size)) {
1108 			ret = -EINVAL;
1109 			goto end;
1110 		}
1111 
1112 		if (check_add_overflow(range.iova, range.length, &iova_end) ||
1113 		    iova_end > ULONG_MAX) {
1114 			ret = -EOVERFLOW;
1115 			goto end;
1116 		}
1117 
1118 		nodes[i].start = range.iova;
1119 		nodes[i].last = range.iova + range.length - 1;
1120 		if (interval_tree_iter_first(&root, nodes[i].start,
1121 					     nodes[i].last)) {
1122 			/* Range overlapping */
1123 			ret = -EINVAL;
1124 			goto end;
1125 		}
1126 		interval_tree_insert(nodes + i, &root);
1127 	}
1128 
1129 	ret = device->log_ops->log_start(device, &root, nnodes,
1130 					 &control.page_size);
1131 	if (ret)
1132 		goto end;
1133 
1134 	if (copy_to_user(arg, &control, sizeof(control))) {
1135 		ret = -EFAULT;
1136 		device->log_ops->log_stop(device);
1137 	}
1138 
1139 end:
1140 	kfree(nodes);
1141 	return ret;
1142 }
1143 
1144 static int
vfio_ioctl_device_feature_logging_stop(struct vfio_device * device,u32 flags,void __user * arg,size_t argsz)1145 vfio_ioctl_device_feature_logging_stop(struct vfio_device *device,
1146 				       u32 flags, void __user *arg,
1147 				       size_t argsz)
1148 {
1149 	int ret;
1150 
1151 	if (!device->log_ops)
1152 		return -ENOTTY;
1153 
1154 	ret = vfio_check_feature(flags, argsz,
1155 				 VFIO_DEVICE_FEATURE_SET, 0);
1156 	if (ret != 1)
1157 		return ret;
1158 
1159 	return device->log_ops->log_stop(device);
1160 }
1161 
vfio_device_log_read_and_clear(struct iova_bitmap * iter,unsigned long iova,size_t length,void * opaque)1162 static int vfio_device_log_read_and_clear(struct iova_bitmap *iter,
1163 					  unsigned long iova, size_t length,
1164 					  void *opaque)
1165 {
1166 	struct vfio_device *device = opaque;
1167 
1168 	return device->log_ops->log_read_and_clear(device, iova, length, iter);
1169 }
1170 
1171 static int
vfio_ioctl_device_feature_logging_report(struct vfio_device * device,u32 flags,void __user * arg,size_t argsz)1172 vfio_ioctl_device_feature_logging_report(struct vfio_device *device,
1173 					 u32 flags, void __user *arg,
1174 					 size_t argsz)
1175 {
1176 	size_t minsz =
1177 		offsetofend(struct vfio_device_feature_dma_logging_report,
1178 			    bitmap);
1179 	struct vfio_device_feature_dma_logging_report report;
1180 	struct iova_bitmap *iter;
1181 	u64 iova_end;
1182 	int ret;
1183 
1184 	if (!device->log_ops)
1185 		return -ENOTTY;
1186 
1187 	ret = vfio_check_feature(flags, argsz,
1188 				 VFIO_DEVICE_FEATURE_GET,
1189 				 sizeof(report));
1190 	if (ret != 1)
1191 		return ret;
1192 
1193 	if (copy_from_user(&report, arg, minsz))
1194 		return -EFAULT;
1195 
1196 	if (report.page_size < SZ_4K || !is_power_of_2(report.page_size))
1197 		return -EINVAL;
1198 
1199 	if (check_add_overflow(report.iova, report.length, &iova_end) ||
1200 	    iova_end > ULONG_MAX)
1201 		return -EOVERFLOW;
1202 
1203 	iter = iova_bitmap_alloc(report.iova, report.length,
1204 				 report.page_size,
1205 				 u64_to_user_ptr(report.bitmap));
1206 	if (IS_ERR(iter))
1207 		return PTR_ERR(iter);
1208 
1209 	ret = iova_bitmap_for_each(iter, device,
1210 				   vfio_device_log_read_and_clear);
1211 
1212 	iova_bitmap_free(iter);
1213 	return ret;
1214 }
1215 
vfio_ioctl_device_feature(struct vfio_device * device,struct vfio_device_feature __user * arg)1216 static int vfio_ioctl_device_feature(struct vfio_device *device,
1217 				     struct vfio_device_feature __user *arg)
1218 {
1219 	size_t minsz = offsetofend(struct vfio_device_feature, flags);
1220 	struct vfio_device_feature feature;
1221 
1222 	if (copy_from_user(&feature, arg, minsz))
1223 		return -EFAULT;
1224 
1225 	if (feature.argsz < minsz)
1226 		return -EINVAL;
1227 
1228 	/* Check unknown flags */
1229 	if (feature.flags &
1230 	    ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
1231 	      VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
1232 		return -EINVAL;
1233 
1234 	/* GET & SET are mutually exclusive except with PROBE */
1235 	if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
1236 	    (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
1237 	    (feature.flags & VFIO_DEVICE_FEATURE_GET))
1238 		return -EINVAL;
1239 
1240 	switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
1241 	case VFIO_DEVICE_FEATURE_MIGRATION:
1242 		return vfio_ioctl_device_feature_migration(
1243 			device, feature.flags, arg->data,
1244 			feature.argsz - minsz);
1245 	case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
1246 		return vfio_ioctl_device_feature_mig_device_state(
1247 			device, feature.flags, arg->data,
1248 			feature.argsz - minsz);
1249 	case VFIO_DEVICE_FEATURE_DMA_LOGGING_START:
1250 		return vfio_ioctl_device_feature_logging_start(
1251 			device, feature.flags, arg->data,
1252 			feature.argsz - minsz);
1253 	case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP:
1254 		return vfio_ioctl_device_feature_logging_stop(
1255 			device, feature.flags, arg->data,
1256 			feature.argsz - minsz);
1257 	case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT:
1258 		return vfio_ioctl_device_feature_logging_report(
1259 			device, feature.flags, arg->data,
1260 			feature.argsz - minsz);
1261 	case VFIO_DEVICE_FEATURE_MIG_DATA_SIZE:
1262 		return vfio_ioctl_device_feature_migration_data_size(
1263 			device, feature.flags, arg->data,
1264 			feature.argsz - minsz);
1265 	default:
1266 		if (unlikely(!device->ops->device_feature))
1267 			return -EINVAL;
1268 		return device->ops->device_feature(device, feature.flags,
1269 						   arg->data,
1270 						   feature.argsz - minsz);
1271 	}
1272 }
1273 
vfio_device_fops_unl_ioctl(struct file * filep,unsigned int cmd,unsigned long arg)1274 static long vfio_device_fops_unl_ioctl(struct file *filep,
1275 				       unsigned int cmd, unsigned long arg)
1276 {
1277 	struct vfio_device_file *df = filep->private_data;
1278 	struct vfio_device *device = df->device;
1279 	void __user *uptr = (void __user *)arg;
1280 	int ret;
1281 
1282 	if (cmd == VFIO_DEVICE_BIND_IOMMUFD)
1283 		return vfio_df_ioctl_bind_iommufd(df, uptr);
1284 
1285 	/* Paired with smp_store_release() following vfio_df_open() */
1286 	if (!smp_load_acquire(&df->access_granted))
1287 		return -EINVAL;
1288 
1289 	ret = vfio_device_pm_runtime_get(device);
1290 	if (ret)
1291 		return ret;
1292 
1293 	/* cdev only ioctls */
1294 	if (IS_ENABLED(CONFIG_VFIO_DEVICE_CDEV) && !df->group) {
1295 		switch (cmd) {
1296 		case VFIO_DEVICE_ATTACH_IOMMUFD_PT:
1297 			ret = vfio_df_ioctl_attach_pt(df, uptr);
1298 			goto out;
1299 
1300 		case VFIO_DEVICE_DETACH_IOMMUFD_PT:
1301 			ret = vfio_df_ioctl_detach_pt(df, uptr);
1302 			goto out;
1303 		}
1304 	}
1305 
1306 	switch (cmd) {
1307 	case VFIO_DEVICE_FEATURE:
1308 		ret = vfio_ioctl_device_feature(device, uptr);
1309 		break;
1310 
1311 	default:
1312 		if (unlikely(!device->ops->ioctl))
1313 			ret = -EINVAL;
1314 		else
1315 			ret = device->ops->ioctl(device, cmd, arg);
1316 		break;
1317 	}
1318 out:
1319 	vfio_device_pm_runtime_put(device);
1320 	return ret;
1321 }
1322 
vfio_device_fops_read(struct file * filep,char __user * buf,size_t count,loff_t * ppos)1323 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1324 				     size_t count, loff_t *ppos)
1325 {
1326 	struct vfio_device_file *df = filep->private_data;
1327 	struct vfio_device *device = df->device;
1328 
1329 	/* Paired with smp_store_release() following vfio_df_open() */
1330 	if (!smp_load_acquire(&df->access_granted))
1331 		return -EINVAL;
1332 
1333 	if (unlikely(!device->ops->read))
1334 		return -EINVAL;
1335 
1336 	return device->ops->read(device, buf, count, ppos);
1337 }
1338 
vfio_device_fops_write(struct file * filep,const char __user * buf,size_t count,loff_t * ppos)1339 static ssize_t vfio_device_fops_write(struct file *filep,
1340 				      const char __user *buf,
1341 				      size_t count, loff_t *ppos)
1342 {
1343 	struct vfio_device_file *df = filep->private_data;
1344 	struct vfio_device *device = df->device;
1345 
1346 	/* Paired with smp_store_release() following vfio_df_open() */
1347 	if (!smp_load_acquire(&df->access_granted))
1348 		return -EINVAL;
1349 
1350 	if (unlikely(!device->ops->write))
1351 		return -EINVAL;
1352 
1353 	return device->ops->write(device, buf, count, ppos);
1354 }
1355 
vfio_device_fops_mmap(struct file * filep,struct vm_area_struct * vma)1356 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1357 {
1358 	struct vfio_device_file *df = filep->private_data;
1359 	struct vfio_device *device = df->device;
1360 
1361 	/* Paired with smp_store_release() following vfio_df_open() */
1362 	if (!smp_load_acquire(&df->access_granted))
1363 		return -EINVAL;
1364 
1365 	if (unlikely(!device->ops->mmap))
1366 		return -EINVAL;
1367 
1368 	return device->ops->mmap(device, vma);
1369 }
1370 
1371 const struct file_operations vfio_device_fops = {
1372 	.owner		= THIS_MODULE,
1373 	.open		= vfio_device_fops_cdev_open,
1374 	.release	= vfio_device_fops_release,
1375 	.read		= vfio_device_fops_read,
1376 	.write		= vfio_device_fops_write,
1377 	.unlocked_ioctl	= vfio_device_fops_unl_ioctl,
1378 	.compat_ioctl	= compat_ptr_ioctl,
1379 	.mmap		= vfio_device_fops_mmap,
1380 };
1381 
vfio_device_from_file(struct file * file)1382 static struct vfio_device *vfio_device_from_file(struct file *file)
1383 {
1384 	struct vfio_device_file *df = file->private_data;
1385 
1386 	if (file->f_op != &vfio_device_fops)
1387 		return NULL;
1388 	return df->device;
1389 }
1390 
1391 /**
1392  * vfio_file_is_valid - True if the file is valid vfio file
1393  * @file: VFIO group file or VFIO device file
1394  */
vfio_file_is_valid(struct file * file)1395 bool vfio_file_is_valid(struct file *file)
1396 {
1397 	return vfio_group_from_file(file) ||
1398 	       vfio_device_from_file(file);
1399 }
1400 EXPORT_SYMBOL_GPL(vfio_file_is_valid);
1401 
1402 /**
1403  * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file
1404  *        is always CPU cache coherent
1405  * @file: VFIO group file or VFIO device file
1406  *
1407  * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop
1408  * bit in DMA transactions. A return of false indicates that the user has
1409  * rights to access additional instructions such as wbinvd on x86.
1410  */
vfio_file_enforced_coherent(struct file * file)1411 bool vfio_file_enforced_coherent(struct file *file)
1412 {
1413 	struct vfio_device *device;
1414 	struct vfio_group *group;
1415 
1416 	group = vfio_group_from_file(file);
1417 	if (group)
1418 		return vfio_group_enforced_coherent(group);
1419 
1420 	device = vfio_device_from_file(file);
1421 	if (device)
1422 		return device_iommu_capable(device->dev,
1423 					    IOMMU_CAP_ENFORCE_CACHE_COHERENCY);
1424 
1425 	return true;
1426 }
1427 EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent);
1428 
vfio_device_file_set_kvm(struct file * file,struct kvm * kvm)1429 static void vfio_device_file_set_kvm(struct file *file, struct kvm *kvm)
1430 {
1431 	struct vfio_device_file *df = file->private_data;
1432 
1433 	/*
1434 	 * The kvm is first recorded in the vfio_device_file, and will
1435 	 * be propagated to vfio_device::kvm when the file is bound to
1436 	 * iommufd successfully in the vfio device cdev path.
1437 	 */
1438 	spin_lock(&df->kvm_ref_lock);
1439 	df->kvm = kvm;
1440 	spin_unlock(&df->kvm_ref_lock);
1441 }
1442 
1443 /**
1444  * vfio_file_set_kvm - Link a kvm with VFIO drivers
1445  * @file: VFIO group file or VFIO device file
1446  * @kvm: KVM to link
1447  *
1448  * When a VFIO device is first opened the KVM will be available in
1449  * device->kvm if one was associated with the file.
1450  */
vfio_file_set_kvm(struct file * file,struct kvm * kvm)1451 void vfio_file_set_kvm(struct file *file, struct kvm *kvm)
1452 {
1453 	struct vfio_group *group;
1454 
1455 	group = vfio_group_from_file(file);
1456 	if (group)
1457 		vfio_group_set_kvm(group, kvm);
1458 
1459 	if (vfio_device_from_file(file))
1460 		vfio_device_file_set_kvm(file, kvm);
1461 }
1462 EXPORT_SYMBOL_GPL(vfio_file_set_kvm);
1463 
1464 /*
1465  * Sub-module support
1466  */
1467 /*
1468  * Helper for managing a buffer of info chain capabilities, allocate or
1469  * reallocate a buffer with additional @size, filling in @id and @version
1470  * of the capability.  A pointer to the new capability is returned.
1471  *
1472  * NB. The chain is based at the head of the buffer, so new entries are
1473  * added to the tail, vfio_info_cap_shift() should be called to fixup the
1474  * next offsets prior to copying to the user buffer.
1475  */
vfio_info_cap_add(struct vfio_info_cap * caps,size_t size,u16 id,u16 version)1476 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1477 					       size_t size, u16 id, u16 version)
1478 {
1479 	void *buf;
1480 	struct vfio_info_cap_header *header, *tmp;
1481 
1482 	/* Ensure that the next capability struct will be aligned */
1483 	size = ALIGN(size, sizeof(u64));
1484 
1485 	buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1486 	if (!buf) {
1487 		kfree(caps->buf);
1488 		caps->buf = NULL;
1489 		caps->size = 0;
1490 		return ERR_PTR(-ENOMEM);
1491 	}
1492 
1493 	caps->buf = buf;
1494 	header = buf + caps->size;
1495 
1496 	/* Eventually copied to user buffer, zero */
1497 	memset(header, 0, size);
1498 
1499 	header->id = id;
1500 	header->version = version;
1501 
1502 	/* Add to the end of the capability chain */
1503 	for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1504 		; /* nothing */
1505 
1506 	tmp->next = caps->size;
1507 	caps->size += size;
1508 
1509 	return header;
1510 }
1511 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1512 
vfio_info_cap_shift(struct vfio_info_cap * caps,size_t offset)1513 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1514 {
1515 	struct vfio_info_cap_header *tmp;
1516 	void *buf = (void *)caps->buf;
1517 
1518 	/* Capability structs should start with proper alignment */
1519 	WARN_ON(!IS_ALIGNED(offset, sizeof(u64)));
1520 
1521 	for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1522 		tmp->next += offset;
1523 }
1524 EXPORT_SYMBOL(vfio_info_cap_shift);
1525 
vfio_info_add_capability(struct vfio_info_cap * caps,struct vfio_info_cap_header * cap,size_t size)1526 int vfio_info_add_capability(struct vfio_info_cap *caps,
1527 			     struct vfio_info_cap_header *cap, size_t size)
1528 {
1529 	struct vfio_info_cap_header *header;
1530 
1531 	header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1532 	if (IS_ERR(header))
1533 		return PTR_ERR(header);
1534 
1535 	memcpy(header + 1, cap + 1, size - sizeof(*header));
1536 
1537 	return 0;
1538 }
1539 EXPORT_SYMBOL(vfio_info_add_capability);
1540 
vfio_set_irqs_validate_and_prepare(struct vfio_irq_set * hdr,int num_irqs,int max_irq_type,size_t * data_size)1541 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1542 				       int max_irq_type, size_t *data_size)
1543 {
1544 	unsigned long minsz;
1545 	size_t size;
1546 
1547 	minsz = offsetofend(struct vfio_irq_set, count);
1548 
1549 	if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1550 	    (hdr->count >= (U32_MAX - hdr->start)) ||
1551 	    (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1552 				VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1553 		return -EINVAL;
1554 
1555 	if (data_size)
1556 		*data_size = 0;
1557 
1558 	if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1559 		return -EINVAL;
1560 
1561 	switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1562 	case VFIO_IRQ_SET_DATA_NONE:
1563 		size = 0;
1564 		break;
1565 	case VFIO_IRQ_SET_DATA_BOOL:
1566 		size = sizeof(uint8_t);
1567 		break;
1568 	case VFIO_IRQ_SET_DATA_EVENTFD:
1569 		size = sizeof(int32_t);
1570 		break;
1571 	default:
1572 		return -EINVAL;
1573 	}
1574 
1575 	if (size) {
1576 		if (hdr->argsz - minsz < hdr->count * size)
1577 			return -EINVAL;
1578 
1579 		if (!data_size)
1580 			return -EINVAL;
1581 
1582 		*data_size = hdr->count * size;
1583 	}
1584 
1585 	return 0;
1586 }
1587 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1588 
1589 /*
1590  * Pin contiguous user pages and return their associated host pages for local
1591  * domain only.
1592  * @device [in]  : device
1593  * @iova [in]    : starting IOVA of user pages to be pinned.
1594  * @npage [in]   : count of pages to be pinned.  This count should not
1595  *		   be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1596  * @prot [in]    : protection flags
1597  * @pages[out]   : array of host pages
1598  * Return error or number of pages pinned.
1599  *
1600  * A driver may only call this function if the vfio_device was created
1601  * by vfio_register_emulated_iommu_dev() due to vfio_device_container_pin_pages().
1602  */
vfio_pin_pages(struct vfio_device * device,dma_addr_t iova,int npage,int prot,struct page ** pages)1603 int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
1604 		   int npage, int prot, struct page **pages)
1605 {
1606 	/* group->container cannot change while a vfio device is open */
1607 	if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device)))
1608 		return -EINVAL;
1609 	if (!device->ops->dma_unmap)
1610 		return -EINVAL;
1611 	if (vfio_device_has_container(device))
1612 		return vfio_device_container_pin_pages(device, iova,
1613 						       npage, prot, pages);
1614 	if (device->iommufd_access) {
1615 		int ret;
1616 
1617 		if (iova > ULONG_MAX)
1618 			return -EINVAL;
1619 		/*
1620 		 * VFIO ignores the sub page offset, npages is from the start of
1621 		 * a PAGE_SIZE chunk of IOVA. The caller is expected to recover
1622 		 * the sub page offset by doing:
1623 		 *     pages[0] + (iova % PAGE_SIZE)
1624 		 */
1625 		ret = iommufd_access_pin_pages(
1626 			device->iommufd_access, ALIGN_DOWN(iova, PAGE_SIZE),
1627 			npage * PAGE_SIZE, pages,
1628 			(prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0);
1629 		if (ret)
1630 			return ret;
1631 		return npage;
1632 	}
1633 	return -EINVAL;
1634 }
1635 EXPORT_SYMBOL(vfio_pin_pages);
1636 
1637 /*
1638  * Unpin contiguous host pages for local domain only.
1639  * @device [in]  : device
1640  * @iova [in]    : starting address of user pages to be unpinned.
1641  * @npage [in]   : count of pages to be unpinned.  This count should not
1642  *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1643  */
vfio_unpin_pages(struct vfio_device * device,dma_addr_t iova,int npage)1644 void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage)
1645 {
1646 	if (WARN_ON(!vfio_assert_device_open(device)))
1647 		return;
1648 	if (WARN_ON(!device->ops->dma_unmap))
1649 		return;
1650 
1651 	if (vfio_device_has_container(device)) {
1652 		vfio_device_container_unpin_pages(device, iova, npage);
1653 		return;
1654 	}
1655 	if (device->iommufd_access) {
1656 		if (WARN_ON(iova > ULONG_MAX))
1657 			return;
1658 		iommufd_access_unpin_pages(device->iommufd_access,
1659 					   ALIGN_DOWN(iova, PAGE_SIZE),
1660 					   npage * PAGE_SIZE);
1661 		return;
1662 	}
1663 }
1664 EXPORT_SYMBOL(vfio_unpin_pages);
1665 
1666 /*
1667  * This interface allows the CPUs to perform some sort of virtual DMA on
1668  * behalf of the device.
1669  *
1670  * CPUs read/write from/into a range of IOVAs pointing to user space memory
1671  * into/from a kernel buffer.
1672  *
1673  * As the read/write of user space memory is conducted via the CPUs and is
1674  * not a real device DMA, it is not necessary to pin the user space memory.
1675  *
1676  * @device [in]		: VFIO device
1677  * @iova [in]		: base IOVA of a user space buffer
1678  * @data [in]		: pointer to kernel buffer
1679  * @len [in]		: kernel buffer length
1680  * @write		: indicate read or write
1681  * Return error code on failure or 0 on success.
1682  */
vfio_dma_rw(struct vfio_device * device,dma_addr_t iova,void * data,size_t len,bool write)1683 int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data,
1684 		size_t len, bool write)
1685 {
1686 	if (!data || len <= 0 || !vfio_assert_device_open(device))
1687 		return -EINVAL;
1688 
1689 	if (vfio_device_has_container(device))
1690 		return vfio_device_container_dma_rw(device, iova,
1691 						    data, len, write);
1692 
1693 	if (device->iommufd_access) {
1694 		unsigned int flags = 0;
1695 
1696 		if (iova > ULONG_MAX)
1697 			return -EINVAL;
1698 
1699 		/* VFIO historically tries to auto-detect a kthread */
1700 		if (!current->mm)
1701 			flags |= IOMMUFD_ACCESS_RW_KTHREAD;
1702 		if (write)
1703 			flags |= IOMMUFD_ACCESS_RW_WRITE;
1704 		return iommufd_access_rw(device->iommufd_access, iova, data,
1705 					 len, flags);
1706 	}
1707 	return -EINVAL;
1708 }
1709 EXPORT_SYMBOL(vfio_dma_rw);
1710 
1711 /*
1712  * Module/class support
1713  */
vfio_init(void)1714 static int __init vfio_init(void)
1715 {
1716 	int ret;
1717 
1718 	ida_init(&vfio.device_ida);
1719 
1720 	ret = vfio_group_init();
1721 	if (ret)
1722 		return ret;
1723 
1724 	ret = vfio_virqfd_init();
1725 	if (ret)
1726 		goto err_virqfd;
1727 
1728 	/* /sys/class/vfio-dev/vfioX */
1729 	vfio.device_class = class_create("vfio-dev");
1730 	if (IS_ERR(vfio.device_class)) {
1731 		ret = PTR_ERR(vfio.device_class);
1732 		goto err_dev_class;
1733 	}
1734 
1735 	ret = vfio_cdev_init(vfio.device_class);
1736 	if (ret)
1737 		goto err_alloc_dev_chrdev;
1738 
1739 	vfio_debugfs_create_root();
1740 	pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
1741 	return 0;
1742 
1743 err_alloc_dev_chrdev:
1744 	class_destroy(vfio.device_class);
1745 	vfio.device_class = NULL;
1746 err_dev_class:
1747 	vfio_virqfd_exit();
1748 err_virqfd:
1749 	vfio_group_cleanup();
1750 	return ret;
1751 }
1752 
vfio_cleanup(void)1753 static void __exit vfio_cleanup(void)
1754 {
1755 	vfio_debugfs_remove_root();
1756 	ida_destroy(&vfio.device_ida);
1757 	vfio_cdev_cleanup();
1758 	class_destroy(vfio.device_class);
1759 	vfio.device_class = NULL;
1760 	vfio_virqfd_exit();
1761 	vfio_group_cleanup();
1762 	xa_destroy(&vfio_device_set_xa);
1763 }
1764 
1765 module_init(vfio_init);
1766 module_exit(vfio_cleanup);
1767 
1768 MODULE_IMPORT_NS(IOMMUFD);
1769 MODULE_VERSION(DRIVER_VERSION);
1770 MODULE_LICENSE("GPL v2");
1771 MODULE_AUTHOR(DRIVER_AUTHOR);
1772 MODULE_DESCRIPTION(DRIVER_DESC);
1773 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");
1774