• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * VFIO: IOMMU DMA mapping support for Type1 IOMMU
4  *
5  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
6  *     Author: Alex Williamson <alex.williamson@redhat.com>
7  *
8  * Derived from original vfio:
9  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
10  * Author: Tom Lyon, pugs@cisco.com
11  *
12  * We arbitrarily define a Type1 IOMMU as one matching the below code.
13  * It could be called the x86 IOMMU as it's designed for AMD-Vi & Intel
14  * VT-d, but that makes it harder to re-use as theoretically anyone
15  * implementing a similar IOMMU could make use of this.  We expect the
16  * IOMMU to support the IOMMU API and have few to no restrictions around
17  * the IOVA range that can be mapped.  The Type1 IOMMU is currently
18  * optimized for relatively static mappings of a userspace process with
19  * userspace pages pinned into memory.  We also assume devices and IOMMU
20  * domains are PCI based as the IOMMU API is still centered around a
21  * device/bus interface rather than a group interface.
22  */
23 
24 #include <linux/compat.h>
25 #include <linux/device.h>
26 #include <linux/fs.h>
27 #include <linux/highmem.h>
28 #include <linux/iommu.h>
29 #include <linux/module.h>
30 #include <linux/mm.h>
31 #include <linux/kthread.h>
32 #include <linux/rbtree.h>
33 #include <linux/sched/signal.h>
34 #include <linux/sched/mm.h>
35 #include <linux/slab.h>
36 #include <linux/uaccess.h>
37 #include <linux/vfio.h>
38 #include <linux/workqueue.h>
39 #include <linux/mdev.h>
40 #include <linux/notifier.h>
41 #include <linux/dma-iommu.h>
42 #include <linux/irqdomain.h>
43 
44 #define DRIVER_VERSION  "0.2"
45 #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
46 #define DRIVER_DESC     "Type1 IOMMU driver for VFIO"
47 
48 static bool allow_unsafe_interrupts;
49 module_param_named(allow_unsafe_interrupts,
50 		   allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR);
51 MODULE_PARM_DESC(allow_unsafe_interrupts,
52 		 "Enable VFIO IOMMU support for on platforms without interrupt remapping support.");
53 
54 static bool disable_hugepages;
55 module_param_named(disable_hugepages,
56 		   disable_hugepages, bool, S_IRUGO | S_IWUSR);
57 MODULE_PARM_DESC(disable_hugepages,
58 		 "Disable VFIO IOMMU support for IOMMU hugepages.");
59 
60 static unsigned int dma_entry_limit __read_mostly = U16_MAX;
61 module_param_named(dma_entry_limit, dma_entry_limit, uint, 0644);
62 MODULE_PARM_DESC(dma_entry_limit,
63 		 "Maximum number of user DMA mappings per container (65535).");
64 
65 struct vfio_iommu {
66 	struct list_head	domain_list;
67 	struct list_head	iova_list;
68 	struct vfio_domain	*external_domain; /* domain for external user */
69 	struct mutex		lock;
70 	struct rb_root		dma_list;
71 	struct blocking_notifier_head notifier;
72 	unsigned int		dma_avail;
73 	unsigned int		vaddr_invalid_count;
74 	uint64_t		pgsize_bitmap;
75 	uint64_t		num_non_pinned_groups;
76 	wait_queue_head_t	vaddr_wait;
77 	bool			v2;
78 	bool			nesting;
79 	bool			dirty_page_tracking;
80 	bool			container_open;
81 };
82 
83 struct vfio_domain {
84 	struct iommu_domain	*domain;
85 	struct list_head	next;
86 	struct list_head	group_list;
87 	int			prot;		/* IOMMU_CACHE */
88 	bool			fgsp;		/* Fine-grained super pages */
89 };
90 
91 struct vfio_dma {
92 	struct rb_node		node;
93 	dma_addr_t		iova;		/* Device address */
94 	unsigned long		vaddr;		/* Process virtual addr */
95 	size_t			size;		/* Map size (bytes) */
96 	int			prot;		/* IOMMU_READ/WRITE */
97 	bool			iommu_mapped;
98 	bool			lock_cap;	/* capable(CAP_IPC_LOCK) */
99 	bool			vaddr_invalid;
100 	struct task_struct	*task;
101 	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
102 	unsigned long		*bitmap;
103 	struct mm_struct	*mm;
104 	size_t			locked_vm;
105 };
106 
107 struct vfio_batch {
108 	struct page		**pages;	/* for pin_user_pages_remote */
109 	struct page		*fallback_page; /* if pages alloc fails */
110 	int			capacity;	/* length of pages array */
111 	int			size;		/* of batch currently */
112 	int			offset;		/* of next entry in pages */
113 };
114 
115 struct vfio_iommu_group {
116 	struct iommu_group	*iommu_group;
117 	struct list_head	next;
118 	bool			mdev_group;	/* An mdev group */
119 	bool			pinned_page_dirty_scope;
120 };
121 
122 struct vfio_iova {
123 	struct list_head	list;
124 	dma_addr_t		start;
125 	dma_addr_t		end;
126 };
127 
128 /*
129  * Guest RAM pinning working set or DMA target
130  */
131 struct vfio_pfn {
132 	struct rb_node		node;
133 	dma_addr_t		iova;		/* Device address */
134 	unsigned long		pfn;		/* Host pfn */
135 	unsigned int		ref_count;
136 };
137 
138 struct vfio_regions {
139 	struct list_head list;
140 	dma_addr_t iova;
141 	phys_addr_t phys;
142 	size_t len;
143 };
144 
145 #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
146 					(!list_empty(&iommu->domain_list))
147 
148 #define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
149 
150 /*
151  * Input argument of number of bits to bitmap_set() is unsigned integer, which
152  * further casts to signed integer for unaligned multi-bit operation,
153  * __bitmap_set().
154  * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
155  * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
156  * system.
157  */
158 #define DIRTY_BITMAP_PAGES_MAX	 ((u64)INT_MAX)
159 #define DIRTY_BITMAP_SIZE_MAX	 DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
160 
161 #define WAITED 1
162 
163 static int put_pfn(unsigned long pfn, int prot);
164 
165 static struct vfio_iommu_group*
166 vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
167 			    struct iommu_group *iommu_group);
168 
169 /*
170  * This code handles mapping and unmapping of user data buffers
171  * into DMA'ble space using the IOMMU
172  */
173 
vfio_find_dma(struct vfio_iommu * iommu,dma_addr_t start,size_t size)174 static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
175 				      dma_addr_t start, size_t size)
176 {
177 	struct rb_node *node = iommu->dma_list.rb_node;
178 
179 	while (node) {
180 		struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
181 
182 		if (start + size <= dma->iova)
183 			node = node->rb_left;
184 		else if (start >= dma->iova + dma->size)
185 			node = node->rb_right;
186 		else
187 			return dma;
188 	}
189 
190 	return NULL;
191 }
192 
vfio_find_dma_first_node(struct vfio_iommu * iommu,dma_addr_t start,u64 size)193 static struct rb_node *vfio_find_dma_first_node(struct vfio_iommu *iommu,
194 						dma_addr_t start, u64 size)
195 {
196 	struct rb_node *res = NULL;
197 	struct rb_node *node = iommu->dma_list.rb_node;
198 	struct vfio_dma *dma_res = NULL;
199 
200 	while (node) {
201 		struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
202 
203 		if (start < dma->iova + dma->size) {
204 			res = node;
205 			dma_res = dma;
206 			if (start >= dma->iova)
207 				break;
208 			node = node->rb_left;
209 		} else {
210 			node = node->rb_right;
211 		}
212 	}
213 	if (res && size && dma_res->iova >= start + size)
214 		res = NULL;
215 	return res;
216 }
217 
vfio_link_dma(struct vfio_iommu * iommu,struct vfio_dma * new)218 static void vfio_link_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
219 {
220 	struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL;
221 	struct vfio_dma *dma;
222 
223 	while (*link) {
224 		parent = *link;
225 		dma = rb_entry(parent, struct vfio_dma, node);
226 
227 		if (new->iova + new->size <= dma->iova)
228 			link = &(*link)->rb_left;
229 		else
230 			link = &(*link)->rb_right;
231 	}
232 
233 	rb_link_node(&new->node, parent, link);
234 	rb_insert_color(&new->node, &iommu->dma_list);
235 }
236 
vfio_unlink_dma(struct vfio_iommu * iommu,struct vfio_dma * old)237 static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
238 {
239 	rb_erase(&old->node, &iommu->dma_list);
240 }
241 
242 
vfio_dma_bitmap_alloc(struct vfio_dma * dma,size_t pgsize)243 static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, size_t pgsize)
244 {
245 	uint64_t npages = dma->size / pgsize;
246 
247 	if (npages > DIRTY_BITMAP_PAGES_MAX)
248 		return -EINVAL;
249 
250 	/*
251 	 * Allocate extra 64 bits that are used to calculate shift required for
252 	 * bitmap_shift_left() to manipulate and club unaligned number of pages
253 	 * in adjacent vfio_dma ranges.
254 	 */
255 	dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages) + sizeof(u64),
256 			       GFP_KERNEL);
257 	if (!dma->bitmap)
258 		return -ENOMEM;
259 
260 	return 0;
261 }
262 
vfio_dma_bitmap_free(struct vfio_dma * dma)263 static void vfio_dma_bitmap_free(struct vfio_dma *dma)
264 {
265 	kfree(dma->bitmap);
266 	dma->bitmap = NULL;
267 }
268 
vfio_dma_populate_bitmap(struct vfio_dma * dma,size_t pgsize)269 static void vfio_dma_populate_bitmap(struct vfio_dma *dma, size_t pgsize)
270 {
271 	struct rb_node *p;
272 	unsigned long pgshift = __ffs(pgsize);
273 
274 	for (p = rb_first(&dma->pfn_list); p; p = rb_next(p)) {
275 		struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, node);
276 
277 		bitmap_set(dma->bitmap, (vpfn->iova - dma->iova) >> pgshift, 1);
278 	}
279 }
280 
vfio_iommu_populate_bitmap_full(struct vfio_iommu * iommu)281 static void vfio_iommu_populate_bitmap_full(struct vfio_iommu *iommu)
282 {
283 	struct rb_node *n;
284 	unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
285 
286 	for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
287 		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
288 
289 		bitmap_set(dma->bitmap, 0, dma->size >> pgshift);
290 	}
291 }
292 
vfio_dma_bitmap_alloc_all(struct vfio_iommu * iommu,size_t pgsize)293 static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, size_t pgsize)
294 {
295 	struct rb_node *n;
296 
297 	for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
298 		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
299 		int ret;
300 
301 		ret = vfio_dma_bitmap_alloc(dma, pgsize);
302 		if (ret) {
303 			struct rb_node *p;
304 
305 			for (p = rb_prev(n); p; p = rb_prev(p)) {
306 				struct vfio_dma *dma = rb_entry(n,
307 							struct vfio_dma, node);
308 
309 				vfio_dma_bitmap_free(dma);
310 			}
311 			return ret;
312 		}
313 		vfio_dma_populate_bitmap(dma, pgsize);
314 	}
315 	return 0;
316 }
317 
vfio_dma_bitmap_free_all(struct vfio_iommu * iommu)318 static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu)
319 {
320 	struct rb_node *n;
321 
322 	for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
323 		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
324 
325 		vfio_dma_bitmap_free(dma);
326 	}
327 }
328 
329 /*
330  * Helper Functions for host iova-pfn list
331  */
vfio_find_vpfn(struct vfio_dma * dma,dma_addr_t iova)332 static struct vfio_pfn *vfio_find_vpfn(struct vfio_dma *dma, dma_addr_t iova)
333 {
334 	struct vfio_pfn *vpfn;
335 	struct rb_node *node = dma->pfn_list.rb_node;
336 
337 	while (node) {
338 		vpfn = rb_entry(node, struct vfio_pfn, node);
339 
340 		if (iova < vpfn->iova)
341 			node = node->rb_left;
342 		else if (iova > vpfn->iova)
343 			node = node->rb_right;
344 		else
345 			return vpfn;
346 	}
347 	return NULL;
348 }
349 
vfio_link_pfn(struct vfio_dma * dma,struct vfio_pfn * new)350 static void vfio_link_pfn(struct vfio_dma *dma,
351 			  struct vfio_pfn *new)
352 {
353 	struct rb_node **link, *parent = NULL;
354 	struct vfio_pfn *vpfn;
355 
356 	link = &dma->pfn_list.rb_node;
357 	while (*link) {
358 		parent = *link;
359 		vpfn = rb_entry(parent, struct vfio_pfn, node);
360 
361 		if (new->iova < vpfn->iova)
362 			link = &(*link)->rb_left;
363 		else
364 			link = &(*link)->rb_right;
365 	}
366 
367 	rb_link_node(&new->node, parent, link);
368 	rb_insert_color(&new->node, &dma->pfn_list);
369 }
370 
vfio_unlink_pfn(struct vfio_dma * dma,struct vfio_pfn * old)371 static void vfio_unlink_pfn(struct vfio_dma *dma, struct vfio_pfn *old)
372 {
373 	rb_erase(&old->node, &dma->pfn_list);
374 }
375 
vfio_add_to_pfn_list(struct vfio_dma * dma,dma_addr_t iova,unsigned long pfn)376 static int vfio_add_to_pfn_list(struct vfio_dma *dma, dma_addr_t iova,
377 				unsigned long pfn)
378 {
379 	struct vfio_pfn *vpfn;
380 
381 	vpfn = kzalloc(sizeof(*vpfn), GFP_KERNEL);
382 	if (!vpfn)
383 		return -ENOMEM;
384 
385 	vpfn->iova = iova;
386 	vpfn->pfn = pfn;
387 	vpfn->ref_count = 1;
388 	vfio_link_pfn(dma, vpfn);
389 	return 0;
390 }
391 
vfio_remove_from_pfn_list(struct vfio_dma * dma,struct vfio_pfn * vpfn)392 static void vfio_remove_from_pfn_list(struct vfio_dma *dma,
393 				      struct vfio_pfn *vpfn)
394 {
395 	vfio_unlink_pfn(dma, vpfn);
396 	kfree(vpfn);
397 }
398 
vfio_iova_get_vfio_pfn(struct vfio_dma * dma,unsigned long iova)399 static struct vfio_pfn *vfio_iova_get_vfio_pfn(struct vfio_dma *dma,
400 					       unsigned long iova)
401 {
402 	struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
403 
404 	if (vpfn)
405 		vpfn->ref_count++;
406 	return vpfn;
407 }
408 
vfio_iova_put_vfio_pfn(struct vfio_dma * dma,struct vfio_pfn * vpfn)409 static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn)
410 {
411 	int ret = 0;
412 
413 	vpfn->ref_count--;
414 	if (!vpfn->ref_count) {
415 		ret = put_pfn(vpfn->pfn, dma->prot);
416 		vfio_remove_from_pfn_list(dma, vpfn);
417 	}
418 	return ret;
419 }
420 
mm_lock_acct(struct task_struct * task,struct mm_struct * mm,bool lock_cap,long npage)421 static int mm_lock_acct(struct task_struct *task, struct mm_struct *mm,
422 			bool lock_cap, long npage)
423 {
424 	int ret = mmap_write_lock_killable(mm);
425 
426 	if (ret)
427 		return ret;
428 
429 	ret = __account_locked_vm(mm, abs(npage), npage > 0, task, lock_cap);
430 	mmap_write_unlock(mm);
431 	return ret;
432 }
433 
vfio_lock_acct(struct vfio_dma * dma,long npage,bool async)434 static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async)
435 {
436 	struct mm_struct *mm;
437 	int ret;
438 
439 	if (!npage)
440 		return 0;
441 
442 	mm = dma->mm;
443 	if (async && !mmget_not_zero(mm))
444 		return -ESRCH; /* process exited */
445 
446 	ret = mm_lock_acct(dma->task, mm, dma->lock_cap, npage);
447 	if (!ret)
448 		dma->locked_vm += npage;
449 
450 	if (async)
451 		mmput(mm);
452 
453 	return ret;
454 }
455 
456 /*
457  * Some mappings aren't backed by a struct page, for example an mmap'd
458  * MMIO range for our own or another device.  These use a different
459  * pfn conversion and shouldn't be tracked as locked pages.
460  * For compound pages, any driver that sets the reserved bit in head
461  * page needs to set the reserved bit in all subpages to be safe.
462  */
is_invalid_reserved_pfn(unsigned long pfn)463 static bool is_invalid_reserved_pfn(unsigned long pfn)
464 {
465 	if (pfn_valid(pfn))
466 		return PageReserved(pfn_to_page(pfn));
467 
468 	return true;
469 }
470 
put_pfn(unsigned long pfn,int prot)471 static int put_pfn(unsigned long pfn, int prot)
472 {
473 	if (!is_invalid_reserved_pfn(pfn)) {
474 		struct page *page = pfn_to_page(pfn);
475 
476 		unpin_user_pages_dirty_lock(&page, 1, prot & IOMMU_WRITE);
477 		return 1;
478 	}
479 	return 0;
480 }
481 
482 #define VFIO_BATCH_MAX_CAPACITY (PAGE_SIZE / sizeof(struct page *))
483 
vfio_batch_init(struct vfio_batch * batch)484 static void vfio_batch_init(struct vfio_batch *batch)
485 {
486 	batch->size = 0;
487 	batch->offset = 0;
488 
489 	if (unlikely(disable_hugepages))
490 		goto fallback;
491 
492 	batch->pages = (struct page **) __get_free_page(GFP_KERNEL);
493 	if (!batch->pages)
494 		goto fallback;
495 
496 	batch->capacity = VFIO_BATCH_MAX_CAPACITY;
497 	return;
498 
499 fallback:
500 	batch->pages = &batch->fallback_page;
501 	batch->capacity = 1;
502 }
503 
vfio_batch_unpin(struct vfio_batch * batch,struct vfio_dma * dma)504 static void vfio_batch_unpin(struct vfio_batch *batch, struct vfio_dma *dma)
505 {
506 	while (batch->size) {
507 		unsigned long pfn = page_to_pfn(batch->pages[batch->offset]);
508 
509 		put_pfn(pfn, dma->prot);
510 		batch->offset++;
511 		batch->size--;
512 	}
513 }
514 
vfio_batch_fini(struct vfio_batch * batch)515 static void vfio_batch_fini(struct vfio_batch *batch)
516 {
517 	if (batch->capacity == VFIO_BATCH_MAX_CAPACITY)
518 		free_page((unsigned long)batch->pages);
519 }
520 
follow_fault_pfn(struct vm_area_struct * vma,struct mm_struct * mm,unsigned long vaddr,unsigned long * pfn,bool write_fault)521 static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm,
522 			    unsigned long vaddr, unsigned long *pfn,
523 			    bool write_fault)
524 {
525 	pte_t *ptep;
526 	spinlock_t *ptl;
527 	int ret;
528 
529 	ret = follow_pte(vma->vm_mm, vaddr, &ptep, &ptl);
530 	if (ret) {
531 		bool unlocked = false;
532 
533 		ret = fixup_user_fault(mm, vaddr,
534 				       FAULT_FLAG_REMOTE |
535 				       (write_fault ?  FAULT_FLAG_WRITE : 0),
536 				       &unlocked);
537 		if (unlocked)
538 			return -EAGAIN;
539 
540 		if (ret)
541 			return ret;
542 
543 		ret = follow_pte(vma->vm_mm, vaddr, &ptep, &ptl);
544 		if (ret)
545 			return ret;
546 	}
547 
548 	if (write_fault && !pte_write(*ptep))
549 		ret = -EFAULT;
550 	else
551 		*pfn = pte_pfn(*ptep);
552 
553 	pte_unmap_unlock(ptep, ptl);
554 	return ret;
555 }
556 
557 /*
558  * Returns the positive number of pfns successfully obtained or a negative
559  * error code.
560  */
vaddr_get_pfns(struct mm_struct * mm,unsigned long vaddr,long npages,int prot,unsigned long * pfn,struct page ** pages)561 static int vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr,
562 			  long npages, int prot, unsigned long *pfn,
563 			  struct page **pages)
564 {
565 	struct vm_area_struct *vma;
566 	unsigned int flags = 0;
567 	int ret;
568 
569 	if (prot & IOMMU_WRITE)
570 		flags |= FOLL_WRITE;
571 
572 	mmap_read_lock(mm);
573 	ret = pin_user_pages_remote(mm, vaddr, npages, flags | FOLL_LONGTERM,
574 				    pages, NULL, NULL);
575 	if (ret > 0) {
576 		int i;
577 
578 		/*
579 		 * The zero page is always resident, we don't need to pin it
580 		 * and it falls into our invalid/reserved test so we don't
581 		 * unpin in put_pfn().  Unpin all zero pages in the batch here.
582 		 */
583 		for (i = 0 ; i < ret; i++) {
584 			if (unlikely(is_zero_pfn(page_to_pfn(pages[i]))))
585 				unpin_user_page(pages[i]);
586 		}
587 
588 		*pfn = page_to_pfn(pages[0]);
589 		goto done;
590 	}
591 
592 	vaddr = untagged_addr(vaddr);
593 
594 retry:
595 	vma = vma_lookup(mm, vaddr);
596 
597 	if (vma && vma->vm_flags & VM_PFNMAP) {
598 		ret = follow_fault_pfn(vma, mm, vaddr, pfn, prot & IOMMU_WRITE);
599 		if (ret == -EAGAIN)
600 			goto retry;
601 
602 		if (!ret) {
603 			if (is_invalid_reserved_pfn(*pfn))
604 				ret = 1;
605 			else
606 				ret = -EFAULT;
607 		}
608 	}
609 done:
610 	mmap_read_unlock(mm);
611 	return ret;
612 }
613 
vfio_wait(struct vfio_iommu * iommu)614 static int vfio_wait(struct vfio_iommu *iommu)
615 {
616 	DEFINE_WAIT(wait);
617 
618 	prepare_to_wait(&iommu->vaddr_wait, &wait, TASK_KILLABLE);
619 	mutex_unlock(&iommu->lock);
620 	schedule();
621 	mutex_lock(&iommu->lock);
622 	finish_wait(&iommu->vaddr_wait, &wait);
623 	if (kthread_should_stop() || !iommu->container_open ||
624 	    fatal_signal_pending(current)) {
625 		return -EFAULT;
626 	}
627 	return WAITED;
628 }
629 
630 /*
631  * Find dma struct and wait for its vaddr to be valid.  iommu lock is dropped
632  * if the task waits, but is re-locked on return.  Return result in *dma_p.
633  * Return 0 on success with no waiting, WAITED on success if waited, and -errno
634  * on error.
635  */
vfio_find_dma_valid(struct vfio_iommu * iommu,dma_addr_t start,size_t size,struct vfio_dma ** dma_p)636 static int vfio_find_dma_valid(struct vfio_iommu *iommu, dma_addr_t start,
637 			       size_t size, struct vfio_dma **dma_p)
638 {
639 	int ret = 0;
640 
641 	do {
642 		*dma_p = vfio_find_dma(iommu, start, size);
643 		if (!*dma_p)
644 			return -EINVAL;
645 		else if (!(*dma_p)->vaddr_invalid)
646 			return ret;
647 		else
648 			ret = vfio_wait(iommu);
649 	} while (ret == WAITED);
650 
651 	return ret;
652 }
653 
654 /*
655  * Wait for all vaddr in the dma_list to become valid.  iommu lock is dropped
656  * if the task waits, but is re-locked on return.  Return 0 on success with no
657  * waiting, WAITED on success if waited, and -errno on error.
658  */
vfio_wait_all_valid(struct vfio_iommu * iommu)659 static int vfio_wait_all_valid(struct vfio_iommu *iommu)
660 {
661 	int ret = 0;
662 
663 	while (iommu->vaddr_invalid_count && ret >= 0)
664 		ret = vfio_wait(iommu);
665 
666 	return ret;
667 }
668 
669 /*
670  * Attempt to pin pages.  We really don't want to track all the pfns and
671  * the iommu can only map chunks of consecutive pfns anyway, so get the
672  * first page and all consecutive pages with the same locking.
673  */
vfio_pin_pages_remote(struct vfio_dma * dma,unsigned long vaddr,long npage,unsigned long * pfn_base,unsigned long limit,struct vfio_batch * batch)674 static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
675 				  long npage, unsigned long *pfn_base,
676 				  unsigned long limit, struct vfio_batch *batch)
677 {
678 	unsigned long pfn;
679 	struct mm_struct *mm = current->mm;
680 	long ret, pinned = 0, lock_acct = 0;
681 	bool rsvd;
682 	dma_addr_t iova = vaddr - dma->vaddr + dma->iova;
683 
684 	/* This code path is only user initiated */
685 	if (!mm)
686 		return -ENODEV;
687 
688 	if (batch->size) {
689 		/* Leftover pages in batch from an earlier call. */
690 		*pfn_base = page_to_pfn(batch->pages[batch->offset]);
691 		pfn = *pfn_base;
692 		rsvd = is_invalid_reserved_pfn(*pfn_base);
693 	} else {
694 		*pfn_base = 0;
695 	}
696 
697 	while (npage) {
698 		if (!batch->size) {
699 			/* Empty batch, so refill it. */
700 			long req_pages = min_t(long, npage, batch->capacity);
701 
702 			ret = vaddr_get_pfns(mm, vaddr, req_pages, dma->prot,
703 					     &pfn, batch->pages);
704 			if (ret < 0)
705 				goto unpin_out;
706 
707 			batch->size = ret;
708 			batch->offset = 0;
709 
710 			if (!*pfn_base) {
711 				*pfn_base = pfn;
712 				rsvd = is_invalid_reserved_pfn(*pfn_base);
713 			}
714 		}
715 
716 		/*
717 		 * pfn is preset for the first iteration of this inner loop and
718 		 * updated at the end to handle a VM_PFNMAP pfn.  In that case,
719 		 * batch->pages isn't valid (there's no struct page), so allow
720 		 * batch->pages to be touched only when there's more than one
721 		 * pfn to check, which guarantees the pfns are from a
722 		 * !VM_PFNMAP vma.
723 		 */
724 		while (true) {
725 			if (pfn != *pfn_base + pinned ||
726 			    rsvd != is_invalid_reserved_pfn(pfn))
727 				goto out;
728 
729 			/*
730 			 * Reserved pages aren't counted against the user,
731 			 * externally pinned pages are already counted against
732 			 * the user.
733 			 */
734 			if (!rsvd && !vfio_find_vpfn(dma, iova)) {
735 				if (!dma->lock_cap &&
736 				    mm->locked_vm + lock_acct + 1 > limit) {
737 					pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
738 						__func__, limit << PAGE_SHIFT);
739 					ret = -ENOMEM;
740 					goto unpin_out;
741 				}
742 				lock_acct++;
743 			}
744 
745 			pinned++;
746 			npage--;
747 			vaddr += PAGE_SIZE;
748 			iova += PAGE_SIZE;
749 			batch->offset++;
750 			batch->size--;
751 
752 			if (!batch->size)
753 				break;
754 
755 			pfn = page_to_pfn(batch->pages[batch->offset]);
756 		}
757 
758 		if (unlikely(disable_hugepages))
759 			break;
760 	}
761 
762 out:
763 	ret = vfio_lock_acct(dma, lock_acct, false);
764 
765 unpin_out:
766 	if (batch->size == 1 && !batch->offset) {
767 		/* May be a VM_PFNMAP pfn, which the batch can't remember. */
768 		put_pfn(pfn, dma->prot);
769 		batch->size = 0;
770 	}
771 
772 	if (ret < 0) {
773 		if (pinned && !rsvd) {
774 			for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
775 				put_pfn(pfn, dma->prot);
776 		}
777 		vfio_batch_unpin(batch, dma);
778 
779 		return ret;
780 	}
781 
782 	return pinned;
783 }
784 
vfio_unpin_pages_remote(struct vfio_dma * dma,dma_addr_t iova,unsigned long pfn,long npage,bool do_accounting)785 static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
786 				    unsigned long pfn, long npage,
787 				    bool do_accounting)
788 {
789 	long unlocked = 0, locked = 0;
790 	long i;
791 
792 	for (i = 0; i < npage; i++, iova += PAGE_SIZE) {
793 		if (put_pfn(pfn++, dma->prot)) {
794 			unlocked++;
795 			if (vfio_find_vpfn(dma, iova))
796 				locked++;
797 		}
798 	}
799 
800 	if (do_accounting)
801 		vfio_lock_acct(dma, locked - unlocked, true);
802 
803 	return unlocked;
804 }
805 
vfio_pin_page_external(struct vfio_dma * dma,unsigned long vaddr,unsigned long * pfn_base,bool do_accounting)806 static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
807 				  unsigned long *pfn_base, bool do_accounting)
808 {
809 	struct page *pages[1];
810 	struct mm_struct *mm;
811 	int ret;
812 
813 	mm = dma->mm;
814 	if (!mmget_not_zero(mm))
815 		return -ENODEV;
816 
817 	ret = vaddr_get_pfns(mm, vaddr, 1, dma->prot, pfn_base, pages);
818 	if (ret != 1)
819 		goto out;
820 
821 	ret = 0;
822 
823 	if (do_accounting && !is_invalid_reserved_pfn(*pfn_base)) {
824 		ret = vfio_lock_acct(dma, 1, false);
825 		if (ret) {
826 			put_pfn(*pfn_base, dma->prot);
827 			if (ret == -ENOMEM)
828 				pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK "
829 					"(%ld) exceeded\n", __func__,
830 					dma->task->comm, task_pid_nr(dma->task),
831 					task_rlimit(dma->task, RLIMIT_MEMLOCK));
832 		}
833 	}
834 
835 out:
836 	mmput(mm);
837 	return ret;
838 }
839 
vfio_unpin_page_external(struct vfio_dma * dma,dma_addr_t iova,bool do_accounting)840 static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
841 				    bool do_accounting)
842 {
843 	int unlocked;
844 	struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
845 
846 	if (!vpfn)
847 		return 0;
848 
849 	unlocked = vfio_iova_put_vfio_pfn(dma, vpfn);
850 
851 	if (do_accounting)
852 		vfio_lock_acct(dma, -unlocked, true);
853 
854 	return unlocked;
855 }
856 
vfio_iommu_type1_pin_pages(void * iommu_data,struct iommu_group * iommu_group,unsigned long * user_pfn,int npage,int prot,unsigned long * phys_pfn)857 static int vfio_iommu_type1_pin_pages(void *iommu_data,
858 				      struct iommu_group *iommu_group,
859 				      unsigned long *user_pfn,
860 				      int npage, int prot,
861 				      unsigned long *phys_pfn)
862 {
863 	struct vfio_iommu *iommu = iommu_data;
864 	struct vfio_iommu_group *group;
865 	int i, j, ret;
866 	unsigned long remote_vaddr;
867 	struct vfio_dma *dma;
868 	bool do_accounting;
869 	dma_addr_t iova;
870 
871 	if (!iommu || !user_pfn || !phys_pfn)
872 		return -EINVAL;
873 
874 	/* Supported for v2 version only */
875 	if (!iommu->v2)
876 		return -EACCES;
877 
878 	mutex_lock(&iommu->lock);
879 
880 	/*
881 	 * Wait for all necessary vaddr's to be valid so they can be used in
882 	 * the main loop without dropping the lock, to avoid racing vs unmap.
883 	 */
884 again:
885 	if (iommu->vaddr_invalid_count) {
886 		for (i = 0; i < npage; i++) {
887 			iova = user_pfn[i] << PAGE_SHIFT;
888 			ret = vfio_find_dma_valid(iommu, iova, PAGE_SIZE, &dma);
889 			if (ret < 0)
890 				goto pin_done;
891 			if (ret == WAITED)
892 				goto again;
893 		}
894 	}
895 
896 	/* Fail if notifier list is empty */
897 	if (!iommu->notifier.head) {
898 		ret = -EINVAL;
899 		goto pin_done;
900 	}
901 
902 	/*
903 	 * If iommu capable domain exist in the container then all pages are
904 	 * already pinned and accounted. Accounting should be done if there is no
905 	 * iommu capable domain in the container.
906 	 */
907 	do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
908 
909 	for (i = 0; i < npage; i++) {
910 		struct vfio_pfn *vpfn;
911 
912 		iova = user_pfn[i] << PAGE_SHIFT;
913 		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
914 		if (!dma) {
915 			ret = -EINVAL;
916 			goto pin_unwind;
917 		}
918 
919 		if ((dma->prot & prot) != prot) {
920 			ret = -EPERM;
921 			goto pin_unwind;
922 		}
923 
924 		vpfn = vfio_iova_get_vfio_pfn(dma, iova);
925 		if (vpfn) {
926 			phys_pfn[i] = vpfn->pfn;
927 			continue;
928 		}
929 
930 		remote_vaddr = dma->vaddr + (iova - dma->iova);
931 		ret = vfio_pin_page_external(dma, remote_vaddr, &phys_pfn[i],
932 					     do_accounting);
933 		if (ret)
934 			goto pin_unwind;
935 
936 		ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]);
937 		if (ret) {
938 			if (put_pfn(phys_pfn[i], dma->prot) && do_accounting)
939 				vfio_lock_acct(dma, -1, true);
940 			goto pin_unwind;
941 		}
942 
943 		if (iommu->dirty_page_tracking) {
944 			unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
945 
946 			/*
947 			 * Bitmap populated with the smallest supported page
948 			 * size
949 			 */
950 			bitmap_set(dma->bitmap,
951 				   (iova - dma->iova) >> pgshift, 1);
952 		}
953 	}
954 	ret = i;
955 
956 	group = vfio_iommu_find_iommu_group(iommu, iommu_group);
957 	if (!group->pinned_page_dirty_scope) {
958 		group->pinned_page_dirty_scope = true;
959 		iommu->num_non_pinned_groups--;
960 	}
961 
962 	goto pin_done;
963 
964 pin_unwind:
965 	phys_pfn[i] = 0;
966 	for (j = 0; j < i; j++) {
967 		dma_addr_t iova;
968 
969 		iova = user_pfn[j] << PAGE_SHIFT;
970 		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
971 		vfio_unpin_page_external(dma, iova, do_accounting);
972 		phys_pfn[j] = 0;
973 	}
974 pin_done:
975 	mutex_unlock(&iommu->lock);
976 	return ret;
977 }
978 
vfio_iommu_type1_unpin_pages(void * iommu_data,unsigned long * user_pfn,int npage)979 static int vfio_iommu_type1_unpin_pages(void *iommu_data,
980 					unsigned long *user_pfn,
981 					int npage)
982 {
983 	struct vfio_iommu *iommu = iommu_data;
984 	bool do_accounting;
985 	int i;
986 
987 	if (!iommu || !user_pfn || npage <= 0)
988 		return -EINVAL;
989 
990 	/* Supported for v2 version only */
991 	if (!iommu->v2)
992 		return -EACCES;
993 
994 	mutex_lock(&iommu->lock);
995 
996 	do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
997 	for (i = 0; i < npage; i++) {
998 		struct vfio_dma *dma;
999 		dma_addr_t iova;
1000 
1001 		iova = user_pfn[i] << PAGE_SHIFT;
1002 		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
1003 		if (!dma)
1004 			break;
1005 
1006 		vfio_unpin_page_external(dma, iova, do_accounting);
1007 	}
1008 
1009 	mutex_unlock(&iommu->lock);
1010 	return i > 0 ? i : -EINVAL;
1011 }
1012 
vfio_sync_unpin(struct vfio_dma * dma,struct vfio_domain * domain,struct list_head * regions,struct iommu_iotlb_gather * iotlb_gather)1013 static long vfio_sync_unpin(struct vfio_dma *dma, struct vfio_domain *domain,
1014 			    struct list_head *regions,
1015 			    struct iommu_iotlb_gather *iotlb_gather)
1016 {
1017 	long unlocked = 0;
1018 	struct vfio_regions *entry, *next;
1019 
1020 	iommu_iotlb_sync(domain->domain, iotlb_gather);
1021 
1022 	list_for_each_entry_safe(entry, next, regions, list) {
1023 		unlocked += vfio_unpin_pages_remote(dma,
1024 						    entry->iova,
1025 						    entry->phys >> PAGE_SHIFT,
1026 						    entry->len >> PAGE_SHIFT,
1027 						    false);
1028 		list_del(&entry->list);
1029 		kfree(entry);
1030 	}
1031 
1032 	cond_resched();
1033 
1034 	return unlocked;
1035 }
1036 
1037 /*
1038  * Generally, VFIO needs to unpin remote pages after each IOTLB flush.
1039  * Therefore, when using IOTLB flush sync interface, VFIO need to keep track
1040  * of these regions (currently using a list).
1041  *
1042  * This value specifies maximum number of regions for each IOTLB flush sync.
1043  */
1044 #define VFIO_IOMMU_TLB_SYNC_MAX		512
1045 
unmap_unpin_fast(struct vfio_domain * domain,struct vfio_dma * dma,dma_addr_t * iova,size_t len,phys_addr_t phys,long * unlocked,struct list_head * unmapped_list,int * unmapped_cnt,struct iommu_iotlb_gather * iotlb_gather)1046 static size_t unmap_unpin_fast(struct vfio_domain *domain,
1047 			       struct vfio_dma *dma, dma_addr_t *iova,
1048 			       size_t len, phys_addr_t phys, long *unlocked,
1049 			       struct list_head *unmapped_list,
1050 			       int *unmapped_cnt,
1051 			       struct iommu_iotlb_gather *iotlb_gather)
1052 {
1053 	size_t unmapped = 0;
1054 	struct vfio_regions *entry = kzalloc(sizeof(*entry), GFP_KERNEL);
1055 
1056 	if (entry) {
1057 		unmapped = iommu_unmap_fast(domain->domain, *iova, len,
1058 					    iotlb_gather);
1059 
1060 		if (!unmapped) {
1061 			kfree(entry);
1062 		} else {
1063 			entry->iova = *iova;
1064 			entry->phys = phys;
1065 			entry->len  = unmapped;
1066 			list_add_tail(&entry->list, unmapped_list);
1067 
1068 			*iova += unmapped;
1069 			(*unmapped_cnt)++;
1070 		}
1071 	}
1072 
1073 	/*
1074 	 * Sync if the number of fast-unmap regions hits the limit
1075 	 * or in case of errors.
1076 	 */
1077 	if (*unmapped_cnt >= VFIO_IOMMU_TLB_SYNC_MAX || !unmapped) {
1078 		*unlocked += vfio_sync_unpin(dma, domain, unmapped_list,
1079 					     iotlb_gather);
1080 		*unmapped_cnt = 0;
1081 	}
1082 
1083 	return unmapped;
1084 }
1085 
unmap_unpin_slow(struct vfio_domain * domain,struct vfio_dma * dma,dma_addr_t * iova,size_t len,phys_addr_t phys,long * unlocked)1086 static size_t unmap_unpin_slow(struct vfio_domain *domain,
1087 			       struct vfio_dma *dma, dma_addr_t *iova,
1088 			       size_t len, phys_addr_t phys,
1089 			       long *unlocked)
1090 {
1091 	size_t unmapped = iommu_unmap(domain->domain, *iova, len);
1092 
1093 	if (unmapped) {
1094 		*unlocked += vfio_unpin_pages_remote(dma, *iova,
1095 						     phys >> PAGE_SHIFT,
1096 						     unmapped >> PAGE_SHIFT,
1097 						     false);
1098 		*iova += unmapped;
1099 		cond_resched();
1100 	}
1101 	return unmapped;
1102 }
1103 
vfio_unmap_unpin(struct vfio_iommu * iommu,struct vfio_dma * dma,bool do_accounting)1104 static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
1105 			     bool do_accounting)
1106 {
1107 	dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
1108 	struct vfio_domain *domain, *d;
1109 	LIST_HEAD(unmapped_region_list);
1110 	struct iommu_iotlb_gather iotlb_gather;
1111 	int unmapped_region_cnt = 0;
1112 	long unlocked = 0;
1113 
1114 	if (!dma->size)
1115 		return 0;
1116 
1117 	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
1118 		return 0;
1119 
1120 	/*
1121 	 * We use the IOMMU to track the physical addresses, otherwise we'd
1122 	 * need a much more complicated tracking system.  Unfortunately that
1123 	 * means we need to use one of the iommu domains to figure out the
1124 	 * pfns to unpin.  The rest need to be unmapped in advance so we have
1125 	 * no iommu translations remaining when the pages are unpinned.
1126 	 */
1127 	domain = d = list_first_entry(&iommu->domain_list,
1128 				      struct vfio_domain, next);
1129 
1130 	list_for_each_entry_continue(d, &iommu->domain_list, next) {
1131 		iommu_unmap(d->domain, dma->iova, dma->size);
1132 		cond_resched();
1133 	}
1134 
1135 	iommu_iotlb_gather_init(&iotlb_gather);
1136 	while (iova < end) {
1137 		size_t unmapped, len;
1138 		phys_addr_t phys, next;
1139 
1140 		phys = iommu_iova_to_phys(domain->domain, iova);
1141 		if (WARN_ON(!phys)) {
1142 			iova += PAGE_SIZE;
1143 			continue;
1144 		}
1145 
1146 		/*
1147 		 * To optimize for fewer iommu_unmap() calls, each of which
1148 		 * may require hardware cache flushing, try to find the
1149 		 * largest contiguous physical memory chunk to unmap.
1150 		 */
1151 		for (len = PAGE_SIZE;
1152 		     !domain->fgsp && iova + len < end; len += PAGE_SIZE) {
1153 			next = iommu_iova_to_phys(domain->domain, iova + len);
1154 			if (next != phys + len)
1155 				break;
1156 		}
1157 
1158 		/*
1159 		 * First, try to use fast unmap/unpin. In case of failure,
1160 		 * switch to slow unmap/unpin path.
1161 		 */
1162 		unmapped = unmap_unpin_fast(domain, dma, &iova, len, phys,
1163 					    &unlocked, &unmapped_region_list,
1164 					    &unmapped_region_cnt,
1165 					    &iotlb_gather);
1166 		if (!unmapped) {
1167 			unmapped = unmap_unpin_slow(domain, dma, &iova, len,
1168 						    phys, &unlocked);
1169 			if (WARN_ON(!unmapped))
1170 				break;
1171 		}
1172 	}
1173 
1174 	dma->iommu_mapped = false;
1175 
1176 	if (unmapped_region_cnt) {
1177 		unlocked += vfio_sync_unpin(dma, domain, &unmapped_region_list,
1178 					    &iotlb_gather);
1179 	}
1180 
1181 	if (do_accounting) {
1182 		vfio_lock_acct(dma, -unlocked, true);
1183 		return 0;
1184 	}
1185 	return unlocked;
1186 }
1187 
vfio_remove_dma(struct vfio_iommu * iommu,struct vfio_dma * dma)1188 static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
1189 {
1190 	WARN_ON(!RB_EMPTY_ROOT(&dma->pfn_list));
1191 	vfio_unmap_unpin(iommu, dma, true);
1192 	vfio_unlink_dma(iommu, dma);
1193 	put_task_struct(dma->task);
1194 	mmdrop(dma->mm);
1195 	vfio_dma_bitmap_free(dma);
1196 	if (dma->vaddr_invalid) {
1197 		iommu->vaddr_invalid_count--;
1198 		wake_up_all(&iommu->vaddr_wait);
1199 	}
1200 	kfree(dma);
1201 	iommu->dma_avail++;
1202 }
1203 
vfio_update_pgsize_bitmap(struct vfio_iommu * iommu)1204 static void vfio_update_pgsize_bitmap(struct vfio_iommu *iommu)
1205 {
1206 	struct vfio_domain *domain;
1207 
1208 	iommu->pgsize_bitmap = ULONG_MAX;
1209 
1210 	list_for_each_entry(domain, &iommu->domain_list, next)
1211 		iommu->pgsize_bitmap &= domain->domain->pgsize_bitmap;
1212 
1213 	/*
1214 	 * In case the IOMMU supports page sizes smaller than PAGE_SIZE
1215 	 * we pretend PAGE_SIZE is supported and hide sub-PAGE_SIZE sizes.
1216 	 * That way the user will be able to map/unmap buffers whose size/
1217 	 * start address is aligned with PAGE_SIZE. Pinning code uses that
1218 	 * granularity while iommu driver can use the sub-PAGE_SIZE size
1219 	 * to map the buffer.
1220 	 */
1221 	if (iommu->pgsize_bitmap & ~PAGE_MASK) {
1222 		iommu->pgsize_bitmap &= PAGE_MASK;
1223 		iommu->pgsize_bitmap |= PAGE_SIZE;
1224 	}
1225 }
1226 
update_user_bitmap(u64 __user * bitmap,struct vfio_iommu * iommu,struct vfio_dma * dma,dma_addr_t base_iova,size_t pgsize)1227 static int update_user_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
1228 			      struct vfio_dma *dma, dma_addr_t base_iova,
1229 			      size_t pgsize)
1230 {
1231 	unsigned long pgshift = __ffs(pgsize);
1232 	unsigned long nbits = dma->size >> pgshift;
1233 	unsigned long bit_offset = (dma->iova - base_iova) >> pgshift;
1234 	unsigned long copy_offset = bit_offset / BITS_PER_LONG;
1235 	unsigned long shift = bit_offset % BITS_PER_LONG;
1236 	unsigned long leftover;
1237 
1238 	/*
1239 	 * mark all pages dirty if any IOMMU capable device is not able
1240 	 * to report dirty pages and all pages are pinned and mapped.
1241 	 */
1242 	if (iommu->num_non_pinned_groups && dma->iommu_mapped)
1243 		bitmap_set(dma->bitmap, 0, nbits);
1244 
1245 	if (shift) {
1246 		bitmap_shift_left(dma->bitmap, dma->bitmap, shift,
1247 				  nbits + shift);
1248 
1249 		if (copy_from_user(&leftover,
1250 				   (void __user *)(bitmap + copy_offset),
1251 				   sizeof(leftover)))
1252 			return -EFAULT;
1253 
1254 		bitmap_or(dma->bitmap, dma->bitmap, &leftover, shift);
1255 	}
1256 
1257 	if (copy_to_user((void __user *)(bitmap + copy_offset), dma->bitmap,
1258 			 DIRTY_BITMAP_BYTES(nbits + shift)))
1259 		return -EFAULT;
1260 
1261 	return 0;
1262 }
1263 
vfio_iova_dirty_bitmap(u64 __user * bitmap,struct vfio_iommu * iommu,dma_addr_t iova,size_t size,size_t pgsize)1264 static int vfio_iova_dirty_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
1265 				  dma_addr_t iova, size_t size, size_t pgsize)
1266 {
1267 	struct vfio_dma *dma;
1268 	struct rb_node *n;
1269 	unsigned long pgshift = __ffs(pgsize);
1270 	int ret;
1271 
1272 	/*
1273 	 * GET_BITMAP request must fully cover vfio_dma mappings.  Multiple
1274 	 * vfio_dma mappings may be clubbed by specifying large ranges, but
1275 	 * there must not be any previous mappings bisected by the range.
1276 	 * An error will be returned if these conditions are not met.
1277 	 */
1278 	dma = vfio_find_dma(iommu, iova, 1);
1279 	if (dma && dma->iova != iova)
1280 		return -EINVAL;
1281 
1282 	dma = vfio_find_dma(iommu, iova + size - 1, 0);
1283 	if (dma && dma->iova + dma->size != iova + size)
1284 		return -EINVAL;
1285 
1286 	for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
1287 		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
1288 
1289 		if (dma->iova < iova)
1290 			continue;
1291 
1292 		if (dma->iova > iova + size - 1)
1293 			break;
1294 
1295 		ret = update_user_bitmap(bitmap, iommu, dma, iova, pgsize);
1296 		if (ret)
1297 			return ret;
1298 
1299 		/*
1300 		 * Re-populate bitmap to include all pinned pages which are
1301 		 * considered as dirty but exclude pages which are unpinned and
1302 		 * pages which are marked dirty by vfio_dma_rw()
1303 		 */
1304 		bitmap_clear(dma->bitmap, 0, dma->size >> pgshift);
1305 		vfio_dma_populate_bitmap(dma, pgsize);
1306 	}
1307 	return 0;
1308 }
1309 
verify_bitmap_size(uint64_t npages,uint64_t bitmap_size)1310 static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
1311 {
1312 	if (!npages || !bitmap_size || (bitmap_size > DIRTY_BITMAP_SIZE_MAX) ||
1313 	    (bitmap_size < DIRTY_BITMAP_BYTES(npages)))
1314 		return -EINVAL;
1315 
1316 	return 0;
1317 }
1318 
vfio_dma_do_unmap(struct vfio_iommu * iommu,struct vfio_iommu_type1_dma_unmap * unmap,struct vfio_bitmap * bitmap)1319 static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
1320 			     struct vfio_iommu_type1_dma_unmap *unmap,
1321 			     struct vfio_bitmap *bitmap)
1322 {
1323 	struct vfio_dma *dma, *dma_last = NULL;
1324 	size_t unmapped = 0, pgsize;
1325 	int ret = -EINVAL, retries = 0;
1326 	unsigned long pgshift;
1327 	dma_addr_t iova = unmap->iova;
1328 	u64 size = unmap->size;
1329 	bool unmap_all = unmap->flags & VFIO_DMA_UNMAP_FLAG_ALL;
1330 	bool invalidate_vaddr = unmap->flags & VFIO_DMA_UNMAP_FLAG_VADDR;
1331 	struct rb_node *n, *first_n;
1332 
1333 	mutex_lock(&iommu->lock);
1334 
1335 	pgshift = __ffs(iommu->pgsize_bitmap);
1336 	pgsize = (size_t)1 << pgshift;
1337 
1338 	if (iova & (pgsize - 1))
1339 		goto unlock;
1340 
1341 	if (unmap_all) {
1342 		if (iova || size)
1343 			goto unlock;
1344 		size = U64_MAX;
1345 	} else if (!size || size & (pgsize - 1) ||
1346 		   iova + size - 1 < iova || size > SIZE_MAX) {
1347 		goto unlock;
1348 	}
1349 
1350 	/* When dirty tracking is enabled, allow only min supported pgsize */
1351 	if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
1352 	    (!iommu->dirty_page_tracking || (bitmap->pgsize != pgsize))) {
1353 		goto unlock;
1354 	}
1355 
1356 	WARN_ON((pgsize - 1) & PAGE_MASK);
1357 again:
1358 	/*
1359 	 * vfio-iommu-type1 (v1) - User mappings were coalesced together to
1360 	 * avoid tracking individual mappings.  This means that the granularity
1361 	 * of the original mapping was lost and the user was allowed to attempt
1362 	 * to unmap any range.  Depending on the contiguousness of physical
1363 	 * memory and page sizes supported by the IOMMU, arbitrary unmaps may
1364 	 * or may not have worked.  We only guaranteed unmap granularity
1365 	 * matching the original mapping; even though it was untracked here,
1366 	 * the original mappings are reflected in IOMMU mappings.  This
1367 	 * resulted in a couple unusual behaviors.  First, if a range is not
1368 	 * able to be unmapped, ex. a set of 4k pages that was mapped as a
1369 	 * 2M hugepage into the IOMMU, the unmap ioctl returns success but with
1370 	 * a zero sized unmap.  Also, if an unmap request overlaps the first
1371 	 * address of a hugepage, the IOMMU will unmap the entire hugepage.
1372 	 * This also returns success and the returned unmap size reflects the
1373 	 * actual size unmapped.
1374 	 *
1375 	 * We attempt to maintain compatibility with this "v1" interface, but
1376 	 * we take control out of the hands of the IOMMU.  Therefore, an unmap
1377 	 * request offset from the beginning of the original mapping will
1378 	 * return success with zero sized unmap.  And an unmap request covering
1379 	 * the first iova of mapping will unmap the entire range.
1380 	 *
1381 	 * The v2 version of this interface intends to be more deterministic.
1382 	 * Unmap requests must fully cover previous mappings.  Multiple
1383 	 * mappings may still be unmaped by specifying large ranges, but there
1384 	 * must not be any previous mappings bisected by the range.  An error
1385 	 * will be returned if these conditions are not met.  The v2 interface
1386 	 * will only return success and a size of zero if there were no
1387 	 * mappings within the range.
1388 	 */
1389 	if (iommu->v2 && !unmap_all) {
1390 		dma = vfio_find_dma(iommu, iova, 1);
1391 		if (dma && dma->iova != iova)
1392 			goto unlock;
1393 
1394 		dma = vfio_find_dma(iommu, iova + size - 1, 0);
1395 		if (dma && dma->iova + dma->size != iova + size)
1396 			goto unlock;
1397 	}
1398 
1399 	ret = 0;
1400 	n = first_n = vfio_find_dma_first_node(iommu, iova, size);
1401 
1402 	while (n) {
1403 		dma = rb_entry(n, struct vfio_dma, node);
1404 		if (dma->iova >= iova + size)
1405 			break;
1406 
1407 		if (!iommu->v2 && iova > dma->iova)
1408 			break;
1409 		/*
1410 		 * Task with same address space who mapped this iova range is
1411 		 * allowed to unmap the iova range.
1412 		 */
1413 		if (dma->task->mm != current->mm)
1414 			break;
1415 
1416 		if (invalidate_vaddr) {
1417 			if (dma->vaddr_invalid) {
1418 				struct rb_node *last_n = n;
1419 
1420 				for (n = first_n; n != last_n; n = rb_next(n)) {
1421 					dma = rb_entry(n,
1422 						       struct vfio_dma, node);
1423 					dma->vaddr_invalid = false;
1424 					iommu->vaddr_invalid_count--;
1425 				}
1426 				ret = -EINVAL;
1427 				unmapped = 0;
1428 				break;
1429 			}
1430 			dma->vaddr_invalid = true;
1431 			iommu->vaddr_invalid_count++;
1432 			unmapped += dma->size;
1433 			n = rb_next(n);
1434 			continue;
1435 		}
1436 
1437 		if (!RB_EMPTY_ROOT(&dma->pfn_list)) {
1438 			struct vfio_iommu_type1_dma_unmap nb_unmap;
1439 
1440 			if (dma_last == dma) {
1441 				BUG_ON(++retries > 10);
1442 			} else {
1443 				dma_last = dma;
1444 				retries = 0;
1445 			}
1446 
1447 			nb_unmap.iova = dma->iova;
1448 			nb_unmap.size = dma->size;
1449 
1450 			/*
1451 			 * Notify anyone (mdev vendor drivers) to invalidate and
1452 			 * unmap iovas within the range we're about to unmap.
1453 			 * Vendor drivers MUST unpin pages in response to an
1454 			 * invalidation.
1455 			 */
1456 			mutex_unlock(&iommu->lock);
1457 			blocking_notifier_call_chain(&iommu->notifier,
1458 						    VFIO_IOMMU_NOTIFY_DMA_UNMAP,
1459 						    &nb_unmap);
1460 			mutex_lock(&iommu->lock);
1461 			goto again;
1462 		}
1463 
1464 		if (unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
1465 			ret = update_user_bitmap(bitmap->data, iommu, dma,
1466 						 iova, pgsize);
1467 			if (ret)
1468 				break;
1469 		}
1470 
1471 		unmapped += dma->size;
1472 		n = rb_next(n);
1473 		vfio_remove_dma(iommu, dma);
1474 	}
1475 
1476 unlock:
1477 	mutex_unlock(&iommu->lock);
1478 
1479 	/* Report how much was unmapped */
1480 	unmap->size = unmapped;
1481 
1482 	return ret;
1483 }
1484 
vfio_iommu_map(struct vfio_iommu * iommu,dma_addr_t iova,unsigned long pfn,long npage,int prot)1485 static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova,
1486 			  unsigned long pfn, long npage, int prot)
1487 {
1488 	struct vfio_domain *d;
1489 	int ret;
1490 
1491 	list_for_each_entry(d, &iommu->domain_list, next) {
1492 		ret = iommu_map(d->domain, iova, (phys_addr_t)pfn << PAGE_SHIFT,
1493 				npage << PAGE_SHIFT, prot | d->prot);
1494 		if (ret)
1495 			goto unwind;
1496 
1497 		cond_resched();
1498 	}
1499 
1500 	return 0;
1501 
1502 unwind:
1503 	list_for_each_entry_continue_reverse(d, &iommu->domain_list, next) {
1504 		iommu_unmap(d->domain, iova, npage << PAGE_SHIFT);
1505 		cond_resched();
1506 	}
1507 
1508 	return ret;
1509 }
1510 
vfio_pin_map_dma(struct vfio_iommu * iommu,struct vfio_dma * dma,size_t map_size)1511 static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
1512 			    size_t map_size)
1513 {
1514 	dma_addr_t iova = dma->iova;
1515 	unsigned long vaddr = dma->vaddr;
1516 	struct vfio_batch batch;
1517 	size_t size = map_size;
1518 	long npage;
1519 	unsigned long pfn, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
1520 	int ret = 0;
1521 
1522 	vfio_batch_init(&batch);
1523 
1524 	while (size) {
1525 		/* Pin a contiguous chunk of memory */
1526 		npage = vfio_pin_pages_remote(dma, vaddr + dma->size,
1527 					      size >> PAGE_SHIFT, &pfn, limit,
1528 					      &batch);
1529 		if (npage <= 0) {
1530 			WARN_ON(!npage);
1531 			ret = (int)npage;
1532 			break;
1533 		}
1534 
1535 		/* Map it! */
1536 		ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage,
1537 				     dma->prot);
1538 		if (ret) {
1539 			vfio_unpin_pages_remote(dma, iova + dma->size, pfn,
1540 						npage, true);
1541 			vfio_batch_unpin(&batch, dma);
1542 			break;
1543 		}
1544 
1545 		size -= npage << PAGE_SHIFT;
1546 		dma->size += npage << PAGE_SHIFT;
1547 	}
1548 
1549 	vfio_batch_fini(&batch);
1550 	dma->iommu_mapped = true;
1551 
1552 	if (ret)
1553 		vfio_remove_dma(iommu, dma);
1554 
1555 	return ret;
1556 }
1557 
1558 /*
1559  * Check dma map request is within a valid iova range
1560  */
vfio_iommu_iova_dma_valid(struct vfio_iommu * iommu,dma_addr_t start,dma_addr_t end)1561 static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,
1562 				      dma_addr_t start, dma_addr_t end)
1563 {
1564 	struct list_head *iova = &iommu->iova_list;
1565 	struct vfio_iova *node;
1566 
1567 	list_for_each_entry(node, iova, list) {
1568 		if (start >= node->start && end <= node->end)
1569 			return true;
1570 	}
1571 
1572 	/*
1573 	 * Check for list_empty() as well since a container with
1574 	 * a single mdev device will have an empty list.
1575 	 */
1576 	return list_empty(iova);
1577 }
1578 
vfio_change_dma_owner(struct vfio_dma * dma)1579 static int vfio_change_dma_owner(struct vfio_dma *dma)
1580 {
1581 	struct task_struct *task = current->group_leader;
1582 	struct mm_struct *mm = current->mm;
1583 	long npage = dma->locked_vm;
1584 	bool lock_cap;
1585 	int ret;
1586 
1587 	if (mm == dma->mm)
1588 		return 0;
1589 
1590 	lock_cap = capable(CAP_IPC_LOCK);
1591 	ret = mm_lock_acct(task, mm, lock_cap, npage);
1592 	if (ret)
1593 		return ret;
1594 
1595 	if (mmget_not_zero(dma->mm)) {
1596 		mm_lock_acct(dma->task, dma->mm, dma->lock_cap, -npage);
1597 		mmput(dma->mm);
1598 	}
1599 
1600 	if (dma->task != task) {
1601 		put_task_struct(dma->task);
1602 		dma->task = get_task_struct(task);
1603 	}
1604 	mmdrop(dma->mm);
1605 	dma->mm = mm;
1606 	mmgrab(dma->mm);
1607 	dma->lock_cap = lock_cap;
1608 	return 0;
1609 }
1610 
vfio_dma_do_map(struct vfio_iommu * iommu,struct vfio_iommu_type1_dma_map * map)1611 static int vfio_dma_do_map(struct vfio_iommu *iommu,
1612 			   struct vfio_iommu_type1_dma_map *map)
1613 {
1614 	bool set_vaddr = map->flags & VFIO_DMA_MAP_FLAG_VADDR;
1615 	dma_addr_t iova = map->iova;
1616 	unsigned long vaddr = map->vaddr;
1617 	size_t size = map->size;
1618 	int ret = 0, prot = 0;
1619 	size_t pgsize;
1620 	struct vfio_dma *dma;
1621 
1622 	/* Verify that none of our __u64 fields overflow */
1623 	if (map->size != size || map->vaddr != vaddr || map->iova != iova)
1624 		return -EINVAL;
1625 
1626 	/* READ/WRITE from device perspective */
1627 	if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
1628 		prot |= IOMMU_WRITE;
1629 	if (map->flags & VFIO_DMA_MAP_FLAG_READ)
1630 		prot |= IOMMU_READ;
1631 
1632 	if ((prot && set_vaddr) || (!prot && !set_vaddr))
1633 		return -EINVAL;
1634 
1635 	mutex_lock(&iommu->lock);
1636 
1637 	pgsize = (size_t)1 << __ffs(iommu->pgsize_bitmap);
1638 
1639 	WARN_ON((pgsize - 1) & PAGE_MASK);
1640 
1641 	if (!size || (size | iova | vaddr) & (pgsize - 1)) {
1642 		ret = -EINVAL;
1643 		goto out_unlock;
1644 	}
1645 
1646 	/* Don't allow IOVA or virtual address wrap */
1647 	if (iova + size - 1 < iova || vaddr + size - 1 < vaddr) {
1648 		ret = -EINVAL;
1649 		goto out_unlock;
1650 	}
1651 
1652 	dma = vfio_find_dma(iommu, iova, size);
1653 	if (set_vaddr) {
1654 		if (!dma) {
1655 			ret = -ENOENT;
1656 		} else if (!dma->vaddr_invalid || dma->iova != iova ||
1657 			   dma->size != size) {
1658 			ret = -EINVAL;
1659 		} else {
1660 			ret = vfio_change_dma_owner(dma);
1661 			if (ret)
1662 				goto out_unlock;
1663 			dma->vaddr = vaddr;
1664 			dma->vaddr_invalid = false;
1665 			iommu->vaddr_invalid_count--;
1666 			wake_up_all(&iommu->vaddr_wait);
1667 		}
1668 		goto out_unlock;
1669 	} else if (dma) {
1670 		ret = -EEXIST;
1671 		goto out_unlock;
1672 	}
1673 
1674 	if (!iommu->dma_avail) {
1675 		ret = -ENOSPC;
1676 		goto out_unlock;
1677 	}
1678 
1679 	if (!vfio_iommu_iova_dma_valid(iommu, iova, iova + size - 1)) {
1680 		ret = -EINVAL;
1681 		goto out_unlock;
1682 	}
1683 
1684 	dma = kzalloc(sizeof(*dma), GFP_KERNEL);
1685 	if (!dma) {
1686 		ret = -ENOMEM;
1687 		goto out_unlock;
1688 	}
1689 
1690 	iommu->dma_avail--;
1691 	dma->iova = iova;
1692 	dma->vaddr = vaddr;
1693 	dma->prot = prot;
1694 
1695 	/*
1696 	 * We need to be able to both add to a task's locked memory and test
1697 	 * against the locked memory limit and we need to be able to do both
1698 	 * outside of this call path as pinning can be asynchronous via the
1699 	 * external interfaces for mdev devices.  RLIMIT_MEMLOCK requires a
1700 	 * task_struct. Save the group_leader so that all DMA tracking uses
1701 	 * the same task, to make debugging easier.  VM locked pages requires
1702 	 * an mm_struct, so grab the mm in case the task dies.
1703 	 */
1704 	get_task_struct(current->group_leader);
1705 	dma->task = current->group_leader;
1706 	dma->lock_cap = capable(CAP_IPC_LOCK);
1707 	dma->mm = current->mm;
1708 	mmgrab(dma->mm);
1709 
1710 	dma->pfn_list = RB_ROOT;
1711 
1712 	/* Insert zero-sized and grow as we map chunks of it */
1713 	vfio_link_dma(iommu, dma);
1714 
1715 	/* Don't pin and map if container doesn't contain IOMMU capable domain*/
1716 	if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
1717 		dma->size = size;
1718 	else
1719 		ret = vfio_pin_map_dma(iommu, dma, size);
1720 
1721 	if (!ret && iommu->dirty_page_tracking) {
1722 		ret = vfio_dma_bitmap_alloc(dma, pgsize);
1723 		if (ret)
1724 			vfio_remove_dma(iommu, dma);
1725 	}
1726 
1727 out_unlock:
1728 	mutex_unlock(&iommu->lock);
1729 	return ret;
1730 }
1731 
vfio_bus_type(struct device * dev,void * data)1732 static int vfio_bus_type(struct device *dev, void *data)
1733 {
1734 	struct bus_type **bus = data;
1735 
1736 	if (*bus && *bus != dev->bus)
1737 		return -EINVAL;
1738 
1739 	*bus = dev->bus;
1740 
1741 	return 0;
1742 }
1743 
vfio_iommu_replay(struct vfio_iommu * iommu,struct vfio_domain * domain)1744 static int vfio_iommu_replay(struct vfio_iommu *iommu,
1745 			     struct vfio_domain *domain)
1746 {
1747 	struct vfio_batch batch;
1748 	struct vfio_domain *d = NULL;
1749 	struct rb_node *n;
1750 	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
1751 	int ret;
1752 
1753 	ret = vfio_wait_all_valid(iommu);
1754 	if (ret < 0)
1755 		return ret;
1756 
1757 	/* Arbitrarily pick the first domain in the list for lookups */
1758 	if (!list_empty(&iommu->domain_list))
1759 		d = list_first_entry(&iommu->domain_list,
1760 				     struct vfio_domain, next);
1761 
1762 	vfio_batch_init(&batch);
1763 
1764 	n = rb_first(&iommu->dma_list);
1765 
1766 	for (; n; n = rb_next(n)) {
1767 		struct vfio_dma *dma;
1768 		dma_addr_t iova;
1769 
1770 		dma = rb_entry(n, struct vfio_dma, node);
1771 		iova = dma->iova;
1772 
1773 		while (iova < dma->iova + dma->size) {
1774 			phys_addr_t phys;
1775 			size_t size;
1776 
1777 			if (dma->iommu_mapped) {
1778 				phys_addr_t p;
1779 				dma_addr_t i;
1780 
1781 				if (WARN_ON(!d)) { /* mapped w/o a domain?! */
1782 					ret = -EINVAL;
1783 					goto unwind;
1784 				}
1785 
1786 				phys = iommu_iova_to_phys(d->domain, iova);
1787 
1788 				if (WARN_ON(!phys)) {
1789 					iova += PAGE_SIZE;
1790 					continue;
1791 				}
1792 
1793 				size = PAGE_SIZE;
1794 				p = phys + size;
1795 				i = iova + size;
1796 				while (i < dma->iova + dma->size &&
1797 				       p == iommu_iova_to_phys(d->domain, i)) {
1798 					size += PAGE_SIZE;
1799 					p += PAGE_SIZE;
1800 					i += PAGE_SIZE;
1801 				}
1802 			} else {
1803 				unsigned long pfn;
1804 				unsigned long vaddr = dma->vaddr +
1805 						     (iova - dma->iova);
1806 				size_t n = dma->iova + dma->size - iova;
1807 				long npage;
1808 
1809 				npage = vfio_pin_pages_remote(dma, vaddr,
1810 							      n >> PAGE_SHIFT,
1811 							      &pfn, limit,
1812 							      &batch);
1813 				if (npage <= 0) {
1814 					WARN_ON(!npage);
1815 					ret = (int)npage;
1816 					goto unwind;
1817 				}
1818 
1819 				phys = pfn << PAGE_SHIFT;
1820 				size = npage << PAGE_SHIFT;
1821 			}
1822 
1823 			ret = iommu_map(domain->domain, iova, phys,
1824 					size, dma->prot | domain->prot);
1825 			if (ret) {
1826 				if (!dma->iommu_mapped) {
1827 					vfio_unpin_pages_remote(dma, iova,
1828 							phys >> PAGE_SHIFT,
1829 							size >> PAGE_SHIFT,
1830 							true);
1831 					vfio_batch_unpin(&batch, dma);
1832 				}
1833 				goto unwind;
1834 			}
1835 
1836 			iova += size;
1837 		}
1838 	}
1839 
1840 	/* All dmas are now mapped, defer to second tree walk for unwind */
1841 	for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
1842 		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
1843 
1844 		dma->iommu_mapped = true;
1845 	}
1846 
1847 	vfio_batch_fini(&batch);
1848 	return 0;
1849 
1850 unwind:
1851 	for (; n; n = rb_prev(n)) {
1852 		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
1853 		dma_addr_t iova;
1854 
1855 		if (dma->iommu_mapped) {
1856 			iommu_unmap(domain->domain, dma->iova, dma->size);
1857 			continue;
1858 		}
1859 
1860 		iova = dma->iova;
1861 		while (iova < dma->iova + dma->size) {
1862 			phys_addr_t phys, p;
1863 			size_t size;
1864 			dma_addr_t i;
1865 
1866 			phys = iommu_iova_to_phys(domain->domain, iova);
1867 			if (!phys) {
1868 				iova += PAGE_SIZE;
1869 				continue;
1870 			}
1871 
1872 			size = PAGE_SIZE;
1873 			p = phys + size;
1874 			i = iova + size;
1875 			while (i < dma->iova + dma->size &&
1876 			       p == iommu_iova_to_phys(domain->domain, i)) {
1877 				size += PAGE_SIZE;
1878 				p += PAGE_SIZE;
1879 				i += PAGE_SIZE;
1880 			}
1881 
1882 			iommu_unmap(domain->domain, iova, size);
1883 			vfio_unpin_pages_remote(dma, iova, phys >> PAGE_SHIFT,
1884 						size >> PAGE_SHIFT, true);
1885 		}
1886 	}
1887 
1888 	vfio_batch_fini(&batch);
1889 	return ret;
1890 }
1891 
1892 /*
1893  * We change our unmap behavior slightly depending on whether the IOMMU
1894  * supports fine-grained superpages.  IOMMUs like AMD-Vi will use a superpage
1895  * for practically any contiguous power-of-two mapping we give it.  This means
1896  * we don't need to look for contiguous chunks ourselves to make unmapping
1897  * more efficient.  On IOMMUs with coarse-grained super pages, like Intel VT-d
1898  * with discrete 2M/1G/512G/1T superpages, identifying contiguous chunks
1899  * significantly boosts non-hugetlbfs mappings and doesn't seem to hurt when
1900  * hugetlbfs is in use.
1901  */
vfio_test_domain_fgsp(struct vfio_domain * domain)1902 static void vfio_test_domain_fgsp(struct vfio_domain *domain)
1903 {
1904 	struct page *pages;
1905 	int ret, order = get_order(PAGE_SIZE * 2);
1906 
1907 	pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, order);
1908 	if (!pages)
1909 		return;
1910 
1911 	ret = iommu_map(domain->domain, 0, page_to_phys(pages), PAGE_SIZE * 2,
1912 			IOMMU_READ | IOMMU_WRITE | domain->prot);
1913 	if (!ret) {
1914 		size_t unmapped = iommu_unmap(domain->domain, 0, PAGE_SIZE);
1915 
1916 		if (unmapped == PAGE_SIZE)
1917 			iommu_unmap(domain->domain, PAGE_SIZE, PAGE_SIZE);
1918 		else
1919 			domain->fgsp = true;
1920 	}
1921 
1922 	__free_pages(pages, order);
1923 }
1924 
find_iommu_group(struct vfio_domain * domain,struct iommu_group * iommu_group)1925 static struct vfio_iommu_group *find_iommu_group(struct vfio_domain *domain,
1926 						 struct iommu_group *iommu_group)
1927 {
1928 	struct vfio_iommu_group *g;
1929 
1930 	list_for_each_entry(g, &domain->group_list, next) {
1931 		if (g->iommu_group == iommu_group)
1932 			return g;
1933 	}
1934 
1935 	return NULL;
1936 }
1937 
1938 static struct vfio_iommu_group*
vfio_iommu_find_iommu_group(struct vfio_iommu * iommu,struct iommu_group * iommu_group)1939 vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
1940 			    struct iommu_group *iommu_group)
1941 {
1942 	struct vfio_domain *domain;
1943 	struct vfio_iommu_group *group = NULL;
1944 
1945 	list_for_each_entry(domain, &iommu->domain_list, next) {
1946 		group = find_iommu_group(domain, iommu_group);
1947 		if (group)
1948 			return group;
1949 	}
1950 
1951 	if (iommu->external_domain)
1952 		group = find_iommu_group(iommu->external_domain, iommu_group);
1953 
1954 	return group;
1955 }
1956 
vfio_iommu_has_sw_msi(struct list_head * group_resv_regions,phys_addr_t * base)1957 static bool vfio_iommu_has_sw_msi(struct list_head *group_resv_regions,
1958 				  phys_addr_t *base)
1959 {
1960 	struct iommu_resv_region *region;
1961 	bool ret = false;
1962 
1963 	list_for_each_entry(region, group_resv_regions, list) {
1964 		/*
1965 		 * The presence of any 'real' MSI regions should take
1966 		 * precedence over the software-managed one if the
1967 		 * IOMMU driver happens to advertise both types.
1968 		 */
1969 		if (region->type == IOMMU_RESV_MSI) {
1970 			ret = false;
1971 			break;
1972 		}
1973 
1974 		if (region->type == IOMMU_RESV_SW_MSI) {
1975 			*base = region->start;
1976 			ret = true;
1977 		}
1978 	}
1979 
1980 	return ret;
1981 }
1982 
vfio_mdev_attach_domain(struct device * dev,void * data)1983 static int vfio_mdev_attach_domain(struct device *dev, void *data)
1984 {
1985 	struct mdev_device *mdev = to_mdev_device(dev);
1986 	struct iommu_domain *domain = data;
1987 	struct device *iommu_device;
1988 
1989 	iommu_device = mdev_get_iommu_device(mdev);
1990 	if (iommu_device) {
1991 		if (iommu_dev_feature_enabled(iommu_device, IOMMU_DEV_FEAT_AUX))
1992 			return iommu_aux_attach_device(domain, iommu_device);
1993 		else
1994 			return iommu_attach_device(domain, iommu_device);
1995 	}
1996 
1997 	return -EINVAL;
1998 }
1999 
vfio_mdev_detach_domain(struct device * dev,void * data)2000 static int vfio_mdev_detach_domain(struct device *dev, void *data)
2001 {
2002 	struct mdev_device *mdev = to_mdev_device(dev);
2003 	struct iommu_domain *domain = data;
2004 	struct device *iommu_device;
2005 
2006 	iommu_device = mdev_get_iommu_device(mdev);
2007 	if (iommu_device) {
2008 		if (iommu_dev_feature_enabled(iommu_device, IOMMU_DEV_FEAT_AUX))
2009 			iommu_aux_detach_device(domain, iommu_device);
2010 		else
2011 			iommu_detach_device(domain, iommu_device);
2012 	}
2013 
2014 	return 0;
2015 }
2016 
vfio_iommu_attach_group(struct vfio_domain * domain,struct vfio_iommu_group * group)2017 static int vfio_iommu_attach_group(struct vfio_domain *domain,
2018 				   struct vfio_iommu_group *group)
2019 {
2020 	if (group->mdev_group)
2021 		return iommu_group_for_each_dev(group->iommu_group,
2022 						domain->domain,
2023 						vfio_mdev_attach_domain);
2024 	else
2025 		return iommu_attach_group(domain->domain, group->iommu_group);
2026 }
2027 
vfio_iommu_detach_group(struct vfio_domain * domain,struct vfio_iommu_group * group)2028 static void vfio_iommu_detach_group(struct vfio_domain *domain,
2029 				    struct vfio_iommu_group *group)
2030 {
2031 	if (group->mdev_group)
2032 		iommu_group_for_each_dev(group->iommu_group, domain->domain,
2033 					 vfio_mdev_detach_domain);
2034 	else
2035 		iommu_detach_group(domain->domain, group->iommu_group);
2036 }
2037 
vfio_bus_is_mdev(struct bus_type * bus)2038 static bool vfio_bus_is_mdev(struct bus_type *bus)
2039 {
2040 	struct bus_type *mdev_bus;
2041 	bool ret = false;
2042 
2043 	mdev_bus = symbol_get(mdev_bus_type);
2044 	if (mdev_bus) {
2045 		ret = (bus == mdev_bus);
2046 		symbol_put(mdev_bus_type);
2047 	}
2048 
2049 	return ret;
2050 }
2051 
vfio_mdev_iommu_device(struct device * dev,void * data)2052 static int vfio_mdev_iommu_device(struct device *dev, void *data)
2053 {
2054 	struct mdev_device *mdev = to_mdev_device(dev);
2055 	struct device **old = data, *new;
2056 
2057 	new = mdev_get_iommu_device(mdev);
2058 	if (!new || (*old && *old != new))
2059 		return -EINVAL;
2060 
2061 	*old = new;
2062 
2063 	return 0;
2064 }
2065 
2066 /*
2067  * This is a helper function to insert an address range to iova list.
2068  * The list is initially created with a single entry corresponding to
2069  * the IOMMU domain geometry to which the device group is attached.
2070  * The list aperture gets modified when a new domain is added to the
2071  * container if the new aperture doesn't conflict with the current one
2072  * or with any existing dma mappings. The list is also modified to
2073  * exclude any reserved regions associated with the device group.
2074  */
vfio_iommu_iova_insert(struct list_head * head,dma_addr_t start,dma_addr_t end)2075 static int vfio_iommu_iova_insert(struct list_head *head,
2076 				  dma_addr_t start, dma_addr_t end)
2077 {
2078 	struct vfio_iova *region;
2079 
2080 	region = kmalloc(sizeof(*region), GFP_KERNEL);
2081 	if (!region)
2082 		return -ENOMEM;
2083 
2084 	INIT_LIST_HEAD(&region->list);
2085 	region->start = start;
2086 	region->end = end;
2087 
2088 	list_add_tail(&region->list, head);
2089 	return 0;
2090 }
2091 
2092 /*
2093  * Check the new iommu aperture conflicts with existing aper or with any
2094  * existing dma mappings.
2095  */
vfio_iommu_aper_conflict(struct vfio_iommu * iommu,dma_addr_t start,dma_addr_t end)2096 static bool vfio_iommu_aper_conflict(struct vfio_iommu *iommu,
2097 				     dma_addr_t start, dma_addr_t end)
2098 {
2099 	struct vfio_iova *first, *last;
2100 	struct list_head *iova = &iommu->iova_list;
2101 
2102 	if (list_empty(iova))
2103 		return false;
2104 
2105 	/* Disjoint sets, return conflict */
2106 	first = list_first_entry(iova, struct vfio_iova, list);
2107 	last = list_last_entry(iova, struct vfio_iova, list);
2108 	if (start > last->end || end < first->start)
2109 		return true;
2110 
2111 	/* Check for any existing dma mappings below the new start */
2112 	if (start > first->start) {
2113 		if (vfio_find_dma(iommu, first->start, start - first->start))
2114 			return true;
2115 	}
2116 
2117 	/* Check for any existing dma mappings beyond the new end */
2118 	if (end < last->end) {
2119 		if (vfio_find_dma(iommu, end + 1, last->end - end))
2120 			return true;
2121 	}
2122 
2123 	return false;
2124 }
2125 
2126 /*
2127  * Resize iommu iova aperture window. This is called only if the new
2128  * aperture has no conflict with existing aperture and dma mappings.
2129  */
vfio_iommu_aper_resize(struct list_head * iova,dma_addr_t start,dma_addr_t end)2130 static int vfio_iommu_aper_resize(struct list_head *iova,
2131 				  dma_addr_t start, dma_addr_t end)
2132 {
2133 	struct vfio_iova *node, *next;
2134 
2135 	if (list_empty(iova))
2136 		return vfio_iommu_iova_insert(iova, start, end);
2137 
2138 	/* Adjust iova list start */
2139 	list_for_each_entry_safe(node, next, iova, list) {
2140 		if (start < node->start)
2141 			break;
2142 		if (start >= node->start && start < node->end) {
2143 			node->start = start;
2144 			break;
2145 		}
2146 		/* Delete nodes before new start */
2147 		list_del(&node->list);
2148 		kfree(node);
2149 	}
2150 
2151 	/* Adjust iova list end */
2152 	list_for_each_entry_safe(node, next, iova, list) {
2153 		if (end > node->end)
2154 			continue;
2155 		if (end > node->start && end <= node->end) {
2156 			node->end = end;
2157 			continue;
2158 		}
2159 		/* Delete nodes after new end */
2160 		list_del(&node->list);
2161 		kfree(node);
2162 	}
2163 
2164 	return 0;
2165 }
2166 
2167 /*
2168  * Check reserved region conflicts with existing dma mappings
2169  */
vfio_iommu_resv_conflict(struct vfio_iommu * iommu,struct list_head * resv_regions)2170 static bool vfio_iommu_resv_conflict(struct vfio_iommu *iommu,
2171 				     struct list_head *resv_regions)
2172 {
2173 	struct iommu_resv_region *region;
2174 
2175 	/* Check for conflict with existing dma mappings */
2176 	list_for_each_entry(region, resv_regions, list) {
2177 		if (region->type == IOMMU_RESV_DIRECT_RELAXABLE)
2178 			continue;
2179 
2180 		if (vfio_find_dma(iommu, region->start, region->length))
2181 			return true;
2182 	}
2183 
2184 	return false;
2185 }
2186 
2187 /*
2188  * Check iova region overlap with  reserved regions and
2189  * exclude them from the iommu iova range
2190  */
vfio_iommu_resv_exclude(struct list_head * iova,struct list_head * resv_regions)2191 static int vfio_iommu_resv_exclude(struct list_head *iova,
2192 				   struct list_head *resv_regions)
2193 {
2194 	struct iommu_resv_region *resv;
2195 	struct vfio_iova *n, *next;
2196 
2197 	list_for_each_entry(resv, resv_regions, list) {
2198 		phys_addr_t start, end;
2199 
2200 		if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE)
2201 			continue;
2202 
2203 		start = resv->start;
2204 		end = resv->start + resv->length - 1;
2205 
2206 		list_for_each_entry_safe(n, next, iova, list) {
2207 			int ret = 0;
2208 
2209 			/* No overlap */
2210 			if (start > n->end || end < n->start)
2211 				continue;
2212 			/*
2213 			 * Insert a new node if current node overlaps with the
2214 			 * reserve region to exclude that from valid iova range.
2215 			 * Note that, new node is inserted before the current
2216 			 * node and finally the current node is deleted keeping
2217 			 * the list updated and sorted.
2218 			 */
2219 			if (start > n->start)
2220 				ret = vfio_iommu_iova_insert(&n->list, n->start,
2221 							     start - 1);
2222 			if (!ret && end < n->end)
2223 				ret = vfio_iommu_iova_insert(&n->list, end + 1,
2224 							     n->end);
2225 			if (ret)
2226 				return ret;
2227 
2228 			list_del(&n->list);
2229 			kfree(n);
2230 		}
2231 	}
2232 
2233 	if (list_empty(iova))
2234 		return -EINVAL;
2235 
2236 	return 0;
2237 }
2238 
vfio_iommu_resv_free(struct list_head * resv_regions)2239 static void vfio_iommu_resv_free(struct list_head *resv_regions)
2240 {
2241 	struct iommu_resv_region *n, *next;
2242 
2243 	list_for_each_entry_safe(n, next, resv_regions, list) {
2244 		list_del(&n->list);
2245 		kfree(n);
2246 	}
2247 }
2248 
vfio_iommu_iova_free(struct list_head * iova)2249 static void vfio_iommu_iova_free(struct list_head *iova)
2250 {
2251 	struct vfio_iova *n, *next;
2252 
2253 	list_for_each_entry_safe(n, next, iova, list) {
2254 		list_del(&n->list);
2255 		kfree(n);
2256 	}
2257 }
2258 
vfio_iommu_iova_get_copy(struct vfio_iommu * iommu,struct list_head * iova_copy)2259 static int vfio_iommu_iova_get_copy(struct vfio_iommu *iommu,
2260 				    struct list_head *iova_copy)
2261 {
2262 	struct list_head *iova = &iommu->iova_list;
2263 	struct vfio_iova *n;
2264 	int ret;
2265 
2266 	list_for_each_entry(n, iova, list) {
2267 		ret = vfio_iommu_iova_insert(iova_copy, n->start, n->end);
2268 		if (ret)
2269 			goto out_free;
2270 	}
2271 
2272 	return 0;
2273 
2274 out_free:
2275 	vfio_iommu_iova_free(iova_copy);
2276 	return ret;
2277 }
2278 
vfio_iommu_iova_insert_copy(struct vfio_iommu * iommu,struct list_head * iova_copy)2279 static void vfio_iommu_iova_insert_copy(struct vfio_iommu *iommu,
2280 					struct list_head *iova_copy)
2281 {
2282 	struct list_head *iova = &iommu->iova_list;
2283 
2284 	vfio_iommu_iova_free(iova);
2285 
2286 	list_splice_tail(iova_copy, iova);
2287 }
2288 
vfio_iommu_type1_attach_group(void * iommu_data,struct iommu_group * iommu_group)2289 static int vfio_iommu_type1_attach_group(void *iommu_data,
2290 					 struct iommu_group *iommu_group)
2291 {
2292 	struct vfio_iommu *iommu = iommu_data;
2293 	struct vfio_iommu_group *group;
2294 	struct vfio_domain *domain, *d;
2295 	struct bus_type *bus = NULL;
2296 	int ret;
2297 	bool resv_msi, msi_remap;
2298 	phys_addr_t resv_msi_base = 0;
2299 	struct iommu_domain_geometry *geo;
2300 	LIST_HEAD(iova_copy);
2301 	LIST_HEAD(group_resv_regions);
2302 
2303 	mutex_lock(&iommu->lock);
2304 
2305 	/* Check for duplicates */
2306 	if (vfio_iommu_find_iommu_group(iommu, iommu_group)) {
2307 		mutex_unlock(&iommu->lock);
2308 		return -EINVAL;
2309 	}
2310 
2311 	group = kzalloc(sizeof(*group), GFP_KERNEL);
2312 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
2313 	if (!group || !domain) {
2314 		ret = -ENOMEM;
2315 		goto out_free;
2316 	}
2317 
2318 	group->iommu_group = iommu_group;
2319 
2320 	/* Determine bus_type in order to allocate a domain */
2321 	ret = iommu_group_for_each_dev(iommu_group, &bus, vfio_bus_type);
2322 	if (ret)
2323 		goto out_free;
2324 
2325 	if (vfio_bus_is_mdev(bus)) {
2326 		struct device *iommu_device = NULL;
2327 
2328 		group->mdev_group = true;
2329 
2330 		/* Determine the isolation type */
2331 		ret = iommu_group_for_each_dev(iommu_group, &iommu_device,
2332 					       vfio_mdev_iommu_device);
2333 		if (ret || !iommu_device) {
2334 			if (!iommu->external_domain) {
2335 				INIT_LIST_HEAD(&domain->group_list);
2336 				iommu->external_domain = domain;
2337 				vfio_update_pgsize_bitmap(iommu);
2338 			} else {
2339 				kfree(domain);
2340 			}
2341 
2342 			list_add(&group->next,
2343 				 &iommu->external_domain->group_list);
2344 			/*
2345 			 * Non-iommu backed group cannot dirty memory directly,
2346 			 * it can only use interfaces that provide dirty
2347 			 * tracking.
2348 			 * The iommu scope can only be promoted with the
2349 			 * addition of a dirty tracking group.
2350 			 */
2351 			group->pinned_page_dirty_scope = true;
2352 			mutex_unlock(&iommu->lock);
2353 
2354 			return 0;
2355 		}
2356 
2357 		bus = iommu_device->bus;
2358 	}
2359 
2360 	domain->domain = iommu_domain_alloc(bus);
2361 	if (!domain->domain) {
2362 		ret = -EIO;
2363 		goto out_free;
2364 	}
2365 
2366 	if (iommu->nesting) {
2367 		ret = iommu_enable_nesting(domain->domain);
2368 		if (ret)
2369 			goto out_domain;
2370 	}
2371 
2372 	ret = vfio_iommu_attach_group(domain, group);
2373 	if (ret)
2374 		goto out_domain;
2375 
2376 	/* Get aperture info */
2377 	geo = &domain->domain->geometry;
2378 	if (vfio_iommu_aper_conflict(iommu, geo->aperture_start,
2379 				     geo->aperture_end)) {
2380 		ret = -EINVAL;
2381 		goto out_detach;
2382 	}
2383 
2384 	ret = iommu_get_group_resv_regions(iommu_group, &group_resv_regions);
2385 	if (ret)
2386 		goto out_detach;
2387 
2388 	if (vfio_iommu_resv_conflict(iommu, &group_resv_regions)) {
2389 		ret = -EINVAL;
2390 		goto out_detach;
2391 	}
2392 
2393 	/*
2394 	 * We don't want to work on the original iova list as the list
2395 	 * gets modified and in case of failure we have to retain the
2396 	 * original list. Get a copy here.
2397 	 */
2398 	ret = vfio_iommu_iova_get_copy(iommu, &iova_copy);
2399 	if (ret)
2400 		goto out_detach;
2401 
2402 	ret = vfio_iommu_aper_resize(&iova_copy, geo->aperture_start,
2403 				     geo->aperture_end);
2404 	if (ret)
2405 		goto out_detach;
2406 
2407 	ret = vfio_iommu_resv_exclude(&iova_copy, &group_resv_regions);
2408 	if (ret)
2409 		goto out_detach;
2410 
2411 	resv_msi = vfio_iommu_has_sw_msi(&group_resv_regions, &resv_msi_base);
2412 
2413 	INIT_LIST_HEAD(&domain->group_list);
2414 	list_add(&group->next, &domain->group_list);
2415 
2416 	msi_remap = irq_domain_check_msi_remap() ||
2417 		    iommu_capable(bus, IOMMU_CAP_INTR_REMAP);
2418 
2419 	if (!allow_unsafe_interrupts && !msi_remap) {
2420 		pr_warn("%s: No interrupt remapping support.  Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n",
2421 		       __func__);
2422 		ret = -EPERM;
2423 		goto out_detach;
2424 	}
2425 
2426 	if (iommu_capable(bus, IOMMU_CAP_CACHE_COHERENCY))
2427 		domain->prot |= IOMMU_CACHE;
2428 
2429 	/*
2430 	 * Try to match an existing compatible domain.  We don't want to
2431 	 * preclude an IOMMU driver supporting multiple bus_types and being
2432 	 * able to include different bus_types in the same IOMMU domain, so
2433 	 * we test whether the domains use the same iommu_ops rather than
2434 	 * testing if they're on the same bus_type.
2435 	 */
2436 	list_for_each_entry(d, &iommu->domain_list, next) {
2437 		if (d->domain->ops == domain->domain->ops &&
2438 		    d->prot == domain->prot) {
2439 			vfio_iommu_detach_group(domain, group);
2440 			if (!vfio_iommu_attach_group(d, group)) {
2441 				list_add(&group->next, &d->group_list);
2442 				iommu_domain_free(domain->domain);
2443 				kfree(domain);
2444 				goto done;
2445 			}
2446 
2447 			ret = vfio_iommu_attach_group(domain, group);
2448 			if (ret)
2449 				goto out_domain;
2450 		}
2451 	}
2452 
2453 	vfio_test_domain_fgsp(domain);
2454 
2455 	/* replay mappings on new domains */
2456 	ret = vfio_iommu_replay(iommu, domain);
2457 	if (ret)
2458 		goto out_detach;
2459 
2460 	if (resv_msi) {
2461 		ret = iommu_get_msi_cookie(domain->domain, resv_msi_base);
2462 		if (ret && ret != -ENODEV)
2463 			goto out_detach;
2464 	}
2465 
2466 	list_add(&domain->next, &iommu->domain_list);
2467 	vfio_update_pgsize_bitmap(iommu);
2468 done:
2469 	/* Delete the old one and insert new iova list */
2470 	vfio_iommu_iova_insert_copy(iommu, &iova_copy);
2471 
2472 	/*
2473 	 * An iommu backed group can dirty memory directly and therefore
2474 	 * demotes the iommu scope until it declares itself dirty tracking
2475 	 * capable via the page pinning interface.
2476 	 */
2477 	iommu->num_non_pinned_groups++;
2478 	mutex_unlock(&iommu->lock);
2479 	vfio_iommu_resv_free(&group_resv_regions);
2480 
2481 	return 0;
2482 
2483 out_detach:
2484 	vfio_iommu_detach_group(domain, group);
2485 out_domain:
2486 	iommu_domain_free(domain->domain);
2487 	vfio_iommu_iova_free(&iova_copy);
2488 	vfio_iommu_resv_free(&group_resv_regions);
2489 out_free:
2490 	kfree(domain);
2491 	kfree(group);
2492 	mutex_unlock(&iommu->lock);
2493 	return ret;
2494 }
2495 
vfio_iommu_unmap_unpin_all(struct vfio_iommu * iommu)2496 static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
2497 {
2498 	struct rb_node *node;
2499 
2500 	while ((node = rb_first(&iommu->dma_list)))
2501 		vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
2502 }
2503 
vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu * iommu)2504 static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu)
2505 {
2506 	struct rb_node *n, *p;
2507 
2508 	n = rb_first(&iommu->dma_list);
2509 	for (; n; n = rb_next(n)) {
2510 		struct vfio_dma *dma;
2511 		long locked = 0, unlocked = 0;
2512 
2513 		dma = rb_entry(n, struct vfio_dma, node);
2514 		unlocked += vfio_unmap_unpin(iommu, dma, false);
2515 		p = rb_first(&dma->pfn_list);
2516 		for (; p; p = rb_next(p)) {
2517 			struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn,
2518 							 node);
2519 
2520 			if (!is_invalid_reserved_pfn(vpfn->pfn))
2521 				locked++;
2522 		}
2523 		vfio_lock_acct(dma, locked - unlocked, true);
2524 	}
2525 }
2526 
2527 /*
2528  * Called when a domain is removed in detach. It is possible that
2529  * the removed domain decided the iova aperture window. Modify the
2530  * iova aperture with the smallest window among existing domains.
2531  */
vfio_iommu_aper_expand(struct vfio_iommu * iommu,struct list_head * iova_copy)2532 static void vfio_iommu_aper_expand(struct vfio_iommu *iommu,
2533 				   struct list_head *iova_copy)
2534 {
2535 	struct vfio_domain *domain;
2536 	struct vfio_iova *node;
2537 	dma_addr_t start = 0;
2538 	dma_addr_t end = (dma_addr_t)~0;
2539 
2540 	if (list_empty(iova_copy))
2541 		return;
2542 
2543 	list_for_each_entry(domain, &iommu->domain_list, next) {
2544 		struct iommu_domain_geometry *geo = &domain->domain->geometry;
2545 
2546 		if (geo->aperture_start > start)
2547 			start = geo->aperture_start;
2548 		if (geo->aperture_end < end)
2549 			end = geo->aperture_end;
2550 	}
2551 
2552 	/* Modify aperture limits. The new aper is either same or bigger */
2553 	node = list_first_entry(iova_copy, struct vfio_iova, list);
2554 	node->start = start;
2555 	node = list_last_entry(iova_copy, struct vfio_iova, list);
2556 	node->end = end;
2557 }
2558 
2559 /*
2560  * Called when a group is detached. The reserved regions for that
2561  * group can be part of valid iova now. But since reserved regions
2562  * may be duplicated among groups, populate the iova valid regions
2563  * list again.
2564  */
vfio_iommu_resv_refresh(struct vfio_iommu * iommu,struct list_head * iova_copy)2565 static int vfio_iommu_resv_refresh(struct vfio_iommu *iommu,
2566 				   struct list_head *iova_copy)
2567 {
2568 	struct vfio_domain *d;
2569 	struct vfio_iommu_group *g;
2570 	struct vfio_iova *node;
2571 	dma_addr_t start, end;
2572 	LIST_HEAD(resv_regions);
2573 	int ret;
2574 
2575 	if (list_empty(iova_copy))
2576 		return -EINVAL;
2577 
2578 	list_for_each_entry(d, &iommu->domain_list, next) {
2579 		list_for_each_entry(g, &d->group_list, next) {
2580 			ret = iommu_get_group_resv_regions(g->iommu_group,
2581 							   &resv_regions);
2582 			if (ret)
2583 				goto done;
2584 		}
2585 	}
2586 
2587 	node = list_first_entry(iova_copy, struct vfio_iova, list);
2588 	start = node->start;
2589 	node = list_last_entry(iova_copy, struct vfio_iova, list);
2590 	end = node->end;
2591 
2592 	/* purge the iova list and create new one */
2593 	vfio_iommu_iova_free(iova_copy);
2594 
2595 	ret = vfio_iommu_aper_resize(iova_copy, start, end);
2596 	if (ret)
2597 		goto done;
2598 
2599 	/* Exclude current reserved regions from iova ranges */
2600 	ret = vfio_iommu_resv_exclude(iova_copy, &resv_regions);
2601 done:
2602 	vfio_iommu_resv_free(&resv_regions);
2603 	return ret;
2604 }
2605 
vfio_iommu_type1_detach_group(void * iommu_data,struct iommu_group * iommu_group)2606 static void vfio_iommu_type1_detach_group(void *iommu_data,
2607 					  struct iommu_group *iommu_group)
2608 {
2609 	struct vfio_iommu *iommu = iommu_data;
2610 	struct vfio_domain *domain;
2611 	struct vfio_iommu_group *group;
2612 	bool update_dirty_scope = false;
2613 	LIST_HEAD(iova_copy);
2614 
2615 	mutex_lock(&iommu->lock);
2616 
2617 	if (iommu->external_domain) {
2618 		group = find_iommu_group(iommu->external_domain, iommu_group);
2619 		if (group) {
2620 			update_dirty_scope = !group->pinned_page_dirty_scope;
2621 			list_del(&group->next);
2622 			kfree(group);
2623 
2624 			if (list_empty(&iommu->external_domain->group_list)) {
2625 				if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
2626 					WARN_ON(iommu->notifier.head);
2627 					vfio_iommu_unmap_unpin_all(iommu);
2628 				}
2629 
2630 				kfree(iommu->external_domain);
2631 				iommu->external_domain = NULL;
2632 			}
2633 			goto detach_group_done;
2634 		}
2635 	}
2636 
2637 	/*
2638 	 * Get a copy of iova list. This will be used to update
2639 	 * and to replace the current one later. Please note that
2640 	 * we will leave the original list as it is if update fails.
2641 	 */
2642 	vfio_iommu_iova_get_copy(iommu, &iova_copy);
2643 
2644 	list_for_each_entry(domain, &iommu->domain_list, next) {
2645 		group = find_iommu_group(domain, iommu_group);
2646 		if (!group)
2647 			continue;
2648 
2649 		vfio_iommu_detach_group(domain, group);
2650 		update_dirty_scope = !group->pinned_page_dirty_scope;
2651 		list_del(&group->next);
2652 		kfree(group);
2653 		/*
2654 		 * Group ownership provides privilege, if the group list is
2655 		 * empty, the domain goes away. If it's the last domain with
2656 		 * iommu and external domain doesn't exist, then all the
2657 		 * mappings go away too. If it's the last domain with iommu and
2658 		 * external domain exist, update accounting
2659 		 */
2660 		if (list_empty(&domain->group_list)) {
2661 			if (list_is_singular(&iommu->domain_list)) {
2662 				if (!iommu->external_domain) {
2663 					WARN_ON(iommu->notifier.head);
2664 					vfio_iommu_unmap_unpin_all(iommu);
2665 				} else {
2666 					vfio_iommu_unmap_unpin_reaccount(iommu);
2667 				}
2668 			}
2669 			iommu_domain_free(domain->domain);
2670 			list_del(&domain->next);
2671 			kfree(domain);
2672 			vfio_iommu_aper_expand(iommu, &iova_copy);
2673 			vfio_update_pgsize_bitmap(iommu);
2674 		}
2675 		break;
2676 	}
2677 
2678 	if (!vfio_iommu_resv_refresh(iommu, &iova_copy))
2679 		vfio_iommu_iova_insert_copy(iommu, &iova_copy);
2680 	else
2681 		vfio_iommu_iova_free(&iova_copy);
2682 
2683 detach_group_done:
2684 	/*
2685 	 * Removal of a group without dirty tracking may allow the iommu scope
2686 	 * to be promoted.
2687 	 */
2688 	if (update_dirty_scope) {
2689 		iommu->num_non_pinned_groups--;
2690 		if (iommu->dirty_page_tracking)
2691 			vfio_iommu_populate_bitmap_full(iommu);
2692 	}
2693 	mutex_unlock(&iommu->lock);
2694 }
2695 
vfio_iommu_type1_open(unsigned long arg)2696 static void *vfio_iommu_type1_open(unsigned long arg)
2697 {
2698 	struct vfio_iommu *iommu;
2699 
2700 	iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
2701 	if (!iommu)
2702 		return ERR_PTR(-ENOMEM);
2703 
2704 	switch (arg) {
2705 	case VFIO_TYPE1_IOMMU:
2706 		break;
2707 	case VFIO_TYPE1_NESTING_IOMMU:
2708 		iommu->nesting = true;
2709 		fallthrough;
2710 	case VFIO_TYPE1v2_IOMMU:
2711 		iommu->v2 = true;
2712 		break;
2713 	default:
2714 		kfree(iommu);
2715 		return ERR_PTR(-EINVAL);
2716 	}
2717 
2718 	INIT_LIST_HEAD(&iommu->domain_list);
2719 	INIT_LIST_HEAD(&iommu->iova_list);
2720 	iommu->dma_list = RB_ROOT;
2721 	iommu->dma_avail = dma_entry_limit;
2722 	iommu->container_open = true;
2723 	mutex_init(&iommu->lock);
2724 	BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);
2725 	init_waitqueue_head(&iommu->vaddr_wait);
2726 
2727 	return iommu;
2728 }
2729 
vfio_release_domain(struct vfio_domain * domain,bool external)2730 static void vfio_release_domain(struct vfio_domain *domain, bool external)
2731 {
2732 	struct vfio_iommu_group *group, *group_tmp;
2733 
2734 	list_for_each_entry_safe(group, group_tmp,
2735 				 &domain->group_list, next) {
2736 		if (!external)
2737 			vfio_iommu_detach_group(domain, group);
2738 		list_del(&group->next);
2739 		kfree(group);
2740 	}
2741 
2742 	if (!external)
2743 		iommu_domain_free(domain->domain);
2744 }
2745 
vfio_iommu_type1_release(void * iommu_data)2746 static void vfio_iommu_type1_release(void *iommu_data)
2747 {
2748 	struct vfio_iommu *iommu = iommu_data;
2749 	struct vfio_domain *domain, *domain_tmp;
2750 
2751 	if (iommu->external_domain) {
2752 		vfio_release_domain(iommu->external_domain, true);
2753 		kfree(iommu->external_domain);
2754 	}
2755 
2756 	vfio_iommu_unmap_unpin_all(iommu);
2757 
2758 	list_for_each_entry_safe(domain, domain_tmp,
2759 				 &iommu->domain_list, next) {
2760 		vfio_release_domain(domain, false);
2761 		list_del(&domain->next);
2762 		kfree(domain);
2763 	}
2764 
2765 	vfio_iommu_iova_free(&iommu->iova_list);
2766 
2767 	kfree(iommu);
2768 }
2769 
vfio_domains_have_iommu_cache(struct vfio_iommu * iommu)2770 static int vfio_domains_have_iommu_cache(struct vfio_iommu *iommu)
2771 {
2772 	struct vfio_domain *domain;
2773 	int ret = 1;
2774 
2775 	mutex_lock(&iommu->lock);
2776 	list_for_each_entry(domain, &iommu->domain_list, next) {
2777 		if (!(domain->prot & IOMMU_CACHE)) {
2778 			ret = 0;
2779 			break;
2780 		}
2781 	}
2782 	mutex_unlock(&iommu->lock);
2783 
2784 	return ret;
2785 }
2786 
vfio_iommu_type1_check_extension(struct vfio_iommu * iommu,unsigned long arg)2787 static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu,
2788 					    unsigned long arg)
2789 {
2790 	switch (arg) {
2791 	case VFIO_TYPE1_IOMMU:
2792 	case VFIO_TYPE1v2_IOMMU:
2793 	case VFIO_TYPE1_NESTING_IOMMU:
2794 	case VFIO_UNMAP_ALL:
2795 	case VFIO_UPDATE_VADDR:
2796 		return 1;
2797 	case VFIO_DMA_CC_IOMMU:
2798 		if (!iommu)
2799 			return 0;
2800 		return vfio_domains_have_iommu_cache(iommu);
2801 	default:
2802 		return 0;
2803 	}
2804 }
2805 
vfio_iommu_iova_add_cap(struct vfio_info_cap * caps,struct vfio_iommu_type1_info_cap_iova_range * cap_iovas,size_t size)2806 static int vfio_iommu_iova_add_cap(struct vfio_info_cap *caps,
2807 		 struct vfio_iommu_type1_info_cap_iova_range *cap_iovas,
2808 		 size_t size)
2809 {
2810 	struct vfio_info_cap_header *header;
2811 	struct vfio_iommu_type1_info_cap_iova_range *iova_cap;
2812 
2813 	header = vfio_info_cap_add(caps, size,
2814 				   VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE, 1);
2815 	if (IS_ERR(header))
2816 		return PTR_ERR(header);
2817 
2818 	iova_cap = container_of(header,
2819 				struct vfio_iommu_type1_info_cap_iova_range,
2820 				header);
2821 	iova_cap->nr_iovas = cap_iovas->nr_iovas;
2822 	memcpy(iova_cap->iova_ranges, cap_iovas->iova_ranges,
2823 	       cap_iovas->nr_iovas * sizeof(*cap_iovas->iova_ranges));
2824 	return 0;
2825 }
2826 
vfio_iommu_iova_build_caps(struct vfio_iommu * iommu,struct vfio_info_cap * caps)2827 static int vfio_iommu_iova_build_caps(struct vfio_iommu *iommu,
2828 				      struct vfio_info_cap *caps)
2829 {
2830 	struct vfio_iommu_type1_info_cap_iova_range *cap_iovas;
2831 	struct vfio_iova *iova;
2832 	size_t size;
2833 	int iovas = 0, i = 0, ret;
2834 
2835 	list_for_each_entry(iova, &iommu->iova_list, list)
2836 		iovas++;
2837 
2838 	if (!iovas) {
2839 		/*
2840 		 * Return 0 as a container with a single mdev device
2841 		 * will have an empty list
2842 		 */
2843 		return 0;
2844 	}
2845 
2846 	size = struct_size(cap_iovas, iova_ranges, iovas);
2847 
2848 	cap_iovas = kzalloc(size, GFP_KERNEL);
2849 	if (!cap_iovas)
2850 		return -ENOMEM;
2851 
2852 	cap_iovas->nr_iovas = iovas;
2853 
2854 	list_for_each_entry(iova, &iommu->iova_list, list) {
2855 		cap_iovas->iova_ranges[i].start = iova->start;
2856 		cap_iovas->iova_ranges[i].end = iova->end;
2857 		i++;
2858 	}
2859 
2860 	ret = vfio_iommu_iova_add_cap(caps, cap_iovas, size);
2861 
2862 	kfree(cap_iovas);
2863 	return ret;
2864 }
2865 
vfio_iommu_migration_build_caps(struct vfio_iommu * iommu,struct vfio_info_cap * caps)2866 static int vfio_iommu_migration_build_caps(struct vfio_iommu *iommu,
2867 					   struct vfio_info_cap *caps)
2868 {
2869 	struct vfio_iommu_type1_info_cap_migration cap_mig = {};
2870 
2871 	cap_mig.header.id = VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION;
2872 	cap_mig.header.version = 1;
2873 
2874 	cap_mig.flags = 0;
2875 	/* support minimum pgsize */
2876 	cap_mig.pgsize_bitmap = (size_t)1 << __ffs(iommu->pgsize_bitmap);
2877 	cap_mig.max_dirty_bitmap_size = DIRTY_BITMAP_SIZE_MAX;
2878 
2879 	return vfio_info_add_capability(caps, &cap_mig.header, sizeof(cap_mig));
2880 }
2881 
vfio_iommu_dma_avail_build_caps(struct vfio_iommu * iommu,struct vfio_info_cap * caps)2882 static int vfio_iommu_dma_avail_build_caps(struct vfio_iommu *iommu,
2883 					   struct vfio_info_cap *caps)
2884 {
2885 	struct vfio_iommu_type1_info_dma_avail cap_dma_avail;
2886 
2887 	cap_dma_avail.header.id = VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL;
2888 	cap_dma_avail.header.version = 1;
2889 
2890 	cap_dma_avail.avail = iommu->dma_avail;
2891 
2892 	return vfio_info_add_capability(caps, &cap_dma_avail.header,
2893 					sizeof(cap_dma_avail));
2894 }
2895 
vfio_iommu_type1_get_info(struct vfio_iommu * iommu,unsigned long arg)2896 static int vfio_iommu_type1_get_info(struct vfio_iommu *iommu,
2897 				     unsigned long arg)
2898 {
2899 	struct vfio_iommu_type1_info info;
2900 	unsigned long minsz;
2901 	struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
2902 	unsigned long capsz;
2903 	int ret;
2904 
2905 	minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
2906 
2907 	/* For backward compatibility, cannot require this */
2908 	capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset);
2909 
2910 	if (copy_from_user(&info, (void __user *)arg, minsz))
2911 		return -EFAULT;
2912 
2913 	if (info.argsz < minsz)
2914 		return -EINVAL;
2915 
2916 	if (info.argsz >= capsz) {
2917 		minsz = capsz;
2918 		info.cap_offset = 0; /* output, no-recopy necessary */
2919 	}
2920 
2921 	mutex_lock(&iommu->lock);
2922 	info.flags = VFIO_IOMMU_INFO_PGSIZES;
2923 
2924 	info.iova_pgsizes = iommu->pgsize_bitmap;
2925 
2926 	ret = vfio_iommu_migration_build_caps(iommu, &caps);
2927 
2928 	if (!ret)
2929 		ret = vfio_iommu_dma_avail_build_caps(iommu, &caps);
2930 
2931 	if (!ret)
2932 		ret = vfio_iommu_iova_build_caps(iommu, &caps);
2933 
2934 	mutex_unlock(&iommu->lock);
2935 
2936 	if (ret)
2937 		return ret;
2938 
2939 	if (caps.size) {
2940 		info.flags |= VFIO_IOMMU_INFO_CAPS;
2941 
2942 		if (info.argsz < sizeof(info) + caps.size) {
2943 			info.argsz = sizeof(info) + caps.size;
2944 		} else {
2945 			vfio_info_cap_shift(&caps, sizeof(info));
2946 			if (copy_to_user((void __user *)arg +
2947 					sizeof(info), caps.buf,
2948 					caps.size)) {
2949 				kfree(caps.buf);
2950 				return -EFAULT;
2951 			}
2952 			info.cap_offset = sizeof(info);
2953 		}
2954 
2955 		kfree(caps.buf);
2956 	}
2957 
2958 	return copy_to_user((void __user *)arg, &info, minsz) ?
2959 			-EFAULT : 0;
2960 }
2961 
vfio_iommu_type1_map_dma(struct vfio_iommu * iommu,unsigned long arg)2962 static int vfio_iommu_type1_map_dma(struct vfio_iommu *iommu,
2963 				    unsigned long arg)
2964 {
2965 	struct vfio_iommu_type1_dma_map map;
2966 	unsigned long minsz;
2967 	uint32_t mask = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE |
2968 			VFIO_DMA_MAP_FLAG_VADDR;
2969 
2970 	minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
2971 
2972 	if (copy_from_user(&map, (void __user *)arg, minsz))
2973 		return -EFAULT;
2974 
2975 	if (map.argsz < minsz || map.flags & ~mask)
2976 		return -EINVAL;
2977 
2978 	return vfio_dma_do_map(iommu, &map);
2979 }
2980 
vfio_iommu_type1_unmap_dma(struct vfio_iommu * iommu,unsigned long arg)2981 static int vfio_iommu_type1_unmap_dma(struct vfio_iommu *iommu,
2982 				      unsigned long arg)
2983 {
2984 	struct vfio_iommu_type1_dma_unmap unmap;
2985 	struct vfio_bitmap bitmap = { 0 };
2986 	uint32_t mask = VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP |
2987 			VFIO_DMA_UNMAP_FLAG_VADDR |
2988 			VFIO_DMA_UNMAP_FLAG_ALL;
2989 	unsigned long minsz;
2990 	int ret;
2991 
2992 	minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
2993 
2994 	if (copy_from_user(&unmap, (void __user *)arg, minsz))
2995 		return -EFAULT;
2996 
2997 	if (unmap.argsz < minsz || unmap.flags & ~mask)
2998 		return -EINVAL;
2999 
3000 	if ((unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
3001 	    (unmap.flags & (VFIO_DMA_UNMAP_FLAG_ALL |
3002 			    VFIO_DMA_UNMAP_FLAG_VADDR)))
3003 		return -EINVAL;
3004 
3005 	if (unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
3006 		unsigned long pgshift;
3007 
3008 		if (unmap.argsz < (minsz + sizeof(bitmap)))
3009 			return -EINVAL;
3010 
3011 		if (copy_from_user(&bitmap,
3012 				   (void __user *)(arg + minsz),
3013 				   sizeof(bitmap)))
3014 			return -EFAULT;
3015 
3016 		if (!access_ok((void __user *)bitmap.data, bitmap.size))
3017 			return -EINVAL;
3018 
3019 		pgshift = __ffs(bitmap.pgsize);
3020 		ret = verify_bitmap_size(unmap.size >> pgshift,
3021 					 bitmap.size);
3022 		if (ret)
3023 			return ret;
3024 	}
3025 
3026 	ret = vfio_dma_do_unmap(iommu, &unmap, &bitmap);
3027 	if (ret)
3028 		return ret;
3029 
3030 	return copy_to_user((void __user *)arg, &unmap, minsz) ?
3031 			-EFAULT : 0;
3032 }
3033 
vfio_iommu_type1_dirty_pages(struct vfio_iommu * iommu,unsigned long arg)3034 static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu,
3035 					unsigned long arg)
3036 {
3037 	struct vfio_iommu_type1_dirty_bitmap dirty;
3038 	uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
3039 			VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
3040 			VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
3041 	unsigned long minsz;
3042 	int ret = 0;
3043 
3044 	if (!iommu->v2)
3045 		return -EACCES;
3046 
3047 	minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap, flags);
3048 
3049 	if (copy_from_user(&dirty, (void __user *)arg, minsz))
3050 		return -EFAULT;
3051 
3052 	if (dirty.argsz < minsz || dirty.flags & ~mask)
3053 		return -EINVAL;
3054 
3055 	/* only one flag should be set at a time */
3056 	if (__ffs(dirty.flags) != __fls(dirty.flags))
3057 		return -EINVAL;
3058 
3059 	if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
3060 		size_t pgsize;
3061 
3062 		mutex_lock(&iommu->lock);
3063 		pgsize = 1 << __ffs(iommu->pgsize_bitmap);
3064 		if (!iommu->dirty_page_tracking) {
3065 			ret = vfio_dma_bitmap_alloc_all(iommu, pgsize);
3066 			if (!ret)
3067 				iommu->dirty_page_tracking = true;
3068 		}
3069 		mutex_unlock(&iommu->lock);
3070 		return ret;
3071 	} else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
3072 		mutex_lock(&iommu->lock);
3073 		if (iommu->dirty_page_tracking) {
3074 			iommu->dirty_page_tracking = false;
3075 			vfio_dma_bitmap_free_all(iommu);
3076 		}
3077 		mutex_unlock(&iommu->lock);
3078 		return 0;
3079 	} else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
3080 		struct vfio_iommu_type1_dirty_bitmap_get range;
3081 		unsigned long pgshift;
3082 		size_t data_size = dirty.argsz - minsz;
3083 		size_t iommu_pgsize;
3084 
3085 		if (!data_size || data_size < sizeof(range))
3086 			return -EINVAL;
3087 
3088 		if (copy_from_user(&range, (void __user *)(arg + minsz),
3089 				   sizeof(range)))
3090 			return -EFAULT;
3091 
3092 		if (range.iova + range.size < range.iova)
3093 			return -EINVAL;
3094 		if (!access_ok((void __user *)range.bitmap.data,
3095 			       range.bitmap.size))
3096 			return -EINVAL;
3097 
3098 		pgshift = __ffs(range.bitmap.pgsize);
3099 		ret = verify_bitmap_size(range.size >> pgshift,
3100 					 range.bitmap.size);
3101 		if (ret)
3102 			return ret;
3103 
3104 		mutex_lock(&iommu->lock);
3105 
3106 		iommu_pgsize = (size_t)1 << __ffs(iommu->pgsize_bitmap);
3107 
3108 		/* allow only smallest supported pgsize */
3109 		if (range.bitmap.pgsize != iommu_pgsize) {
3110 			ret = -EINVAL;
3111 			goto out_unlock;
3112 		}
3113 		if (range.iova & (iommu_pgsize - 1)) {
3114 			ret = -EINVAL;
3115 			goto out_unlock;
3116 		}
3117 		if (!range.size || range.size & (iommu_pgsize - 1)) {
3118 			ret = -EINVAL;
3119 			goto out_unlock;
3120 		}
3121 
3122 		if (iommu->dirty_page_tracking)
3123 			ret = vfio_iova_dirty_bitmap(range.bitmap.data,
3124 						     iommu, range.iova,
3125 						     range.size,
3126 						     range.bitmap.pgsize);
3127 		else
3128 			ret = -EINVAL;
3129 out_unlock:
3130 		mutex_unlock(&iommu->lock);
3131 
3132 		return ret;
3133 	}
3134 
3135 	return -EINVAL;
3136 }
3137 
vfio_iommu_type1_ioctl(void * iommu_data,unsigned int cmd,unsigned long arg)3138 static long vfio_iommu_type1_ioctl(void *iommu_data,
3139 				   unsigned int cmd, unsigned long arg)
3140 {
3141 	struct vfio_iommu *iommu = iommu_data;
3142 
3143 	switch (cmd) {
3144 	case VFIO_CHECK_EXTENSION:
3145 		return vfio_iommu_type1_check_extension(iommu, arg);
3146 	case VFIO_IOMMU_GET_INFO:
3147 		return vfio_iommu_type1_get_info(iommu, arg);
3148 	case VFIO_IOMMU_MAP_DMA:
3149 		return vfio_iommu_type1_map_dma(iommu, arg);
3150 	case VFIO_IOMMU_UNMAP_DMA:
3151 		return vfio_iommu_type1_unmap_dma(iommu, arg);
3152 	case VFIO_IOMMU_DIRTY_PAGES:
3153 		return vfio_iommu_type1_dirty_pages(iommu, arg);
3154 	default:
3155 		return -ENOTTY;
3156 	}
3157 }
3158 
vfio_iommu_type1_register_notifier(void * iommu_data,unsigned long * events,struct notifier_block * nb)3159 static int vfio_iommu_type1_register_notifier(void *iommu_data,
3160 					      unsigned long *events,
3161 					      struct notifier_block *nb)
3162 {
3163 	struct vfio_iommu *iommu = iommu_data;
3164 
3165 	/* clear known events */
3166 	*events &= ~VFIO_IOMMU_NOTIFY_DMA_UNMAP;
3167 
3168 	/* refuse to register if still events remaining */
3169 	if (*events)
3170 		return -EINVAL;
3171 
3172 	return blocking_notifier_chain_register(&iommu->notifier, nb);
3173 }
3174 
vfio_iommu_type1_unregister_notifier(void * iommu_data,struct notifier_block * nb)3175 static int vfio_iommu_type1_unregister_notifier(void *iommu_data,
3176 						struct notifier_block *nb)
3177 {
3178 	struct vfio_iommu *iommu = iommu_data;
3179 
3180 	return blocking_notifier_chain_unregister(&iommu->notifier, nb);
3181 }
3182 
vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu * iommu,dma_addr_t user_iova,void * data,size_t count,bool write,size_t * copied)3183 static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
3184 					 dma_addr_t user_iova, void *data,
3185 					 size_t count, bool write,
3186 					 size_t *copied)
3187 {
3188 	struct mm_struct *mm;
3189 	unsigned long vaddr;
3190 	struct vfio_dma *dma;
3191 	bool kthread = current->mm == NULL;
3192 	size_t offset;
3193 	int ret;
3194 
3195 	*copied = 0;
3196 
3197 	ret = vfio_find_dma_valid(iommu, user_iova, 1, &dma);
3198 	if (ret < 0)
3199 		return ret;
3200 
3201 	if ((write && !(dma->prot & IOMMU_WRITE)) ||
3202 			!(dma->prot & IOMMU_READ))
3203 		return -EPERM;
3204 
3205 	mm = dma->mm;
3206 	if (!mmget_not_zero(mm))
3207 		return -EPERM;
3208 
3209 	if (kthread)
3210 		kthread_use_mm(mm);
3211 	else if (current->mm != mm)
3212 		goto out;
3213 
3214 	offset = user_iova - dma->iova;
3215 
3216 	if (count > dma->size - offset)
3217 		count = dma->size - offset;
3218 
3219 	vaddr = dma->vaddr + offset;
3220 
3221 	if (write) {
3222 		*copied = copy_to_user((void __user *)vaddr, data,
3223 					 count) ? 0 : count;
3224 		if (*copied && iommu->dirty_page_tracking) {
3225 			unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
3226 			/*
3227 			 * Bitmap populated with the smallest supported page
3228 			 * size
3229 			 */
3230 			bitmap_set(dma->bitmap, offset >> pgshift,
3231 				   ((offset + *copied - 1) >> pgshift) -
3232 				   (offset >> pgshift) + 1);
3233 		}
3234 	} else
3235 		*copied = copy_from_user(data, (void __user *)vaddr,
3236 					   count) ? 0 : count;
3237 	if (kthread)
3238 		kthread_unuse_mm(mm);
3239 out:
3240 	mmput(mm);
3241 	return *copied ? 0 : -EFAULT;
3242 }
3243 
vfio_iommu_type1_dma_rw(void * iommu_data,dma_addr_t user_iova,void * data,size_t count,bool write)3244 static int vfio_iommu_type1_dma_rw(void *iommu_data, dma_addr_t user_iova,
3245 				   void *data, size_t count, bool write)
3246 {
3247 	struct vfio_iommu *iommu = iommu_data;
3248 	int ret = 0;
3249 	size_t done;
3250 
3251 	mutex_lock(&iommu->lock);
3252 	while (count > 0) {
3253 		ret = vfio_iommu_type1_dma_rw_chunk(iommu, user_iova, data,
3254 						    count, write, &done);
3255 		if (ret)
3256 			break;
3257 
3258 		count -= done;
3259 		data += done;
3260 		user_iova += done;
3261 	}
3262 
3263 	mutex_unlock(&iommu->lock);
3264 	return ret;
3265 }
3266 
3267 static struct iommu_domain *
vfio_iommu_type1_group_iommu_domain(void * iommu_data,struct iommu_group * iommu_group)3268 vfio_iommu_type1_group_iommu_domain(void *iommu_data,
3269 				    struct iommu_group *iommu_group)
3270 {
3271 	struct iommu_domain *domain = ERR_PTR(-ENODEV);
3272 	struct vfio_iommu *iommu = iommu_data;
3273 	struct vfio_domain *d;
3274 
3275 	if (!iommu || !iommu_group)
3276 		return ERR_PTR(-EINVAL);
3277 
3278 	mutex_lock(&iommu->lock);
3279 	list_for_each_entry(d, &iommu->domain_list, next) {
3280 		if (find_iommu_group(d, iommu_group)) {
3281 			domain = d->domain;
3282 			break;
3283 		}
3284 	}
3285 	mutex_unlock(&iommu->lock);
3286 
3287 	return domain;
3288 }
3289 
vfio_iommu_type1_notify(void * iommu_data,enum vfio_iommu_notify_type event)3290 static void vfio_iommu_type1_notify(void *iommu_data,
3291 				    enum vfio_iommu_notify_type event)
3292 {
3293 	struct vfio_iommu *iommu = iommu_data;
3294 
3295 	if (event != VFIO_IOMMU_CONTAINER_CLOSE)
3296 		return;
3297 	mutex_lock(&iommu->lock);
3298 	iommu->container_open = false;
3299 	mutex_unlock(&iommu->lock);
3300 	wake_up_all(&iommu->vaddr_wait);
3301 }
3302 
3303 static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
3304 	.name			= "vfio-iommu-type1",
3305 	.owner			= THIS_MODULE,
3306 	.open			= vfio_iommu_type1_open,
3307 	.release		= vfio_iommu_type1_release,
3308 	.ioctl			= vfio_iommu_type1_ioctl,
3309 	.attach_group		= vfio_iommu_type1_attach_group,
3310 	.detach_group		= vfio_iommu_type1_detach_group,
3311 	.pin_pages		= vfio_iommu_type1_pin_pages,
3312 	.unpin_pages		= vfio_iommu_type1_unpin_pages,
3313 	.register_notifier	= vfio_iommu_type1_register_notifier,
3314 	.unregister_notifier	= vfio_iommu_type1_unregister_notifier,
3315 	.dma_rw			= vfio_iommu_type1_dma_rw,
3316 	.group_iommu_domain	= vfio_iommu_type1_group_iommu_domain,
3317 	.notify			= vfio_iommu_type1_notify,
3318 };
3319 
vfio_iommu_type1_init(void)3320 static int __init vfio_iommu_type1_init(void)
3321 {
3322 	return vfio_register_iommu_driver(&vfio_iommu_driver_ops_type1);
3323 }
3324 
vfio_iommu_type1_cleanup(void)3325 static void __exit vfio_iommu_type1_cleanup(void)
3326 {
3327 	vfio_unregister_iommu_driver(&vfio_iommu_driver_ops_type1);
3328 }
3329 
3330 module_init(vfio_iommu_type1_init);
3331 module_exit(vfio_iommu_type1_cleanup);
3332 
3333 MODULE_VERSION(DRIVER_VERSION);
3334 MODULE_LICENSE("GPL v2");
3335 MODULE_AUTHOR(DRIVER_AUTHOR);
3336 MODULE_DESCRIPTION(DRIVER_DESC);
3337