1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * VFIO: IOMMU DMA mapping support for Type1 IOMMU
4 *
5 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
6 * Author: Alex Williamson <alex.williamson@redhat.com>
7 *
8 * Derived from original vfio:
9 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
10 * Author: Tom Lyon, pugs@cisco.com
11 *
12 * We arbitrarily define a Type1 IOMMU as one matching the below code.
13 * It could be called the x86 IOMMU as it's designed for AMD-Vi & Intel
14 * VT-d, but that makes it harder to re-use as theoretically anyone
15 * implementing a similar IOMMU could make use of this. We expect the
16 * IOMMU to support the IOMMU API and have few to no restrictions around
17 * the IOVA range that can be mapped. The Type1 IOMMU is currently
18 * optimized for relatively static mappings of a userspace process with
19 * userspace pages pinned into memory. We also assume devices and IOMMU
20 * domains are PCI based as the IOMMU API is still centered around a
21 * device/bus interface rather than a group interface.
22 */
23
24 #include <linux/compat.h>
25 #include <linux/device.h>
26 #include <linux/fs.h>
27 #include <linux/highmem.h>
28 #include <linux/iommu.h>
29 #include <linux/module.h>
30 #include <linux/mm.h>
31 #include <linux/kthread.h>
32 #include <linux/rbtree.h>
33 #include <linux/sched/signal.h>
34 #include <linux/sched/mm.h>
35 #include <linux/slab.h>
36 #include <linux/uaccess.h>
37 #include <linux/vfio.h>
38 #include <linux/workqueue.h>
39 #include <linux/mdev.h>
40 #include <linux/notifier.h>
41 #include <linux/dma-iommu.h>
42 #include <linux/irqdomain.h>
43
44 #define DRIVER_VERSION "0.2"
45 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
46 #define DRIVER_DESC "Type1 IOMMU driver for VFIO"
47
48 static bool allow_unsafe_interrupts;
49 module_param_named(allow_unsafe_interrupts,
50 allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR);
51 MODULE_PARM_DESC(allow_unsafe_interrupts,
52 "Enable VFIO IOMMU support for on platforms without interrupt remapping support.");
53
54 static bool disable_hugepages;
55 module_param_named(disable_hugepages,
56 disable_hugepages, bool, S_IRUGO | S_IWUSR);
57 MODULE_PARM_DESC(disable_hugepages,
58 "Disable VFIO IOMMU support for IOMMU hugepages.");
59
60 static unsigned int dma_entry_limit __read_mostly = U16_MAX;
61 module_param_named(dma_entry_limit, dma_entry_limit, uint, 0644);
62 MODULE_PARM_DESC(dma_entry_limit,
63 "Maximum number of user DMA mappings per container (65535).");
64
65 struct vfio_iommu {
66 struct list_head domain_list;
67 struct list_head iova_list;
68 struct vfio_domain *external_domain; /* domain for external user */
69 struct mutex lock;
70 struct rb_root dma_list;
71 struct blocking_notifier_head notifier;
72 unsigned int dma_avail;
73 unsigned int vaddr_invalid_count;
74 uint64_t pgsize_bitmap;
75 uint64_t num_non_pinned_groups;
76 wait_queue_head_t vaddr_wait;
77 bool v2;
78 bool nesting;
79 bool dirty_page_tracking;
80 bool container_open;
81 };
82
83 struct vfio_domain {
84 struct iommu_domain *domain;
85 struct list_head next;
86 struct list_head group_list;
87 int prot; /* IOMMU_CACHE */
88 bool fgsp; /* Fine-grained super pages */
89 };
90
91 struct vfio_dma {
92 struct rb_node node;
93 dma_addr_t iova; /* Device address */
94 unsigned long vaddr; /* Process virtual addr */
95 size_t size; /* Map size (bytes) */
96 int prot; /* IOMMU_READ/WRITE */
97 bool iommu_mapped;
98 bool lock_cap; /* capable(CAP_IPC_LOCK) */
99 bool vaddr_invalid;
100 struct task_struct *task;
101 struct rb_root pfn_list; /* Ex-user pinned pfn list */
102 unsigned long *bitmap;
103 struct mm_struct *mm;
104 size_t locked_vm;
105 };
106
107 struct vfio_batch {
108 struct page **pages; /* for pin_user_pages_remote */
109 struct page *fallback_page; /* if pages alloc fails */
110 int capacity; /* length of pages array */
111 int size; /* of batch currently */
112 int offset; /* of next entry in pages */
113 };
114
115 struct vfio_iommu_group {
116 struct iommu_group *iommu_group;
117 struct list_head next;
118 bool mdev_group; /* An mdev group */
119 bool pinned_page_dirty_scope;
120 };
121
122 struct vfio_iova {
123 struct list_head list;
124 dma_addr_t start;
125 dma_addr_t end;
126 };
127
128 /*
129 * Guest RAM pinning working set or DMA target
130 */
131 struct vfio_pfn {
132 struct rb_node node;
133 dma_addr_t iova; /* Device address */
134 unsigned long pfn; /* Host pfn */
135 unsigned int ref_count;
136 };
137
138 struct vfio_regions {
139 struct list_head list;
140 dma_addr_t iova;
141 phys_addr_t phys;
142 size_t len;
143 };
144
145 #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) \
146 (!list_empty(&iommu->domain_list))
147
148 #define DIRTY_BITMAP_BYTES(n) (ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
149
150 /*
151 * Input argument of number of bits to bitmap_set() is unsigned integer, which
152 * further casts to signed integer for unaligned multi-bit operation,
153 * __bitmap_set().
154 * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
155 * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
156 * system.
157 */
158 #define DIRTY_BITMAP_PAGES_MAX ((u64)INT_MAX)
159 #define DIRTY_BITMAP_SIZE_MAX DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
160
161 #define WAITED 1
162
163 static int put_pfn(unsigned long pfn, int prot);
164
165 static struct vfio_iommu_group*
166 vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
167 struct iommu_group *iommu_group);
168
169 /*
170 * This code handles mapping and unmapping of user data buffers
171 * into DMA'ble space using the IOMMU
172 */
173
vfio_find_dma(struct vfio_iommu * iommu,dma_addr_t start,size_t size)174 static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
175 dma_addr_t start, size_t size)
176 {
177 struct rb_node *node = iommu->dma_list.rb_node;
178
179 while (node) {
180 struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
181
182 if (start + size <= dma->iova)
183 node = node->rb_left;
184 else if (start >= dma->iova + dma->size)
185 node = node->rb_right;
186 else
187 return dma;
188 }
189
190 return NULL;
191 }
192
vfio_find_dma_first_node(struct vfio_iommu * iommu,dma_addr_t start,u64 size)193 static struct rb_node *vfio_find_dma_first_node(struct vfio_iommu *iommu,
194 dma_addr_t start, u64 size)
195 {
196 struct rb_node *res = NULL;
197 struct rb_node *node = iommu->dma_list.rb_node;
198 struct vfio_dma *dma_res = NULL;
199
200 while (node) {
201 struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
202
203 if (start < dma->iova + dma->size) {
204 res = node;
205 dma_res = dma;
206 if (start >= dma->iova)
207 break;
208 node = node->rb_left;
209 } else {
210 node = node->rb_right;
211 }
212 }
213 if (res && size && dma_res->iova >= start + size)
214 res = NULL;
215 return res;
216 }
217
vfio_link_dma(struct vfio_iommu * iommu,struct vfio_dma * new)218 static void vfio_link_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
219 {
220 struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL;
221 struct vfio_dma *dma;
222
223 while (*link) {
224 parent = *link;
225 dma = rb_entry(parent, struct vfio_dma, node);
226
227 if (new->iova + new->size <= dma->iova)
228 link = &(*link)->rb_left;
229 else
230 link = &(*link)->rb_right;
231 }
232
233 rb_link_node(&new->node, parent, link);
234 rb_insert_color(&new->node, &iommu->dma_list);
235 }
236
vfio_unlink_dma(struct vfio_iommu * iommu,struct vfio_dma * old)237 static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
238 {
239 rb_erase(&old->node, &iommu->dma_list);
240 }
241
242
vfio_dma_bitmap_alloc(struct vfio_dma * dma,size_t pgsize)243 static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, size_t pgsize)
244 {
245 uint64_t npages = dma->size / pgsize;
246
247 if (npages > DIRTY_BITMAP_PAGES_MAX)
248 return -EINVAL;
249
250 /*
251 * Allocate extra 64 bits that are used to calculate shift required for
252 * bitmap_shift_left() to manipulate and club unaligned number of pages
253 * in adjacent vfio_dma ranges.
254 */
255 dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages) + sizeof(u64),
256 GFP_KERNEL);
257 if (!dma->bitmap)
258 return -ENOMEM;
259
260 return 0;
261 }
262
vfio_dma_bitmap_free(struct vfio_dma * dma)263 static void vfio_dma_bitmap_free(struct vfio_dma *dma)
264 {
265 kfree(dma->bitmap);
266 dma->bitmap = NULL;
267 }
268
vfio_dma_populate_bitmap(struct vfio_dma * dma,size_t pgsize)269 static void vfio_dma_populate_bitmap(struct vfio_dma *dma, size_t pgsize)
270 {
271 struct rb_node *p;
272 unsigned long pgshift = __ffs(pgsize);
273
274 for (p = rb_first(&dma->pfn_list); p; p = rb_next(p)) {
275 struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, node);
276
277 bitmap_set(dma->bitmap, (vpfn->iova - dma->iova) >> pgshift, 1);
278 }
279 }
280
vfio_iommu_populate_bitmap_full(struct vfio_iommu * iommu)281 static void vfio_iommu_populate_bitmap_full(struct vfio_iommu *iommu)
282 {
283 struct rb_node *n;
284 unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
285
286 for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
287 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
288
289 bitmap_set(dma->bitmap, 0, dma->size >> pgshift);
290 }
291 }
292
vfio_dma_bitmap_alloc_all(struct vfio_iommu * iommu,size_t pgsize)293 static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, size_t pgsize)
294 {
295 struct rb_node *n;
296
297 for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
298 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
299 int ret;
300
301 ret = vfio_dma_bitmap_alloc(dma, pgsize);
302 if (ret) {
303 struct rb_node *p;
304
305 for (p = rb_prev(n); p; p = rb_prev(p)) {
306 struct vfio_dma *dma = rb_entry(n,
307 struct vfio_dma, node);
308
309 vfio_dma_bitmap_free(dma);
310 }
311 return ret;
312 }
313 vfio_dma_populate_bitmap(dma, pgsize);
314 }
315 return 0;
316 }
317
vfio_dma_bitmap_free_all(struct vfio_iommu * iommu)318 static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu)
319 {
320 struct rb_node *n;
321
322 for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
323 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
324
325 vfio_dma_bitmap_free(dma);
326 }
327 }
328
329 /*
330 * Helper Functions for host iova-pfn list
331 */
vfio_find_vpfn(struct vfio_dma * dma,dma_addr_t iova)332 static struct vfio_pfn *vfio_find_vpfn(struct vfio_dma *dma, dma_addr_t iova)
333 {
334 struct vfio_pfn *vpfn;
335 struct rb_node *node = dma->pfn_list.rb_node;
336
337 while (node) {
338 vpfn = rb_entry(node, struct vfio_pfn, node);
339
340 if (iova < vpfn->iova)
341 node = node->rb_left;
342 else if (iova > vpfn->iova)
343 node = node->rb_right;
344 else
345 return vpfn;
346 }
347 return NULL;
348 }
349
vfio_link_pfn(struct vfio_dma * dma,struct vfio_pfn * new)350 static void vfio_link_pfn(struct vfio_dma *dma,
351 struct vfio_pfn *new)
352 {
353 struct rb_node **link, *parent = NULL;
354 struct vfio_pfn *vpfn;
355
356 link = &dma->pfn_list.rb_node;
357 while (*link) {
358 parent = *link;
359 vpfn = rb_entry(parent, struct vfio_pfn, node);
360
361 if (new->iova < vpfn->iova)
362 link = &(*link)->rb_left;
363 else
364 link = &(*link)->rb_right;
365 }
366
367 rb_link_node(&new->node, parent, link);
368 rb_insert_color(&new->node, &dma->pfn_list);
369 }
370
vfio_unlink_pfn(struct vfio_dma * dma,struct vfio_pfn * old)371 static void vfio_unlink_pfn(struct vfio_dma *dma, struct vfio_pfn *old)
372 {
373 rb_erase(&old->node, &dma->pfn_list);
374 }
375
vfio_add_to_pfn_list(struct vfio_dma * dma,dma_addr_t iova,unsigned long pfn)376 static int vfio_add_to_pfn_list(struct vfio_dma *dma, dma_addr_t iova,
377 unsigned long pfn)
378 {
379 struct vfio_pfn *vpfn;
380
381 vpfn = kzalloc(sizeof(*vpfn), GFP_KERNEL);
382 if (!vpfn)
383 return -ENOMEM;
384
385 vpfn->iova = iova;
386 vpfn->pfn = pfn;
387 vpfn->ref_count = 1;
388 vfio_link_pfn(dma, vpfn);
389 return 0;
390 }
391
vfio_remove_from_pfn_list(struct vfio_dma * dma,struct vfio_pfn * vpfn)392 static void vfio_remove_from_pfn_list(struct vfio_dma *dma,
393 struct vfio_pfn *vpfn)
394 {
395 vfio_unlink_pfn(dma, vpfn);
396 kfree(vpfn);
397 }
398
vfio_iova_get_vfio_pfn(struct vfio_dma * dma,unsigned long iova)399 static struct vfio_pfn *vfio_iova_get_vfio_pfn(struct vfio_dma *dma,
400 unsigned long iova)
401 {
402 struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
403
404 if (vpfn)
405 vpfn->ref_count++;
406 return vpfn;
407 }
408
vfio_iova_put_vfio_pfn(struct vfio_dma * dma,struct vfio_pfn * vpfn)409 static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn)
410 {
411 int ret = 0;
412
413 vpfn->ref_count--;
414 if (!vpfn->ref_count) {
415 ret = put_pfn(vpfn->pfn, dma->prot);
416 vfio_remove_from_pfn_list(dma, vpfn);
417 }
418 return ret;
419 }
420
mm_lock_acct(struct task_struct * task,struct mm_struct * mm,bool lock_cap,long npage)421 static int mm_lock_acct(struct task_struct *task, struct mm_struct *mm,
422 bool lock_cap, long npage)
423 {
424 int ret = mmap_write_lock_killable(mm);
425
426 if (ret)
427 return ret;
428
429 ret = __account_locked_vm(mm, abs(npage), npage > 0, task, lock_cap);
430 mmap_write_unlock(mm);
431 return ret;
432 }
433
vfio_lock_acct(struct vfio_dma * dma,long npage,bool async)434 static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async)
435 {
436 struct mm_struct *mm;
437 int ret;
438
439 if (!npage)
440 return 0;
441
442 mm = dma->mm;
443 if (async && !mmget_not_zero(mm))
444 return -ESRCH; /* process exited */
445
446 ret = mm_lock_acct(dma->task, mm, dma->lock_cap, npage);
447 if (!ret)
448 dma->locked_vm += npage;
449
450 if (async)
451 mmput(mm);
452
453 return ret;
454 }
455
456 /*
457 * Some mappings aren't backed by a struct page, for example an mmap'd
458 * MMIO range for our own or another device. These use a different
459 * pfn conversion and shouldn't be tracked as locked pages.
460 * For compound pages, any driver that sets the reserved bit in head
461 * page needs to set the reserved bit in all subpages to be safe.
462 */
is_invalid_reserved_pfn(unsigned long pfn)463 static bool is_invalid_reserved_pfn(unsigned long pfn)
464 {
465 if (pfn_valid(pfn))
466 return PageReserved(pfn_to_page(pfn));
467
468 return true;
469 }
470
put_pfn(unsigned long pfn,int prot)471 static int put_pfn(unsigned long pfn, int prot)
472 {
473 if (!is_invalid_reserved_pfn(pfn)) {
474 struct page *page = pfn_to_page(pfn);
475
476 unpin_user_pages_dirty_lock(&page, 1, prot & IOMMU_WRITE);
477 return 1;
478 }
479 return 0;
480 }
481
482 #define VFIO_BATCH_MAX_CAPACITY (PAGE_SIZE / sizeof(struct page *))
483
vfio_batch_init(struct vfio_batch * batch)484 static void vfio_batch_init(struct vfio_batch *batch)
485 {
486 batch->size = 0;
487 batch->offset = 0;
488
489 if (unlikely(disable_hugepages))
490 goto fallback;
491
492 batch->pages = (struct page **) __get_free_page(GFP_KERNEL);
493 if (!batch->pages)
494 goto fallback;
495
496 batch->capacity = VFIO_BATCH_MAX_CAPACITY;
497 return;
498
499 fallback:
500 batch->pages = &batch->fallback_page;
501 batch->capacity = 1;
502 }
503
vfio_batch_unpin(struct vfio_batch * batch,struct vfio_dma * dma)504 static void vfio_batch_unpin(struct vfio_batch *batch, struct vfio_dma *dma)
505 {
506 while (batch->size) {
507 unsigned long pfn = page_to_pfn(batch->pages[batch->offset]);
508
509 put_pfn(pfn, dma->prot);
510 batch->offset++;
511 batch->size--;
512 }
513 }
514
vfio_batch_fini(struct vfio_batch * batch)515 static void vfio_batch_fini(struct vfio_batch *batch)
516 {
517 if (batch->capacity == VFIO_BATCH_MAX_CAPACITY)
518 free_page((unsigned long)batch->pages);
519 }
520
follow_fault_pfn(struct vm_area_struct * vma,struct mm_struct * mm,unsigned long vaddr,unsigned long * pfn,bool write_fault)521 static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm,
522 unsigned long vaddr, unsigned long *pfn,
523 bool write_fault)
524 {
525 pte_t *ptep;
526 spinlock_t *ptl;
527 int ret;
528
529 ret = follow_pte(vma->vm_mm, vaddr, &ptep, &ptl);
530 if (ret) {
531 bool unlocked = false;
532
533 ret = fixup_user_fault(mm, vaddr,
534 FAULT_FLAG_REMOTE |
535 (write_fault ? FAULT_FLAG_WRITE : 0),
536 &unlocked);
537 if (unlocked)
538 return -EAGAIN;
539
540 if (ret)
541 return ret;
542
543 ret = follow_pte(vma->vm_mm, vaddr, &ptep, &ptl);
544 if (ret)
545 return ret;
546 }
547
548 if (write_fault && !pte_write(*ptep))
549 ret = -EFAULT;
550 else
551 *pfn = pte_pfn(*ptep);
552
553 pte_unmap_unlock(ptep, ptl);
554 return ret;
555 }
556
557 /*
558 * Returns the positive number of pfns successfully obtained or a negative
559 * error code.
560 */
vaddr_get_pfns(struct mm_struct * mm,unsigned long vaddr,long npages,int prot,unsigned long * pfn,struct page ** pages)561 static int vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr,
562 long npages, int prot, unsigned long *pfn,
563 struct page **pages)
564 {
565 struct vm_area_struct *vma;
566 unsigned int flags = 0;
567 int ret;
568
569 if (prot & IOMMU_WRITE)
570 flags |= FOLL_WRITE;
571
572 mmap_read_lock(mm);
573 ret = pin_user_pages_remote(mm, vaddr, npages, flags | FOLL_LONGTERM,
574 pages, NULL, NULL);
575 if (ret > 0) {
576 int i;
577
578 /*
579 * The zero page is always resident, we don't need to pin it
580 * and it falls into our invalid/reserved test so we don't
581 * unpin in put_pfn(). Unpin all zero pages in the batch here.
582 */
583 for (i = 0 ; i < ret; i++) {
584 if (unlikely(is_zero_pfn(page_to_pfn(pages[i]))))
585 unpin_user_page(pages[i]);
586 }
587
588 *pfn = page_to_pfn(pages[0]);
589 goto done;
590 }
591
592 vaddr = untagged_addr(vaddr);
593
594 retry:
595 vma = vma_lookup(mm, vaddr);
596
597 if (vma && vma->vm_flags & VM_PFNMAP) {
598 ret = follow_fault_pfn(vma, mm, vaddr, pfn, prot & IOMMU_WRITE);
599 if (ret == -EAGAIN)
600 goto retry;
601
602 if (!ret) {
603 if (is_invalid_reserved_pfn(*pfn))
604 ret = 1;
605 else
606 ret = -EFAULT;
607 }
608 }
609 done:
610 mmap_read_unlock(mm);
611 return ret;
612 }
613
vfio_wait(struct vfio_iommu * iommu)614 static int vfio_wait(struct vfio_iommu *iommu)
615 {
616 DEFINE_WAIT(wait);
617
618 prepare_to_wait(&iommu->vaddr_wait, &wait, TASK_KILLABLE);
619 mutex_unlock(&iommu->lock);
620 schedule();
621 mutex_lock(&iommu->lock);
622 finish_wait(&iommu->vaddr_wait, &wait);
623 if (kthread_should_stop() || !iommu->container_open ||
624 fatal_signal_pending(current)) {
625 return -EFAULT;
626 }
627 return WAITED;
628 }
629
630 /*
631 * Find dma struct and wait for its vaddr to be valid. iommu lock is dropped
632 * if the task waits, but is re-locked on return. Return result in *dma_p.
633 * Return 0 on success with no waiting, WAITED on success if waited, and -errno
634 * on error.
635 */
vfio_find_dma_valid(struct vfio_iommu * iommu,dma_addr_t start,size_t size,struct vfio_dma ** dma_p)636 static int vfio_find_dma_valid(struct vfio_iommu *iommu, dma_addr_t start,
637 size_t size, struct vfio_dma **dma_p)
638 {
639 int ret = 0;
640
641 do {
642 *dma_p = vfio_find_dma(iommu, start, size);
643 if (!*dma_p)
644 return -EINVAL;
645 else if (!(*dma_p)->vaddr_invalid)
646 return ret;
647 else
648 ret = vfio_wait(iommu);
649 } while (ret == WAITED);
650
651 return ret;
652 }
653
654 /*
655 * Wait for all vaddr in the dma_list to become valid. iommu lock is dropped
656 * if the task waits, but is re-locked on return. Return 0 on success with no
657 * waiting, WAITED on success if waited, and -errno on error.
658 */
vfio_wait_all_valid(struct vfio_iommu * iommu)659 static int vfio_wait_all_valid(struct vfio_iommu *iommu)
660 {
661 int ret = 0;
662
663 while (iommu->vaddr_invalid_count && ret >= 0)
664 ret = vfio_wait(iommu);
665
666 return ret;
667 }
668
669 /*
670 * Attempt to pin pages. We really don't want to track all the pfns and
671 * the iommu can only map chunks of consecutive pfns anyway, so get the
672 * first page and all consecutive pages with the same locking.
673 */
vfio_pin_pages_remote(struct vfio_dma * dma,unsigned long vaddr,long npage,unsigned long * pfn_base,unsigned long limit,struct vfio_batch * batch)674 static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
675 long npage, unsigned long *pfn_base,
676 unsigned long limit, struct vfio_batch *batch)
677 {
678 unsigned long pfn;
679 struct mm_struct *mm = current->mm;
680 long ret, pinned = 0, lock_acct = 0;
681 bool rsvd;
682 dma_addr_t iova = vaddr - dma->vaddr + dma->iova;
683
684 /* This code path is only user initiated */
685 if (!mm)
686 return -ENODEV;
687
688 if (batch->size) {
689 /* Leftover pages in batch from an earlier call. */
690 *pfn_base = page_to_pfn(batch->pages[batch->offset]);
691 pfn = *pfn_base;
692 rsvd = is_invalid_reserved_pfn(*pfn_base);
693 } else {
694 *pfn_base = 0;
695 }
696
697 while (npage) {
698 if (!batch->size) {
699 /* Empty batch, so refill it. */
700 long req_pages = min_t(long, npage, batch->capacity);
701
702 ret = vaddr_get_pfns(mm, vaddr, req_pages, dma->prot,
703 &pfn, batch->pages);
704 if (ret < 0)
705 goto unpin_out;
706
707 batch->size = ret;
708 batch->offset = 0;
709
710 if (!*pfn_base) {
711 *pfn_base = pfn;
712 rsvd = is_invalid_reserved_pfn(*pfn_base);
713 }
714 }
715
716 /*
717 * pfn is preset for the first iteration of this inner loop and
718 * updated at the end to handle a VM_PFNMAP pfn. In that case,
719 * batch->pages isn't valid (there's no struct page), so allow
720 * batch->pages to be touched only when there's more than one
721 * pfn to check, which guarantees the pfns are from a
722 * !VM_PFNMAP vma.
723 */
724 while (true) {
725 if (pfn != *pfn_base + pinned ||
726 rsvd != is_invalid_reserved_pfn(pfn))
727 goto out;
728
729 /*
730 * Reserved pages aren't counted against the user,
731 * externally pinned pages are already counted against
732 * the user.
733 */
734 if (!rsvd && !vfio_find_vpfn(dma, iova)) {
735 if (!dma->lock_cap &&
736 mm->locked_vm + lock_acct + 1 > limit) {
737 pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
738 __func__, limit << PAGE_SHIFT);
739 ret = -ENOMEM;
740 goto unpin_out;
741 }
742 lock_acct++;
743 }
744
745 pinned++;
746 npage--;
747 vaddr += PAGE_SIZE;
748 iova += PAGE_SIZE;
749 batch->offset++;
750 batch->size--;
751
752 if (!batch->size)
753 break;
754
755 pfn = page_to_pfn(batch->pages[batch->offset]);
756 }
757
758 if (unlikely(disable_hugepages))
759 break;
760 }
761
762 out:
763 ret = vfio_lock_acct(dma, lock_acct, false);
764
765 unpin_out:
766 if (batch->size == 1 && !batch->offset) {
767 /* May be a VM_PFNMAP pfn, which the batch can't remember. */
768 put_pfn(pfn, dma->prot);
769 batch->size = 0;
770 }
771
772 if (ret < 0) {
773 if (pinned && !rsvd) {
774 for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
775 put_pfn(pfn, dma->prot);
776 }
777 vfio_batch_unpin(batch, dma);
778
779 return ret;
780 }
781
782 return pinned;
783 }
784
vfio_unpin_pages_remote(struct vfio_dma * dma,dma_addr_t iova,unsigned long pfn,long npage,bool do_accounting)785 static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
786 unsigned long pfn, long npage,
787 bool do_accounting)
788 {
789 long unlocked = 0, locked = 0;
790 long i;
791
792 for (i = 0; i < npage; i++, iova += PAGE_SIZE) {
793 if (put_pfn(pfn++, dma->prot)) {
794 unlocked++;
795 if (vfio_find_vpfn(dma, iova))
796 locked++;
797 }
798 }
799
800 if (do_accounting)
801 vfio_lock_acct(dma, locked - unlocked, true);
802
803 return unlocked;
804 }
805
vfio_pin_page_external(struct vfio_dma * dma,unsigned long vaddr,unsigned long * pfn_base,bool do_accounting)806 static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
807 unsigned long *pfn_base, bool do_accounting)
808 {
809 struct page *pages[1];
810 struct mm_struct *mm;
811 int ret;
812
813 mm = dma->mm;
814 if (!mmget_not_zero(mm))
815 return -ENODEV;
816
817 ret = vaddr_get_pfns(mm, vaddr, 1, dma->prot, pfn_base, pages);
818 if (ret != 1)
819 goto out;
820
821 ret = 0;
822
823 if (do_accounting && !is_invalid_reserved_pfn(*pfn_base)) {
824 ret = vfio_lock_acct(dma, 1, false);
825 if (ret) {
826 put_pfn(*pfn_base, dma->prot);
827 if (ret == -ENOMEM)
828 pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK "
829 "(%ld) exceeded\n", __func__,
830 dma->task->comm, task_pid_nr(dma->task),
831 task_rlimit(dma->task, RLIMIT_MEMLOCK));
832 }
833 }
834
835 out:
836 mmput(mm);
837 return ret;
838 }
839
vfio_unpin_page_external(struct vfio_dma * dma,dma_addr_t iova,bool do_accounting)840 static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
841 bool do_accounting)
842 {
843 int unlocked;
844 struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
845
846 if (!vpfn)
847 return 0;
848
849 unlocked = vfio_iova_put_vfio_pfn(dma, vpfn);
850
851 if (do_accounting)
852 vfio_lock_acct(dma, -unlocked, true);
853
854 return unlocked;
855 }
856
vfio_iommu_type1_pin_pages(void * iommu_data,struct iommu_group * iommu_group,unsigned long * user_pfn,int npage,int prot,unsigned long * phys_pfn)857 static int vfio_iommu_type1_pin_pages(void *iommu_data,
858 struct iommu_group *iommu_group,
859 unsigned long *user_pfn,
860 int npage, int prot,
861 unsigned long *phys_pfn)
862 {
863 struct vfio_iommu *iommu = iommu_data;
864 struct vfio_iommu_group *group;
865 int i, j, ret;
866 unsigned long remote_vaddr;
867 struct vfio_dma *dma;
868 bool do_accounting;
869 dma_addr_t iova;
870
871 if (!iommu || !user_pfn || !phys_pfn)
872 return -EINVAL;
873
874 /* Supported for v2 version only */
875 if (!iommu->v2)
876 return -EACCES;
877
878 mutex_lock(&iommu->lock);
879
880 /*
881 * Wait for all necessary vaddr's to be valid so they can be used in
882 * the main loop without dropping the lock, to avoid racing vs unmap.
883 */
884 again:
885 if (iommu->vaddr_invalid_count) {
886 for (i = 0; i < npage; i++) {
887 iova = user_pfn[i] << PAGE_SHIFT;
888 ret = vfio_find_dma_valid(iommu, iova, PAGE_SIZE, &dma);
889 if (ret < 0)
890 goto pin_done;
891 if (ret == WAITED)
892 goto again;
893 }
894 }
895
896 /* Fail if notifier list is empty */
897 if (!iommu->notifier.head) {
898 ret = -EINVAL;
899 goto pin_done;
900 }
901
902 /*
903 * If iommu capable domain exist in the container then all pages are
904 * already pinned and accounted. Accounting should be done if there is no
905 * iommu capable domain in the container.
906 */
907 do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
908
909 for (i = 0; i < npage; i++) {
910 struct vfio_pfn *vpfn;
911
912 iova = user_pfn[i] << PAGE_SHIFT;
913 dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
914 if (!dma) {
915 ret = -EINVAL;
916 goto pin_unwind;
917 }
918
919 if ((dma->prot & prot) != prot) {
920 ret = -EPERM;
921 goto pin_unwind;
922 }
923
924 vpfn = vfio_iova_get_vfio_pfn(dma, iova);
925 if (vpfn) {
926 phys_pfn[i] = vpfn->pfn;
927 continue;
928 }
929
930 remote_vaddr = dma->vaddr + (iova - dma->iova);
931 ret = vfio_pin_page_external(dma, remote_vaddr, &phys_pfn[i],
932 do_accounting);
933 if (ret)
934 goto pin_unwind;
935
936 ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]);
937 if (ret) {
938 if (put_pfn(phys_pfn[i], dma->prot) && do_accounting)
939 vfio_lock_acct(dma, -1, true);
940 goto pin_unwind;
941 }
942
943 if (iommu->dirty_page_tracking) {
944 unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
945
946 /*
947 * Bitmap populated with the smallest supported page
948 * size
949 */
950 bitmap_set(dma->bitmap,
951 (iova - dma->iova) >> pgshift, 1);
952 }
953 }
954 ret = i;
955
956 group = vfio_iommu_find_iommu_group(iommu, iommu_group);
957 if (!group->pinned_page_dirty_scope) {
958 group->pinned_page_dirty_scope = true;
959 iommu->num_non_pinned_groups--;
960 }
961
962 goto pin_done;
963
964 pin_unwind:
965 phys_pfn[i] = 0;
966 for (j = 0; j < i; j++) {
967 dma_addr_t iova;
968
969 iova = user_pfn[j] << PAGE_SHIFT;
970 dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
971 vfio_unpin_page_external(dma, iova, do_accounting);
972 phys_pfn[j] = 0;
973 }
974 pin_done:
975 mutex_unlock(&iommu->lock);
976 return ret;
977 }
978
vfio_iommu_type1_unpin_pages(void * iommu_data,unsigned long * user_pfn,int npage)979 static int vfio_iommu_type1_unpin_pages(void *iommu_data,
980 unsigned long *user_pfn,
981 int npage)
982 {
983 struct vfio_iommu *iommu = iommu_data;
984 bool do_accounting;
985 int i;
986
987 if (!iommu || !user_pfn || npage <= 0)
988 return -EINVAL;
989
990 /* Supported for v2 version only */
991 if (!iommu->v2)
992 return -EACCES;
993
994 mutex_lock(&iommu->lock);
995
996 do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
997 for (i = 0; i < npage; i++) {
998 struct vfio_dma *dma;
999 dma_addr_t iova;
1000
1001 iova = user_pfn[i] << PAGE_SHIFT;
1002 dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
1003 if (!dma)
1004 break;
1005
1006 vfio_unpin_page_external(dma, iova, do_accounting);
1007 }
1008
1009 mutex_unlock(&iommu->lock);
1010 return i > 0 ? i : -EINVAL;
1011 }
1012
vfio_sync_unpin(struct vfio_dma * dma,struct vfio_domain * domain,struct list_head * regions,struct iommu_iotlb_gather * iotlb_gather)1013 static long vfio_sync_unpin(struct vfio_dma *dma, struct vfio_domain *domain,
1014 struct list_head *regions,
1015 struct iommu_iotlb_gather *iotlb_gather)
1016 {
1017 long unlocked = 0;
1018 struct vfio_regions *entry, *next;
1019
1020 iommu_iotlb_sync(domain->domain, iotlb_gather);
1021
1022 list_for_each_entry_safe(entry, next, regions, list) {
1023 unlocked += vfio_unpin_pages_remote(dma,
1024 entry->iova,
1025 entry->phys >> PAGE_SHIFT,
1026 entry->len >> PAGE_SHIFT,
1027 false);
1028 list_del(&entry->list);
1029 kfree(entry);
1030 }
1031
1032 cond_resched();
1033
1034 return unlocked;
1035 }
1036
1037 /*
1038 * Generally, VFIO needs to unpin remote pages after each IOTLB flush.
1039 * Therefore, when using IOTLB flush sync interface, VFIO need to keep track
1040 * of these regions (currently using a list).
1041 *
1042 * This value specifies maximum number of regions for each IOTLB flush sync.
1043 */
1044 #define VFIO_IOMMU_TLB_SYNC_MAX 512
1045
unmap_unpin_fast(struct vfio_domain * domain,struct vfio_dma * dma,dma_addr_t * iova,size_t len,phys_addr_t phys,long * unlocked,struct list_head * unmapped_list,int * unmapped_cnt,struct iommu_iotlb_gather * iotlb_gather)1046 static size_t unmap_unpin_fast(struct vfio_domain *domain,
1047 struct vfio_dma *dma, dma_addr_t *iova,
1048 size_t len, phys_addr_t phys, long *unlocked,
1049 struct list_head *unmapped_list,
1050 int *unmapped_cnt,
1051 struct iommu_iotlb_gather *iotlb_gather)
1052 {
1053 size_t unmapped = 0;
1054 struct vfio_regions *entry = kzalloc(sizeof(*entry), GFP_KERNEL);
1055
1056 if (entry) {
1057 unmapped = iommu_unmap_fast(domain->domain, *iova, len,
1058 iotlb_gather);
1059
1060 if (!unmapped) {
1061 kfree(entry);
1062 } else {
1063 entry->iova = *iova;
1064 entry->phys = phys;
1065 entry->len = unmapped;
1066 list_add_tail(&entry->list, unmapped_list);
1067
1068 *iova += unmapped;
1069 (*unmapped_cnt)++;
1070 }
1071 }
1072
1073 /*
1074 * Sync if the number of fast-unmap regions hits the limit
1075 * or in case of errors.
1076 */
1077 if (*unmapped_cnt >= VFIO_IOMMU_TLB_SYNC_MAX || !unmapped) {
1078 *unlocked += vfio_sync_unpin(dma, domain, unmapped_list,
1079 iotlb_gather);
1080 *unmapped_cnt = 0;
1081 }
1082
1083 return unmapped;
1084 }
1085
unmap_unpin_slow(struct vfio_domain * domain,struct vfio_dma * dma,dma_addr_t * iova,size_t len,phys_addr_t phys,long * unlocked)1086 static size_t unmap_unpin_slow(struct vfio_domain *domain,
1087 struct vfio_dma *dma, dma_addr_t *iova,
1088 size_t len, phys_addr_t phys,
1089 long *unlocked)
1090 {
1091 size_t unmapped = iommu_unmap(domain->domain, *iova, len);
1092
1093 if (unmapped) {
1094 *unlocked += vfio_unpin_pages_remote(dma, *iova,
1095 phys >> PAGE_SHIFT,
1096 unmapped >> PAGE_SHIFT,
1097 false);
1098 *iova += unmapped;
1099 cond_resched();
1100 }
1101 return unmapped;
1102 }
1103
vfio_unmap_unpin(struct vfio_iommu * iommu,struct vfio_dma * dma,bool do_accounting)1104 static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
1105 bool do_accounting)
1106 {
1107 dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
1108 struct vfio_domain *domain, *d;
1109 LIST_HEAD(unmapped_region_list);
1110 struct iommu_iotlb_gather iotlb_gather;
1111 int unmapped_region_cnt = 0;
1112 long unlocked = 0;
1113
1114 if (!dma->size)
1115 return 0;
1116
1117 if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
1118 return 0;
1119
1120 /*
1121 * We use the IOMMU to track the physical addresses, otherwise we'd
1122 * need a much more complicated tracking system. Unfortunately that
1123 * means we need to use one of the iommu domains to figure out the
1124 * pfns to unpin. The rest need to be unmapped in advance so we have
1125 * no iommu translations remaining when the pages are unpinned.
1126 */
1127 domain = d = list_first_entry(&iommu->domain_list,
1128 struct vfio_domain, next);
1129
1130 list_for_each_entry_continue(d, &iommu->domain_list, next) {
1131 iommu_unmap(d->domain, dma->iova, dma->size);
1132 cond_resched();
1133 }
1134
1135 iommu_iotlb_gather_init(&iotlb_gather);
1136 while (iova < end) {
1137 size_t unmapped, len;
1138 phys_addr_t phys, next;
1139
1140 phys = iommu_iova_to_phys(domain->domain, iova);
1141 if (WARN_ON(!phys)) {
1142 iova += PAGE_SIZE;
1143 continue;
1144 }
1145
1146 /*
1147 * To optimize for fewer iommu_unmap() calls, each of which
1148 * may require hardware cache flushing, try to find the
1149 * largest contiguous physical memory chunk to unmap.
1150 */
1151 for (len = PAGE_SIZE;
1152 !domain->fgsp && iova + len < end; len += PAGE_SIZE) {
1153 next = iommu_iova_to_phys(domain->domain, iova + len);
1154 if (next != phys + len)
1155 break;
1156 }
1157
1158 /*
1159 * First, try to use fast unmap/unpin. In case of failure,
1160 * switch to slow unmap/unpin path.
1161 */
1162 unmapped = unmap_unpin_fast(domain, dma, &iova, len, phys,
1163 &unlocked, &unmapped_region_list,
1164 &unmapped_region_cnt,
1165 &iotlb_gather);
1166 if (!unmapped) {
1167 unmapped = unmap_unpin_slow(domain, dma, &iova, len,
1168 phys, &unlocked);
1169 if (WARN_ON(!unmapped))
1170 break;
1171 }
1172 }
1173
1174 dma->iommu_mapped = false;
1175
1176 if (unmapped_region_cnt) {
1177 unlocked += vfio_sync_unpin(dma, domain, &unmapped_region_list,
1178 &iotlb_gather);
1179 }
1180
1181 if (do_accounting) {
1182 vfio_lock_acct(dma, -unlocked, true);
1183 return 0;
1184 }
1185 return unlocked;
1186 }
1187
vfio_remove_dma(struct vfio_iommu * iommu,struct vfio_dma * dma)1188 static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
1189 {
1190 WARN_ON(!RB_EMPTY_ROOT(&dma->pfn_list));
1191 vfio_unmap_unpin(iommu, dma, true);
1192 vfio_unlink_dma(iommu, dma);
1193 put_task_struct(dma->task);
1194 mmdrop(dma->mm);
1195 vfio_dma_bitmap_free(dma);
1196 if (dma->vaddr_invalid) {
1197 iommu->vaddr_invalid_count--;
1198 wake_up_all(&iommu->vaddr_wait);
1199 }
1200 kfree(dma);
1201 iommu->dma_avail++;
1202 }
1203
vfio_update_pgsize_bitmap(struct vfio_iommu * iommu)1204 static void vfio_update_pgsize_bitmap(struct vfio_iommu *iommu)
1205 {
1206 struct vfio_domain *domain;
1207
1208 iommu->pgsize_bitmap = ULONG_MAX;
1209
1210 list_for_each_entry(domain, &iommu->domain_list, next)
1211 iommu->pgsize_bitmap &= domain->domain->pgsize_bitmap;
1212
1213 /*
1214 * In case the IOMMU supports page sizes smaller than PAGE_SIZE
1215 * we pretend PAGE_SIZE is supported and hide sub-PAGE_SIZE sizes.
1216 * That way the user will be able to map/unmap buffers whose size/
1217 * start address is aligned with PAGE_SIZE. Pinning code uses that
1218 * granularity while iommu driver can use the sub-PAGE_SIZE size
1219 * to map the buffer.
1220 */
1221 if (iommu->pgsize_bitmap & ~PAGE_MASK) {
1222 iommu->pgsize_bitmap &= PAGE_MASK;
1223 iommu->pgsize_bitmap |= PAGE_SIZE;
1224 }
1225 }
1226
update_user_bitmap(u64 __user * bitmap,struct vfio_iommu * iommu,struct vfio_dma * dma,dma_addr_t base_iova,size_t pgsize)1227 static int update_user_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
1228 struct vfio_dma *dma, dma_addr_t base_iova,
1229 size_t pgsize)
1230 {
1231 unsigned long pgshift = __ffs(pgsize);
1232 unsigned long nbits = dma->size >> pgshift;
1233 unsigned long bit_offset = (dma->iova - base_iova) >> pgshift;
1234 unsigned long copy_offset = bit_offset / BITS_PER_LONG;
1235 unsigned long shift = bit_offset % BITS_PER_LONG;
1236 unsigned long leftover;
1237
1238 /*
1239 * mark all pages dirty if any IOMMU capable device is not able
1240 * to report dirty pages and all pages are pinned and mapped.
1241 */
1242 if (iommu->num_non_pinned_groups && dma->iommu_mapped)
1243 bitmap_set(dma->bitmap, 0, nbits);
1244
1245 if (shift) {
1246 bitmap_shift_left(dma->bitmap, dma->bitmap, shift,
1247 nbits + shift);
1248
1249 if (copy_from_user(&leftover,
1250 (void __user *)(bitmap + copy_offset),
1251 sizeof(leftover)))
1252 return -EFAULT;
1253
1254 bitmap_or(dma->bitmap, dma->bitmap, &leftover, shift);
1255 }
1256
1257 if (copy_to_user((void __user *)(bitmap + copy_offset), dma->bitmap,
1258 DIRTY_BITMAP_BYTES(nbits + shift)))
1259 return -EFAULT;
1260
1261 return 0;
1262 }
1263
vfio_iova_dirty_bitmap(u64 __user * bitmap,struct vfio_iommu * iommu,dma_addr_t iova,size_t size,size_t pgsize)1264 static int vfio_iova_dirty_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
1265 dma_addr_t iova, size_t size, size_t pgsize)
1266 {
1267 struct vfio_dma *dma;
1268 struct rb_node *n;
1269 unsigned long pgshift = __ffs(pgsize);
1270 int ret;
1271
1272 /*
1273 * GET_BITMAP request must fully cover vfio_dma mappings. Multiple
1274 * vfio_dma mappings may be clubbed by specifying large ranges, but
1275 * there must not be any previous mappings bisected by the range.
1276 * An error will be returned if these conditions are not met.
1277 */
1278 dma = vfio_find_dma(iommu, iova, 1);
1279 if (dma && dma->iova != iova)
1280 return -EINVAL;
1281
1282 dma = vfio_find_dma(iommu, iova + size - 1, 0);
1283 if (dma && dma->iova + dma->size != iova + size)
1284 return -EINVAL;
1285
1286 for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
1287 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
1288
1289 if (dma->iova < iova)
1290 continue;
1291
1292 if (dma->iova > iova + size - 1)
1293 break;
1294
1295 ret = update_user_bitmap(bitmap, iommu, dma, iova, pgsize);
1296 if (ret)
1297 return ret;
1298
1299 /*
1300 * Re-populate bitmap to include all pinned pages which are
1301 * considered as dirty but exclude pages which are unpinned and
1302 * pages which are marked dirty by vfio_dma_rw()
1303 */
1304 bitmap_clear(dma->bitmap, 0, dma->size >> pgshift);
1305 vfio_dma_populate_bitmap(dma, pgsize);
1306 }
1307 return 0;
1308 }
1309
verify_bitmap_size(uint64_t npages,uint64_t bitmap_size)1310 static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
1311 {
1312 if (!npages || !bitmap_size || (bitmap_size > DIRTY_BITMAP_SIZE_MAX) ||
1313 (bitmap_size < DIRTY_BITMAP_BYTES(npages)))
1314 return -EINVAL;
1315
1316 return 0;
1317 }
1318
vfio_dma_do_unmap(struct vfio_iommu * iommu,struct vfio_iommu_type1_dma_unmap * unmap,struct vfio_bitmap * bitmap)1319 static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
1320 struct vfio_iommu_type1_dma_unmap *unmap,
1321 struct vfio_bitmap *bitmap)
1322 {
1323 struct vfio_dma *dma, *dma_last = NULL;
1324 size_t unmapped = 0, pgsize;
1325 int ret = -EINVAL, retries = 0;
1326 unsigned long pgshift;
1327 dma_addr_t iova = unmap->iova;
1328 u64 size = unmap->size;
1329 bool unmap_all = unmap->flags & VFIO_DMA_UNMAP_FLAG_ALL;
1330 bool invalidate_vaddr = unmap->flags & VFIO_DMA_UNMAP_FLAG_VADDR;
1331 struct rb_node *n, *first_n;
1332
1333 mutex_lock(&iommu->lock);
1334
1335 pgshift = __ffs(iommu->pgsize_bitmap);
1336 pgsize = (size_t)1 << pgshift;
1337
1338 if (iova & (pgsize - 1))
1339 goto unlock;
1340
1341 if (unmap_all) {
1342 if (iova || size)
1343 goto unlock;
1344 size = U64_MAX;
1345 } else if (!size || size & (pgsize - 1) ||
1346 iova + size - 1 < iova || size > SIZE_MAX) {
1347 goto unlock;
1348 }
1349
1350 /* When dirty tracking is enabled, allow only min supported pgsize */
1351 if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
1352 (!iommu->dirty_page_tracking || (bitmap->pgsize != pgsize))) {
1353 goto unlock;
1354 }
1355
1356 WARN_ON((pgsize - 1) & PAGE_MASK);
1357 again:
1358 /*
1359 * vfio-iommu-type1 (v1) - User mappings were coalesced together to
1360 * avoid tracking individual mappings. This means that the granularity
1361 * of the original mapping was lost and the user was allowed to attempt
1362 * to unmap any range. Depending on the contiguousness of physical
1363 * memory and page sizes supported by the IOMMU, arbitrary unmaps may
1364 * or may not have worked. We only guaranteed unmap granularity
1365 * matching the original mapping; even though it was untracked here,
1366 * the original mappings are reflected in IOMMU mappings. This
1367 * resulted in a couple unusual behaviors. First, if a range is not
1368 * able to be unmapped, ex. a set of 4k pages that was mapped as a
1369 * 2M hugepage into the IOMMU, the unmap ioctl returns success but with
1370 * a zero sized unmap. Also, if an unmap request overlaps the first
1371 * address of a hugepage, the IOMMU will unmap the entire hugepage.
1372 * This also returns success and the returned unmap size reflects the
1373 * actual size unmapped.
1374 *
1375 * We attempt to maintain compatibility with this "v1" interface, but
1376 * we take control out of the hands of the IOMMU. Therefore, an unmap
1377 * request offset from the beginning of the original mapping will
1378 * return success with zero sized unmap. And an unmap request covering
1379 * the first iova of mapping will unmap the entire range.
1380 *
1381 * The v2 version of this interface intends to be more deterministic.
1382 * Unmap requests must fully cover previous mappings. Multiple
1383 * mappings may still be unmaped by specifying large ranges, but there
1384 * must not be any previous mappings bisected by the range. An error
1385 * will be returned if these conditions are not met. The v2 interface
1386 * will only return success and a size of zero if there were no
1387 * mappings within the range.
1388 */
1389 if (iommu->v2 && !unmap_all) {
1390 dma = vfio_find_dma(iommu, iova, 1);
1391 if (dma && dma->iova != iova)
1392 goto unlock;
1393
1394 dma = vfio_find_dma(iommu, iova + size - 1, 0);
1395 if (dma && dma->iova + dma->size != iova + size)
1396 goto unlock;
1397 }
1398
1399 ret = 0;
1400 n = first_n = vfio_find_dma_first_node(iommu, iova, size);
1401
1402 while (n) {
1403 dma = rb_entry(n, struct vfio_dma, node);
1404 if (dma->iova >= iova + size)
1405 break;
1406
1407 if (!iommu->v2 && iova > dma->iova)
1408 break;
1409 /*
1410 * Task with same address space who mapped this iova range is
1411 * allowed to unmap the iova range.
1412 */
1413 if (dma->task->mm != current->mm)
1414 break;
1415
1416 if (invalidate_vaddr) {
1417 if (dma->vaddr_invalid) {
1418 struct rb_node *last_n = n;
1419
1420 for (n = first_n; n != last_n; n = rb_next(n)) {
1421 dma = rb_entry(n,
1422 struct vfio_dma, node);
1423 dma->vaddr_invalid = false;
1424 iommu->vaddr_invalid_count--;
1425 }
1426 ret = -EINVAL;
1427 unmapped = 0;
1428 break;
1429 }
1430 dma->vaddr_invalid = true;
1431 iommu->vaddr_invalid_count++;
1432 unmapped += dma->size;
1433 n = rb_next(n);
1434 continue;
1435 }
1436
1437 if (!RB_EMPTY_ROOT(&dma->pfn_list)) {
1438 struct vfio_iommu_type1_dma_unmap nb_unmap;
1439
1440 if (dma_last == dma) {
1441 BUG_ON(++retries > 10);
1442 } else {
1443 dma_last = dma;
1444 retries = 0;
1445 }
1446
1447 nb_unmap.iova = dma->iova;
1448 nb_unmap.size = dma->size;
1449
1450 /*
1451 * Notify anyone (mdev vendor drivers) to invalidate and
1452 * unmap iovas within the range we're about to unmap.
1453 * Vendor drivers MUST unpin pages in response to an
1454 * invalidation.
1455 */
1456 mutex_unlock(&iommu->lock);
1457 blocking_notifier_call_chain(&iommu->notifier,
1458 VFIO_IOMMU_NOTIFY_DMA_UNMAP,
1459 &nb_unmap);
1460 mutex_lock(&iommu->lock);
1461 goto again;
1462 }
1463
1464 if (unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
1465 ret = update_user_bitmap(bitmap->data, iommu, dma,
1466 iova, pgsize);
1467 if (ret)
1468 break;
1469 }
1470
1471 unmapped += dma->size;
1472 n = rb_next(n);
1473 vfio_remove_dma(iommu, dma);
1474 }
1475
1476 unlock:
1477 mutex_unlock(&iommu->lock);
1478
1479 /* Report how much was unmapped */
1480 unmap->size = unmapped;
1481
1482 return ret;
1483 }
1484
vfio_iommu_map(struct vfio_iommu * iommu,dma_addr_t iova,unsigned long pfn,long npage,int prot)1485 static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova,
1486 unsigned long pfn, long npage, int prot)
1487 {
1488 struct vfio_domain *d;
1489 int ret;
1490
1491 list_for_each_entry(d, &iommu->domain_list, next) {
1492 ret = iommu_map(d->domain, iova, (phys_addr_t)pfn << PAGE_SHIFT,
1493 npage << PAGE_SHIFT, prot | d->prot);
1494 if (ret)
1495 goto unwind;
1496
1497 cond_resched();
1498 }
1499
1500 return 0;
1501
1502 unwind:
1503 list_for_each_entry_continue_reverse(d, &iommu->domain_list, next) {
1504 iommu_unmap(d->domain, iova, npage << PAGE_SHIFT);
1505 cond_resched();
1506 }
1507
1508 return ret;
1509 }
1510
vfio_pin_map_dma(struct vfio_iommu * iommu,struct vfio_dma * dma,size_t map_size)1511 static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
1512 size_t map_size)
1513 {
1514 dma_addr_t iova = dma->iova;
1515 unsigned long vaddr = dma->vaddr;
1516 struct vfio_batch batch;
1517 size_t size = map_size;
1518 long npage;
1519 unsigned long pfn, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
1520 int ret = 0;
1521
1522 vfio_batch_init(&batch);
1523
1524 while (size) {
1525 /* Pin a contiguous chunk of memory */
1526 npage = vfio_pin_pages_remote(dma, vaddr + dma->size,
1527 size >> PAGE_SHIFT, &pfn, limit,
1528 &batch);
1529 if (npage <= 0) {
1530 WARN_ON(!npage);
1531 ret = (int)npage;
1532 break;
1533 }
1534
1535 /* Map it! */
1536 ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage,
1537 dma->prot);
1538 if (ret) {
1539 vfio_unpin_pages_remote(dma, iova + dma->size, pfn,
1540 npage, true);
1541 vfio_batch_unpin(&batch, dma);
1542 break;
1543 }
1544
1545 size -= npage << PAGE_SHIFT;
1546 dma->size += npage << PAGE_SHIFT;
1547 }
1548
1549 vfio_batch_fini(&batch);
1550 dma->iommu_mapped = true;
1551
1552 if (ret)
1553 vfio_remove_dma(iommu, dma);
1554
1555 return ret;
1556 }
1557
1558 /*
1559 * Check dma map request is within a valid iova range
1560 */
vfio_iommu_iova_dma_valid(struct vfio_iommu * iommu,dma_addr_t start,dma_addr_t end)1561 static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,
1562 dma_addr_t start, dma_addr_t end)
1563 {
1564 struct list_head *iova = &iommu->iova_list;
1565 struct vfio_iova *node;
1566
1567 list_for_each_entry(node, iova, list) {
1568 if (start >= node->start && end <= node->end)
1569 return true;
1570 }
1571
1572 /*
1573 * Check for list_empty() as well since a container with
1574 * a single mdev device will have an empty list.
1575 */
1576 return list_empty(iova);
1577 }
1578
vfio_change_dma_owner(struct vfio_dma * dma)1579 static int vfio_change_dma_owner(struct vfio_dma *dma)
1580 {
1581 struct task_struct *task = current->group_leader;
1582 struct mm_struct *mm = current->mm;
1583 long npage = dma->locked_vm;
1584 bool lock_cap;
1585 int ret;
1586
1587 if (mm == dma->mm)
1588 return 0;
1589
1590 lock_cap = capable(CAP_IPC_LOCK);
1591 ret = mm_lock_acct(task, mm, lock_cap, npage);
1592 if (ret)
1593 return ret;
1594
1595 if (mmget_not_zero(dma->mm)) {
1596 mm_lock_acct(dma->task, dma->mm, dma->lock_cap, -npage);
1597 mmput(dma->mm);
1598 }
1599
1600 if (dma->task != task) {
1601 put_task_struct(dma->task);
1602 dma->task = get_task_struct(task);
1603 }
1604 mmdrop(dma->mm);
1605 dma->mm = mm;
1606 mmgrab(dma->mm);
1607 dma->lock_cap = lock_cap;
1608 return 0;
1609 }
1610
vfio_dma_do_map(struct vfio_iommu * iommu,struct vfio_iommu_type1_dma_map * map)1611 static int vfio_dma_do_map(struct vfio_iommu *iommu,
1612 struct vfio_iommu_type1_dma_map *map)
1613 {
1614 bool set_vaddr = map->flags & VFIO_DMA_MAP_FLAG_VADDR;
1615 dma_addr_t iova = map->iova;
1616 unsigned long vaddr = map->vaddr;
1617 size_t size = map->size;
1618 int ret = 0, prot = 0;
1619 size_t pgsize;
1620 struct vfio_dma *dma;
1621
1622 /* Verify that none of our __u64 fields overflow */
1623 if (map->size != size || map->vaddr != vaddr || map->iova != iova)
1624 return -EINVAL;
1625
1626 /* READ/WRITE from device perspective */
1627 if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
1628 prot |= IOMMU_WRITE;
1629 if (map->flags & VFIO_DMA_MAP_FLAG_READ)
1630 prot |= IOMMU_READ;
1631
1632 if ((prot && set_vaddr) || (!prot && !set_vaddr))
1633 return -EINVAL;
1634
1635 mutex_lock(&iommu->lock);
1636
1637 pgsize = (size_t)1 << __ffs(iommu->pgsize_bitmap);
1638
1639 WARN_ON((pgsize - 1) & PAGE_MASK);
1640
1641 if (!size || (size | iova | vaddr) & (pgsize - 1)) {
1642 ret = -EINVAL;
1643 goto out_unlock;
1644 }
1645
1646 /* Don't allow IOVA or virtual address wrap */
1647 if (iova + size - 1 < iova || vaddr + size - 1 < vaddr) {
1648 ret = -EINVAL;
1649 goto out_unlock;
1650 }
1651
1652 dma = vfio_find_dma(iommu, iova, size);
1653 if (set_vaddr) {
1654 if (!dma) {
1655 ret = -ENOENT;
1656 } else if (!dma->vaddr_invalid || dma->iova != iova ||
1657 dma->size != size) {
1658 ret = -EINVAL;
1659 } else {
1660 ret = vfio_change_dma_owner(dma);
1661 if (ret)
1662 goto out_unlock;
1663 dma->vaddr = vaddr;
1664 dma->vaddr_invalid = false;
1665 iommu->vaddr_invalid_count--;
1666 wake_up_all(&iommu->vaddr_wait);
1667 }
1668 goto out_unlock;
1669 } else if (dma) {
1670 ret = -EEXIST;
1671 goto out_unlock;
1672 }
1673
1674 if (!iommu->dma_avail) {
1675 ret = -ENOSPC;
1676 goto out_unlock;
1677 }
1678
1679 if (!vfio_iommu_iova_dma_valid(iommu, iova, iova + size - 1)) {
1680 ret = -EINVAL;
1681 goto out_unlock;
1682 }
1683
1684 dma = kzalloc(sizeof(*dma), GFP_KERNEL);
1685 if (!dma) {
1686 ret = -ENOMEM;
1687 goto out_unlock;
1688 }
1689
1690 iommu->dma_avail--;
1691 dma->iova = iova;
1692 dma->vaddr = vaddr;
1693 dma->prot = prot;
1694
1695 /*
1696 * We need to be able to both add to a task's locked memory and test
1697 * against the locked memory limit and we need to be able to do both
1698 * outside of this call path as pinning can be asynchronous via the
1699 * external interfaces for mdev devices. RLIMIT_MEMLOCK requires a
1700 * task_struct. Save the group_leader so that all DMA tracking uses
1701 * the same task, to make debugging easier. VM locked pages requires
1702 * an mm_struct, so grab the mm in case the task dies.
1703 */
1704 get_task_struct(current->group_leader);
1705 dma->task = current->group_leader;
1706 dma->lock_cap = capable(CAP_IPC_LOCK);
1707 dma->mm = current->mm;
1708 mmgrab(dma->mm);
1709
1710 dma->pfn_list = RB_ROOT;
1711
1712 /* Insert zero-sized and grow as we map chunks of it */
1713 vfio_link_dma(iommu, dma);
1714
1715 /* Don't pin and map if container doesn't contain IOMMU capable domain*/
1716 if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
1717 dma->size = size;
1718 else
1719 ret = vfio_pin_map_dma(iommu, dma, size);
1720
1721 if (!ret && iommu->dirty_page_tracking) {
1722 ret = vfio_dma_bitmap_alloc(dma, pgsize);
1723 if (ret)
1724 vfio_remove_dma(iommu, dma);
1725 }
1726
1727 out_unlock:
1728 mutex_unlock(&iommu->lock);
1729 return ret;
1730 }
1731
vfio_bus_type(struct device * dev,void * data)1732 static int vfio_bus_type(struct device *dev, void *data)
1733 {
1734 struct bus_type **bus = data;
1735
1736 if (*bus && *bus != dev->bus)
1737 return -EINVAL;
1738
1739 *bus = dev->bus;
1740
1741 return 0;
1742 }
1743
vfio_iommu_replay(struct vfio_iommu * iommu,struct vfio_domain * domain)1744 static int vfio_iommu_replay(struct vfio_iommu *iommu,
1745 struct vfio_domain *domain)
1746 {
1747 struct vfio_batch batch;
1748 struct vfio_domain *d = NULL;
1749 struct rb_node *n;
1750 unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
1751 int ret;
1752
1753 ret = vfio_wait_all_valid(iommu);
1754 if (ret < 0)
1755 return ret;
1756
1757 /* Arbitrarily pick the first domain in the list for lookups */
1758 if (!list_empty(&iommu->domain_list))
1759 d = list_first_entry(&iommu->domain_list,
1760 struct vfio_domain, next);
1761
1762 vfio_batch_init(&batch);
1763
1764 n = rb_first(&iommu->dma_list);
1765
1766 for (; n; n = rb_next(n)) {
1767 struct vfio_dma *dma;
1768 dma_addr_t iova;
1769
1770 dma = rb_entry(n, struct vfio_dma, node);
1771 iova = dma->iova;
1772
1773 while (iova < dma->iova + dma->size) {
1774 phys_addr_t phys;
1775 size_t size;
1776
1777 if (dma->iommu_mapped) {
1778 phys_addr_t p;
1779 dma_addr_t i;
1780
1781 if (WARN_ON(!d)) { /* mapped w/o a domain?! */
1782 ret = -EINVAL;
1783 goto unwind;
1784 }
1785
1786 phys = iommu_iova_to_phys(d->domain, iova);
1787
1788 if (WARN_ON(!phys)) {
1789 iova += PAGE_SIZE;
1790 continue;
1791 }
1792
1793 size = PAGE_SIZE;
1794 p = phys + size;
1795 i = iova + size;
1796 while (i < dma->iova + dma->size &&
1797 p == iommu_iova_to_phys(d->domain, i)) {
1798 size += PAGE_SIZE;
1799 p += PAGE_SIZE;
1800 i += PAGE_SIZE;
1801 }
1802 } else {
1803 unsigned long pfn;
1804 unsigned long vaddr = dma->vaddr +
1805 (iova - dma->iova);
1806 size_t n = dma->iova + dma->size - iova;
1807 long npage;
1808
1809 npage = vfio_pin_pages_remote(dma, vaddr,
1810 n >> PAGE_SHIFT,
1811 &pfn, limit,
1812 &batch);
1813 if (npage <= 0) {
1814 WARN_ON(!npage);
1815 ret = (int)npage;
1816 goto unwind;
1817 }
1818
1819 phys = pfn << PAGE_SHIFT;
1820 size = npage << PAGE_SHIFT;
1821 }
1822
1823 ret = iommu_map(domain->domain, iova, phys,
1824 size, dma->prot | domain->prot);
1825 if (ret) {
1826 if (!dma->iommu_mapped) {
1827 vfio_unpin_pages_remote(dma, iova,
1828 phys >> PAGE_SHIFT,
1829 size >> PAGE_SHIFT,
1830 true);
1831 vfio_batch_unpin(&batch, dma);
1832 }
1833 goto unwind;
1834 }
1835
1836 iova += size;
1837 }
1838 }
1839
1840 /* All dmas are now mapped, defer to second tree walk for unwind */
1841 for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
1842 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
1843
1844 dma->iommu_mapped = true;
1845 }
1846
1847 vfio_batch_fini(&batch);
1848 return 0;
1849
1850 unwind:
1851 for (; n; n = rb_prev(n)) {
1852 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
1853 dma_addr_t iova;
1854
1855 if (dma->iommu_mapped) {
1856 iommu_unmap(domain->domain, dma->iova, dma->size);
1857 continue;
1858 }
1859
1860 iova = dma->iova;
1861 while (iova < dma->iova + dma->size) {
1862 phys_addr_t phys, p;
1863 size_t size;
1864 dma_addr_t i;
1865
1866 phys = iommu_iova_to_phys(domain->domain, iova);
1867 if (!phys) {
1868 iova += PAGE_SIZE;
1869 continue;
1870 }
1871
1872 size = PAGE_SIZE;
1873 p = phys + size;
1874 i = iova + size;
1875 while (i < dma->iova + dma->size &&
1876 p == iommu_iova_to_phys(domain->domain, i)) {
1877 size += PAGE_SIZE;
1878 p += PAGE_SIZE;
1879 i += PAGE_SIZE;
1880 }
1881
1882 iommu_unmap(domain->domain, iova, size);
1883 vfio_unpin_pages_remote(dma, iova, phys >> PAGE_SHIFT,
1884 size >> PAGE_SHIFT, true);
1885 }
1886 }
1887
1888 vfio_batch_fini(&batch);
1889 return ret;
1890 }
1891
1892 /*
1893 * We change our unmap behavior slightly depending on whether the IOMMU
1894 * supports fine-grained superpages. IOMMUs like AMD-Vi will use a superpage
1895 * for practically any contiguous power-of-two mapping we give it. This means
1896 * we don't need to look for contiguous chunks ourselves to make unmapping
1897 * more efficient. On IOMMUs with coarse-grained super pages, like Intel VT-d
1898 * with discrete 2M/1G/512G/1T superpages, identifying contiguous chunks
1899 * significantly boosts non-hugetlbfs mappings and doesn't seem to hurt when
1900 * hugetlbfs is in use.
1901 */
vfio_test_domain_fgsp(struct vfio_domain * domain)1902 static void vfio_test_domain_fgsp(struct vfio_domain *domain)
1903 {
1904 struct page *pages;
1905 int ret, order = get_order(PAGE_SIZE * 2);
1906
1907 pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, order);
1908 if (!pages)
1909 return;
1910
1911 ret = iommu_map(domain->domain, 0, page_to_phys(pages), PAGE_SIZE * 2,
1912 IOMMU_READ | IOMMU_WRITE | domain->prot);
1913 if (!ret) {
1914 size_t unmapped = iommu_unmap(domain->domain, 0, PAGE_SIZE);
1915
1916 if (unmapped == PAGE_SIZE)
1917 iommu_unmap(domain->domain, PAGE_SIZE, PAGE_SIZE);
1918 else
1919 domain->fgsp = true;
1920 }
1921
1922 __free_pages(pages, order);
1923 }
1924
find_iommu_group(struct vfio_domain * domain,struct iommu_group * iommu_group)1925 static struct vfio_iommu_group *find_iommu_group(struct vfio_domain *domain,
1926 struct iommu_group *iommu_group)
1927 {
1928 struct vfio_iommu_group *g;
1929
1930 list_for_each_entry(g, &domain->group_list, next) {
1931 if (g->iommu_group == iommu_group)
1932 return g;
1933 }
1934
1935 return NULL;
1936 }
1937
1938 static struct vfio_iommu_group*
vfio_iommu_find_iommu_group(struct vfio_iommu * iommu,struct iommu_group * iommu_group)1939 vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
1940 struct iommu_group *iommu_group)
1941 {
1942 struct vfio_domain *domain;
1943 struct vfio_iommu_group *group = NULL;
1944
1945 list_for_each_entry(domain, &iommu->domain_list, next) {
1946 group = find_iommu_group(domain, iommu_group);
1947 if (group)
1948 return group;
1949 }
1950
1951 if (iommu->external_domain)
1952 group = find_iommu_group(iommu->external_domain, iommu_group);
1953
1954 return group;
1955 }
1956
vfio_iommu_has_sw_msi(struct list_head * group_resv_regions,phys_addr_t * base)1957 static bool vfio_iommu_has_sw_msi(struct list_head *group_resv_regions,
1958 phys_addr_t *base)
1959 {
1960 struct iommu_resv_region *region;
1961 bool ret = false;
1962
1963 list_for_each_entry(region, group_resv_regions, list) {
1964 /*
1965 * The presence of any 'real' MSI regions should take
1966 * precedence over the software-managed one if the
1967 * IOMMU driver happens to advertise both types.
1968 */
1969 if (region->type == IOMMU_RESV_MSI) {
1970 ret = false;
1971 break;
1972 }
1973
1974 if (region->type == IOMMU_RESV_SW_MSI) {
1975 *base = region->start;
1976 ret = true;
1977 }
1978 }
1979
1980 return ret;
1981 }
1982
vfio_mdev_attach_domain(struct device * dev,void * data)1983 static int vfio_mdev_attach_domain(struct device *dev, void *data)
1984 {
1985 struct mdev_device *mdev = to_mdev_device(dev);
1986 struct iommu_domain *domain = data;
1987 struct device *iommu_device;
1988
1989 iommu_device = mdev_get_iommu_device(mdev);
1990 if (iommu_device) {
1991 if (iommu_dev_feature_enabled(iommu_device, IOMMU_DEV_FEAT_AUX))
1992 return iommu_aux_attach_device(domain, iommu_device);
1993 else
1994 return iommu_attach_device(domain, iommu_device);
1995 }
1996
1997 return -EINVAL;
1998 }
1999
vfio_mdev_detach_domain(struct device * dev,void * data)2000 static int vfio_mdev_detach_domain(struct device *dev, void *data)
2001 {
2002 struct mdev_device *mdev = to_mdev_device(dev);
2003 struct iommu_domain *domain = data;
2004 struct device *iommu_device;
2005
2006 iommu_device = mdev_get_iommu_device(mdev);
2007 if (iommu_device) {
2008 if (iommu_dev_feature_enabled(iommu_device, IOMMU_DEV_FEAT_AUX))
2009 iommu_aux_detach_device(domain, iommu_device);
2010 else
2011 iommu_detach_device(domain, iommu_device);
2012 }
2013
2014 return 0;
2015 }
2016
vfio_iommu_attach_group(struct vfio_domain * domain,struct vfio_iommu_group * group)2017 static int vfio_iommu_attach_group(struct vfio_domain *domain,
2018 struct vfio_iommu_group *group)
2019 {
2020 if (group->mdev_group)
2021 return iommu_group_for_each_dev(group->iommu_group,
2022 domain->domain,
2023 vfio_mdev_attach_domain);
2024 else
2025 return iommu_attach_group(domain->domain, group->iommu_group);
2026 }
2027
vfio_iommu_detach_group(struct vfio_domain * domain,struct vfio_iommu_group * group)2028 static void vfio_iommu_detach_group(struct vfio_domain *domain,
2029 struct vfio_iommu_group *group)
2030 {
2031 if (group->mdev_group)
2032 iommu_group_for_each_dev(group->iommu_group, domain->domain,
2033 vfio_mdev_detach_domain);
2034 else
2035 iommu_detach_group(domain->domain, group->iommu_group);
2036 }
2037
vfio_bus_is_mdev(struct bus_type * bus)2038 static bool vfio_bus_is_mdev(struct bus_type *bus)
2039 {
2040 struct bus_type *mdev_bus;
2041 bool ret = false;
2042
2043 mdev_bus = symbol_get(mdev_bus_type);
2044 if (mdev_bus) {
2045 ret = (bus == mdev_bus);
2046 symbol_put(mdev_bus_type);
2047 }
2048
2049 return ret;
2050 }
2051
vfio_mdev_iommu_device(struct device * dev,void * data)2052 static int vfio_mdev_iommu_device(struct device *dev, void *data)
2053 {
2054 struct mdev_device *mdev = to_mdev_device(dev);
2055 struct device **old = data, *new;
2056
2057 new = mdev_get_iommu_device(mdev);
2058 if (!new || (*old && *old != new))
2059 return -EINVAL;
2060
2061 *old = new;
2062
2063 return 0;
2064 }
2065
2066 /*
2067 * This is a helper function to insert an address range to iova list.
2068 * The list is initially created with a single entry corresponding to
2069 * the IOMMU domain geometry to which the device group is attached.
2070 * The list aperture gets modified when a new domain is added to the
2071 * container if the new aperture doesn't conflict with the current one
2072 * or with any existing dma mappings. The list is also modified to
2073 * exclude any reserved regions associated with the device group.
2074 */
vfio_iommu_iova_insert(struct list_head * head,dma_addr_t start,dma_addr_t end)2075 static int vfio_iommu_iova_insert(struct list_head *head,
2076 dma_addr_t start, dma_addr_t end)
2077 {
2078 struct vfio_iova *region;
2079
2080 region = kmalloc(sizeof(*region), GFP_KERNEL);
2081 if (!region)
2082 return -ENOMEM;
2083
2084 INIT_LIST_HEAD(®ion->list);
2085 region->start = start;
2086 region->end = end;
2087
2088 list_add_tail(®ion->list, head);
2089 return 0;
2090 }
2091
2092 /*
2093 * Check the new iommu aperture conflicts with existing aper or with any
2094 * existing dma mappings.
2095 */
vfio_iommu_aper_conflict(struct vfio_iommu * iommu,dma_addr_t start,dma_addr_t end)2096 static bool vfio_iommu_aper_conflict(struct vfio_iommu *iommu,
2097 dma_addr_t start, dma_addr_t end)
2098 {
2099 struct vfio_iova *first, *last;
2100 struct list_head *iova = &iommu->iova_list;
2101
2102 if (list_empty(iova))
2103 return false;
2104
2105 /* Disjoint sets, return conflict */
2106 first = list_first_entry(iova, struct vfio_iova, list);
2107 last = list_last_entry(iova, struct vfio_iova, list);
2108 if (start > last->end || end < first->start)
2109 return true;
2110
2111 /* Check for any existing dma mappings below the new start */
2112 if (start > first->start) {
2113 if (vfio_find_dma(iommu, first->start, start - first->start))
2114 return true;
2115 }
2116
2117 /* Check for any existing dma mappings beyond the new end */
2118 if (end < last->end) {
2119 if (vfio_find_dma(iommu, end + 1, last->end - end))
2120 return true;
2121 }
2122
2123 return false;
2124 }
2125
2126 /*
2127 * Resize iommu iova aperture window. This is called only if the new
2128 * aperture has no conflict with existing aperture and dma mappings.
2129 */
vfio_iommu_aper_resize(struct list_head * iova,dma_addr_t start,dma_addr_t end)2130 static int vfio_iommu_aper_resize(struct list_head *iova,
2131 dma_addr_t start, dma_addr_t end)
2132 {
2133 struct vfio_iova *node, *next;
2134
2135 if (list_empty(iova))
2136 return vfio_iommu_iova_insert(iova, start, end);
2137
2138 /* Adjust iova list start */
2139 list_for_each_entry_safe(node, next, iova, list) {
2140 if (start < node->start)
2141 break;
2142 if (start >= node->start && start < node->end) {
2143 node->start = start;
2144 break;
2145 }
2146 /* Delete nodes before new start */
2147 list_del(&node->list);
2148 kfree(node);
2149 }
2150
2151 /* Adjust iova list end */
2152 list_for_each_entry_safe(node, next, iova, list) {
2153 if (end > node->end)
2154 continue;
2155 if (end > node->start && end <= node->end) {
2156 node->end = end;
2157 continue;
2158 }
2159 /* Delete nodes after new end */
2160 list_del(&node->list);
2161 kfree(node);
2162 }
2163
2164 return 0;
2165 }
2166
2167 /*
2168 * Check reserved region conflicts with existing dma mappings
2169 */
vfio_iommu_resv_conflict(struct vfio_iommu * iommu,struct list_head * resv_regions)2170 static bool vfio_iommu_resv_conflict(struct vfio_iommu *iommu,
2171 struct list_head *resv_regions)
2172 {
2173 struct iommu_resv_region *region;
2174
2175 /* Check for conflict with existing dma mappings */
2176 list_for_each_entry(region, resv_regions, list) {
2177 if (region->type == IOMMU_RESV_DIRECT_RELAXABLE)
2178 continue;
2179
2180 if (vfio_find_dma(iommu, region->start, region->length))
2181 return true;
2182 }
2183
2184 return false;
2185 }
2186
2187 /*
2188 * Check iova region overlap with reserved regions and
2189 * exclude them from the iommu iova range
2190 */
vfio_iommu_resv_exclude(struct list_head * iova,struct list_head * resv_regions)2191 static int vfio_iommu_resv_exclude(struct list_head *iova,
2192 struct list_head *resv_regions)
2193 {
2194 struct iommu_resv_region *resv;
2195 struct vfio_iova *n, *next;
2196
2197 list_for_each_entry(resv, resv_regions, list) {
2198 phys_addr_t start, end;
2199
2200 if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE)
2201 continue;
2202
2203 start = resv->start;
2204 end = resv->start + resv->length - 1;
2205
2206 list_for_each_entry_safe(n, next, iova, list) {
2207 int ret = 0;
2208
2209 /* No overlap */
2210 if (start > n->end || end < n->start)
2211 continue;
2212 /*
2213 * Insert a new node if current node overlaps with the
2214 * reserve region to exclude that from valid iova range.
2215 * Note that, new node is inserted before the current
2216 * node and finally the current node is deleted keeping
2217 * the list updated and sorted.
2218 */
2219 if (start > n->start)
2220 ret = vfio_iommu_iova_insert(&n->list, n->start,
2221 start - 1);
2222 if (!ret && end < n->end)
2223 ret = vfio_iommu_iova_insert(&n->list, end + 1,
2224 n->end);
2225 if (ret)
2226 return ret;
2227
2228 list_del(&n->list);
2229 kfree(n);
2230 }
2231 }
2232
2233 if (list_empty(iova))
2234 return -EINVAL;
2235
2236 return 0;
2237 }
2238
vfio_iommu_resv_free(struct list_head * resv_regions)2239 static void vfio_iommu_resv_free(struct list_head *resv_regions)
2240 {
2241 struct iommu_resv_region *n, *next;
2242
2243 list_for_each_entry_safe(n, next, resv_regions, list) {
2244 list_del(&n->list);
2245 kfree(n);
2246 }
2247 }
2248
vfio_iommu_iova_free(struct list_head * iova)2249 static void vfio_iommu_iova_free(struct list_head *iova)
2250 {
2251 struct vfio_iova *n, *next;
2252
2253 list_for_each_entry_safe(n, next, iova, list) {
2254 list_del(&n->list);
2255 kfree(n);
2256 }
2257 }
2258
vfio_iommu_iova_get_copy(struct vfio_iommu * iommu,struct list_head * iova_copy)2259 static int vfio_iommu_iova_get_copy(struct vfio_iommu *iommu,
2260 struct list_head *iova_copy)
2261 {
2262 struct list_head *iova = &iommu->iova_list;
2263 struct vfio_iova *n;
2264 int ret;
2265
2266 list_for_each_entry(n, iova, list) {
2267 ret = vfio_iommu_iova_insert(iova_copy, n->start, n->end);
2268 if (ret)
2269 goto out_free;
2270 }
2271
2272 return 0;
2273
2274 out_free:
2275 vfio_iommu_iova_free(iova_copy);
2276 return ret;
2277 }
2278
vfio_iommu_iova_insert_copy(struct vfio_iommu * iommu,struct list_head * iova_copy)2279 static void vfio_iommu_iova_insert_copy(struct vfio_iommu *iommu,
2280 struct list_head *iova_copy)
2281 {
2282 struct list_head *iova = &iommu->iova_list;
2283
2284 vfio_iommu_iova_free(iova);
2285
2286 list_splice_tail(iova_copy, iova);
2287 }
2288
vfio_iommu_type1_attach_group(void * iommu_data,struct iommu_group * iommu_group)2289 static int vfio_iommu_type1_attach_group(void *iommu_data,
2290 struct iommu_group *iommu_group)
2291 {
2292 struct vfio_iommu *iommu = iommu_data;
2293 struct vfio_iommu_group *group;
2294 struct vfio_domain *domain, *d;
2295 struct bus_type *bus = NULL;
2296 int ret;
2297 bool resv_msi, msi_remap;
2298 phys_addr_t resv_msi_base = 0;
2299 struct iommu_domain_geometry *geo;
2300 LIST_HEAD(iova_copy);
2301 LIST_HEAD(group_resv_regions);
2302
2303 mutex_lock(&iommu->lock);
2304
2305 /* Check for duplicates */
2306 if (vfio_iommu_find_iommu_group(iommu, iommu_group)) {
2307 mutex_unlock(&iommu->lock);
2308 return -EINVAL;
2309 }
2310
2311 group = kzalloc(sizeof(*group), GFP_KERNEL);
2312 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
2313 if (!group || !domain) {
2314 ret = -ENOMEM;
2315 goto out_free;
2316 }
2317
2318 group->iommu_group = iommu_group;
2319
2320 /* Determine bus_type in order to allocate a domain */
2321 ret = iommu_group_for_each_dev(iommu_group, &bus, vfio_bus_type);
2322 if (ret)
2323 goto out_free;
2324
2325 if (vfio_bus_is_mdev(bus)) {
2326 struct device *iommu_device = NULL;
2327
2328 group->mdev_group = true;
2329
2330 /* Determine the isolation type */
2331 ret = iommu_group_for_each_dev(iommu_group, &iommu_device,
2332 vfio_mdev_iommu_device);
2333 if (ret || !iommu_device) {
2334 if (!iommu->external_domain) {
2335 INIT_LIST_HEAD(&domain->group_list);
2336 iommu->external_domain = domain;
2337 vfio_update_pgsize_bitmap(iommu);
2338 } else {
2339 kfree(domain);
2340 }
2341
2342 list_add(&group->next,
2343 &iommu->external_domain->group_list);
2344 /*
2345 * Non-iommu backed group cannot dirty memory directly,
2346 * it can only use interfaces that provide dirty
2347 * tracking.
2348 * The iommu scope can only be promoted with the
2349 * addition of a dirty tracking group.
2350 */
2351 group->pinned_page_dirty_scope = true;
2352 mutex_unlock(&iommu->lock);
2353
2354 return 0;
2355 }
2356
2357 bus = iommu_device->bus;
2358 }
2359
2360 domain->domain = iommu_domain_alloc(bus);
2361 if (!domain->domain) {
2362 ret = -EIO;
2363 goto out_free;
2364 }
2365
2366 if (iommu->nesting) {
2367 ret = iommu_enable_nesting(domain->domain);
2368 if (ret)
2369 goto out_domain;
2370 }
2371
2372 ret = vfio_iommu_attach_group(domain, group);
2373 if (ret)
2374 goto out_domain;
2375
2376 /* Get aperture info */
2377 geo = &domain->domain->geometry;
2378 if (vfio_iommu_aper_conflict(iommu, geo->aperture_start,
2379 geo->aperture_end)) {
2380 ret = -EINVAL;
2381 goto out_detach;
2382 }
2383
2384 ret = iommu_get_group_resv_regions(iommu_group, &group_resv_regions);
2385 if (ret)
2386 goto out_detach;
2387
2388 if (vfio_iommu_resv_conflict(iommu, &group_resv_regions)) {
2389 ret = -EINVAL;
2390 goto out_detach;
2391 }
2392
2393 /*
2394 * We don't want to work on the original iova list as the list
2395 * gets modified and in case of failure we have to retain the
2396 * original list. Get a copy here.
2397 */
2398 ret = vfio_iommu_iova_get_copy(iommu, &iova_copy);
2399 if (ret)
2400 goto out_detach;
2401
2402 ret = vfio_iommu_aper_resize(&iova_copy, geo->aperture_start,
2403 geo->aperture_end);
2404 if (ret)
2405 goto out_detach;
2406
2407 ret = vfio_iommu_resv_exclude(&iova_copy, &group_resv_regions);
2408 if (ret)
2409 goto out_detach;
2410
2411 resv_msi = vfio_iommu_has_sw_msi(&group_resv_regions, &resv_msi_base);
2412
2413 INIT_LIST_HEAD(&domain->group_list);
2414 list_add(&group->next, &domain->group_list);
2415
2416 msi_remap = irq_domain_check_msi_remap() ||
2417 iommu_capable(bus, IOMMU_CAP_INTR_REMAP);
2418
2419 if (!allow_unsafe_interrupts && !msi_remap) {
2420 pr_warn("%s: No interrupt remapping support. Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n",
2421 __func__);
2422 ret = -EPERM;
2423 goto out_detach;
2424 }
2425
2426 if (iommu_capable(bus, IOMMU_CAP_CACHE_COHERENCY))
2427 domain->prot |= IOMMU_CACHE;
2428
2429 /*
2430 * Try to match an existing compatible domain. We don't want to
2431 * preclude an IOMMU driver supporting multiple bus_types and being
2432 * able to include different bus_types in the same IOMMU domain, so
2433 * we test whether the domains use the same iommu_ops rather than
2434 * testing if they're on the same bus_type.
2435 */
2436 list_for_each_entry(d, &iommu->domain_list, next) {
2437 if (d->domain->ops == domain->domain->ops &&
2438 d->prot == domain->prot) {
2439 vfio_iommu_detach_group(domain, group);
2440 if (!vfio_iommu_attach_group(d, group)) {
2441 list_add(&group->next, &d->group_list);
2442 iommu_domain_free(domain->domain);
2443 kfree(domain);
2444 goto done;
2445 }
2446
2447 ret = vfio_iommu_attach_group(domain, group);
2448 if (ret)
2449 goto out_domain;
2450 }
2451 }
2452
2453 vfio_test_domain_fgsp(domain);
2454
2455 /* replay mappings on new domains */
2456 ret = vfio_iommu_replay(iommu, domain);
2457 if (ret)
2458 goto out_detach;
2459
2460 if (resv_msi) {
2461 ret = iommu_get_msi_cookie(domain->domain, resv_msi_base);
2462 if (ret && ret != -ENODEV)
2463 goto out_detach;
2464 }
2465
2466 list_add(&domain->next, &iommu->domain_list);
2467 vfio_update_pgsize_bitmap(iommu);
2468 done:
2469 /* Delete the old one and insert new iova list */
2470 vfio_iommu_iova_insert_copy(iommu, &iova_copy);
2471
2472 /*
2473 * An iommu backed group can dirty memory directly and therefore
2474 * demotes the iommu scope until it declares itself dirty tracking
2475 * capable via the page pinning interface.
2476 */
2477 iommu->num_non_pinned_groups++;
2478 mutex_unlock(&iommu->lock);
2479 vfio_iommu_resv_free(&group_resv_regions);
2480
2481 return 0;
2482
2483 out_detach:
2484 vfio_iommu_detach_group(domain, group);
2485 out_domain:
2486 iommu_domain_free(domain->domain);
2487 vfio_iommu_iova_free(&iova_copy);
2488 vfio_iommu_resv_free(&group_resv_regions);
2489 out_free:
2490 kfree(domain);
2491 kfree(group);
2492 mutex_unlock(&iommu->lock);
2493 return ret;
2494 }
2495
vfio_iommu_unmap_unpin_all(struct vfio_iommu * iommu)2496 static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
2497 {
2498 struct rb_node *node;
2499
2500 while ((node = rb_first(&iommu->dma_list)))
2501 vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
2502 }
2503
vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu * iommu)2504 static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu)
2505 {
2506 struct rb_node *n, *p;
2507
2508 n = rb_first(&iommu->dma_list);
2509 for (; n; n = rb_next(n)) {
2510 struct vfio_dma *dma;
2511 long locked = 0, unlocked = 0;
2512
2513 dma = rb_entry(n, struct vfio_dma, node);
2514 unlocked += vfio_unmap_unpin(iommu, dma, false);
2515 p = rb_first(&dma->pfn_list);
2516 for (; p; p = rb_next(p)) {
2517 struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn,
2518 node);
2519
2520 if (!is_invalid_reserved_pfn(vpfn->pfn))
2521 locked++;
2522 }
2523 vfio_lock_acct(dma, locked - unlocked, true);
2524 }
2525 }
2526
2527 /*
2528 * Called when a domain is removed in detach. It is possible that
2529 * the removed domain decided the iova aperture window. Modify the
2530 * iova aperture with the smallest window among existing domains.
2531 */
vfio_iommu_aper_expand(struct vfio_iommu * iommu,struct list_head * iova_copy)2532 static void vfio_iommu_aper_expand(struct vfio_iommu *iommu,
2533 struct list_head *iova_copy)
2534 {
2535 struct vfio_domain *domain;
2536 struct vfio_iova *node;
2537 dma_addr_t start = 0;
2538 dma_addr_t end = (dma_addr_t)~0;
2539
2540 if (list_empty(iova_copy))
2541 return;
2542
2543 list_for_each_entry(domain, &iommu->domain_list, next) {
2544 struct iommu_domain_geometry *geo = &domain->domain->geometry;
2545
2546 if (geo->aperture_start > start)
2547 start = geo->aperture_start;
2548 if (geo->aperture_end < end)
2549 end = geo->aperture_end;
2550 }
2551
2552 /* Modify aperture limits. The new aper is either same or bigger */
2553 node = list_first_entry(iova_copy, struct vfio_iova, list);
2554 node->start = start;
2555 node = list_last_entry(iova_copy, struct vfio_iova, list);
2556 node->end = end;
2557 }
2558
2559 /*
2560 * Called when a group is detached. The reserved regions for that
2561 * group can be part of valid iova now. But since reserved regions
2562 * may be duplicated among groups, populate the iova valid regions
2563 * list again.
2564 */
vfio_iommu_resv_refresh(struct vfio_iommu * iommu,struct list_head * iova_copy)2565 static int vfio_iommu_resv_refresh(struct vfio_iommu *iommu,
2566 struct list_head *iova_copy)
2567 {
2568 struct vfio_domain *d;
2569 struct vfio_iommu_group *g;
2570 struct vfio_iova *node;
2571 dma_addr_t start, end;
2572 LIST_HEAD(resv_regions);
2573 int ret;
2574
2575 if (list_empty(iova_copy))
2576 return -EINVAL;
2577
2578 list_for_each_entry(d, &iommu->domain_list, next) {
2579 list_for_each_entry(g, &d->group_list, next) {
2580 ret = iommu_get_group_resv_regions(g->iommu_group,
2581 &resv_regions);
2582 if (ret)
2583 goto done;
2584 }
2585 }
2586
2587 node = list_first_entry(iova_copy, struct vfio_iova, list);
2588 start = node->start;
2589 node = list_last_entry(iova_copy, struct vfio_iova, list);
2590 end = node->end;
2591
2592 /* purge the iova list and create new one */
2593 vfio_iommu_iova_free(iova_copy);
2594
2595 ret = vfio_iommu_aper_resize(iova_copy, start, end);
2596 if (ret)
2597 goto done;
2598
2599 /* Exclude current reserved regions from iova ranges */
2600 ret = vfio_iommu_resv_exclude(iova_copy, &resv_regions);
2601 done:
2602 vfio_iommu_resv_free(&resv_regions);
2603 return ret;
2604 }
2605
vfio_iommu_type1_detach_group(void * iommu_data,struct iommu_group * iommu_group)2606 static void vfio_iommu_type1_detach_group(void *iommu_data,
2607 struct iommu_group *iommu_group)
2608 {
2609 struct vfio_iommu *iommu = iommu_data;
2610 struct vfio_domain *domain;
2611 struct vfio_iommu_group *group;
2612 bool update_dirty_scope = false;
2613 LIST_HEAD(iova_copy);
2614
2615 mutex_lock(&iommu->lock);
2616
2617 if (iommu->external_domain) {
2618 group = find_iommu_group(iommu->external_domain, iommu_group);
2619 if (group) {
2620 update_dirty_scope = !group->pinned_page_dirty_scope;
2621 list_del(&group->next);
2622 kfree(group);
2623
2624 if (list_empty(&iommu->external_domain->group_list)) {
2625 if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
2626 WARN_ON(iommu->notifier.head);
2627 vfio_iommu_unmap_unpin_all(iommu);
2628 }
2629
2630 kfree(iommu->external_domain);
2631 iommu->external_domain = NULL;
2632 }
2633 goto detach_group_done;
2634 }
2635 }
2636
2637 /*
2638 * Get a copy of iova list. This will be used to update
2639 * and to replace the current one later. Please note that
2640 * we will leave the original list as it is if update fails.
2641 */
2642 vfio_iommu_iova_get_copy(iommu, &iova_copy);
2643
2644 list_for_each_entry(domain, &iommu->domain_list, next) {
2645 group = find_iommu_group(domain, iommu_group);
2646 if (!group)
2647 continue;
2648
2649 vfio_iommu_detach_group(domain, group);
2650 update_dirty_scope = !group->pinned_page_dirty_scope;
2651 list_del(&group->next);
2652 kfree(group);
2653 /*
2654 * Group ownership provides privilege, if the group list is
2655 * empty, the domain goes away. If it's the last domain with
2656 * iommu and external domain doesn't exist, then all the
2657 * mappings go away too. If it's the last domain with iommu and
2658 * external domain exist, update accounting
2659 */
2660 if (list_empty(&domain->group_list)) {
2661 if (list_is_singular(&iommu->domain_list)) {
2662 if (!iommu->external_domain) {
2663 WARN_ON(iommu->notifier.head);
2664 vfio_iommu_unmap_unpin_all(iommu);
2665 } else {
2666 vfio_iommu_unmap_unpin_reaccount(iommu);
2667 }
2668 }
2669 iommu_domain_free(domain->domain);
2670 list_del(&domain->next);
2671 kfree(domain);
2672 vfio_iommu_aper_expand(iommu, &iova_copy);
2673 vfio_update_pgsize_bitmap(iommu);
2674 }
2675 break;
2676 }
2677
2678 if (!vfio_iommu_resv_refresh(iommu, &iova_copy))
2679 vfio_iommu_iova_insert_copy(iommu, &iova_copy);
2680 else
2681 vfio_iommu_iova_free(&iova_copy);
2682
2683 detach_group_done:
2684 /*
2685 * Removal of a group without dirty tracking may allow the iommu scope
2686 * to be promoted.
2687 */
2688 if (update_dirty_scope) {
2689 iommu->num_non_pinned_groups--;
2690 if (iommu->dirty_page_tracking)
2691 vfio_iommu_populate_bitmap_full(iommu);
2692 }
2693 mutex_unlock(&iommu->lock);
2694 }
2695
vfio_iommu_type1_open(unsigned long arg)2696 static void *vfio_iommu_type1_open(unsigned long arg)
2697 {
2698 struct vfio_iommu *iommu;
2699
2700 iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
2701 if (!iommu)
2702 return ERR_PTR(-ENOMEM);
2703
2704 switch (arg) {
2705 case VFIO_TYPE1_IOMMU:
2706 break;
2707 case VFIO_TYPE1_NESTING_IOMMU:
2708 iommu->nesting = true;
2709 fallthrough;
2710 case VFIO_TYPE1v2_IOMMU:
2711 iommu->v2 = true;
2712 break;
2713 default:
2714 kfree(iommu);
2715 return ERR_PTR(-EINVAL);
2716 }
2717
2718 INIT_LIST_HEAD(&iommu->domain_list);
2719 INIT_LIST_HEAD(&iommu->iova_list);
2720 iommu->dma_list = RB_ROOT;
2721 iommu->dma_avail = dma_entry_limit;
2722 iommu->container_open = true;
2723 mutex_init(&iommu->lock);
2724 BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);
2725 init_waitqueue_head(&iommu->vaddr_wait);
2726
2727 return iommu;
2728 }
2729
vfio_release_domain(struct vfio_domain * domain,bool external)2730 static void vfio_release_domain(struct vfio_domain *domain, bool external)
2731 {
2732 struct vfio_iommu_group *group, *group_tmp;
2733
2734 list_for_each_entry_safe(group, group_tmp,
2735 &domain->group_list, next) {
2736 if (!external)
2737 vfio_iommu_detach_group(domain, group);
2738 list_del(&group->next);
2739 kfree(group);
2740 }
2741
2742 if (!external)
2743 iommu_domain_free(domain->domain);
2744 }
2745
vfio_iommu_type1_release(void * iommu_data)2746 static void vfio_iommu_type1_release(void *iommu_data)
2747 {
2748 struct vfio_iommu *iommu = iommu_data;
2749 struct vfio_domain *domain, *domain_tmp;
2750
2751 if (iommu->external_domain) {
2752 vfio_release_domain(iommu->external_domain, true);
2753 kfree(iommu->external_domain);
2754 }
2755
2756 vfio_iommu_unmap_unpin_all(iommu);
2757
2758 list_for_each_entry_safe(domain, domain_tmp,
2759 &iommu->domain_list, next) {
2760 vfio_release_domain(domain, false);
2761 list_del(&domain->next);
2762 kfree(domain);
2763 }
2764
2765 vfio_iommu_iova_free(&iommu->iova_list);
2766
2767 kfree(iommu);
2768 }
2769
vfio_domains_have_iommu_cache(struct vfio_iommu * iommu)2770 static int vfio_domains_have_iommu_cache(struct vfio_iommu *iommu)
2771 {
2772 struct vfio_domain *domain;
2773 int ret = 1;
2774
2775 mutex_lock(&iommu->lock);
2776 list_for_each_entry(domain, &iommu->domain_list, next) {
2777 if (!(domain->prot & IOMMU_CACHE)) {
2778 ret = 0;
2779 break;
2780 }
2781 }
2782 mutex_unlock(&iommu->lock);
2783
2784 return ret;
2785 }
2786
vfio_iommu_type1_check_extension(struct vfio_iommu * iommu,unsigned long arg)2787 static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu,
2788 unsigned long arg)
2789 {
2790 switch (arg) {
2791 case VFIO_TYPE1_IOMMU:
2792 case VFIO_TYPE1v2_IOMMU:
2793 case VFIO_TYPE1_NESTING_IOMMU:
2794 case VFIO_UNMAP_ALL:
2795 case VFIO_UPDATE_VADDR:
2796 return 1;
2797 case VFIO_DMA_CC_IOMMU:
2798 if (!iommu)
2799 return 0;
2800 return vfio_domains_have_iommu_cache(iommu);
2801 default:
2802 return 0;
2803 }
2804 }
2805
vfio_iommu_iova_add_cap(struct vfio_info_cap * caps,struct vfio_iommu_type1_info_cap_iova_range * cap_iovas,size_t size)2806 static int vfio_iommu_iova_add_cap(struct vfio_info_cap *caps,
2807 struct vfio_iommu_type1_info_cap_iova_range *cap_iovas,
2808 size_t size)
2809 {
2810 struct vfio_info_cap_header *header;
2811 struct vfio_iommu_type1_info_cap_iova_range *iova_cap;
2812
2813 header = vfio_info_cap_add(caps, size,
2814 VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE, 1);
2815 if (IS_ERR(header))
2816 return PTR_ERR(header);
2817
2818 iova_cap = container_of(header,
2819 struct vfio_iommu_type1_info_cap_iova_range,
2820 header);
2821 iova_cap->nr_iovas = cap_iovas->nr_iovas;
2822 memcpy(iova_cap->iova_ranges, cap_iovas->iova_ranges,
2823 cap_iovas->nr_iovas * sizeof(*cap_iovas->iova_ranges));
2824 return 0;
2825 }
2826
vfio_iommu_iova_build_caps(struct vfio_iommu * iommu,struct vfio_info_cap * caps)2827 static int vfio_iommu_iova_build_caps(struct vfio_iommu *iommu,
2828 struct vfio_info_cap *caps)
2829 {
2830 struct vfio_iommu_type1_info_cap_iova_range *cap_iovas;
2831 struct vfio_iova *iova;
2832 size_t size;
2833 int iovas = 0, i = 0, ret;
2834
2835 list_for_each_entry(iova, &iommu->iova_list, list)
2836 iovas++;
2837
2838 if (!iovas) {
2839 /*
2840 * Return 0 as a container with a single mdev device
2841 * will have an empty list
2842 */
2843 return 0;
2844 }
2845
2846 size = struct_size(cap_iovas, iova_ranges, iovas);
2847
2848 cap_iovas = kzalloc(size, GFP_KERNEL);
2849 if (!cap_iovas)
2850 return -ENOMEM;
2851
2852 cap_iovas->nr_iovas = iovas;
2853
2854 list_for_each_entry(iova, &iommu->iova_list, list) {
2855 cap_iovas->iova_ranges[i].start = iova->start;
2856 cap_iovas->iova_ranges[i].end = iova->end;
2857 i++;
2858 }
2859
2860 ret = vfio_iommu_iova_add_cap(caps, cap_iovas, size);
2861
2862 kfree(cap_iovas);
2863 return ret;
2864 }
2865
vfio_iommu_migration_build_caps(struct vfio_iommu * iommu,struct vfio_info_cap * caps)2866 static int vfio_iommu_migration_build_caps(struct vfio_iommu *iommu,
2867 struct vfio_info_cap *caps)
2868 {
2869 struct vfio_iommu_type1_info_cap_migration cap_mig = {};
2870
2871 cap_mig.header.id = VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION;
2872 cap_mig.header.version = 1;
2873
2874 cap_mig.flags = 0;
2875 /* support minimum pgsize */
2876 cap_mig.pgsize_bitmap = (size_t)1 << __ffs(iommu->pgsize_bitmap);
2877 cap_mig.max_dirty_bitmap_size = DIRTY_BITMAP_SIZE_MAX;
2878
2879 return vfio_info_add_capability(caps, &cap_mig.header, sizeof(cap_mig));
2880 }
2881
vfio_iommu_dma_avail_build_caps(struct vfio_iommu * iommu,struct vfio_info_cap * caps)2882 static int vfio_iommu_dma_avail_build_caps(struct vfio_iommu *iommu,
2883 struct vfio_info_cap *caps)
2884 {
2885 struct vfio_iommu_type1_info_dma_avail cap_dma_avail;
2886
2887 cap_dma_avail.header.id = VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL;
2888 cap_dma_avail.header.version = 1;
2889
2890 cap_dma_avail.avail = iommu->dma_avail;
2891
2892 return vfio_info_add_capability(caps, &cap_dma_avail.header,
2893 sizeof(cap_dma_avail));
2894 }
2895
vfio_iommu_type1_get_info(struct vfio_iommu * iommu,unsigned long arg)2896 static int vfio_iommu_type1_get_info(struct vfio_iommu *iommu,
2897 unsigned long arg)
2898 {
2899 struct vfio_iommu_type1_info info;
2900 unsigned long minsz;
2901 struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
2902 unsigned long capsz;
2903 int ret;
2904
2905 minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
2906
2907 /* For backward compatibility, cannot require this */
2908 capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset);
2909
2910 if (copy_from_user(&info, (void __user *)arg, minsz))
2911 return -EFAULT;
2912
2913 if (info.argsz < minsz)
2914 return -EINVAL;
2915
2916 if (info.argsz >= capsz) {
2917 minsz = capsz;
2918 info.cap_offset = 0; /* output, no-recopy necessary */
2919 }
2920
2921 mutex_lock(&iommu->lock);
2922 info.flags = VFIO_IOMMU_INFO_PGSIZES;
2923
2924 info.iova_pgsizes = iommu->pgsize_bitmap;
2925
2926 ret = vfio_iommu_migration_build_caps(iommu, &caps);
2927
2928 if (!ret)
2929 ret = vfio_iommu_dma_avail_build_caps(iommu, &caps);
2930
2931 if (!ret)
2932 ret = vfio_iommu_iova_build_caps(iommu, &caps);
2933
2934 mutex_unlock(&iommu->lock);
2935
2936 if (ret)
2937 return ret;
2938
2939 if (caps.size) {
2940 info.flags |= VFIO_IOMMU_INFO_CAPS;
2941
2942 if (info.argsz < sizeof(info) + caps.size) {
2943 info.argsz = sizeof(info) + caps.size;
2944 } else {
2945 vfio_info_cap_shift(&caps, sizeof(info));
2946 if (copy_to_user((void __user *)arg +
2947 sizeof(info), caps.buf,
2948 caps.size)) {
2949 kfree(caps.buf);
2950 return -EFAULT;
2951 }
2952 info.cap_offset = sizeof(info);
2953 }
2954
2955 kfree(caps.buf);
2956 }
2957
2958 return copy_to_user((void __user *)arg, &info, minsz) ?
2959 -EFAULT : 0;
2960 }
2961
vfio_iommu_type1_map_dma(struct vfio_iommu * iommu,unsigned long arg)2962 static int vfio_iommu_type1_map_dma(struct vfio_iommu *iommu,
2963 unsigned long arg)
2964 {
2965 struct vfio_iommu_type1_dma_map map;
2966 unsigned long minsz;
2967 uint32_t mask = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE |
2968 VFIO_DMA_MAP_FLAG_VADDR;
2969
2970 minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
2971
2972 if (copy_from_user(&map, (void __user *)arg, minsz))
2973 return -EFAULT;
2974
2975 if (map.argsz < minsz || map.flags & ~mask)
2976 return -EINVAL;
2977
2978 return vfio_dma_do_map(iommu, &map);
2979 }
2980
vfio_iommu_type1_unmap_dma(struct vfio_iommu * iommu,unsigned long arg)2981 static int vfio_iommu_type1_unmap_dma(struct vfio_iommu *iommu,
2982 unsigned long arg)
2983 {
2984 struct vfio_iommu_type1_dma_unmap unmap;
2985 struct vfio_bitmap bitmap = { 0 };
2986 uint32_t mask = VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP |
2987 VFIO_DMA_UNMAP_FLAG_VADDR |
2988 VFIO_DMA_UNMAP_FLAG_ALL;
2989 unsigned long minsz;
2990 int ret;
2991
2992 minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
2993
2994 if (copy_from_user(&unmap, (void __user *)arg, minsz))
2995 return -EFAULT;
2996
2997 if (unmap.argsz < minsz || unmap.flags & ~mask)
2998 return -EINVAL;
2999
3000 if ((unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
3001 (unmap.flags & (VFIO_DMA_UNMAP_FLAG_ALL |
3002 VFIO_DMA_UNMAP_FLAG_VADDR)))
3003 return -EINVAL;
3004
3005 if (unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
3006 unsigned long pgshift;
3007
3008 if (unmap.argsz < (minsz + sizeof(bitmap)))
3009 return -EINVAL;
3010
3011 if (copy_from_user(&bitmap,
3012 (void __user *)(arg + minsz),
3013 sizeof(bitmap)))
3014 return -EFAULT;
3015
3016 if (!access_ok((void __user *)bitmap.data, bitmap.size))
3017 return -EINVAL;
3018
3019 pgshift = __ffs(bitmap.pgsize);
3020 ret = verify_bitmap_size(unmap.size >> pgshift,
3021 bitmap.size);
3022 if (ret)
3023 return ret;
3024 }
3025
3026 ret = vfio_dma_do_unmap(iommu, &unmap, &bitmap);
3027 if (ret)
3028 return ret;
3029
3030 return copy_to_user((void __user *)arg, &unmap, minsz) ?
3031 -EFAULT : 0;
3032 }
3033
vfio_iommu_type1_dirty_pages(struct vfio_iommu * iommu,unsigned long arg)3034 static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu,
3035 unsigned long arg)
3036 {
3037 struct vfio_iommu_type1_dirty_bitmap dirty;
3038 uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
3039 VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
3040 VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
3041 unsigned long minsz;
3042 int ret = 0;
3043
3044 if (!iommu->v2)
3045 return -EACCES;
3046
3047 minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap, flags);
3048
3049 if (copy_from_user(&dirty, (void __user *)arg, minsz))
3050 return -EFAULT;
3051
3052 if (dirty.argsz < minsz || dirty.flags & ~mask)
3053 return -EINVAL;
3054
3055 /* only one flag should be set at a time */
3056 if (__ffs(dirty.flags) != __fls(dirty.flags))
3057 return -EINVAL;
3058
3059 if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
3060 size_t pgsize;
3061
3062 mutex_lock(&iommu->lock);
3063 pgsize = 1 << __ffs(iommu->pgsize_bitmap);
3064 if (!iommu->dirty_page_tracking) {
3065 ret = vfio_dma_bitmap_alloc_all(iommu, pgsize);
3066 if (!ret)
3067 iommu->dirty_page_tracking = true;
3068 }
3069 mutex_unlock(&iommu->lock);
3070 return ret;
3071 } else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
3072 mutex_lock(&iommu->lock);
3073 if (iommu->dirty_page_tracking) {
3074 iommu->dirty_page_tracking = false;
3075 vfio_dma_bitmap_free_all(iommu);
3076 }
3077 mutex_unlock(&iommu->lock);
3078 return 0;
3079 } else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
3080 struct vfio_iommu_type1_dirty_bitmap_get range;
3081 unsigned long pgshift;
3082 size_t data_size = dirty.argsz - minsz;
3083 size_t iommu_pgsize;
3084
3085 if (!data_size || data_size < sizeof(range))
3086 return -EINVAL;
3087
3088 if (copy_from_user(&range, (void __user *)(arg + minsz),
3089 sizeof(range)))
3090 return -EFAULT;
3091
3092 if (range.iova + range.size < range.iova)
3093 return -EINVAL;
3094 if (!access_ok((void __user *)range.bitmap.data,
3095 range.bitmap.size))
3096 return -EINVAL;
3097
3098 pgshift = __ffs(range.bitmap.pgsize);
3099 ret = verify_bitmap_size(range.size >> pgshift,
3100 range.bitmap.size);
3101 if (ret)
3102 return ret;
3103
3104 mutex_lock(&iommu->lock);
3105
3106 iommu_pgsize = (size_t)1 << __ffs(iommu->pgsize_bitmap);
3107
3108 /* allow only smallest supported pgsize */
3109 if (range.bitmap.pgsize != iommu_pgsize) {
3110 ret = -EINVAL;
3111 goto out_unlock;
3112 }
3113 if (range.iova & (iommu_pgsize - 1)) {
3114 ret = -EINVAL;
3115 goto out_unlock;
3116 }
3117 if (!range.size || range.size & (iommu_pgsize - 1)) {
3118 ret = -EINVAL;
3119 goto out_unlock;
3120 }
3121
3122 if (iommu->dirty_page_tracking)
3123 ret = vfio_iova_dirty_bitmap(range.bitmap.data,
3124 iommu, range.iova,
3125 range.size,
3126 range.bitmap.pgsize);
3127 else
3128 ret = -EINVAL;
3129 out_unlock:
3130 mutex_unlock(&iommu->lock);
3131
3132 return ret;
3133 }
3134
3135 return -EINVAL;
3136 }
3137
vfio_iommu_type1_ioctl(void * iommu_data,unsigned int cmd,unsigned long arg)3138 static long vfio_iommu_type1_ioctl(void *iommu_data,
3139 unsigned int cmd, unsigned long arg)
3140 {
3141 struct vfio_iommu *iommu = iommu_data;
3142
3143 switch (cmd) {
3144 case VFIO_CHECK_EXTENSION:
3145 return vfio_iommu_type1_check_extension(iommu, arg);
3146 case VFIO_IOMMU_GET_INFO:
3147 return vfio_iommu_type1_get_info(iommu, arg);
3148 case VFIO_IOMMU_MAP_DMA:
3149 return vfio_iommu_type1_map_dma(iommu, arg);
3150 case VFIO_IOMMU_UNMAP_DMA:
3151 return vfio_iommu_type1_unmap_dma(iommu, arg);
3152 case VFIO_IOMMU_DIRTY_PAGES:
3153 return vfio_iommu_type1_dirty_pages(iommu, arg);
3154 default:
3155 return -ENOTTY;
3156 }
3157 }
3158
vfio_iommu_type1_register_notifier(void * iommu_data,unsigned long * events,struct notifier_block * nb)3159 static int vfio_iommu_type1_register_notifier(void *iommu_data,
3160 unsigned long *events,
3161 struct notifier_block *nb)
3162 {
3163 struct vfio_iommu *iommu = iommu_data;
3164
3165 /* clear known events */
3166 *events &= ~VFIO_IOMMU_NOTIFY_DMA_UNMAP;
3167
3168 /* refuse to register if still events remaining */
3169 if (*events)
3170 return -EINVAL;
3171
3172 return blocking_notifier_chain_register(&iommu->notifier, nb);
3173 }
3174
vfio_iommu_type1_unregister_notifier(void * iommu_data,struct notifier_block * nb)3175 static int vfio_iommu_type1_unregister_notifier(void *iommu_data,
3176 struct notifier_block *nb)
3177 {
3178 struct vfio_iommu *iommu = iommu_data;
3179
3180 return blocking_notifier_chain_unregister(&iommu->notifier, nb);
3181 }
3182
vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu * iommu,dma_addr_t user_iova,void * data,size_t count,bool write,size_t * copied)3183 static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
3184 dma_addr_t user_iova, void *data,
3185 size_t count, bool write,
3186 size_t *copied)
3187 {
3188 struct mm_struct *mm;
3189 unsigned long vaddr;
3190 struct vfio_dma *dma;
3191 bool kthread = current->mm == NULL;
3192 size_t offset;
3193 int ret;
3194
3195 *copied = 0;
3196
3197 ret = vfio_find_dma_valid(iommu, user_iova, 1, &dma);
3198 if (ret < 0)
3199 return ret;
3200
3201 if ((write && !(dma->prot & IOMMU_WRITE)) ||
3202 !(dma->prot & IOMMU_READ))
3203 return -EPERM;
3204
3205 mm = dma->mm;
3206 if (!mmget_not_zero(mm))
3207 return -EPERM;
3208
3209 if (kthread)
3210 kthread_use_mm(mm);
3211 else if (current->mm != mm)
3212 goto out;
3213
3214 offset = user_iova - dma->iova;
3215
3216 if (count > dma->size - offset)
3217 count = dma->size - offset;
3218
3219 vaddr = dma->vaddr + offset;
3220
3221 if (write) {
3222 *copied = copy_to_user((void __user *)vaddr, data,
3223 count) ? 0 : count;
3224 if (*copied && iommu->dirty_page_tracking) {
3225 unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
3226 /*
3227 * Bitmap populated with the smallest supported page
3228 * size
3229 */
3230 bitmap_set(dma->bitmap, offset >> pgshift,
3231 ((offset + *copied - 1) >> pgshift) -
3232 (offset >> pgshift) + 1);
3233 }
3234 } else
3235 *copied = copy_from_user(data, (void __user *)vaddr,
3236 count) ? 0 : count;
3237 if (kthread)
3238 kthread_unuse_mm(mm);
3239 out:
3240 mmput(mm);
3241 return *copied ? 0 : -EFAULT;
3242 }
3243
vfio_iommu_type1_dma_rw(void * iommu_data,dma_addr_t user_iova,void * data,size_t count,bool write)3244 static int vfio_iommu_type1_dma_rw(void *iommu_data, dma_addr_t user_iova,
3245 void *data, size_t count, bool write)
3246 {
3247 struct vfio_iommu *iommu = iommu_data;
3248 int ret = 0;
3249 size_t done;
3250
3251 mutex_lock(&iommu->lock);
3252 while (count > 0) {
3253 ret = vfio_iommu_type1_dma_rw_chunk(iommu, user_iova, data,
3254 count, write, &done);
3255 if (ret)
3256 break;
3257
3258 count -= done;
3259 data += done;
3260 user_iova += done;
3261 }
3262
3263 mutex_unlock(&iommu->lock);
3264 return ret;
3265 }
3266
3267 static struct iommu_domain *
vfio_iommu_type1_group_iommu_domain(void * iommu_data,struct iommu_group * iommu_group)3268 vfio_iommu_type1_group_iommu_domain(void *iommu_data,
3269 struct iommu_group *iommu_group)
3270 {
3271 struct iommu_domain *domain = ERR_PTR(-ENODEV);
3272 struct vfio_iommu *iommu = iommu_data;
3273 struct vfio_domain *d;
3274
3275 if (!iommu || !iommu_group)
3276 return ERR_PTR(-EINVAL);
3277
3278 mutex_lock(&iommu->lock);
3279 list_for_each_entry(d, &iommu->domain_list, next) {
3280 if (find_iommu_group(d, iommu_group)) {
3281 domain = d->domain;
3282 break;
3283 }
3284 }
3285 mutex_unlock(&iommu->lock);
3286
3287 return domain;
3288 }
3289
vfio_iommu_type1_notify(void * iommu_data,enum vfio_iommu_notify_type event)3290 static void vfio_iommu_type1_notify(void *iommu_data,
3291 enum vfio_iommu_notify_type event)
3292 {
3293 struct vfio_iommu *iommu = iommu_data;
3294
3295 if (event != VFIO_IOMMU_CONTAINER_CLOSE)
3296 return;
3297 mutex_lock(&iommu->lock);
3298 iommu->container_open = false;
3299 mutex_unlock(&iommu->lock);
3300 wake_up_all(&iommu->vaddr_wait);
3301 }
3302
3303 static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
3304 .name = "vfio-iommu-type1",
3305 .owner = THIS_MODULE,
3306 .open = vfio_iommu_type1_open,
3307 .release = vfio_iommu_type1_release,
3308 .ioctl = vfio_iommu_type1_ioctl,
3309 .attach_group = vfio_iommu_type1_attach_group,
3310 .detach_group = vfio_iommu_type1_detach_group,
3311 .pin_pages = vfio_iommu_type1_pin_pages,
3312 .unpin_pages = vfio_iommu_type1_unpin_pages,
3313 .register_notifier = vfio_iommu_type1_register_notifier,
3314 .unregister_notifier = vfio_iommu_type1_unregister_notifier,
3315 .dma_rw = vfio_iommu_type1_dma_rw,
3316 .group_iommu_domain = vfio_iommu_type1_group_iommu_domain,
3317 .notify = vfio_iommu_type1_notify,
3318 };
3319
vfio_iommu_type1_init(void)3320 static int __init vfio_iommu_type1_init(void)
3321 {
3322 return vfio_register_iommu_driver(&vfio_iommu_driver_ops_type1);
3323 }
3324
vfio_iommu_type1_cleanup(void)3325 static void __exit vfio_iommu_type1_cleanup(void)
3326 {
3327 vfio_unregister_iommu_driver(&vfio_iommu_driver_ops_type1);
3328 }
3329
3330 module_init(vfio_iommu_type1_init);
3331 module_exit(vfio_iommu_type1_cleanup);
3332
3333 MODULE_VERSION(DRIVER_VERSION);
3334 MODULE_LICENSE("GPL v2");
3335 MODULE_AUTHOR(DRIVER_AUTHOR);
3336 MODULE_DESCRIPTION(DRIVER_DESC);
3337