1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * VFIO: IOMMU DMA mapping support for Type1 IOMMU
4 *
5 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
6 * Author: Alex Williamson <alex.williamson@redhat.com>
7 *
8 * Derived from original vfio:
9 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
10 * Author: Tom Lyon, pugs@cisco.com
11 *
12 * We arbitrarily define a Type1 IOMMU as one matching the below code.
13 * It could be called the x86 IOMMU as it's designed for AMD-Vi & Intel
14 * VT-d, but that makes it harder to re-use as theoretically anyone
15 * implementing a similar IOMMU could make use of this. We expect the
16 * IOMMU to support the IOMMU API and have few to no restrictions around
17 * the IOVA range that can be mapped. The Type1 IOMMU is currently
18 * optimized for relatively static mappings of a userspace process with
19 * userpsace pages pinned into memory. We also assume devices and IOMMU
20 * domains are PCI based as the IOMMU API is still centered around a
21 * device/bus interface rather than a group interface.
22 */
23
24 #include <linux/compat.h>
25 #include <linux/device.h>
26 #include <linux/fs.h>
27 #include <linux/highmem.h>
28 #include <linux/iommu.h>
29 #include <linux/module.h>
30 #include <linux/mm.h>
31 #include <linux/kthread.h>
32 #include <linux/rbtree.h>
33 #include <linux/sched/signal.h>
34 #include <linux/sched/mm.h>
35 #include <linux/slab.h>
36 #include <linux/uaccess.h>
37 #include <linux/vfio.h>
38 #include <linux/workqueue.h>
39 #include <linux/mdev.h>
40 #include <linux/notifier.h>
41 #include <linux/dma-iommu.h>
42 #include <linux/irqdomain.h>
43
44 #define DRIVER_VERSION "0.2"
45 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
46 #define DRIVER_DESC "Type1 IOMMU driver for VFIO"
47
48 static bool allow_unsafe_interrupts;
49 module_param_named(allow_unsafe_interrupts,
50 allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR);
51 MODULE_PARM_DESC(allow_unsafe_interrupts,
52 "Enable VFIO IOMMU support for on platforms without interrupt remapping support.");
53
54 static bool disable_hugepages;
55 module_param_named(disable_hugepages,
56 disable_hugepages, bool, S_IRUGO | S_IWUSR);
57 MODULE_PARM_DESC(disable_hugepages,
58 "Disable VFIO IOMMU support for IOMMU hugepages.");
59
60 static unsigned int dma_entry_limit __read_mostly = U16_MAX;
61 module_param_named(dma_entry_limit, dma_entry_limit, uint, 0644);
62 MODULE_PARM_DESC(dma_entry_limit,
63 "Maximum number of user DMA mappings per container (65535).");
64
65 struct vfio_iommu {
66 struct list_head domain_list;
67 struct list_head iova_list;
68 struct vfio_domain *external_domain; /* domain for external user */
69 struct mutex lock;
70 struct rb_root dma_list;
71 struct blocking_notifier_head notifier;
72 unsigned int dma_avail;
73 uint64_t pgsize_bitmap;
74 bool v2;
75 bool nesting;
76 bool dirty_page_tracking;
77 bool pinned_page_dirty_scope;
78 };
79
80 struct vfio_domain {
81 struct iommu_domain *domain;
82 struct list_head next;
83 struct list_head group_list;
84 int prot; /* IOMMU_CACHE */
85 bool fgsp; /* Fine-grained super pages */
86 };
87
88 struct vfio_dma {
89 struct rb_node node;
90 dma_addr_t iova; /* Device address */
91 unsigned long vaddr; /* Process virtual addr */
92 size_t size; /* Map size (bytes) */
93 int prot; /* IOMMU_READ/WRITE */
94 bool iommu_mapped;
95 bool lock_cap; /* capable(CAP_IPC_LOCK) */
96 struct task_struct *task;
97 struct rb_root pfn_list; /* Ex-user pinned pfn list */
98 unsigned long *bitmap;
99 };
100
101 struct vfio_group {
102 struct iommu_group *iommu_group;
103 struct list_head next;
104 bool mdev_group; /* An mdev group */
105 bool pinned_page_dirty_scope;
106 };
107
108 struct vfio_iova {
109 struct list_head list;
110 dma_addr_t start;
111 dma_addr_t end;
112 };
113
114 /*
115 * Guest RAM pinning working set or DMA target
116 */
117 struct vfio_pfn {
118 struct rb_node node;
119 dma_addr_t iova; /* Device address */
120 unsigned long pfn; /* Host pfn */
121 unsigned int ref_count;
122 };
123
124 struct vfio_regions {
125 struct list_head list;
126 dma_addr_t iova;
127 phys_addr_t phys;
128 size_t len;
129 };
130
131 #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) \
132 (!list_empty(&iommu->domain_list))
133
134 #define DIRTY_BITMAP_BYTES(n) (ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
135
136 /*
137 * Input argument of number of bits to bitmap_set() is unsigned integer, which
138 * further casts to signed integer for unaligned multi-bit operation,
139 * __bitmap_set().
140 * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
141 * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
142 * system.
143 */
144 #define DIRTY_BITMAP_PAGES_MAX ((u64)INT_MAX)
145 #define DIRTY_BITMAP_SIZE_MAX DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
146
147 static int put_pfn(unsigned long pfn, int prot);
148
149 static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
150 struct iommu_group *iommu_group);
151
152 static void update_pinned_page_dirty_scope(struct vfio_iommu *iommu);
153 /*
154 * This code handles mapping and unmapping of user data buffers
155 * into DMA'ble space using the IOMMU
156 */
157
vfio_find_dma(struct vfio_iommu * iommu,dma_addr_t start,size_t size)158 static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
159 dma_addr_t start, size_t size)
160 {
161 struct rb_node *node = iommu->dma_list.rb_node;
162
163 while (node) {
164 struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
165
166 if (start + size <= dma->iova)
167 node = node->rb_left;
168 else if (start >= dma->iova + dma->size)
169 node = node->rb_right;
170 else
171 return dma;
172 }
173
174 return NULL;
175 }
176
vfio_link_dma(struct vfio_iommu * iommu,struct vfio_dma * new)177 static void vfio_link_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
178 {
179 struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL;
180 struct vfio_dma *dma;
181
182 while (*link) {
183 parent = *link;
184 dma = rb_entry(parent, struct vfio_dma, node);
185
186 if (new->iova + new->size <= dma->iova)
187 link = &(*link)->rb_left;
188 else
189 link = &(*link)->rb_right;
190 }
191
192 rb_link_node(&new->node, parent, link);
193 rb_insert_color(&new->node, &iommu->dma_list);
194 }
195
vfio_unlink_dma(struct vfio_iommu * iommu,struct vfio_dma * old)196 static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
197 {
198 rb_erase(&old->node, &iommu->dma_list);
199 }
200
201
vfio_dma_bitmap_alloc(struct vfio_dma * dma,size_t pgsize)202 static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, size_t pgsize)
203 {
204 uint64_t npages = dma->size / pgsize;
205
206 if (npages > DIRTY_BITMAP_PAGES_MAX)
207 return -EINVAL;
208
209 /*
210 * Allocate extra 64 bits that are used to calculate shift required for
211 * bitmap_shift_left() to manipulate and club unaligned number of pages
212 * in adjacent vfio_dma ranges.
213 */
214 dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages) + sizeof(u64),
215 GFP_KERNEL);
216 if (!dma->bitmap)
217 return -ENOMEM;
218
219 return 0;
220 }
221
vfio_dma_bitmap_free(struct vfio_dma * dma)222 static void vfio_dma_bitmap_free(struct vfio_dma *dma)
223 {
224 kfree(dma->bitmap);
225 dma->bitmap = NULL;
226 }
227
vfio_dma_populate_bitmap(struct vfio_dma * dma,size_t pgsize)228 static void vfio_dma_populate_bitmap(struct vfio_dma *dma, size_t pgsize)
229 {
230 struct rb_node *p;
231 unsigned long pgshift = __ffs(pgsize);
232
233 for (p = rb_first(&dma->pfn_list); p; p = rb_next(p)) {
234 struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, node);
235
236 bitmap_set(dma->bitmap, (vpfn->iova - dma->iova) >> pgshift, 1);
237 }
238 }
239
vfio_iommu_populate_bitmap_full(struct vfio_iommu * iommu)240 static void vfio_iommu_populate_bitmap_full(struct vfio_iommu *iommu)
241 {
242 struct rb_node *n;
243 unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
244
245 for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
246 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
247
248 bitmap_set(dma->bitmap, 0, dma->size >> pgshift);
249 }
250 }
251
vfio_dma_bitmap_alloc_all(struct vfio_iommu * iommu,size_t pgsize)252 static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, size_t pgsize)
253 {
254 struct rb_node *n;
255
256 for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
257 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
258 int ret;
259
260 ret = vfio_dma_bitmap_alloc(dma, pgsize);
261 if (ret) {
262 struct rb_node *p;
263
264 for (p = rb_prev(n); p; p = rb_prev(p)) {
265 struct vfio_dma *dma = rb_entry(n,
266 struct vfio_dma, node);
267
268 vfio_dma_bitmap_free(dma);
269 }
270 return ret;
271 }
272 vfio_dma_populate_bitmap(dma, pgsize);
273 }
274 return 0;
275 }
276
vfio_dma_bitmap_free_all(struct vfio_iommu * iommu)277 static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu)
278 {
279 struct rb_node *n;
280
281 for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
282 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
283
284 vfio_dma_bitmap_free(dma);
285 }
286 }
287
288 /*
289 * Helper Functions for host iova-pfn list
290 */
vfio_find_vpfn(struct vfio_dma * dma,dma_addr_t iova)291 static struct vfio_pfn *vfio_find_vpfn(struct vfio_dma *dma, dma_addr_t iova)
292 {
293 struct vfio_pfn *vpfn;
294 struct rb_node *node = dma->pfn_list.rb_node;
295
296 while (node) {
297 vpfn = rb_entry(node, struct vfio_pfn, node);
298
299 if (iova < vpfn->iova)
300 node = node->rb_left;
301 else if (iova > vpfn->iova)
302 node = node->rb_right;
303 else
304 return vpfn;
305 }
306 return NULL;
307 }
308
vfio_link_pfn(struct vfio_dma * dma,struct vfio_pfn * new)309 static void vfio_link_pfn(struct vfio_dma *dma,
310 struct vfio_pfn *new)
311 {
312 struct rb_node **link, *parent = NULL;
313 struct vfio_pfn *vpfn;
314
315 link = &dma->pfn_list.rb_node;
316 while (*link) {
317 parent = *link;
318 vpfn = rb_entry(parent, struct vfio_pfn, node);
319
320 if (new->iova < vpfn->iova)
321 link = &(*link)->rb_left;
322 else
323 link = &(*link)->rb_right;
324 }
325
326 rb_link_node(&new->node, parent, link);
327 rb_insert_color(&new->node, &dma->pfn_list);
328 }
329
vfio_unlink_pfn(struct vfio_dma * dma,struct vfio_pfn * old)330 static void vfio_unlink_pfn(struct vfio_dma *dma, struct vfio_pfn *old)
331 {
332 rb_erase(&old->node, &dma->pfn_list);
333 }
334
vfio_add_to_pfn_list(struct vfio_dma * dma,dma_addr_t iova,unsigned long pfn)335 static int vfio_add_to_pfn_list(struct vfio_dma *dma, dma_addr_t iova,
336 unsigned long pfn)
337 {
338 struct vfio_pfn *vpfn;
339
340 vpfn = kzalloc(sizeof(*vpfn), GFP_KERNEL);
341 if (!vpfn)
342 return -ENOMEM;
343
344 vpfn->iova = iova;
345 vpfn->pfn = pfn;
346 vpfn->ref_count = 1;
347 vfio_link_pfn(dma, vpfn);
348 return 0;
349 }
350
vfio_remove_from_pfn_list(struct vfio_dma * dma,struct vfio_pfn * vpfn)351 static void vfio_remove_from_pfn_list(struct vfio_dma *dma,
352 struct vfio_pfn *vpfn)
353 {
354 vfio_unlink_pfn(dma, vpfn);
355 kfree(vpfn);
356 }
357
vfio_iova_get_vfio_pfn(struct vfio_dma * dma,unsigned long iova)358 static struct vfio_pfn *vfio_iova_get_vfio_pfn(struct vfio_dma *dma,
359 unsigned long iova)
360 {
361 struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
362
363 if (vpfn)
364 vpfn->ref_count++;
365 return vpfn;
366 }
367
vfio_iova_put_vfio_pfn(struct vfio_dma * dma,struct vfio_pfn * vpfn)368 static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn)
369 {
370 int ret = 0;
371
372 vpfn->ref_count--;
373 if (!vpfn->ref_count) {
374 ret = put_pfn(vpfn->pfn, dma->prot);
375 vfio_remove_from_pfn_list(dma, vpfn);
376 }
377 return ret;
378 }
379
vfio_lock_acct(struct vfio_dma * dma,long npage,bool async)380 static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async)
381 {
382 struct mm_struct *mm;
383 int ret;
384
385 if (!npage)
386 return 0;
387
388 mm = async ? get_task_mm(dma->task) : dma->task->mm;
389 if (!mm)
390 return -ESRCH; /* process exited */
391
392 ret = mmap_write_lock_killable(mm);
393 if (!ret) {
394 ret = __account_locked_vm(mm, abs(npage), npage > 0, dma->task,
395 dma->lock_cap);
396 mmap_write_unlock(mm);
397 }
398
399 if (async)
400 mmput(mm);
401
402 return ret;
403 }
404
405 /*
406 * Some mappings aren't backed by a struct page, for example an mmap'd
407 * MMIO range for our own or another device. These use a different
408 * pfn conversion and shouldn't be tracked as locked pages.
409 * For compound pages, any driver that sets the reserved bit in head
410 * page needs to set the reserved bit in all subpages to be safe.
411 */
is_invalid_reserved_pfn(unsigned long pfn)412 static bool is_invalid_reserved_pfn(unsigned long pfn)
413 {
414 if (pfn_valid(pfn))
415 return PageReserved(pfn_to_page(pfn));
416
417 return true;
418 }
419
put_pfn(unsigned long pfn,int prot)420 static int put_pfn(unsigned long pfn, int prot)
421 {
422 if (!is_invalid_reserved_pfn(pfn)) {
423 struct page *page = pfn_to_page(pfn);
424
425 unpin_user_pages_dirty_lock(&page, 1, prot & IOMMU_WRITE);
426 return 1;
427 }
428 return 0;
429 }
430
follow_fault_pfn(struct vm_area_struct * vma,struct mm_struct * mm,unsigned long vaddr,unsigned long * pfn,bool write_fault)431 static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm,
432 unsigned long vaddr, unsigned long *pfn,
433 bool write_fault)
434 {
435 pte_t *ptep;
436 spinlock_t *ptl;
437 int ret;
438
439 ret = follow_pte(vma->vm_mm, vaddr, &ptep, &ptl);
440 if (ret) {
441 bool unlocked = false;
442
443 ret = fixup_user_fault(mm, vaddr,
444 FAULT_FLAG_REMOTE |
445 (write_fault ? FAULT_FLAG_WRITE : 0),
446 &unlocked);
447 if (unlocked)
448 return -EAGAIN;
449
450 if (ret)
451 return ret;
452
453 ret = follow_pte(vma->vm_mm, vaddr, &ptep, &ptl);
454 if (ret)
455 return ret;
456 }
457
458 if (write_fault && !pte_write(*ptep))
459 ret = -EFAULT;
460 else
461 *pfn = pte_pfn(*ptep);
462
463 pte_unmap_unlock(ptep, ptl);
464 return ret;
465 }
466
vaddr_get_pfn(struct mm_struct * mm,unsigned long vaddr,int prot,unsigned long * pfn)467 static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
468 int prot, unsigned long *pfn)
469 {
470 struct page *page[1];
471 struct vm_area_struct *vma;
472 unsigned int flags = 0;
473 int ret;
474
475 if (prot & IOMMU_WRITE)
476 flags |= FOLL_WRITE;
477
478 mmap_read_lock(mm);
479 ret = pin_user_pages_remote(mm, vaddr, 1, flags | FOLL_LONGTERM,
480 page, NULL, NULL);
481 if (ret == 1) {
482 *pfn = page_to_pfn(page[0]);
483 ret = 0;
484 goto done;
485 }
486
487 vaddr = untagged_addr(vaddr);
488
489 retry:
490 vma = find_vma_intersection(mm, vaddr, vaddr + 1);
491
492 if (vma && vma->vm_flags & VM_PFNMAP) {
493 ret = follow_fault_pfn(vma, mm, vaddr, pfn, prot & IOMMU_WRITE);
494 if (ret == -EAGAIN)
495 goto retry;
496
497 if (!ret && !is_invalid_reserved_pfn(*pfn))
498 ret = -EFAULT;
499 }
500 done:
501 mmap_read_unlock(mm);
502 return ret;
503 }
504
505 /*
506 * Attempt to pin pages. We really don't want to track all the pfns and
507 * the iommu can only map chunks of consecutive pfns anyway, so get the
508 * first page and all consecutive pages with the same locking.
509 */
vfio_pin_pages_remote(struct vfio_dma * dma,unsigned long vaddr,long npage,unsigned long * pfn_base,unsigned long limit)510 static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
511 long npage, unsigned long *pfn_base,
512 unsigned long limit)
513 {
514 unsigned long pfn = 0;
515 long ret, pinned = 0, lock_acct = 0;
516 bool rsvd;
517 dma_addr_t iova = vaddr - dma->vaddr + dma->iova;
518
519 /* This code path is only user initiated */
520 if (!current->mm)
521 return -ENODEV;
522
523 ret = vaddr_get_pfn(current->mm, vaddr, dma->prot, pfn_base);
524 if (ret)
525 return ret;
526
527 pinned++;
528 rsvd = is_invalid_reserved_pfn(*pfn_base);
529
530 /*
531 * Reserved pages aren't counted against the user, externally pinned
532 * pages are already counted against the user.
533 */
534 if (!rsvd && !vfio_find_vpfn(dma, iova)) {
535 if (!dma->lock_cap && current->mm->locked_vm + 1 > limit) {
536 put_pfn(*pfn_base, dma->prot);
537 pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
538 limit << PAGE_SHIFT);
539 return -ENOMEM;
540 }
541 lock_acct++;
542 }
543
544 if (unlikely(disable_hugepages))
545 goto out;
546
547 /* Lock all the consecutive pages from pfn_base */
548 for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; pinned < npage;
549 pinned++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) {
550 ret = vaddr_get_pfn(current->mm, vaddr, dma->prot, &pfn);
551 if (ret)
552 break;
553
554 if (pfn != *pfn_base + pinned ||
555 rsvd != is_invalid_reserved_pfn(pfn)) {
556 put_pfn(pfn, dma->prot);
557 break;
558 }
559
560 if (!rsvd && !vfio_find_vpfn(dma, iova)) {
561 if (!dma->lock_cap &&
562 current->mm->locked_vm + lock_acct + 1 > limit) {
563 put_pfn(pfn, dma->prot);
564 pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
565 __func__, limit << PAGE_SHIFT);
566 ret = -ENOMEM;
567 goto unpin_out;
568 }
569 lock_acct++;
570 }
571 }
572
573 out:
574 ret = vfio_lock_acct(dma, lock_acct, false);
575
576 unpin_out:
577 if (ret) {
578 if (!rsvd) {
579 for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
580 put_pfn(pfn, dma->prot);
581 }
582
583 return ret;
584 }
585
586 return pinned;
587 }
588
vfio_unpin_pages_remote(struct vfio_dma * dma,dma_addr_t iova,unsigned long pfn,long npage,bool do_accounting)589 static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
590 unsigned long pfn, long npage,
591 bool do_accounting)
592 {
593 long unlocked = 0, locked = 0;
594 long i;
595
596 for (i = 0; i < npage; i++, iova += PAGE_SIZE) {
597 if (put_pfn(pfn++, dma->prot)) {
598 unlocked++;
599 if (vfio_find_vpfn(dma, iova))
600 locked++;
601 }
602 }
603
604 if (do_accounting)
605 vfio_lock_acct(dma, locked - unlocked, true);
606
607 return unlocked;
608 }
609
vfio_pin_page_external(struct vfio_dma * dma,unsigned long vaddr,unsigned long * pfn_base,bool do_accounting)610 static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
611 unsigned long *pfn_base, bool do_accounting)
612 {
613 struct mm_struct *mm;
614 int ret;
615
616 mm = get_task_mm(dma->task);
617 if (!mm)
618 return -ENODEV;
619
620 ret = vaddr_get_pfn(mm, vaddr, dma->prot, pfn_base);
621 if (!ret && do_accounting && !is_invalid_reserved_pfn(*pfn_base)) {
622 ret = vfio_lock_acct(dma, 1, true);
623 if (ret) {
624 put_pfn(*pfn_base, dma->prot);
625 if (ret == -ENOMEM)
626 pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK "
627 "(%ld) exceeded\n", __func__,
628 dma->task->comm, task_pid_nr(dma->task),
629 task_rlimit(dma->task, RLIMIT_MEMLOCK));
630 }
631 }
632
633 mmput(mm);
634 return ret;
635 }
636
vfio_unpin_page_external(struct vfio_dma * dma,dma_addr_t iova,bool do_accounting)637 static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
638 bool do_accounting)
639 {
640 int unlocked;
641 struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
642
643 if (!vpfn)
644 return 0;
645
646 unlocked = vfio_iova_put_vfio_pfn(dma, vpfn);
647
648 if (do_accounting)
649 vfio_lock_acct(dma, -unlocked, true);
650
651 return unlocked;
652 }
653
vfio_iommu_type1_pin_pages(void * iommu_data,struct iommu_group * iommu_group,unsigned long * user_pfn,int npage,int prot,unsigned long * phys_pfn)654 static int vfio_iommu_type1_pin_pages(void *iommu_data,
655 struct iommu_group *iommu_group,
656 unsigned long *user_pfn,
657 int npage, int prot,
658 unsigned long *phys_pfn)
659 {
660 struct vfio_iommu *iommu = iommu_data;
661 struct vfio_group *group;
662 int i, j, ret;
663 unsigned long remote_vaddr;
664 struct vfio_dma *dma;
665 bool do_accounting;
666
667 if (!iommu || !user_pfn || !phys_pfn)
668 return -EINVAL;
669
670 /* Supported for v2 version only */
671 if (!iommu->v2)
672 return -EACCES;
673
674 mutex_lock(&iommu->lock);
675
676 /* Fail if notifier list is empty */
677 if (!iommu->notifier.head) {
678 ret = -EINVAL;
679 goto pin_done;
680 }
681
682 /*
683 * If iommu capable domain exist in the container then all pages are
684 * already pinned and accounted. Accouting should be done if there is no
685 * iommu capable domain in the container.
686 */
687 do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
688
689 for (i = 0; i < npage; i++) {
690 dma_addr_t iova;
691 struct vfio_pfn *vpfn;
692
693 iova = user_pfn[i] << PAGE_SHIFT;
694 dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
695 if (!dma) {
696 ret = -EINVAL;
697 goto pin_unwind;
698 }
699
700 if ((dma->prot & prot) != prot) {
701 ret = -EPERM;
702 goto pin_unwind;
703 }
704
705 vpfn = vfio_iova_get_vfio_pfn(dma, iova);
706 if (vpfn) {
707 phys_pfn[i] = vpfn->pfn;
708 continue;
709 }
710
711 remote_vaddr = dma->vaddr + (iova - dma->iova);
712 ret = vfio_pin_page_external(dma, remote_vaddr, &phys_pfn[i],
713 do_accounting);
714 if (ret)
715 goto pin_unwind;
716
717 ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]);
718 if (ret) {
719 if (put_pfn(phys_pfn[i], dma->prot) && do_accounting)
720 vfio_lock_acct(dma, -1, true);
721 goto pin_unwind;
722 }
723
724 if (iommu->dirty_page_tracking) {
725 unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
726
727 /*
728 * Bitmap populated with the smallest supported page
729 * size
730 */
731 bitmap_set(dma->bitmap,
732 (iova - dma->iova) >> pgshift, 1);
733 }
734 }
735 ret = i;
736
737 group = vfio_iommu_find_iommu_group(iommu, iommu_group);
738 if (!group->pinned_page_dirty_scope) {
739 group->pinned_page_dirty_scope = true;
740 update_pinned_page_dirty_scope(iommu);
741 }
742
743 goto pin_done;
744
745 pin_unwind:
746 phys_pfn[i] = 0;
747 for (j = 0; j < i; j++) {
748 dma_addr_t iova;
749
750 iova = user_pfn[j] << PAGE_SHIFT;
751 dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
752 vfio_unpin_page_external(dma, iova, do_accounting);
753 phys_pfn[j] = 0;
754 }
755 pin_done:
756 mutex_unlock(&iommu->lock);
757 return ret;
758 }
759
vfio_iommu_type1_unpin_pages(void * iommu_data,unsigned long * user_pfn,int npage)760 static int vfio_iommu_type1_unpin_pages(void *iommu_data,
761 unsigned long *user_pfn,
762 int npage)
763 {
764 struct vfio_iommu *iommu = iommu_data;
765 bool do_accounting;
766 int i;
767
768 if (!iommu || !user_pfn)
769 return -EINVAL;
770
771 /* Supported for v2 version only */
772 if (!iommu->v2)
773 return -EACCES;
774
775 mutex_lock(&iommu->lock);
776
777 do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
778 for (i = 0; i < npage; i++) {
779 struct vfio_dma *dma;
780 dma_addr_t iova;
781
782 iova = user_pfn[i] << PAGE_SHIFT;
783 dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
784 if (!dma)
785 goto unpin_exit;
786 vfio_unpin_page_external(dma, iova, do_accounting);
787 }
788
789 unpin_exit:
790 mutex_unlock(&iommu->lock);
791 return i > npage ? npage : (i > 0 ? i : -EINVAL);
792 }
793
vfio_sync_unpin(struct vfio_dma * dma,struct vfio_domain * domain,struct list_head * regions,struct iommu_iotlb_gather * iotlb_gather)794 static long vfio_sync_unpin(struct vfio_dma *dma, struct vfio_domain *domain,
795 struct list_head *regions,
796 struct iommu_iotlb_gather *iotlb_gather)
797 {
798 long unlocked = 0;
799 struct vfio_regions *entry, *next;
800
801 iommu_iotlb_sync(domain->domain, iotlb_gather);
802
803 list_for_each_entry_safe(entry, next, regions, list) {
804 unlocked += vfio_unpin_pages_remote(dma,
805 entry->iova,
806 entry->phys >> PAGE_SHIFT,
807 entry->len >> PAGE_SHIFT,
808 false);
809 list_del(&entry->list);
810 kfree(entry);
811 }
812
813 cond_resched();
814
815 return unlocked;
816 }
817
818 /*
819 * Generally, VFIO needs to unpin remote pages after each IOTLB flush.
820 * Therefore, when using IOTLB flush sync interface, VFIO need to keep track
821 * of these regions (currently using a list).
822 *
823 * This value specifies maximum number of regions for each IOTLB flush sync.
824 */
825 #define VFIO_IOMMU_TLB_SYNC_MAX 512
826
unmap_unpin_fast(struct vfio_domain * domain,struct vfio_dma * dma,dma_addr_t * iova,size_t len,phys_addr_t phys,long * unlocked,struct list_head * unmapped_list,int * unmapped_cnt,struct iommu_iotlb_gather * iotlb_gather)827 static size_t unmap_unpin_fast(struct vfio_domain *domain,
828 struct vfio_dma *dma, dma_addr_t *iova,
829 size_t len, phys_addr_t phys, long *unlocked,
830 struct list_head *unmapped_list,
831 int *unmapped_cnt,
832 struct iommu_iotlb_gather *iotlb_gather)
833 {
834 size_t unmapped = 0;
835 struct vfio_regions *entry = kzalloc(sizeof(*entry), GFP_KERNEL);
836
837 if (entry) {
838 unmapped = iommu_unmap_fast(domain->domain, *iova, len,
839 iotlb_gather);
840
841 if (!unmapped) {
842 kfree(entry);
843 } else {
844 entry->iova = *iova;
845 entry->phys = phys;
846 entry->len = unmapped;
847 list_add_tail(&entry->list, unmapped_list);
848
849 *iova += unmapped;
850 (*unmapped_cnt)++;
851 }
852 }
853
854 /*
855 * Sync if the number of fast-unmap regions hits the limit
856 * or in case of errors.
857 */
858 if (*unmapped_cnt >= VFIO_IOMMU_TLB_SYNC_MAX || !unmapped) {
859 *unlocked += vfio_sync_unpin(dma, domain, unmapped_list,
860 iotlb_gather);
861 *unmapped_cnt = 0;
862 }
863
864 return unmapped;
865 }
866
unmap_unpin_slow(struct vfio_domain * domain,struct vfio_dma * dma,dma_addr_t * iova,size_t len,phys_addr_t phys,long * unlocked)867 static size_t unmap_unpin_slow(struct vfio_domain *domain,
868 struct vfio_dma *dma, dma_addr_t *iova,
869 size_t len, phys_addr_t phys,
870 long *unlocked)
871 {
872 size_t unmapped = iommu_unmap(domain->domain, *iova, len);
873
874 if (unmapped) {
875 *unlocked += vfio_unpin_pages_remote(dma, *iova,
876 phys >> PAGE_SHIFT,
877 unmapped >> PAGE_SHIFT,
878 false);
879 *iova += unmapped;
880 cond_resched();
881 }
882 return unmapped;
883 }
884
vfio_unmap_unpin(struct vfio_iommu * iommu,struct vfio_dma * dma,bool do_accounting)885 static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
886 bool do_accounting)
887 {
888 dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
889 struct vfio_domain *domain, *d;
890 LIST_HEAD(unmapped_region_list);
891 struct iommu_iotlb_gather iotlb_gather;
892 int unmapped_region_cnt = 0;
893 long unlocked = 0;
894
895 if (!dma->size)
896 return 0;
897
898 if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
899 return 0;
900
901 /*
902 * We use the IOMMU to track the physical addresses, otherwise we'd
903 * need a much more complicated tracking system. Unfortunately that
904 * means we need to use one of the iommu domains to figure out the
905 * pfns to unpin. The rest need to be unmapped in advance so we have
906 * no iommu translations remaining when the pages are unpinned.
907 */
908 domain = d = list_first_entry(&iommu->domain_list,
909 struct vfio_domain, next);
910
911 list_for_each_entry_continue(d, &iommu->domain_list, next) {
912 iommu_unmap(d->domain, dma->iova, dma->size);
913 cond_resched();
914 }
915
916 iommu_iotlb_gather_init(&iotlb_gather);
917 while (iova < end) {
918 size_t unmapped, len;
919 phys_addr_t phys, next;
920
921 phys = iommu_iova_to_phys(domain->domain, iova);
922 if (WARN_ON(!phys)) {
923 iova += PAGE_SIZE;
924 continue;
925 }
926
927 /*
928 * To optimize for fewer iommu_unmap() calls, each of which
929 * may require hardware cache flushing, try to find the
930 * largest contiguous physical memory chunk to unmap.
931 */
932 for (len = PAGE_SIZE;
933 !domain->fgsp && iova + len < end; len += PAGE_SIZE) {
934 next = iommu_iova_to_phys(domain->domain, iova + len);
935 if (next != phys + len)
936 break;
937 }
938
939 /*
940 * First, try to use fast unmap/unpin. In case of failure,
941 * switch to slow unmap/unpin path.
942 */
943 unmapped = unmap_unpin_fast(domain, dma, &iova, len, phys,
944 &unlocked, &unmapped_region_list,
945 &unmapped_region_cnt,
946 &iotlb_gather);
947 if (!unmapped) {
948 unmapped = unmap_unpin_slow(domain, dma, &iova, len,
949 phys, &unlocked);
950 if (WARN_ON(!unmapped))
951 break;
952 }
953 }
954
955 dma->iommu_mapped = false;
956
957 if (unmapped_region_cnt) {
958 unlocked += vfio_sync_unpin(dma, domain, &unmapped_region_list,
959 &iotlb_gather);
960 }
961
962 if (do_accounting) {
963 vfio_lock_acct(dma, -unlocked, true);
964 return 0;
965 }
966 return unlocked;
967 }
968
vfio_remove_dma(struct vfio_iommu * iommu,struct vfio_dma * dma)969 static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
970 {
971 WARN_ON(!RB_EMPTY_ROOT(&dma->pfn_list));
972 vfio_unmap_unpin(iommu, dma, true);
973 vfio_unlink_dma(iommu, dma);
974 put_task_struct(dma->task);
975 vfio_dma_bitmap_free(dma);
976 kfree(dma);
977 iommu->dma_avail++;
978 }
979
vfio_update_pgsize_bitmap(struct vfio_iommu * iommu)980 static void vfio_update_pgsize_bitmap(struct vfio_iommu *iommu)
981 {
982 struct vfio_domain *domain;
983
984 iommu->pgsize_bitmap = ULONG_MAX;
985
986 list_for_each_entry(domain, &iommu->domain_list, next)
987 iommu->pgsize_bitmap &= domain->domain->pgsize_bitmap;
988
989 /*
990 * In case the IOMMU supports page sizes smaller than PAGE_SIZE
991 * we pretend PAGE_SIZE is supported and hide sub-PAGE_SIZE sizes.
992 * That way the user will be able to map/unmap buffers whose size/
993 * start address is aligned with PAGE_SIZE. Pinning code uses that
994 * granularity while iommu driver can use the sub-PAGE_SIZE size
995 * to map the buffer.
996 */
997 if (iommu->pgsize_bitmap & ~PAGE_MASK) {
998 iommu->pgsize_bitmap &= PAGE_MASK;
999 iommu->pgsize_bitmap |= PAGE_SIZE;
1000 }
1001 }
1002
update_user_bitmap(u64 __user * bitmap,struct vfio_iommu * iommu,struct vfio_dma * dma,dma_addr_t base_iova,size_t pgsize)1003 static int update_user_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
1004 struct vfio_dma *dma, dma_addr_t base_iova,
1005 size_t pgsize)
1006 {
1007 unsigned long pgshift = __ffs(pgsize);
1008 unsigned long nbits = dma->size >> pgshift;
1009 unsigned long bit_offset = (dma->iova - base_iova) >> pgshift;
1010 unsigned long copy_offset = bit_offset / BITS_PER_LONG;
1011 unsigned long shift = bit_offset % BITS_PER_LONG;
1012 unsigned long leftover;
1013
1014 /*
1015 * mark all pages dirty if any IOMMU capable device is not able
1016 * to report dirty pages and all pages are pinned and mapped.
1017 */
1018 if (!iommu->pinned_page_dirty_scope && dma->iommu_mapped)
1019 bitmap_set(dma->bitmap, 0, nbits);
1020
1021 if (shift) {
1022 bitmap_shift_left(dma->bitmap, dma->bitmap, shift,
1023 nbits + shift);
1024
1025 if (copy_from_user(&leftover,
1026 (void __user *)(bitmap + copy_offset),
1027 sizeof(leftover)))
1028 return -EFAULT;
1029
1030 bitmap_or(dma->bitmap, dma->bitmap, &leftover, shift);
1031 }
1032
1033 if (copy_to_user((void __user *)(bitmap + copy_offset), dma->bitmap,
1034 DIRTY_BITMAP_BYTES(nbits + shift)))
1035 return -EFAULT;
1036
1037 return 0;
1038 }
1039
vfio_iova_dirty_bitmap(u64 __user * bitmap,struct vfio_iommu * iommu,dma_addr_t iova,size_t size,size_t pgsize)1040 static int vfio_iova_dirty_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
1041 dma_addr_t iova, size_t size, size_t pgsize)
1042 {
1043 struct vfio_dma *dma;
1044 struct rb_node *n;
1045 unsigned long pgshift = __ffs(pgsize);
1046 int ret;
1047
1048 /*
1049 * GET_BITMAP request must fully cover vfio_dma mappings. Multiple
1050 * vfio_dma mappings may be clubbed by specifying large ranges, but
1051 * there must not be any previous mappings bisected by the range.
1052 * An error will be returned if these conditions are not met.
1053 */
1054 dma = vfio_find_dma(iommu, iova, 1);
1055 if (dma && dma->iova != iova)
1056 return -EINVAL;
1057
1058 dma = vfio_find_dma(iommu, iova + size - 1, 0);
1059 if (dma && dma->iova + dma->size != iova + size)
1060 return -EINVAL;
1061
1062 for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
1063 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
1064
1065 if (dma->iova < iova)
1066 continue;
1067
1068 if (dma->iova > iova + size - 1)
1069 break;
1070
1071 ret = update_user_bitmap(bitmap, iommu, dma, iova, pgsize);
1072 if (ret)
1073 return ret;
1074
1075 /*
1076 * Re-populate bitmap to include all pinned pages which are
1077 * considered as dirty but exclude pages which are unpinned and
1078 * pages which are marked dirty by vfio_dma_rw()
1079 */
1080 bitmap_clear(dma->bitmap, 0, dma->size >> pgshift);
1081 vfio_dma_populate_bitmap(dma, pgsize);
1082 }
1083 return 0;
1084 }
1085
verify_bitmap_size(uint64_t npages,uint64_t bitmap_size)1086 static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
1087 {
1088 if (!npages || !bitmap_size || (bitmap_size > DIRTY_BITMAP_SIZE_MAX) ||
1089 (bitmap_size < DIRTY_BITMAP_BYTES(npages)))
1090 return -EINVAL;
1091
1092 return 0;
1093 }
1094
vfio_dma_do_unmap(struct vfio_iommu * iommu,struct vfio_iommu_type1_dma_unmap * unmap,struct vfio_bitmap * bitmap)1095 static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
1096 struct vfio_iommu_type1_dma_unmap *unmap,
1097 struct vfio_bitmap *bitmap)
1098 {
1099 struct vfio_dma *dma, *dma_last = NULL;
1100 size_t unmapped = 0, pgsize;
1101 int ret = 0, retries = 0;
1102 unsigned long pgshift;
1103
1104 mutex_lock(&iommu->lock);
1105
1106 pgshift = __ffs(iommu->pgsize_bitmap);
1107 pgsize = (size_t)1 << pgshift;
1108
1109 if (unmap->iova & (pgsize - 1)) {
1110 ret = -EINVAL;
1111 goto unlock;
1112 }
1113
1114 if (!unmap->size || unmap->size & (pgsize - 1)) {
1115 ret = -EINVAL;
1116 goto unlock;
1117 }
1118
1119 if (unmap->iova + unmap->size - 1 < unmap->iova ||
1120 unmap->size > SIZE_MAX) {
1121 ret = -EINVAL;
1122 goto unlock;
1123 }
1124
1125 /* When dirty tracking is enabled, allow only min supported pgsize */
1126 if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
1127 (!iommu->dirty_page_tracking || (bitmap->pgsize != pgsize))) {
1128 ret = -EINVAL;
1129 goto unlock;
1130 }
1131
1132 WARN_ON((pgsize - 1) & PAGE_MASK);
1133 again:
1134 /*
1135 * vfio-iommu-type1 (v1) - User mappings were coalesced together to
1136 * avoid tracking individual mappings. This means that the granularity
1137 * of the original mapping was lost and the user was allowed to attempt
1138 * to unmap any range. Depending on the contiguousness of physical
1139 * memory and page sizes supported by the IOMMU, arbitrary unmaps may
1140 * or may not have worked. We only guaranteed unmap granularity
1141 * matching the original mapping; even though it was untracked here,
1142 * the original mappings are reflected in IOMMU mappings. This
1143 * resulted in a couple unusual behaviors. First, if a range is not
1144 * able to be unmapped, ex. a set of 4k pages that was mapped as a
1145 * 2M hugepage into the IOMMU, the unmap ioctl returns success but with
1146 * a zero sized unmap. Also, if an unmap request overlaps the first
1147 * address of a hugepage, the IOMMU will unmap the entire hugepage.
1148 * This also returns success and the returned unmap size reflects the
1149 * actual size unmapped.
1150 *
1151 * We attempt to maintain compatibility with this "v1" interface, but
1152 * we take control out of the hands of the IOMMU. Therefore, an unmap
1153 * request offset from the beginning of the original mapping will
1154 * return success with zero sized unmap. And an unmap request covering
1155 * the first iova of mapping will unmap the entire range.
1156 *
1157 * The v2 version of this interface intends to be more deterministic.
1158 * Unmap requests must fully cover previous mappings. Multiple
1159 * mappings may still be unmaped by specifying large ranges, but there
1160 * must not be any previous mappings bisected by the range. An error
1161 * will be returned if these conditions are not met. The v2 interface
1162 * will only return success and a size of zero if there were no
1163 * mappings within the range.
1164 */
1165 if (iommu->v2) {
1166 dma = vfio_find_dma(iommu, unmap->iova, 1);
1167 if (dma && dma->iova != unmap->iova) {
1168 ret = -EINVAL;
1169 goto unlock;
1170 }
1171 dma = vfio_find_dma(iommu, unmap->iova + unmap->size - 1, 0);
1172 if (dma && dma->iova + dma->size != unmap->iova + unmap->size) {
1173 ret = -EINVAL;
1174 goto unlock;
1175 }
1176 }
1177
1178 while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) {
1179 if (!iommu->v2 && unmap->iova > dma->iova)
1180 break;
1181 /*
1182 * Task with same address space who mapped this iova range is
1183 * allowed to unmap the iova range.
1184 */
1185 if (dma->task->mm != current->mm)
1186 break;
1187
1188 if (!RB_EMPTY_ROOT(&dma->pfn_list)) {
1189 struct vfio_iommu_type1_dma_unmap nb_unmap;
1190
1191 if (dma_last == dma) {
1192 BUG_ON(++retries > 10);
1193 } else {
1194 dma_last = dma;
1195 retries = 0;
1196 }
1197
1198 nb_unmap.iova = dma->iova;
1199 nb_unmap.size = dma->size;
1200
1201 /*
1202 * Notify anyone (mdev vendor drivers) to invalidate and
1203 * unmap iovas within the range we're about to unmap.
1204 * Vendor drivers MUST unpin pages in response to an
1205 * invalidation.
1206 */
1207 mutex_unlock(&iommu->lock);
1208 blocking_notifier_call_chain(&iommu->notifier,
1209 VFIO_IOMMU_NOTIFY_DMA_UNMAP,
1210 &nb_unmap);
1211 mutex_lock(&iommu->lock);
1212 goto again;
1213 }
1214
1215 if (unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
1216 ret = update_user_bitmap(bitmap->data, iommu, dma,
1217 unmap->iova, pgsize);
1218 if (ret)
1219 break;
1220 }
1221
1222 unmapped += dma->size;
1223 vfio_remove_dma(iommu, dma);
1224 }
1225
1226 unlock:
1227 mutex_unlock(&iommu->lock);
1228
1229 /* Report how much was unmapped */
1230 unmap->size = unmapped;
1231
1232 return ret;
1233 }
1234
vfio_iommu_map(struct vfio_iommu * iommu,dma_addr_t iova,unsigned long pfn,long npage,int prot)1235 static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova,
1236 unsigned long pfn, long npage, int prot)
1237 {
1238 struct vfio_domain *d;
1239 int ret;
1240
1241 list_for_each_entry(d, &iommu->domain_list, next) {
1242 ret = iommu_map(d->domain, iova, (phys_addr_t)pfn << PAGE_SHIFT,
1243 npage << PAGE_SHIFT, prot | d->prot);
1244 if (ret)
1245 goto unwind;
1246
1247 cond_resched();
1248 }
1249
1250 return 0;
1251
1252 unwind:
1253 list_for_each_entry_continue_reverse(d, &iommu->domain_list, next) {
1254 iommu_unmap(d->domain, iova, npage << PAGE_SHIFT);
1255 cond_resched();
1256 }
1257
1258 return ret;
1259 }
1260
vfio_pin_map_dma(struct vfio_iommu * iommu,struct vfio_dma * dma,size_t map_size)1261 static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
1262 size_t map_size)
1263 {
1264 dma_addr_t iova = dma->iova;
1265 unsigned long vaddr = dma->vaddr;
1266 size_t size = map_size;
1267 long npage;
1268 unsigned long pfn, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
1269 int ret = 0;
1270
1271 while (size) {
1272 /* Pin a contiguous chunk of memory */
1273 npage = vfio_pin_pages_remote(dma, vaddr + dma->size,
1274 size >> PAGE_SHIFT, &pfn, limit);
1275 if (npage <= 0) {
1276 WARN_ON(!npage);
1277 ret = (int)npage;
1278 break;
1279 }
1280
1281 /* Map it! */
1282 ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage,
1283 dma->prot);
1284 if (ret) {
1285 vfio_unpin_pages_remote(dma, iova + dma->size, pfn,
1286 npage, true);
1287 break;
1288 }
1289
1290 size -= npage << PAGE_SHIFT;
1291 dma->size += npage << PAGE_SHIFT;
1292 }
1293
1294 dma->iommu_mapped = true;
1295
1296 if (ret)
1297 vfio_remove_dma(iommu, dma);
1298
1299 return ret;
1300 }
1301
1302 /*
1303 * Check dma map request is within a valid iova range
1304 */
vfio_iommu_iova_dma_valid(struct vfio_iommu * iommu,dma_addr_t start,dma_addr_t end)1305 static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,
1306 dma_addr_t start, dma_addr_t end)
1307 {
1308 struct list_head *iova = &iommu->iova_list;
1309 struct vfio_iova *node;
1310
1311 list_for_each_entry(node, iova, list) {
1312 if (start >= node->start && end <= node->end)
1313 return true;
1314 }
1315
1316 /*
1317 * Check for list_empty() as well since a container with
1318 * a single mdev device will have an empty list.
1319 */
1320 return list_empty(iova);
1321 }
1322
vfio_dma_do_map(struct vfio_iommu * iommu,struct vfio_iommu_type1_dma_map * map)1323 static int vfio_dma_do_map(struct vfio_iommu *iommu,
1324 struct vfio_iommu_type1_dma_map *map)
1325 {
1326 dma_addr_t iova = map->iova;
1327 unsigned long vaddr = map->vaddr;
1328 size_t size = map->size;
1329 int ret = 0, prot = 0;
1330 size_t pgsize;
1331 struct vfio_dma *dma;
1332
1333 /* Verify that none of our __u64 fields overflow */
1334 if (map->size != size || map->vaddr != vaddr || map->iova != iova)
1335 return -EINVAL;
1336
1337 /* READ/WRITE from device perspective */
1338 if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
1339 prot |= IOMMU_WRITE;
1340 if (map->flags & VFIO_DMA_MAP_FLAG_READ)
1341 prot |= IOMMU_READ;
1342
1343 mutex_lock(&iommu->lock);
1344
1345 pgsize = (size_t)1 << __ffs(iommu->pgsize_bitmap);
1346
1347 WARN_ON((pgsize - 1) & PAGE_MASK);
1348
1349 if (!prot || !size || (size | iova | vaddr) & (pgsize - 1)) {
1350 ret = -EINVAL;
1351 goto out_unlock;
1352 }
1353
1354 /* Don't allow IOVA or virtual address wrap */
1355 if (iova + size - 1 < iova || vaddr + size - 1 < vaddr) {
1356 ret = -EINVAL;
1357 goto out_unlock;
1358 }
1359
1360 if (vfio_find_dma(iommu, iova, size)) {
1361 ret = -EEXIST;
1362 goto out_unlock;
1363 }
1364
1365 if (!iommu->dma_avail) {
1366 ret = -ENOSPC;
1367 goto out_unlock;
1368 }
1369
1370 if (!vfio_iommu_iova_dma_valid(iommu, iova, iova + size - 1)) {
1371 ret = -EINVAL;
1372 goto out_unlock;
1373 }
1374
1375 dma = kzalloc(sizeof(*dma), GFP_KERNEL);
1376 if (!dma) {
1377 ret = -ENOMEM;
1378 goto out_unlock;
1379 }
1380
1381 iommu->dma_avail--;
1382 dma->iova = iova;
1383 dma->vaddr = vaddr;
1384 dma->prot = prot;
1385
1386 /*
1387 * We need to be able to both add to a task's locked memory and test
1388 * against the locked memory limit and we need to be able to do both
1389 * outside of this call path as pinning can be asynchronous via the
1390 * external interfaces for mdev devices. RLIMIT_MEMLOCK requires a
1391 * task_struct and VM locked pages requires an mm_struct, however
1392 * holding an indefinite mm reference is not recommended, therefore we
1393 * only hold a reference to a task. We could hold a reference to
1394 * current, however QEMU uses this call path through vCPU threads,
1395 * which can be killed resulting in a NULL mm and failure in the unmap
1396 * path when called via a different thread. Avoid this problem by
1397 * using the group_leader as threads within the same group require
1398 * both CLONE_THREAD and CLONE_VM and will therefore use the same
1399 * mm_struct.
1400 *
1401 * Previously we also used the task for testing CAP_IPC_LOCK at the
1402 * time of pinning and accounting, however has_capability() makes use
1403 * of real_cred, a copy-on-write field, so we can't guarantee that it
1404 * matches group_leader, or in fact that it might not change by the
1405 * time it's evaluated. If a process were to call MAP_DMA with
1406 * CAP_IPC_LOCK but later drop it, it doesn't make sense that they
1407 * possibly see different results for an iommu_mapped vfio_dma vs
1408 * externally mapped. Therefore track CAP_IPC_LOCK in vfio_dma at the
1409 * time of calling MAP_DMA.
1410 */
1411 get_task_struct(current->group_leader);
1412 dma->task = current->group_leader;
1413 dma->lock_cap = capable(CAP_IPC_LOCK);
1414
1415 dma->pfn_list = RB_ROOT;
1416
1417 /* Insert zero-sized and grow as we map chunks of it */
1418 vfio_link_dma(iommu, dma);
1419
1420 /* Don't pin and map if container doesn't contain IOMMU capable domain*/
1421 if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
1422 dma->size = size;
1423 else
1424 ret = vfio_pin_map_dma(iommu, dma, size);
1425
1426 if (!ret && iommu->dirty_page_tracking) {
1427 ret = vfio_dma_bitmap_alloc(dma, pgsize);
1428 if (ret)
1429 vfio_remove_dma(iommu, dma);
1430 }
1431
1432 out_unlock:
1433 mutex_unlock(&iommu->lock);
1434 return ret;
1435 }
1436
vfio_bus_type(struct device * dev,void * data)1437 static int vfio_bus_type(struct device *dev, void *data)
1438 {
1439 struct bus_type **bus = data;
1440
1441 if (*bus && *bus != dev->bus)
1442 return -EINVAL;
1443
1444 *bus = dev->bus;
1445
1446 return 0;
1447 }
1448
vfio_iommu_replay(struct vfio_iommu * iommu,struct vfio_domain * domain)1449 static int vfio_iommu_replay(struct vfio_iommu *iommu,
1450 struct vfio_domain *domain)
1451 {
1452 struct vfio_domain *d = NULL;
1453 struct rb_node *n;
1454 unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
1455 int ret;
1456
1457 /* Arbitrarily pick the first domain in the list for lookups */
1458 if (!list_empty(&iommu->domain_list))
1459 d = list_first_entry(&iommu->domain_list,
1460 struct vfio_domain, next);
1461
1462 n = rb_first(&iommu->dma_list);
1463
1464 for (; n; n = rb_next(n)) {
1465 struct vfio_dma *dma;
1466 dma_addr_t iova;
1467
1468 dma = rb_entry(n, struct vfio_dma, node);
1469 iova = dma->iova;
1470
1471 while (iova < dma->iova + dma->size) {
1472 phys_addr_t phys;
1473 size_t size;
1474
1475 if (dma->iommu_mapped) {
1476 phys_addr_t p;
1477 dma_addr_t i;
1478
1479 if (WARN_ON(!d)) { /* mapped w/o a domain?! */
1480 ret = -EINVAL;
1481 goto unwind;
1482 }
1483
1484 phys = iommu_iova_to_phys(d->domain, iova);
1485
1486 if (WARN_ON(!phys)) {
1487 iova += PAGE_SIZE;
1488 continue;
1489 }
1490
1491 size = PAGE_SIZE;
1492 p = phys + size;
1493 i = iova + size;
1494 while (i < dma->iova + dma->size &&
1495 p == iommu_iova_to_phys(d->domain, i)) {
1496 size += PAGE_SIZE;
1497 p += PAGE_SIZE;
1498 i += PAGE_SIZE;
1499 }
1500 } else {
1501 unsigned long pfn;
1502 unsigned long vaddr = dma->vaddr +
1503 (iova - dma->iova);
1504 size_t n = dma->iova + dma->size - iova;
1505 long npage;
1506
1507 npage = vfio_pin_pages_remote(dma, vaddr,
1508 n >> PAGE_SHIFT,
1509 &pfn, limit);
1510 if (npage <= 0) {
1511 WARN_ON(!npage);
1512 ret = (int)npage;
1513 goto unwind;
1514 }
1515
1516 phys = pfn << PAGE_SHIFT;
1517 size = npage << PAGE_SHIFT;
1518 }
1519
1520 ret = iommu_map(domain->domain, iova, phys,
1521 size, dma->prot | domain->prot);
1522 if (ret) {
1523 if (!dma->iommu_mapped)
1524 vfio_unpin_pages_remote(dma, iova,
1525 phys >> PAGE_SHIFT,
1526 size >> PAGE_SHIFT,
1527 true);
1528 goto unwind;
1529 }
1530
1531 iova += size;
1532 }
1533 }
1534
1535 /* All dmas are now mapped, defer to second tree walk for unwind */
1536 for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
1537 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
1538
1539 dma->iommu_mapped = true;
1540 }
1541
1542 return 0;
1543
1544 unwind:
1545 for (; n; n = rb_prev(n)) {
1546 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
1547 dma_addr_t iova;
1548
1549 if (dma->iommu_mapped) {
1550 iommu_unmap(domain->domain, dma->iova, dma->size);
1551 continue;
1552 }
1553
1554 iova = dma->iova;
1555 while (iova < dma->iova + dma->size) {
1556 phys_addr_t phys, p;
1557 size_t size;
1558 dma_addr_t i;
1559
1560 phys = iommu_iova_to_phys(domain->domain, iova);
1561 if (!phys) {
1562 iova += PAGE_SIZE;
1563 continue;
1564 }
1565
1566 size = PAGE_SIZE;
1567 p = phys + size;
1568 i = iova + size;
1569 while (i < dma->iova + dma->size &&
1570 p == iommu_iova_to_phys(domain->domain, i)) {
1571 size += PAGE_SIZE;
1572 p += PAGE_SIZE;
1573 i += PAGE_SIZE;
1574 }
1575
1576 iommu_unmap(domain->domain, iova, size);
1577 vfio_unpin_pages_remote(dma, iova, phys >> PAGE_SHIFT,
1578 size >> PAGE_SHIFT, true);
1579 }
1580 }
1581
1582 return ret;
1583 }
1584
1585 /*
1586 * We change our unmap behavior slightly depending on whether the IOMMU
1587 * supports fine-grained superpages. IOMMUs like AMD-Vi will use a superpage
1588 * for practically any contiguous power-of-two mapping we give it. This means
1589 * we don't need to look for contiguous chunks ourselves to make unmapping
1590 * more efficient. On IOMMUs with coarse-grained super pages, like Intel VT-d
1591 * with discrete 2M/1G/512G/1T superpages, identifying contiguous chunks
1592 * significantly boosts non-hugetlbfs mappings and doesn't seem to hurt when
1593 * hugetlbfs is in use.
1594 */
vfio_test_domain_fgsp(struct vfio_domain * domain)1595 static void vfio_test_domain_fgsp(struct vfio_domain *domain)
1596 {
1597 struct page *pages;
1598 int ret, order = get_order(PAGE_SIZE * 2);
1599
1600 pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, order);
1601 if (!pages)
1602 return;
1603
1604 ret = iommu_map(domain->domain, 0, page_to_phys(pages), PAGE_SIZE * 2,
1605 IOMMU_READ | IOMMU_WRITE | domain->prot);
1606 if (!ret) {
1607 size_t unmapped = iommu_unmap(domain->domain, 0, PAGE_SIZE);
1608
1609 if (unmapped == PAGE_SIZE)
1610 iommu_unmap(domain->domain, PAGE_SIZE, PAGE_SIZE);
1611 else
1612 domain->fgsp = true;
1613 }
1614
1615 __free_pages(pages, order);
1616 }
1617
find_iommu_group(struct vfio_domain * domain,struct iommu_group * iommu_group)1618 static struct vfio_group *find_iommu_group(struct vfio_domain *domain,
1619 struct iommu_group *iommu_group)
1620 {
1621 struct vfio_group *g;
1622
1623 list_for_each_entry(g, &domain->group_list, next) {
1624 if (g->iommu_group == iommu_group)
1625 return g;
1626 }
1627
1628 return NULL;
1629 }
1630
vfio_iommu_find_iommu_group(struct vfio_iommu * iommu,struct iommu_group * iommu_group)1631 static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
1632 struct iommu_group *iommu_group)
1633 {
1634 struct vfio_domain *domain;
1635 struct vfio_group *group = NULL;
1636
1637 list_for_each_entry(domain, &iommu->domain_list, next) {
1638 group = find_iommu_group(domain, iommu_group);
1639 if (group)
1640 return group;
1641 }
1642
1643 if (iommu->external_domain)
1644 group = find_iommu_group(iommu->external_domain, iommu_group);
1645
1646 return group;
1647 }
1648
update_pinned_page_dirty_scope(struct vfio_iommu * iommu)1649 static void update_pinned_page_dirty_scope(struct vfio_iommu *iommu)
1650 {
1651 struct vfio_domain *domain;
1652 struct vfio_group *group;
1653
1654 list_for_each_entry(domain, &iommu->domain_list, next) {
1655 list_for_each_entry(group, &domain->group_list, next) {
1656 if (!group->pinned_page_dirty_scope) {
1657 iommu->pinned_page_dirty_scope = false;
1658 return;
1659 }
1660 }
1661 }
1662
1663 if (iommu->external_domain) {
1664 domain = iommu->external_domain;
1665 list_for_each_entry(group, &domain->group_list, next) {
1666 if (!group->pinned_page_dirty_scope) {
1667 iommu->pinned_page_dirty_scope = false;
1668 return;
1669 }
1670 }
1671 }
1672
1673 iommu->pinned_page_dirty_scope = true;
1674 }
1675
vfio_iommu_has_sw_msi(struct list_head * group_resv_regions,phys_addr_t * base)1676 static bool vfio_iommu_has_sw_msi(struct list_head *group_resv_regions,
1677 phys_addr_t *base)
1678 {
1679 struct iommu_resv_region *region;
1680 bool ret = false;
1681
1682 list_for_each_entry(region, group_resv_regions, list) {
1683 /*
1684 * The presence of any 'real' MSI regions should take
1685 * precedence over the software-managed one if the
1686 * IOMMU driver happens to advertise both types.
1687 */
1688 if (region->type == IOMMU_RESV_MSI) {
1689 ret = false;
1690 break;
1691 }
1692
1693 if (region->type == IOMMU_RESV_SW_MSI) {
1694 *base = region->start;
1695 ret = true;
1696 }
1697 }
1698
1699 return ret;
1700 }
1701
vfio_mdev_get_iommu_device(struct device * dev)1702 static struct device *vfio_mdev_get_iommu_device(struct device *dev)
1703 {
1704 struct device *(*fn)(struct device *dev);
1705 struct device *iommu_device;
1706
1707 fn = symbol_get(mdev_get_iommu_device);
1708 if (fn) {
1709 iommu_device = fn(dev);
1710 symbol_put(mdev_get_iommu_device);
1711
1712 return iommu_device;
1713 }
1714
1715 return NULL;
1716 }
1717
vfio_mdev_attach_domain(struct device * dev,void * data)1718 static int vfio_mdev_attach_domain(struct device *dev, void *data)
1719 {
1720 struct iommu_domain *domain = data;
1721 struct device *iommu_device;
1722
1723 iommu_device = vfio_mdev_get_iommu_device(dev);
1724 if (iommu_device) {
1725 if (iommu_dev_feature_enabled(iommu_device, IOMMU_DEV_FEAT_AUX))
1726 return iommu_aux_attach_device(domain, iommu_device);
1727 else
1728 return iommu_attach_device(domain, iommu_device);
1729 }
1730
1731 return -EINVAL;
1732 }
1733
vfio_mdev_detach_domain(struct device * dev,void * data)1734 static int vfio_mdev_detach_domain(struct device *dev, void *data)
1735 {
1736 struct iommu_domain *domain = data;
1737 struct device *iommu_device;
1738
1739 iommu_device = vfio_mdev_get_iommu_device(dev);
1740 if (iommu_device) {
1741 if (iommu_dev_feature_enabled(iommu_device, IOMMU_DEV_FEAT_AUX))
1742 iommu_aux_detach_device(domain, iommu_device);
1743 else
1744 iommu_detach_device(domain, iommu_device);
1745 }
1746
1747 return 0;
1748 }
1749
vfio_iommu_attach_group(struct vfio_domain * domain,struct vfio_group * group)1750 static int vfio_iommu_attach_group(struct vfio_domain *domain,
1751 struct vfio_group *group)
1752 {
1753 if (group->mdev_group)
1754 return iommu_group_for_each_dev(group->iommu_group,
1755 domain->domain,
1756 vfio_mdev_attach_domain);
1757 else
1758 return iommu_attach_group(domain->domain, group->iommu_group);
1759 }
1760
vfio_iommu_detach_group(struct vfio_domain * domain,struct vfio_group * group)1761 static void vfio_iommu_detach_group(struct vfio_domain *domain,
1762 struct vfio_group *group)
1763 {
1764 if (group->mdev_group)
1765 iommu_group_for_each_dev(group->iommu_group, domain->domain,
1766 vfio_mdev_detach_domain);
1767 else
1768 iommu_detach_group(domain->domain, group->iommu_group);
1769 }
1770
vfio_bus_is_mdev(struct bus_type * bus)1771 static bool vfio_bus_is_mdev(struct bus_type *bus)
1772 {
1773 struct bus_type *mdev_bus;
1774 bool ret = false;
1775
1776 mdev_bus = symbol_get(mdev_bus_type);
1777 if (mdev_bus) {
1778 ret = (bus == mdev_bus);
1779 symbol_put(mdev_bus_type);
1780 }
1781
1782 return ret;
1783 }
1784
vfio_mdev_iommu_device(struct device * dev,void * data)1785 static int vfio_mdev_iommu_device(struct device *dev, void *data)
1786 {
1787 struct device **old = data, *new;
1788
1789 new = vfio_mdev_get_iommu_device(dev);
1790 if (!new || (*old && *old != new))
1791 return -EINVAL;
1792
1793 *old = new;
1794
1795 return 0;
1796 }
1797
1798 /*
1799 * This is a helper function to insert an address range to iova list.
1800 * The list is initially created with a single entry corresponding to
1801 * the IOMMU domain geometry to which the device group is attached.
1802 * The list aperture gets modified when a new domain is added to the
1803 * container if the new aperture doesn't conflict with the current one
1804 * or with any existing dma mappings. The list is also modified to
1805 * exclude any reserved regions associated with the device group.
1806 */
vfio_iommu_iova_insert(struct list_head * head,dma_addr_t start,dma_addr_t end)1807 static int vfio_iommu_iova_insert(struct list_head *head,
1808 dma_addr_t start, dma_addr_t end)
1809 {
1810 struct vfio_iova *region;
1811
1812 region = kmalloc(sizeof(*region), GFP_KERNEL);
1813 if (!region)
1814 return -ENOMEM;
1815
1816 INIT_LIST_HEAD(®ion->list);
1817 region->start = start;
1818 region->end = end;
1819
1820 list_add_tail(®ion->list, head);
1821 return 0;
1822 }
1823
1824 /*
1825 * Check the new iommu aperture conflicts with existing aper or with any
1826 * existing dma mappings.
1827 */
vfio_iommu_aper_conflict(struct vfio_iommu * iommu,dma_addr_t start,dma_addr_t end)1828 static bool vfio_iommu_aper_conflict(struct vfio_iommu *iommu,
1829 dma_addr_t start, dma_addr_t end)
1830 {
1831 struct vfio_iova *first, *last;
1832 struct list_head *iova = &iommu->iova_list;
1833
1834 if (list_empty(iova))
1835 return false;
1836
1837 /* Disjoint sets, return conflict */
1838 first = list_first_entry(iova, struct vfio_iova, list);
1839 last = list_last_entry(iova, struct vfio_iova, list);
1840 if (start > last->end || end < first->start)
1841 return true;
1842
1843 /* Check for any existing dma mappings below the new start */
1844 if (start > first->start) {
1845 if (vfio_find_dma(iommu, first->start, start - first->start))
1846 return true;
1847 }
1848
1849 /* Check for any existing dma mappings beyond the new end */
1850 if (end < last->end) {
1851 if (vfio_find_dma(iommu, end + 1, last->end - end))
1852 return true;
1853 }
1854
1855 return false;
1856 }
1857
1858 /*
1859 * Resize iommu iova aperture window. This is called only if the new
1860 * aperture has no conflict with existing aperture and dma mappings.
1861 */
vfio_iommu_aper_resize(struct list_head * iova,dma_addr_t start,dma_addr_t end)1862 static int vfio_iommu_aper_resize(struct list_head *iova,
1863 dma_addr_t start, dma_addr_t end)
1864 {
1865 struct vfio_iova *node, *next;
1866
1867 if (list_empty(iova))
1868 return vfio_iommu_iova_insert(iova, start, end);
1869
1870 /* Adjust iova list start */
1871 list_for_each_entry_safe(node, next, iova, list) {
1872 if (start < node->start)
1873 break;
1874 if (start >= node->start && start < node->end) {
1875 node->start = start;
1876 break;
1877 }
1878 /* Delete nodes before new start */
1879 list_del(&node->list);
1880 kfree(node);
1881 }
1882
1883 /* Adjust iova list end */
1884 list_for_each_entry_safe(node, next, iova, list) {
1885 if (end > node->end)
1886 continue;
1887 if (end > node->start && end <= node->end) {
1888 node->end = end;
1889 continue;
1890 }
1891 /* Delete nodes after new end */
1892 list_del(&node->list);
1893 kfree(node);
1894 }
1895
1896 return 0;
1897 }
1898
1899 /*
1900 * Check reserved region conflicts with existing dma mappings
1901 */
vfio_iommu_resv_conflict(struct vfio_iommu * iommu,struct list_head * resv_regions)1902 static bool vfio_iommu_resv_conflict(struct vfio_iommu *iommu,
1903 struct list_head *resv_regions)
1904 {
1905 struct iommu_resv_region *region;
1906
1907 /* Check for conflict with existing dma mappings */
1908 list_for_each_entry(region, resv_regions, list) {
1909 if (region->type == IOMMU_RESV_DIRECT_RELAXABLE)
1910 continue;
1911
1912 if (vfio_find_dma(iommu, region->start, region->length))
1913 return true;
1914 }
1915
1916 return false;
1917 }
1918
1919 /*
1920 * Check iova region overlap with reserved regions and
1921 * exclude them from the iommu iova range
1922 */
vfio_iommu_resv_exclude(struct list_head * iova,struct list_head * resv_regions)1923 static int vfio_iommu_resv_exclude(struct list_head *iova,
1924 struct list_head *resv_regions)
1925 {
1926 struct iommu_resv_region *resv;
1927 struct vfio_iova *n, *next;
1928
1929 list_for_each_entry(resv, resv_regions, list) {
1930 phys_addr_t start, end;
1931
1932 if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE)
1933 continue;
1934
1935 start = resv->start;
1936 end = resv->start + resv->length - 1;
1937
1938 list_for_each_entry_safe(n, next, iova, list) {
1939 int ret = 0;
1940
1941 /* No overlap */
1942 if (start > n->end || end < n->start)
1943 continue;
1944 /*
1945 * Insert a new node if current node overlaps with the
1946 * reserve region to exlude that from valid iova range.
1947 * Note that, new node is inserted before the current
1948 * node and finally the current node is deleted keeping
1949 * the list updated and sorted.
1950 */
1951 if (start > n->start)
1952 ret = vfio_iommu_iova_insert(&n->list, n->start,
1953 start - 1);
1954 if (!ret && end < n->end)
1955 ret = vfio_iommu_iova_insert(&n->list, end + 1,
1956 n->end);
1957 if (ret)
1958 return ret;
1959
1960 list_del(&n->list);
1961 kfree(n);
1962 }
1963 }
1964
1965 if (list_empty(iova))
1966 return -EINVAL;
1967
1968 return 0;
1969 }
1970
vfio_iommu_resv_free(struct list_head * resv_regions)1971 static void vfio_iommu_resv_free(struct list_head *resv_regions)
1972 {
1973 struct iommu_resv_region *n, *next;
1974
1975 list_for_each_entry_safe(n, next, resv_regions, list) {
1976 list_del(&n->list);
1977 kfree(n);
1978 }
1979 }
1980
vfio_iommu_iova_free(struct list_head * iova)1981 static void vfio_iommu_iova_free(struct list_head *iova)
1982 {
1983 struct vfio_iova *n, *next;
1984
1985 list_for_each_entry_safe(n, next, iova, list) {
1986 list_del(&n->list);
1987 kfree(n);
1988 }
1989 }
1990
vfio_iommu_iova_get_copy(struct vfio_iommu * iommu,struct list_head * iova_copy)1991 static int vfio_iommu_iova_get_copy(struct vfio_iommu *iommu,
1992 struct list_head *iova_copy)
1993 {
1994 struct list_head *iova = &iommu->iova_list;
1995 struct vfio_iova *n;
1996 int ret;
1997
1998 list_for_each_entry(n, iova, list) {
1999 ret = vfio_iommu_iova_insert(iova_copy, n->start, n->end);
2000 if (ret)
2001 goto out_free;
2002 }
2003
2004 return 0;
2005
2006 out_free:
2007 vfio_iommu_iova_free(iova_copy);
2008 return ret;
2009 }
2010
vfio_iommu_iova_insert_copy(struct vfio_iommu * iommu,struct list_head * iova_copy)2011 static void vfio_iommu_iova_insert_copy(struct vfio_iommu *iommu,
2012 struct list_head *iova_copy)
2013 {
2014 struct list_head *iova = &iommu->iova_list;
2015
2016 vfio_iommu_iova_free(iova);
2017
2018 list_splice_tail(iova_copy, iova);
2019 }
2020
vfio_iommu_type1_attach_group(void * iommu_data,struct iommu_group * iommu_group)2021 static int vfio_iommu_type1_attach_group(void *iommu_data,
2022 struct iommu_group *iommu_group)
2023 {
2024 struct vfio_iommu *iommu = iommu_data;
2025 struct vfio_group *group;
2026 struct vfio_domain *domain, *d;
2027 struct bus_type *bus = NULL;
2028 int ret;
2029 bool resv_msi, msi_remap;
2030 phys_addr_t resv_msi_base = 0;
2031 struct iommu_domain_geometry geo;
2032 LIST_HEAD(iova_copy);
2033 LIST_HEAD(group_resv_regions);
2034
2035 mutex_lock(&iommu->lock);
2036
2037 /* Check for duplicates */
2038 if (vfio_iommu_find_iommu_group(iommu, iommu_group)) {
2039 mutex_unlock(&iommu->lock);
2040 return -EINVAL;
2041 }
2042
2043 group = kzalloc(sizeof(*group), GFP_KERNEL);
2044 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
2045 if (!group || !domain) {
2046 ret = -ENOMEM;
2047 goto out_free;
2048 }
2049
2050 group->iommu_group = iommu_group;
2051
2052 /* Determine bus_type in order to allocate a domain */
2053 ret = iommu_group_for_each_dev(iommu_group, &bus, vfio_bus_type);
2054 if (ret)
2055 goto out_free;
2056
2057 if (vfio_bus_is_mdev(bus)) {
2058 struct device *iommu_device = NULL;
2059
2060 group->mdev_group = true;
2061
2062 /* Determine the isolation type */
2063 ret = iommu_group_for_each_dev(iommu_group, &iommu_device,
2064 vfio_mdev_iommu_device);
2065 if (ret || !iommu_device) {
2066 if (!iommu->external_domain) {
2067 INIT_LIST_HEAD(&domain->group_list);
2068 iommu->external_domain = domain;
2069 vfio_update_pgsize_bitmap(iommu);
2070 } else {
2071 kfree(domain);
2072 }
2073
2074 list_add(&group->next,
2075 &iommu->external_domain->group_list);
2076 /*
2077 * Non-iommu backed group cannot dirty memory directly,
2078 * it can only use interfaces that provide dirty
2079 * tracking.
2080 * The iommu scope can only be promoted with the
2081 * addition of a dirty tracking group.
2082 */
2083 group->pinned_page_dirty_scope = true;
2084 if (!iommu->pinned_page_dirty_scope)
2085 update_pinned_page_dirty_scope(iommu);
2086 mutex_unlock(&iommu->lock);
2087
2088 return 0;
2089 }
2090
2091 bus = iommu_device->bus;
2092 }
2093
2094 domain->domain = iommu_domain_alloc(bus);
2095 if (!domain->domain) {
2096 ret = -EIO;
2097 goto out_free;
2098 }
2099
2100 if (iommu->nesting) {
2101 int attr = 1;
2102
2103 ret = iommu_domain_set_attr(domain->domain, DOMAIN_ATTR_NESTING,
2104 &attr);
2105 if (ret)
2106 goto out_domain;
2107 }
2108
2109 ret = vfio_iommu_attach_group(domain, group);
2110 if (ret)
2111 goto out_domain;
2112
2113 /* Get aperture info */
2114 iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY, &geo);
2115
2116 if (vfio_iommu_aper_conflict(iommu, geo.aperture_start,
2117 geo.aperture_end)) {
2118 ret = -EINVAL;
2119 goto out_detach;
2120 }
2121
2122 ret = iommu_get_group_resv_regions(iommu_group, &group_resv_regions);
2123 if (ret)
2124 goto out_detach;
2125
2126 if (vfio_iommu_resv_conflict(iommu, &group_resv_regions)) {
2127 ret = -EINVAL;
2128 goto out_detach;
2129 }
2130
2131 /*
2132 * We don't want to work on the original iova list as the list
2133 * gets modified and in case of failure we have to retain the
2134 * original list. Get a copy here.
2135 */
2136 ret = vfio_iommu_iova_get_copy(iommu, &iova_copy);
2137 if (ret)
2138 goto out_detach;
2139
2140 ret = vfio_iommu_aper_resize(&iova_copy, geo.aperture_start,
2141 geo.aperture_end);
2142 if (ret)
2143 goto out_detach;
2144
2145 ret = vfio_iommu_resv_exclude(&iova_copy, &group_resv_regions);
2146 if (ret)
2147 goto out_detach;
2148
2149 resv_msi = vfio_iommu_has_sw_msi(&group_resv_regions, &resv_msi_base);
2150
2151 INIT_LIST_HEAD(&domain->group_list);
2152 list_add(&group->next, &domain->group_list);
2153
2154 msi_remap = irq_domain_check_msi_remap() ||
2155 iommu_capable(bus, IOMMU_CAP_INTR_REMAP);
2156
2157 if (!allow_unsafe_interrupts && !msi_remap) {
2158 pr_warn("%s: No interrupt remapping support. Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n",
2159 __func__);
2160 ret = -EPERM;
2161 goto out_detach;
2162 }
2163
2164 if (iommu_capable(bus, IOMMU_CAP_CACHE_COHERENCY))
2165 domain->prot |= IOMMU_CACHE;
2166
2167 /*
2168 * Try to match an existing compatible domain. We don't want to
2169 * preclude an IOMMU driver supporting multiple bus_types and being
2170 * able to include different bus_types in the same IOMMU domain, so
2171 * we test whether the domains use the same iommu_ops rather than
2172 * testing if they're on the same bus_type.
2173 */
2174 list_for_each_entry(d, &iommu->domain_list, next) {
2175 if (d->domain->ops == domain->domain->ops &&
2176 d->prot == domain->prot) {
2177 vfio_iommu_detach_group(domain, group);
2178 if (!vfio_iommu_attach_group(d, group)) {
2179 list_add(&group->next, &d->group_list);
2180 iommu_domain_free(domain->domain);
2181 kfree(domain);
2182 goto done;
2183 }
2184
2185 ret = vfio_iommu_attach_group(domain, group);
2186 if (ret)
2187 goto out_domain;
2188 }
2189 }
2190
2191 vfio_test_domain_fgsp(domain);
2192
2193 /* replay mappings on new domains */
2194 ret = vfio_iommu_replay(iommu, domain);
2195 if (ret)
2196 goto out_detach;
2197
2198 if (resv_msi) {
2199 ret = iommu_get_msi_cookie(domain->domain, resv_msi_base);
2200 if (ret && ret != -ENODEV)
2201 goto out_detach;
2202 }
2203
2204 list_add(&domain->next, &iommu->domain_list);
2205 vfio_update_pgsize_bitmap(iommu);
2206 done:
2207 /* Delete the old one and insert new iova list */
2208 vfio_iommu_iova_insert_copy(iommu, &iova_copy);
2209
2210 /*
2211 * An iommu backed group can dirty memory directly and therefore
2212 * demotes the iommu scope until it declares itself dirty tracking
2213 * capable via the page pinning interface.
2214 */
2215 iommu->pinned_page_dirty_scope = false;
2216 mutex_unlock(&iommu->lock);
2217 vfio_iommu_resv_free(&group_resv_regions);
2218
2219 return 0;
2220
2221 out_detach:
2222 vfio_iommu_detach_group(domain, group);
2223 out_domain:
2224 iommu_domain_free(domain->domain);
2225 vfio_iommu_iova_free(&iova_copy);
2226 vfio_iommu_resv_free(&group_resv_regions);
2227 out_free:
2228 kfree(domain);
2229 kfree(group);
2230 mutex_unlock(&iommu->lock);
2231 return ret;
2232 }
2233
vfio_iommu_unmap_unpin_all(struct vfio_iommu * iommu)2234 static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
2235 {
2236 struct rb_node *node;
2237
2238 while ((node = rb_first(&iommu->dma_list)))
2239 vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
2240 }
2241
vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu * iommu)2242 static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu)
2243 {
2244 struct rb_node *n, *p;
2245
2246 n = rb_first(&iommu->dma_list);
2247 for (; n; n = rb_next(n)) {
2248 struct vfio_dma *dma;
2249 long locked = 0, unlocked = 0;
2250
2251 dma = rb_entry(n, struct vfio_dma, node);
2252 unlocked += vfio_unmap_unpin(iommu, dma, false);
2253 p = rb_first(&dma->pfn_list);
2254 for (; p; p = rb_next(p)) {
2255 struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn,
2256 node);
2257
2258 if (!is_invalid_reserved_pfn(vpfn->pfn))
2259 locked++;
2260 }
2261 vfio_lock_acct(dma, locked - unlocked, true);
2262 }
2263 }
2264
2265 /*
2266 * Called when a domain is removed in detach. It is possible that
2267 * the removed domain decided the iova aperture window. Modify the
2268 * iova aperture with the smallest window among existing domains.
2269 */
vfio_iommu_aper_expand(struct vfio_iommu * iommu,struct list_head * iova_copy)2270 static void vfio_iommu_aper_expand(struct vfio_iommu *iommu,
2271 struct list_head *iova_copy)
2272 {
2273 struct vfio_domain *domain;
2274 struct iommu_domain_geometry geo;
2275 struct vfio_iova *node;
2276 dma_addr_t start = 0;
2277 dma_addr_t end = (dma_addr_t)~0;
2278
2279 if (list_empty(iova_copy))
2280 return;
2281
2282 list_for_each_entry(domain, &iommu->domain_list, next) {
2283 iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY,
2284 &geo);
2285 if (geo.aperture_start > start)
2286 start = geo.aperture_start;
2287 if (geo.aperture_end < end)
2288 end = geo.aperture_end;
2289 }
2290
2291 /* Modify aperture limits. The new aper is either same or bigger */
2292 node = list_first_entry(iova_copy, struct vfio_iova, list);
2293 node->start = start;
2294 node = list_last_entry(iova_copy, struct vfio_iova, list);
2295 node->end = end;
2296 }
2297
2298 /*
2299 * Called when a group is detached. The reserved regions for that
2300 * group can be part of valid iova now. But since reserved regions
2301 * may be duplicated among groups, populate the iova valid regions
2302 * list again.
2303 */
vfio_iommu_resv_refresh(struct vfio_iommu * iommu,struct list_head * iova_copy)2304 static int vfio_iommu_resv_refresh(struct vfio_iommu *iommu,
2305 struct list_head *iova_copy)
2306 {
2307 struct vfio_domain *d;
2308 struct vfio_group *g;
2309 struct vfio_iova *node;
2310 dma_addr_t start, end;
2311 LIST_HEAD(resv_regions);
2312 int ret;
2313
2314 if (list_empty(iova_copy))
2315 return -EINVAL;
2316
2317 list_for_each_entry(d, &iommu->domain_list, next) {
2318 list_for_each_entry(g, &d->group_list, next) {
2319 ret = iommu_get_group_resv_regions(g->iommu_group,
2320 &resv_regions);
2321 if (ret)
2322 goto done;
2323 }
2324 }
2325
2326 node = list_first_entry(iova_copy, struct vfio_iova, list);
2327 start = node->start;
2328 node = list_last_entry(iova_copy, struct vfio_iova, list);
2329 end = node->end;
2330
2331 /* purge the iova list and create new one */
2332 vfio_iommu_iova_free(iova_copy);
2333
2334 ret = vfio_iommu_aper_resize(iova_copy, start, end);
2335 if (ret)
2336 goto done;
2337
2338 /* Exclude current reserved regions from iova ranges */
2339 ret = vfio_iommu_resv_exclude(iova_copy, &resv_regions);
2340 done:
2341 vfio_iommu_resv_free(&resv_regions);
2342 return ret;
2343 }
2344
vfio_iommu_type1_detach_group(void * iommu_data,struct iommu_group * iommu_group)2345 static void vfio_iommu_type1_detach_group(void *iommu_data,
2346 struct iommu_group *iommu_group)
2347 {
2348 struct vfio_iommu *iommu = iommu_data;
2349 struct vfio_domain *domain;
2350 struct vfio_group *group;
2351 bool update_dirty_scope = false;
2352 LIST_HEAD(iova_copy);
2353
2354 mutex_lock(&iommu->lock);
2355
2356 if (iommu->external_domain) {
2357 group = find_iommu_group(iommu->external_domain, iommu_group);
2358 if (group) {
2359 update_dirty_scope = !group->pinned_page_dirty_scope;
2360 list_del(&group->next);
2361 kfree(group);
2362
2363 if (list_empty(&iommu->external_domain->group_list)) {
2364 if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
2365 WARN_ON(iommu->notifier.head);
2366 vfio_iommu_unmap_unpin_all(iommu);
2367 }
2368
2369 kfree(iommu->external_domain);
2370 iommu->external_domain = NULL;
2371 }
2372 goto detach_group_done;
2373 }
2374 }
2375
2376 /*
2377 * Get a copy of iova list. This will be used to update
2378 * and to replace the current one later. Please note that
2379 * we will leave the original list as it is if update fails.
2380 */
2381 vfio_iommu_iova_get_copy(iommu, &iova_copy);
2382
2383 list_for_each_entry(domain, &iommu->domain_list, next) {
2384 group = find_iommu_group(domain, iommu_group);
2385 if (!group)
2386 continue;
2387
2388 vfio_iommu_detach_group(domain, group);
2389 update_dirty_scope = !group->pinned_page_dirty_scope;
2390 list_del(&group->next);
2391 kfree(group);
2392 /*
2393 * Group ownership provides privilege, if the group list is
2394 * empty, the domain goes away. If it's the last domain with
2395 * iommu and external domain doesn't exist, then all the
2396 * mappings go away too. If it's the last domain with iommu and
2397 * external domain exist, update accounting
2398 */
2399 if (list_empty(&domain->group_list)) {
2400 if (list_is_singular(&iommu->domain_list)) {
2401 if (!iommu->external_domain) {
2402 WARN_ON(iommu->notifier.head);
2403 vfio_iommu_unmap_unpin_all(iommu);
2404 } else {
2405 vfio_iommu_unmap_unpin_reaccount(iommu);
2406 }
2407 }
2408 iommu_domain_free(domain->domain);
2409 list_del(&domain->next);
2410 kfree(domain);
2411 vfio_iommu_aper_expand(iommu, &iova_copy);
2412 vfio_update_pgsize_bitmap(iommu);
2413 }
2414 break;
2415 }
2416
2417 if (!vfio_iommu_resv_refresh(iommu, &iova_copy))
2418 vfio_iommu_iova_insert_copy(iommu, &iova_copy);
2419 else
2420 vfio_iommu_iova_free(&iova_copy);
2421
2422 detach_group_done:
2423 /*
2424 * Removal of a group without dirty tracking may allow the iommu scope
2425 * to be promoted.
2426 */
2427 if (update_dirty_scope) {
2428 update_pinned_page_dirty_scope(iommu);
2429 if (iommu->dirty_page_tracking)
2430 vfio_iommu_populate_bitmap_full(iommu);
2431 }
2432 mutex_unlock(&iommu->lock);
2433 }
2434
vfio_iommu_type1_open(unsigned long arg)2435 static void *vfio_iommu_type1_open(unsigned long arg)
2436 {
2437 struct vfio_iommu *iommu;
2438
2439 iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
2440 if (!iommu)
2441 return ERR_PTR(-ENOMEM);
2442
2443 switch (arg) {
2444 case VFIO_TYPE1_IOMMU:
2445 break;
2446 case VFIO_TYPE1_NESTING_IOMMU:
2447 iommu->nesting = true;
2448 fallthrough;
2449 case VFIO_TYPE1v2_IOMMU:
2450 iommu->v2 = true;
2451 break;
2452 default:
2453 kfree(iommu);
2454 return ERR_PTR(-EINVAL);
2455 }
2456
2457 INIT_LIST_HEAD(&iommu->domain_list);
2458 INIT_LIST_HEAD(&iommu->iova_list);
2459 iommu->dma_list = RB_ROOT;
2460 iommu->dma_avail = dma_entry_limit;
2461 mutex_init(&iommu->lock);
2462 BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);
2463
2464 return iommu;
2465 }
2466
vfio_release_domain(struct vfio_domain * domain,bool external)2467 static void vfio_release_domain(struct vfio_domain *domain, bool external)
2468 {
2469 struct vfio_group *group, *group_tmp;
2470
2471 list_for_each_entry_safe(group, group_tmp,
2472 &domain->group_list, next) {
2473 if (!external)
2474 vfio_iommu_detach_group(domain, group);
2475 list_del(&group->next);
2476 kfree(group);
2477 }
2478
2479 if (!external)
2480 iommu_domain_free(domain->domain);
2481 }
2482
vfio_iommu_type1_release(void * iommu_data)2483 static void vfio_iommu_type1_release(void *iommu_data)
2484 {
2485 struct vfio_iommu *iommu = iommu_data;
2486 struct vfio_domain *domain, *domain_tmp;
2487
2488 if (iommu->external_domain) {
2489 vfio_release_domain(iommu->external_domain, true);
2490 kfree(iommu->external_domain);
2491 }
2492
2493 vfio_iommu_unmap_unpin_all(iommu);
2494
2495 list_for_each_entry_safe(domain, domain_tmp,
2496 &iommu->domain_list, next) {
2497 vfio_release_domain(domain, false);
2498 list_del(&domain->next);
2499 kfree(domain);
2500 }
2501
2502 vfio_iommu_iova_free(&iommu->iova_list);
2503
2504 kfree(iommu);
2505 }
2506
vfio_domains_have_iommu_cache(struct vfio_iommu * iommu)2507 static int vfio_domains_have_iommu_cache(struct vfio_iommu *iommu)
2508 {
2509 struct vfio_domain *domain;
2510 int ret = 1;
2511
2512 mutex_lock(&iommu->lock);
2513 list_for_each_entry(domain, &iommu->domain_list, next) {
2514 if (!(domain->prot & IOMMU_CACHE)) {
2515 ret = 0;
2516 break;
2517 }
2518 }
2519 mutex_unlock(&iommu->lock);
2520
2521 return ret;
2522 }
2523
vfio_iommu_type1_check_extension(struct vfio_iommu * iommu,unsigned long arg)2524 static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu,
2525 unsigned long arg)
2526 {
2527 switch (arg) {
2528 case VFIO_TYPE1_IOMMU:
2529 case VFIO_TYPE1v2_IOMMU:
2530 case VFIO_TYPE1_NESTING_IOMMU:
2531 return 1;
2532 case VFIO_DMA_CC_IOMMU:
2533 if (!iommu)
2534 return 0;
2535 return vfio_domains_have_iommu_cache(iommu);
2536 default:
2537 return 0;
2538 }
2539 }
2540
vfio_iommu_iova_add_cap(struct vfio_info_cap * caps,struct vfio_iommu_type1_info_cap_iova_range * cap_iovas,size_t size)2541 static int vfio_iommu_iova_add_cap(struct vfio_info_cap *caps,
2542 struct vfio_iommu_type1_info_cap_iova_range *cap_iovas,
2543 size_t size)
2544 {
2545 struct vfio_info_cap_header *header;
2546 struct vfio_iommu_type1_info_cap_iova_range *iova_cap;
2547
2548 header = vfio_info_cap_add(caps, size,
2549 VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE, 1);
2550 if (IS_ERR(header))
2551 return PTR_ERR(header);
2552
2553 iova_cap = container_of(header,
2554 struct vfio_iommu_type1_info_cap_iova_range,
2555 header);
2556 iova_cap->nr_iovas = cap_iovas->nr_iovas;
2557 memcpy(iova_cap->iova_ranges, cap_iovas->iova_ranges,
2558 cap_iovas->nr_iovas * sizeof(*cap_iovas->iova_ranges));
2559 return 0;
2560 }
2561
vfio_iommu_iova_build_caps(struct vfio_iommu * iommu,struct vfio_info_cap * caps)2562 static int vfio_iommu_iova_build_caps(struct vfio_iommu *iommu,
2563 struct vfio_info_cap *caps)
2564 {
2565 struct vfio_iommu_type1_info_cap_iova_range *cap_iovas;
2566 struct vfio_iova *iova;
2567 size_t size;
2568 int iovas = 0, i = 0, ret;
2569
2570 list_for_each_entry(iova, &iommu->iova_list, list)
2571 iovas++;
2572
2573 if (!iovas) {
2574 /*
2575 * Return 0 as a container with a single mdev device
2576 * will have an empty list
2577 */
2578 return 0;
2579 }
2580
2581 size = sizeof(*cap_iovas) + (iovas * sizeof(*cap_iovas->iova_ranges));
2582
2583 cap_iovas = kzalloc(size, GFP_KERNEL);
2584 if (!cap_iovas)
2585 return -ENOMEM;
2586
2587 cap_iovas->nr_iovas = iovas;
2588
2589 list_for_each_entry(iova, &iommu->iova_list, list) {
2590 cap_iovas->iova_ranges[i].start = iova->start;
2591 cap_iovas->iova_ranges[i].end = iova->end;
2592 i++;
2593 }
2594
2595 ret = vfio_iommu_iova_add_cap(caps, cap_iovas, size);
2596
2597 kfree(cap_iovas);
2598 return ret;
2599 }
2600
vfio_iommu_migration_build_caps(struct vfio_iommu * iommu,struct vfio_info_cap * caps)2601 static int vfio_iommu_migration_build_caps(struct vfio_iommu *iommu,
2602 struct vfio_info_cap *caps)
2603 {
2604 struct vfio_iommu_type1_info_cap_migration cap_mig;
2605
2606 cap_mig.header.id = VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION;
2607 cap_mig.header.version = 1;
2608
2609 cap_mig.flags = 0;
2610 /* support minimum pgsize */
2611 cap_mig.pgsize_bitmap = (size_t)1 << __ffs(iommu->pgsize_bitmap);
2612 cap_mig.max_dirty_bitmap_size = DIRTY_BITMAP_SIZE_MAX;
2613
2614 return vfio_info_add_capability(caps, &cap_mig.header, sizeof(cap_mig));
2615 }
2616
vfio_iommu_dma_avail_build_caps(struct vfio_iommu * iommu,struct vfio_info_cap * caps)2617 static int vfio_iommu_dma_avail_build_caps(struct vfio_iommu *iommu,
2618 struct vfio_info_cap *caps)
2619 {
2620 struct vfio_iommu_type1_info_dma_avail cap_dma_avail;
2621
2622 cap_dma_avail.header.id = VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL;
2623 cap_dma_avail.header.version = 1;
2624
2625 cap_dma_avail.avail = iommu->dma_avail;
2626
2627 return vfio_info_add_capability(caps, &cap_dma_avail.header,
2628 sizeof(cap_dma_avail));
2629 }
2630
vfio_iommu_type1_get_info(struct vfio_iommu * iommu,unsigned long arg)2631 static int vfio_iommu_type1_get_info(struct vfio_iommu *iommu,
2632 unsigned long arg)
2633 {
2634 struct vfio_iommu_type1_info info;
2635 unsigned long minsz;
2636 struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
2637 unsigned long capsz;
2638 int ret;
2639
2640 minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
2641
2642 /* For backward compatibility, cannot require this */
2643 capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset);
2644
2645 if (copy_from_user(&info, (void __user *)arg, minsz))
2646 return -EFAULT;
2647
2648 if (info.argsz < minsz)
2649 return -EINVAL;
2650
2651 if (info.argsz >= capsz) {
2652 minsz = capsz;
2653 info.cap_offset = 0; /* output, no-recopy necessary */
2654 }
2655
2656 mutex_lock(&iommu->lock);
2657 info.flags = VFIO_IOMMU_INFO_PGSIZES;
2658
2659 info.iova_pgsizes = iommu->pgsize_bitmap;
2660
2661 ret = vfio_iommu_migration_build_caps(iommu, &caps);
2662
2663 if (!ret)
2664 ret = vfio_iommu_dma_avail_build_caps(iommu, &caps);
2665
2666 if (!ret)
2667 ret = vfio_iommu_iova_build_caps(iommu, &caps);
2668
2669 mutex_unlock(&iommu->lock);
2670
2671 if (ret)
2672 return ret;
2673
2674 if (caps.size) {
2675 info.flags |= VFIO_IOMMU_INFO_CAPS;
2676
2677 if (info.argsz < sizeof(info) + caps.size) {
2678 info.argsz = sizeof(info) + caps.size;
2679 } else {
2680 vfio_info_cap_shift(&caps, sizeof(info));
2681 if (copy_to_user((void __user *)arg +
2682 sizeof(info), caps.buf,
2683 caps.size)) {
2684 kfree(caps.buf);
2685 return -EFAULT;
2686 }
2687 info.cap_offset = sizeof(info);
2688 }
2689
2690 kfree(caps.buf);
2691 }
2692
2693 return copy_to_user((void __user *)arg, &info, minsz) ?
2694 -EFAULT : 0;
2695 }
2696
vfio_iommu_type1_map_dma(struct vfio_iommu * iommu,unsigned long arg)2697 static int vfio_iommu_type1_map_dma(struct vfio_iommu *iommu,
2698 unsigned long arg)
2699 {
2700 struct vfio_iommu_type1_dma_map map;
2701 unsigned long minsz;
2702 uint32_t mask = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
2703
2704 minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
2705
2706 if (copy_from_user(&map, (void __user *)arg, minsz))
2707 return -EFAULT;
2708
2709 if (map.argsz < minsz || map.flags & ~mask)
2710 return -EINVAL;
2711
2712 return vfio_dma_do_map(iommu, &map);
2713 }
2714
vfio_iommu_type1_unmap_dma(struct vfio_iommu * iommu,unsigned long arg)2715 static int vfio_iommu_type1_unmap_dma(struct vfio_iommu *iommu,
2716 unsigned long arg)
2717 {
2718 struct vfio_iommu_type1_dma_unmap unmap;
2719 struct vfio_bitmap bitmap = { 0 };
2720 unsigned long minsz;
2721 int ret;
2722
2723 minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
2724
2725 if (copy_from_user(&unmap, (void __user *)arg, minsz))
2726 return -EFAULT;
2727
2728 if (unmap.argsz < minsz ||
2729 unmap.flags & ~VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP)
2730 return -EINVAL;
2731
2732 if (unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
2733 unsigned long pgshift;
2734
2735 if (unmap.argsz < (minsz + sizeof(bitmap)))
2736 return -EINVAL;
2737
2738 if (copy_from_user(&bitmap,
2739 (void __user *)(arg + minsz),
2740 sizeof(bitmap)))
2741 return -EFAULT;
2742
2743 if (!access_ok((void __user *)bitmap.data, bitmap.size))
2744 return -EINVAL;
2745
2746 pgshift = __ffs(bitmap.pgsize);
2747 ret = verify_bitmap_size(unmap.size >> pgshift,
2748 bitmap.size);
2749 if (ret)
2750 return ret;
2751 }
2752
2753 ret = vfio_dma_do_unmap(iommu, &unmap, &bitmap);
2754 if (ret)
2755 return ret;
2756
2757 return copy_to_user((void __user *)arg, &unmap, minsz) ?
2758 -EFAULT : 0;
2759 }
2760
vfio_iommu_type1_dirty_pages(struct vfio_iommu * iommu,unsigned long arg)2761 static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu,
2762 unsigned long arg)
2763 {
2764 struct vfio_iommu_type1_dirty_bitmap dirty;
2765 uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
2766 VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
2767 VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
2768 unsigned long minsz;
2769 int ret = 0;
2770
2771 if (!iommu->v2)
2772 return -EACCES;
2773
2774 minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap, flags);
2775
2776 if (copy_from_user(&dirty, (void __user *)arg, minsz))
2777 return -EFAULT;
2778
2779 if (dirty.argsz < minsz || dirty.flags & ~mask)
2780 return -EINVAL;
2781
2782 /* only one flag should be set at a time */
2783 if (__ffs(dirty.flags) != __fls(dirty.flags))
2784 return -EINVAL;
2785
2786 if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
2787 size_t pgsize;
2788
2789 mutex_lock(&iommu->lock);
2790 pgsize = 1 << __ffs(iommu->pgsize_bitmap);
2791 if (!iommu->dirty_page_tracking) {
2792 ret = vfio_dma_bitmap_alloc_all(iommu, pgsize);
2793 if (!ret)
2794 iommu->dirty_page_tracking = true;
2795 }
2796 mutex_unlock(&iommu->lock);
2797 return ret;
2798 } else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
2799 mutex_lock(&iommu->lock);
2800 if (iommu->dirty_page_tracking) {
2801 iommu->dirty_page_tracking = false;
2802 vfio_dma_bitmap_free_all(iommu);
2803 }
2804 mutex_unlock(&iommu->lock);
2805 return 0;
2806 } else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
2807 struct vfio_iommu_type1_dirty_bitmap_get range;
2808 unsigned long pgshift;
2809 size_t data_size = dirty.argsz - minsz;
2810 size_t iommu_pgsize;
2811
2812 if (!data_size || data_size < sizeof(range))
2813 return -EINVAL;
2814
2815 if (copy_from_user(&range, (void __user *)(arg + minsz),
2816 sizeof(range)))
2817 return -EFAULT;
2818
2819 if (range.iova + range.size < range.iova)
2820 return -EINVAL;
2821 if (!access_ok((void __user *)range.bitmap.data,
2822 range.bitmap.size))
2823 return -EINVAL;
2824
2825 pgshift = __ffs(range.bitmap.pgsize);
2826 ret = verify_bitmap_size(range.size >> pgshift,
2827 range.bitmap.size);
2828 if (ret)
2829 return ret;
2830
2831 mutex_lock(&iommu->lock);
2832
2833 iommu_pgsize = (size_t)1 << __ffs(iommu->pgsize_bitmap);
2834
2835 /* allow only smallest supported pgsize */
2836 if (range.bitmap.pgsize != iommu_pgsize) {
2837 ret = -EINVAL;
2838 goto out_unlock;
2839 }
2840 if (range.iova & (iommu_pgsize - 1)) {
2841 ret = -EINVAL;
2842 goto out_unlock;
2843 }
2844 if (!range.size || range.size & (iommu_pgsize - 1)) {
2845 ret = -EINVAL;
2846 goto out_unlock;
2847 }
2848
2849 if (iommu->dirty_page_tracking)
2850 ret = vfio_iova_dirty_bitmap(range.bitmap.data,
2851 iommu, range.iova,
2852 range.size,
2853 range.bitmap.pgsize);
2854 else
2855 ret = -EINVAL;
2856 out_unlock:
2857 mutex_unlock(&iommu->lock);
2858
2859 return ret;
2860 }
2861
2862 return -EINVAL;
2863 }
2864
vfio_iommu_type1_ioctl(void * iommu_data,unsigned int cmd,unsigned long arg)2865 static long vfio_iommu_type1_ioctl(void *iommu_data,
2866 unsigned int cmd, unsigned long arg)
2867 {
2868 struct vfio_iommu *iommu = iommu_data;
2869
2870 switch (cmd) {
2871 case VFIO_CHECK_EXTENSION:
2872 return vfio_iommu_type1_check_extension(iommu, arg);
2873 case VFIO_IOMMU_GET_INFO:
2874 return vfio_iommu_type1_get_info(iommu, arg);
2875 case VFIO_IOMMU_MAP_DMA:
2876 return vfio_iommu_type1_map_dma(iommu, arg);
2877 case VFIO_IOMMU_UNMAP_DMA:
2878 return vfio_iommu_type1_unmap_dma(iommu, arg);
2879 case VFIO_IOMMU_DIRTY_PAGES:
2880 return vfio_iommu_type1_dirty_pages(iommu, arg);
2881 default:
2882 return -ENOTTY;
2883 }
2884 }
2885
vfio_iommu_type1_register_notifier(void * iommu_data,unsigned long * events,struct notifier_block * nb)2886 static int vfio_iommu_type1_register_notifier(void *iommu_data,
2887 unsigned long *events,
2888 struct notifier_block *nb)
2889 {
2890 struct vfio_iommu *iommu = iommu_data;
2891
2892 /* clear known events */
2893 *events &= ~VFIO_IOMMU_NOTIFY_DMA_UNMAP;
2894
2895 /* refuse to register if still events remaining */
2896 if (*events)
2897 return -EINVAL;
2898
2899 return blocking_notifier_chain_register(&iommu->notifier, nb);
2900 }
2901
vfio_iommu_type1_unregister_notifier(void * iommu_data,struct notifier_block * nb)2902 static int vfio_iommu_type1_unregister_notifier(void *iommu_data,
2903 struct notifier_block *nb)
2904 {
2905 struct vfio_iommu *iommu = iommu_data;
2906
2907 return blocking_notifier_chain_unregister(&iommu->notifier, nb);
2908 }
2909
vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu * iommu,dma_addr_t user_iova,void * data,size_t count,bool write,size_t * copied)2910 static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
2911 dma_addr_t user_iova, void *data,
2912 size_t count, bool write,
2913 size_t *copied)
2914 {
2915 struct mm_struct *mm;
2916 unsigned long vaddr;
2917 struct vfio_dma *dma;
2918 bool kthread = current->mm == NULL;
2919 size_t offset;
2920
2921 *copied = 0;
2922
2923 dma = vfio_find_dma(iommu, user_iova, 1);
2924 if (!dma)
2925 return -EINVAL;
2926
2927 if ((write && !(dma->prot & IOMMU_WRITE)) ||
2928 !(dma->prot & IOMMU_READ))
2929 return -EPERM;
2930
2931 mm = get_task_mm(dma->task);
2932
2933 if (!mm)
2934 return -EPERM;
2935
2936 if (kthread)
2937 kthread_use_mm(mm);
2938 else if (current->mm != mm)
2939 goto out;
2940
2941 offset = user_iova - dma->iova;
2942
2943 if (count > dma->size - offset)
2944 count = dma->size - offset;
2945
2946 vaddr = dma->vaddr + offset;
2947
2948 if (write) {
2949 *copied = copy_to_user((void __user *)vaddr, data,
2950 count) ? 0 : count;
2951 if (*copied && iommu->dirty_page_tracking) {
2952 unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
2953 /*
2954 * Bitmap populated with the smallest supported page
2955 * size
2956 */
2957 bitmap_set(dma->bitmap, offset >> pgshift,
2958 ((offset + *copied - 1) >> pgshift) -
2959 (offset >> pgshift) + 1);
2960 }
2961 } else
2962 *copied = copy_from_user(data, (void __user *)vaddr,
2963 count) ? 0 : count;
2964 if (kthread)
2965 kthread_unuse_mm(mm);
2966 out:
2967 mmput(mm);
2968 return *copied ? 0 : -EFAULT;
2969 }
2970
vfio_iommu_type1_dma_rw(void * iommu_data,dma_addr_t user_iova,void * data,size_t count,bool write)2971 static int vfio_iommu_type1_dma_rw(void *iommu_data, dma_addr_t user_iova,
2972 void *data, size_t count, bool write)
2973 {
2974 struct vfio_iommu *iommu = iommu_data;
2975 int ret = 0;
2976 size_t done;
2977
2978 mutex_lock(&iommu->lock);
2979 while (count > 0) {
2980 ret = vfio_iommu_type1_dma_rw_chunk(iommu, user_iova, data,
2981 count, write, &done);
2982 if (ret)
2983 break;
2984
2985 count -= done;
2986 data += done;
2987 user_iova += done;
2988 }
2989
2990 mutex_unlock(&iommu->lock);
2991 return ret;
2992 }
2993
2994 static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
2995 .name = "vfio-iommu-type1",
2996 .owner = THIS_MODULE,
2997 .open = vfio_iommu_type1_open,
2998 .release = vfio_iommu_type1_release,
2999 .ioctl = vfio_iommu_type1_ioctl,
3000 .attach_group = vfio_iommu_type1_attach_group,
3001 .detach_group = vfio_iommu_type1_detach_group,
3002 .pin_pages = vfio_iommu_type1_pin_pages,
3003 .unpin_pages = vfio_iommu_type1_unpin_pages,
3004 .register_notifier = vfio_iommu_type1_register_notifier,
3005 .unregister_notifier = vfio_iommu_type1_unregister_notifier,
3006 .dma_rw = vfio_iommu_type1_dma_rw,
3007 };
3008
vfio_iommu_type1_init(void)3009 static int __init vfio_iommu_type1_init(void)
3010 {
3011 return vfio_register_iommu_driver(&vfio_iommu_driver_ops_type1);
3012 }
3013
vfio_iommu_type1_cleanup(void)3014 static void __exit vfio_iommu_type1_cleanup(void)
3015 {
3016 vfio_unregister_iommu_driver(&vfio_iommu_driver_ops_type1);
3017 }
3018
3019 module_init(vfio_iommu_type1_init);
3020 module_exit(vfio_iommu_type1_cleanup);
3021
3022 MODULE_VERSION(DRIVER_VERSION);
3023 MODULE_LICENSE("GPL v2");
3024 MODULE_AUTHOR(DRIVER_AUTHOR);
3025 MODULE_DESCRIPTION(DRIVER_DESC);
3026