1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
3 *
4 * The io_pagetable is the top of datastructure that maps IOVA's to PFNs. The
5 * PFNs can be placed into an iommu_domain, or returned to the caller as a page
6 * list for access by an in-kernel user.
7 *
8 * The datastructure uses the iopt_pages to optimize the storage of the PFNs
9 * between the domains and xarray.
10 */
11 #include <linux/err.h>
12 #include <linux/errno.h>
13 #include <linux/iommu.h>
14 #include <linux/iommufd.h>
15 #include <linux/lockdep.h>
16 #include <linux/sched/mm.h>
17 #include <linux/slab.h>
18 #include <uapi/linux/iommufd.h>
19
20 #include "double_span.h"
21 #include "io_pagetable.h"
22
23 struct iopt_pages_list {
24 struct iopt_pages *pages;
25 struct iopt_area *area;
26 struct list_head next;
27 unsigned long start_byte;
28 unsigned long length;
29 };
30
iopt_area_contig_init(struct iopt_area_contig_iter * iter,struct io_pagetable * iopt,unsigned long iova,unsigned long last_iova)31 struct iopt_area *iopt_area_contig_init(struct iopt_area_contig_iter *iter,
32 struct io_pagetable *iopt,
33 unsigned long iova,
34 unsigned long last_iova)
35 {
36 lockdep_assert_held(&iopt->iova_rwsem);
37
38 iter->cur_iova = iova;
39 iter->last_iova = last_iova;
40 iter->area = iopt_area_iter_first(iopt, iova, iova);
41 if (!iter->area)
42 return NULL;
43 if (!iter->area->pages) {
44 iter->area = NULL;
45 return NULL;
46 }
47 return iter->area;
48 }
49
iopt_area_contig_next(struct iopt_area_contig_iter * iter)50 struct iopt_area *iopt_area_contig_next(struct iopt_area_contig_iter *iter)
51 {
52 unsigned long last_iova;
53
54 if (!iter->area)
55 return NULL;
56 last_iova = iopt_area_last_iova(iter->area);
57 if (iter->last_iova <= last_iova)
58 return NULL;
59
60 iter->cur_iova = last_iova + 1;
61 iter->area = iopt_area_iter_next(iter->area, iter->cur_iova,
62 iter->last_iova);
63 if (!iter->area)
64 return NULL;
65 if (iter->cur_iova != iopt_area_iova(iter->area) ||
66 !iter->area->pages) {
67 iter->area = NULL;
68 return NULL;
69 }
70 return iter->area;
71 }
72
__alloc_iova_check_range(unsigned long * start,unsigned long last,unsigned long length,unsigned long iova_alignment,unsigned long page_offset)73 static bool __alloc_iova_check_range(unsigned long *start, unsigned long last,
74 unsigned long length,
75 unsigned long iova_alignment,
76 unsigned long page_offset)
77 {
78 unsigned long aligned_start;
79
80 /* ALIGN_UP() */
81 if (check_add_overflow(*start, iova_alignment - 1, &aligned_start))
82 return false;
83 aligned_start &= ~(iova_alignment - 1);
84 aligned_start |= page_offset;
85
86 if (aligned_start >= last || last - aligned_start < length - 1)
87 return false;
88 *start = aligned_start;
89 return true;
90 }
91
__alloc_iova_check_hole(struct interval_tree_double_span_iter * span,unsigned long length,unsigned long iova_alignment,unsigned long page_offset)92 static bool __alloc_iova_check_hole(struct interval_tree_double_span_iter *span,
93 unsigned long length,
94 unsigned long iova_alignment,
95 unsigned long page_offset)
96 {
97 if (span->is_used)
98 return false;
99 return __alloc_iova_check_range(&span->start_hole, span->last_hole,
100 length, iova_alignment, page_offset);
101 }
102
__alloc_iova_check_used(struct interval_tree_span_iter * span,unsigned long length,unsigned long iova_alignment,unsigned long page_offset)103 static bool __alloc_iova_check_used(struct interval_tree_span_iter *span,
104 unsigned long length,
105 unsigned long iova_alignment,
106 unsigned long page_offset)
107 {
108 if (span->is_hole)
109 return false;
110 return __alloc_iova_check_range(&span->start_used, span->last_used,
111 length, iova_alignment, page_offset);
112 }
113
114 /*
115 * Automatically find a block of IOVA that is not being used and not reserved.
116 * Does not return a 0 IOVA even if it is valid.
117 */
iopt_alloc_iova(struct io_pagetable * iopt,unsigned long * iova,unsigned long uptr,unsigned long length)118 static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova,
119 unsigned long uptr, unsigned long length)
120 {
121 unsigned long page_offset = uptr % PAGE_SIZE;
122 struct interval_tree_double_span_iter used_span;
123 struct interval_tree_span_iter allowed_span;
124 unsigned long max_alignment = PAGE_SIZE;
125 unsigned long iova_alignment;
126
127 lockdep_assert_held(&iopt->iova_rwsem);
128
129 /* Protect roundup_pow-of_two() from overflow */
130 if (length == 0 || length >= ULONG_MAX / 2)
131 return -EOVERFLOW;
132
133 /*
134 * Keep alignment present in the uptr when building the IOVA, this
135 * increases the chance we can map a THP.
136 */
137 if (!uptr)
138 iova_alignment = roundup_pow_of_two(length);
139 else
140 iova_alignment = min_t(unsigned long,
141 roundup_pow_of_two(length),
142 1UL << __ffs64(uptr));
143
144 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
145 max_alignment = HPAGE_SIZE;
146 #endif
147 /* Protect against ALIGN() overflow */
148 if (iova_alignment >= max_alignment)
149 iova_alignment = max_alignment;
150
151 if (iova_alignment < iopt->iova_alignment)
152 return -EINVAL;
153
154 interval_tree_for_each_span(&allowed_span, &iopt->allowed_itree,
155 PAGE_SIZE, ULONG_MAX - PAGE_SIZE) {
156 if (RB_EMPTY_ROOT(&iopt->allowed_itree.rb_root)) {
157 allowed_span.start_used = PAGE_SIZE;
158 allowed_span.last_used = ULONG_MAX - PAGE_SIZE;
159 allowed_span.is_hole = false;
160 }
161
162 if (!__alloc_iova_check_used(&allowed_span, length,
163 iova_alignment, page_offset))
164 continue;
165
166 interval_tree_for_each_double_span(
167 &used_span, &iopt->reserved_itree, &iopt->area_itree,
168 allowed_span.start_used, allowed_span.last_used) {
169 if (!__alloc_iova_check_hole(&used_span, length,
170 iova_alignment,
171 page_offset))
172 continue;
173
174 *iova = used_span.start_hole;
175 return 0;
176 }
177 }
178 return -ENOSPC;
179 }
180
iopt_check_iova(struct io_pagetable * iopt,unsigned long iova,unsigned long length)181 static int iopt_check_iova(struct io_pagetable *iopt, unsigned long iova,
182 unsigned long length)
183 {
184 unsigned long last;
185
186 lockdep_assert_held(&iopt->iova_rwsem);
187
188 if ((iova & (iopt->iova_alignment - 1)))
189 return -EINVAL;
190
191 if (check_add_overflow(iova, length - 1, &last))
192 return -EOVERFLOW;
193
194 /* No reserved IOVA intersects the range */
195 if (iopt_reserved_iter_first(iopt, iova, last))
196 return -EINVAL;
197
198 /* Check that there is not already a mapping in the range */
199 if (iopt_area_iter_first(iopt, iova, last))
200 return -EEXIST;
201 return 0;
202 }
203
204 /*
205 * The area takes a slice of the pages from start_bytes to start_byte + length
206 */
iopt_insert_area(struct io_pagetable * iopt,struct iopt_area * area,struct iopt_pages * pages,unsigned long iova,unsigned long start_byte,unsigned long length,int iommu_prot)207 static int iopt_insert_area(struct io_pagetable *iopt, struct iopt_area *area,
208 struct iopt_pages *pages, unsigned long iova,
209 unsigned long start_byte, unsigned long length,
210 int iommu_prot)
211 {
212 lockdep_assert_held_write(&iopt->iova_rwsem);
213
214 if ((iommu_prot & IOMMU_WRITE) && !pages->writable)
215 return -EPERM;
216
217 area->iommu_prot = iommu_prot;
218 area->page_offset = start_byte % PAGE_SIZE;
219 if (area->page_offset & (iopt->iova_alignment - 1))
220 return -EINVAL;
221
222 area->node.start = iova;
223 if (check_add_overflow(iova, length - 1, &area->node.last))
224 return -EOVERFLOW;
225
226 area->pages_node.start = start_byte / PAGE_SIZE;
227 if (check_add_overflow(start_byte, length - 1, &area->pages_node.last))
228 return -EOVERFLOW;
229 area->pages_node.last = area->pages_node.last / PAGE_SIZE;
230 if (WARN_ON(area->pages_node.last >= pages->npages))
231 return -EOVERFLOW;
232
233 /*
234 * The area is inserted with a NULL pages indicating it is not fully
235 * initialized yet.
236 */
237 area->iopt = iopt;
238 interval_tree_insert(&area->node, &iopt->area_itree);
239 return 0;
240 }
241
iopt_area_alloc(void)242 static struct iopt_area *iopt_area_alloc(void)
243 {
244 struct iopt_area *area;
245
246 area = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT);
247 if (!area)
248 return NULL;
249 RB_CLEAR_NODE(&area->node.rb);
250 RB_CLEAR_NODE(&area->pages_node.rb);
251 return area;
252 }
253
iopt_alloc_area_pages(struct io_pagetable * iopt,struct list_head * pages_list,unsigned long length,unsigned long * dst_iova,int iommu_prot,unsigned int flags)254 static int iopt_alloc_area_pages(struct io_pagetable *iopt,
255 struct list_head *pages_list,
256 unsigned long length, unsigned long *dst_iova,
257 int iommu_prot, unsigned int flags)
258 {
259 struct iopt_pages_list *elm;
260 unsigned long iova;
261 int rc = 0;
262
263 list_for_each_entry(elm, pages_list, next) {
264 elm->area = iopt_area_alloc();
265 if (!elm->area)
266 return -ENOMEM;
267 }
268
269 down_write(&iopt->iova_rwsem);
270 if ((length & (iopt->iova_alignment - 1)) || !length) {
271 rc = -EINVAL;
272 goto out_unlock;
273 }
274
275 if (flags & IOPT_ALLOC_IOVA) {
276 /* Use the first entry to guess the ideal IOVA alignment */
277 elm = list_first_entry(pages_list, struct iopt_pages_list,
278 next);
279 rc = iopt_alloc_iova(
280 iopt, dst_iova,
281 (uintptr_t)elm->pages->uptr + elm->start_byte, length);
282 if (rc)
283 goto out_unlock;
284 if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
285 WARN_ON(iopt_check_iova(iopt, *dst_iova, length))) {
286 rc = -EINVAL;
287 goto out_unlock;
288 }
289 } else {
290 rc = iopt_check_iova(iopt, *dst_iova, length);
291 if (rc)
292 goto out_unlock;
293 }
294
295 /*
296 * Areas are created with a NULL pages so that the IOVA space is
297 * reserved and we can unlock the iova_rwsem.
298 */
299 iova = *dst_iova;
300 list_for_each_entry(elm, pages_list, next) {
301 rc = iopt_insert_area(iopt, elm->area, elm->pages, iova,
302 elm->start_byte, elm->length, iommu_prot);
303 if (rc)
304 goto out_unlock;
305 iova += elm->length;
306 }
307
308 out_unlock:
309 up_write(&iopt->iova_rwsem);
310 return rc;
311 }
312
iopt_abort_area(struct iopt_area * area)313 static void iopt_abort_area(struct iopt_area *area)
314 {
315 if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
316 WARN_ON(area->pages);
317 if (area->iopt) {
318 down_write(&area->iopt->iova_rwsem);
319 interval_tree_remove(&area->node, &area->iopt->area_itree);
320 up_write(&area->iopt->iova_rwsem);
321 }
322 kfree(area);
323 }
324
iopt_free_pages_list(struct list_head * pages_list)325 void iopt_free_pages_list(struct list_head *pages_list)
326 {
327 struct iopt_pages_list *elm;
328
329 while ((elm = list_first_entry_or_null(pages_list,
330 struct iopt_pages_list, next))) {
331 if (elm->area)
332 iopt_abort_area(elm->area);
333 if (elm->pages)
334 iopt_put_pages(elm->pages);
335 list_del(&elm->next);
336 kfree(elm);
337 }
338 }
339
iopt_fill_domains_pages(struct list_head * pages_list)340 static int iopt_fill_domains_pages(struct list_head *pages_list)
341 {
342 struct iopt_pages_list *undo_elm;
343 struct iopt_pages_list *elm;
344 int rc;
345
346 list_for_each_entry(elm, pages_list, next) {
347 rc = iopt_area_fill_domains(elm->area, elm->pages);
348 if (rc)
349 goto err_undo;
350 }
351 return 0;
352
353 err_undo:
354 list_for_each_entry(undo_elm, pages_list, next) {
355 if (undo_elm == elm)
356 break;
357 iopt_area_unfill_domains(undo_elm->area, undo_elm->pages);
358 }
359 return rc;
360 }
361
iopt_map_pages(struct io_pagetable * iopt,struct list_head * pages_list,unsigned long length,unsigned long * dst_iova,int iommu_prot,unsigned int flags)362 int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list,
363 unsigned long length, unsigned long *dst_iova,
364 int iommu_prot, unsigned int flags)
365 {
366 struct iopt_pages_list *elm;
367 int rc;
368
369 rc = iopt_alloc_area_pages(iopt, pages_list, length, dst_iova,
370 iommu_prot, flags);
371 if (rc)
372 return rc;
373
374 down_read(&iopt->domains_rwsem);
375 rc = iopt_fill_domains_pages(pages_list);
376 if (rc)
377 goto out_unlock_domains;
378
379 down_write(&iopt->iova_rwsem);
380 list_for_each_entry(elm, pages_list, next) {
381 /*
382 * area->pages must be set inside the domains_rwsem to ensure
383 * any newly added domains will get filled. Moves the reference
384 * in from the list.
385 */
386 elm->area->pages = elm->pages;
387 elm->pages = NULL;
388 elm->area = NULL;
389 }
390 up_write(&iopt->iova_rwsem);
391 out_unlock_domains:
392 up_read(&iopt->domains_rwsem);
393 return rc;
394 }
395
396 /**
397 * iopt_map_user_pages() - Map a user VA to an iova in the io page table
398 * @ictx: iommufd_ctx the iopt is part of
399 * @iopt: io_pagetable to act on
400 * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
401 * the chosen iova on output. Otherwise is the iova to map to on input
402 * @uptr: User VA to map
403 * @length: Number of bytes to map
404 * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
405 * @flags: IOPT_ALLOC_IOVA or zero
406 *
407 * iova, uptr, and length must be aligned to iova_alignment. For domain backed
408 * page tables this will pin the pages and load them into the domain at iova.
409 * For non-domain page tables this will only setup a lazy reference and the
410 * caller must use iopt_access_pages() to touch them.
411 *
412 * iopt_unmap_iova() must be called to undo this before the io_pagetable can be
413 * destroyed.
414 */
iopt_map_user_pages(struct iommufd_ctx * ictx,struct io_pagetable * iopt,unsigned long * iova,void __user * uptr,unsigned long length,int iommu_prot,unsigned int flags)415 int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
416 unsigned long *iova, void __user *uptr,
417 unsigned long length, int iommu_prot,
418 unsigned int flags)
419 {
420 struct iopt_pages_list elm = {};
421 LIST_HEAD(pages_list);
422 int rc;
423
424 elm.pages = iopt_alloc_pages(uptr, length, iommu_prot & IOMMU_WRITE);
425 if (IS_ERR(elm.pages))
426 return PTR_ERR(elm.pages);
427 if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM &&
428 elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER)
429 elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM;
430 elm.start_byte = uptr - elm.pages->uptr;
431 elm.length = length;
432 list_add(&elm.next, &pages_list);
433
434 rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags);
435 if (rc) {
436 if (elm.area)
437 iopt_abort_area(elm.area);
438 if (elm.pages)
439 iopt_put_pages(elm.pages);
440 return rc;
441 }
442 return 0;
443 }
444
445 struct iova_bitmap_fn_arg {
446 unsigned long flags;
447 struct io_pagetable *iopt;
448 struct iommu_domain *domain;
449 struct iommu_dirty_bitmap *dirty;
450 };
451
__iommu_read_and_clear_dirty(struct iova_bitmap * bitmap,unsigned long iova,size_t length,void * opaque)452 static int __iommu_read_and_clear_dirty(struct iova_bitmap *bitmap,
453 unsigned long iova, size_t length,
454 void *opaque)
455 {
456 struct iopt_area *area;
457 struct iopt_area_contig_iter iter;
458 struct iova_bitmap_fn_arg *arg = opaque;
459 struct iommu_domain *domain = arg->domain;
460 struct iommu_dirty_bitmap *dirty = arg->dirty;
461 const struct iommu_dirty_ops *ops = domain->dirty_ops;
462 unsigned long last_iova = iova + length - 1;
463 unsigned long flags = arg->flags;
464 int ret;
465
466 iopt_for_each_contig_area(&iter, area, arg->iopt, iova, last_iova) {
467 unsigned long last = min(last_iova, iopt_area_last_iova(area));
468
469 ret = ops->read_and_clear_dirty(domain, iter.cur_iova,
470 last - iter.cur_iova + 1, flags,
471 dirty);
472 if (ret)
473 return ret;
474 }
475
476 if (!iopt_area_contig_done(&iter))
477 return -EINVAL;
478 return 0;
479 }
480
481 static int
iommu_read_and_clear_dirty(struct iommu_domain * domain,struct io_pagetable * iopt,unsigned long flags,struct iommu_hwpt_get_dirty_bitmap * bitmap)482 iommu_read_and_clear_dirty(struct iommu_domain *domain,
483 struct io_pagetable *iopt, unsigned long flags,
484 struct iommu_hwpt_get_dirty_bitmap *bitmap)
485 {
486 const struct iommu_dirty_ops *ops = domain->dirty_ops;
487 struct iommu_iotlb_gather gather;
488 struct iommu_dirty_bitmap dirty;
489 struct iova_bitmap_fn_arg arg;
490 struct iova_bitmap *iter;
491 int ret = 0;
492
493 if (!ops || !ops->read_and_clear_dirty)
494 return -EOPNOTSUPP;
495
496 iter = iova_bitmap_alloc(bitmap->iova, bitmap->length,
497 bitmap->page_size,
498 u64_to_user_ptr(bitmap->data));
499 if (IS_ERR(iter))
500 return -ENOMEM;
501
502 iommu_dirty_bitmap_init(&dirty, iter, &gather);
503
504 arg.flags = flags;
505 arg.iopt = iopt;
506 arg.domain = domain;
507 arg.dirty = &dirty;
508 iova_bitmap_for_each(iter, &arg, __iommu_read_and_clear_dirty);
509
510 if (!(flags & IOMMU_DIRTY_NO_CLEAR))
511 iommu_iotlb_sync(domain, &gather);
512
513 iova_bitmap_free(iter);
514
515 return ret;
516 }
517
iommufd_check_iova_range(struct io_pagetable * iopt,struct iommu_hwpt_get_dirty_bitmap * bitmap)518 int iommufd_check_iova_range(struct io_pagetable *iopt,
519 struct iommu_hwpt_get_dirty_bitmap *bitmap)
520 {
521 size_t iommu_pgsize = iopt->iova_alignment;
522 u64 last_iova;
523
524 if (check_add_overflow(bitmap->iova, bitmap->length - 1, &last_iova))
525 return -EOVERFLOW;
526
527 if (bitmap->iova > ULONG_MAX || last_iova > ULONG_MAX)
528 return -EOVERFLOW;
529
530 if ((bitmap->iova & (iommu_pgsize - 1)) ||
531 ((last_iova + 1) & (iommu_pgsize - 1)))
532 return -EINVAL;
533
534 if (!bitmap->page_size)
535 return -EINVAL;
536
537 if ((bitmap->iova & (bitmap->page_size - 1)) ||
538 ((last_iova + 1) & (bitmap->page_size - 1)))
539 return -EINVAL;
540
541 return 0;
542 }
543
iopt_read_and_clear_dirty_data(struct io_pagetable * iopt,struct iommu_domain * domain,unsigned long flags,struct iommu_hwpt_get_dirty_bitmap * bitmap)544 int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt,
545 struct iommu_domain *domain,
546 unsigned long flags,
547 struct iommu_hwpt_get_dirty_bitmap *bitmap)
548 {
549 int ret;
550
551 ret = iommufd_check_iova_range(iopt, bitmap);
552 if (ret)
553 return ret;
554
555 down_read(&iopt->iova_rwsem);
556 ret = iommu_read_and_clear_dirty(domain, iopt, flags, bitmap);
557 up_read(&iopt->iova_rwsem);
558
559 return ret;
560 }
561
iopt_clear_dirty_data(struct io_pagetable * iopt,struct iommu_domain * domain)562 static int iopt_clear_dirty_data(struct io_pagetable *iopt,
563 struct iommu_domain *domain)
564 {
565 const struct iommu_dirty_ops *ops = domain->dirty_ops;
566 struct iommu_iotlb_gather gather;
567 struct iommu_dirty_bitmap dirty;
568 struct iopt_area *area;
569 int ret = 0;
570
571 lockdep_assert_held_read(&iopt->iova_rwsem);
572
573 iommu_dirty_bitmap_init(&dirty, NULL, &gather);
574
575 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
576 area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
577 if (!area->pages)
578 continue;
579
580 ret = ops->read_and_clear_dirty(domain, iopt_area_iova(area),
581 iopt_area_length(area), 0,
582 &dirty);
583 if (ret)
584 break;
585 }
586
587 iommu_iotlb_sync(domain, &gather);
588 return ret;
589 }
590
iopt_set_dirty_tracking(struct io_pagetable * iopt,struct iommu_domain * domain,bool enable)591 int iopt_set_dirty_tracking(struct io_pagetable *iopt,
592 struct iommu_domain *domain, bool enable)
593 {
594 const struct iommu_dirty_ops *ops = domain->dirty_ops;
595 int ret = 0;
596
597 if (!ops)
598 return -EOPNOTSUPP;
599
600 down_read(&iopt->iova_rwsem);
601
602 /* Clear dirty bits from PTEs to ensure a clean snapshot */
603 if (enable) {
604 ret = iopt_clear_dirty_data(iopt, domain);
605 if (ret)
606 goto out_unlock;
607 }
608
609 ret = ops->set_dirty_tracking(domain, enable);
610
611 out_unlock:
612 up_read(&iopt->iova_rwsem);
613 return ret;
614 }
615
iopt_get_pages(struct io_pagetable * iopt,unsigned long iova,unsigned long length,struct list_head * pages_list)616 int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova,
617 unsigned long length, struct list_head *pages_list)
618 {
619 struct iopt_area_contig_iter iter;
620 unsigned long last_iova;
621 struct iopt_area *area;
622 int rc;
623
624 if (!length)
625 return -EINVAL;
626 if (check_add_overflow(iova, length - 1, &last_iova))
627 return -EOVERFLOW;
628
629 down_read(&iopt->iova_rwsem);
630 iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) {
631 struct iopt_pages_list *elm;
632 unsigned long last = min(last_iova, iopt_area_last_iova(area));
633
634 elm = kzalloc(sizeof(*elm), GFP_KERNEL_ACCOUNT);
635 if (!elm) {
636 rc = -ENOMEM;
637 goto err_free;
638 }
639 elm->start_byte = iopt_area_start_byte(area, iter.cur_iova);
640 elm->pages = area->pages;
641 elm->length = (last - iter.cur_iova) + 1;
642 kref_get(&elm->pages->kref);
643 list_add_tail(&elm->next, pages_list);
644 }
645 if (!iopt_area_contig_done(&iter)) {
646 rc = -ENOENT;
647 goto err_free;
648 }
649 up_read(&iopt->iova_rwsem);
650 return 0;
651 err_free:
652 up_read(&iopt->iova_rwsem);
653 iopt_free_pages_list(pages_list);
654 return rc;
655 }
656
iopt_unmap_iova_range(struct io_pagetable * iopt,unsigned long start,unsigned long last,unsigned long * unmapped)657 static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start,
658 unsigned long last, unsigned long *unmapped)
659 {
660 struct iopt_area *area;
661 unsigned long unmapped_bytes = 0;
662 unsigned int tries = 0;
663 int rc = -ENOENT;
664
665 /*
666 * The domains_rwsem must be held in read mode any time any area->pages
667 * is NULL. This prevents domain attach/detatch from running
668 * concurrently with cleaning up the area.
669 */
670 again:
671 down_read(&iopt->domains_rwsem);
672 down_write(&iopt->iova_rwsem);
673 while ((area = iopt_area_iter_first(iopt, start, last))) {
674 unsigned long area_last = iopt_area_last_iova(area);
675 unsigned long area_first = iopt_area_iova(area);
676 struct iopt_pages *pages;
677
678 /* Userspace should not race map/unmap's of the same area */
679 if (!area->pages) {
680 rc = -EBUSY;
681 goto out_unlock_iova;
682 }
683
684 if (area_first < start || area_last > last) {
685 rc = -ENOENT;
686 goto out_unlock_iova;
687 }
688
689 if (area_first != start)
690 tries = 0;
691
692 /*
693 * num_accesses writers must hold the iova_rwsem too, so we can
694 * safely read it under the write side of the iovam_rwsem
695 * without the pages->mutex.
696 */
697 if (area->num_accesses) {
698 size_t length = iopt_area_length(area);
699
700 start = area_first;
701 area->prevent_access = true;
702 up_write(&iopt->iova_rwsem);
703 up_read(&iopt->domains_rwsem);
704
705 iommufd_access_notify_unmap(iopt, area_first, length);
706 /* Something is not responding to unmap requests. */
707 tries++;
708 if (WARN_ON(tries > 100)) {
709 rc = -EDEADLOCK;
710 goto out_unmapped;
711 }
712 goto again;
713 }
714
715 pages = area->pages;
716 area->pages = NULL;
717 up_write(&iopt->iova_rwsem);
718
719 iopt_area_unfill_domains(area, pages);
720 iopt_abort_area(area);
721 iopt_put_pages(pages);
722
723 unmapped_bytes += area_last - area_first + 1;
724
725 down_write(&iopt->iova_rwsem);
726 }
727 if (unmapped_bytes)
728 rc = 0;
729
730 out_unlock_iova:
731 up_write(&iopt->iova_rwsem);
732 up_read(&iopt->domains_rwsem);
733 out_unmapped:
734 if (unmapped)
735 *unmapped = unmapped_bytes;
736 return rc;
737 }
738
739 /**
740 * iopt_unmap_iova() - Remove a range of iova
741 * @iopt: io_pagetable to act on
742 * @iova: Starting iova to unmap
743 * @length: Number of bytes to unmap
744 * @unmapped: Return number of bytes unmapped
745 *
746 * The requested range must be a superset of existing ranges.
747 * Splitting/truncating IOVA mappings is not allowed.
748 */
iopt_unmap_iova(struct io_pagetable * iopt,unsigned long iova,unsigned long length,unsigned long * unmapped)749 int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
750 unsigned long length, unsigned long *unmapped)
751 {
752 unsigned long iova_last;
753
754 if (!length)
755 return -EINVAL;
756
757 if (check_add_overflow(iova, length - 1, &iova_last))
758 return -EOVERFLOW;
759
760 return iopt_unmap_iova_range(iopt, iova, iova_last, unmapped);
761 }
762
iopt_unmap_all(struct io_pagetable * iopt,unsigned long * unmapped)763 int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped)
764 {
765 int rc;
766
767 rc = iopt_unmap_iova_range(iopt, 0, ULONG_MAX, unmapped);
768 /* If the IOVAs are empty then unmap all succeeds */
769 if (rc == -ENOENT)
770 return 0;
771 return rc;
772 }
773
774 /* The caller must always free all the nodes in the allowed_iova rb_root. */
iopt_set_allow_iova(struct io_pagetable * iopt,struct rb_root_cached * allowed_iova)775 int iopt_set_allow_iova(struct io_pagetable *iopt,
776 struct rb_root_cached *allowed_iova)
777 {
778 struct iopt_allowed *allowed;
779
780 down_write(&iopt->iova_rwsem);
781 swap(*allowed_iova, iopt->allowed_itree);
782
783 for (allowed = iopt_allowed_iter_first(iopt, 0, ULONG_MAX); allowed;
784 allowed = iopt_allowed_iter_next(allowed, 0, ULONG_MAX)) {
785 if (iopt_reserved_iter_first(iopt, allowed->node.start,
786 allowed->node.last)) {
787 swap(*allowed_iova, iopt->allowed_itree);
788 up_write(&iopt->iova_rwsem);
789 return -EADDRINUSE;
790 }
791 }
792 up_write(&iopt->iova_rwsem);
793 return 0;
794 }
795
iopt_reserve_iova(struct io_pagetable * iopt,unsigned long start,unsigned long last,void * owner)796 int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start,
797 unsigned long last, void *owner)
798 {
799 struct iopt_reserved *reserved;
800
801 lockdep_assert_held_write(&iopt->iova_rwsem);
802
803 if (iopt_area_iter_first(iopt, start, last) ||
804 iopt_allowed_iter_first(iopt, start, last))
805 return -EADDRINUSE;
806
807 reserved = kzalloc(sizeof(*reserved), GFP_KERNEL_ACCOUNT);
808 if (!reserved)
809 return -ENOMEM;
810 reserved->node.start = start;
811 reserved->node.last = last;
812 reserved->owner = owner;
813 interval_tree_insert(&reserved->node, &iopt->reserved_itree);
814 return 0;
815 }
816
__iopt_remove_reserved_iova(struct io_pagetable * iopt,void * owner)817 static void __iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
818 {
819 struct iopt_reserved *reserved, *next;
820
821 lockdep_assert_held_write(&iopt->iova_rwsem);
822
823 for (reserved = iopt_reserved_iter_first(iopt, 0, ULONG_MAX); reserved;
824 reserved = next) {
825 next = iopt_reserved_iter_next(reserved, 0, ULONG_MAX);
826
827 if (reserved->owner == owner) {
828 interval_tree_remove(&reserved->node,
829 &iopt->reserved_itree);
830 kfree(reserved);
831 }
832 }
833 }
834
iopt_remove_reserved_iova(struct io_pagetable * iopt,void * owner)835 void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
836 {
837 down_write(&iopt->iova_rwsem);
838 __iopt_remove_reserved_iova(iopt, owner);
839 up_write(&iopt->iova_rwsem);
840 }
841
iopt_init_table(struct io_pagetable * iopt)842 void iopt_init_table(struct io_pagetable *iopt)
843 {
844 init_rwsem(&iopt->iova_rwsem);
845 init_rwsem(&iopt->domains_rwsem);
846 iopt->area_itree = RB_ROOT_CACHED;
847 iopt->allowed_itree = RB_ROOT_CACHED;
848 iopt->reserved_itree = RB_ROOT_CACHED;
849 xa_init_flags(&iopt->domains, XA_FLAGS_ACCOUNT);
850 xa_init_flags(&iopt->access_list, XA_FLAGS_ALLOC);
851
852 /*
853 * iopt's start as SW tables that can use the entire size_t IOVA space
854 * due to the use of size_t in the APIs. They have no alignment
855 * restriction.
856 */
857 iopt->iova_alignment = 1;
858 }
859
iopt_destroy_table(struct io_pagetable * iopt)860 void iopt_destroy_table(struct io_pagetable *iopt)
861 {
862 struct interval_tree_node *node;
863
864 if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
865 iopt_remove_reserved_iova(iopt, NULL);
866
867 while ((node = interval_tree_iter_first(&iopt->allowed_itree, 0,
868 ULONG_MAX))) {
869 interval_tree_remove(node, &iopt->allowed_itree);
870 kfree(container_of(node, struct iopt_allowed, node));
871 }
872
873 WARN_ON(!RB_EMPTY_ROOT(&iopt->reserved_itree.rb_root));
874 WARN_ON(!xa_empty(&iopt->domains));
875 WARN_ON(!xa_empty(&iopt->access_list));
876 WARN_ON(!RB_EMPTY_ROOT(&iopt->area_itree.rb_root));
877 }
878
879 /**
880 * iopt_unfill_domain() - Unfill a domain with PFNs
881 * @iopt: io_pagetable to act on
882 * @domain: domain to unfill
883 *
884 * This is used when removing a domain from the iopt. Every area in the iopt
885 * will be unmapped from the domain. The domain must already be removed from the
886 * domains xarray.
887 */
iopt_unfill_domain(struct io_pagetable * iopt,struct iommu_domain * domain)888 static void iopt_unfill_domain(struct io_pagetable *iopt,
889 struct iommu_domain *domain)
890 {
891 struct iopt_area *area;
892
893 lockdep_assert_held(&iopt->iova_rwsem);
894 lockdep_assert_held_write(&iopt->domains_rwsem);
895
896 /*
897 * Some other domain is holding all the pfns still, rapidly unmap this
898 * domain.
899 */
900 if (iopt->next_domain_id != 0) {
901 /* Pick an arbitrary remaining domain to act as storage */
902 struct iommu_domain *storage_domain =
903 xa_load(&iopt->domains, 0);
904
905 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
906 area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
907 struct iopt_pages *pages = area->pages;
908
909 if (!pages)
910 continue;
911
912 mutex_lock(&pages->mutex);
913 if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
914 WARN_ON(!area->storage_domain);
915 if (area->storage_domain == domain)
916 area->storage_domain = storage_domain;
917 mutex_unlock(&pages->mutex);
918
919 iopt_area_unmap_domain(area, domain);
920 }
921 return;
922 }
923
924 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
925 area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
926 struct iopt_pages *pages = area->pages;
927
928 if (!pages)
929 continue;
930
931 mutex_lock(&pages->mutex);
932 interval_tree_remove(&area->pages_node, &pages->domains_itree);
933 WARN_ON(area->storage_domain != domain);
934 area->storage_domain = NULL;
935 iopt_area_unfill_domain(area, pages, domain);
936 mutex_unlock(&pages->mutex);
937 }
938 }
939
940 /**
941 * iopt_fill_domain() - Fill a domain with PFNs
942 * @iopt: io_pagetable to act on
943 * @domain: domain to fill
944 *
945 * Fill the domain with PFNs from every area in the iopt. On failure the domain
946 * is left unchanged.
947 */
iopt_fill_domain(struct io_pagetable * iopt,struct iommu_domain * domain)948 static int iopt_fill_domain(struct io_pagetable *iopt,
949 struct iommu_domain *domain)
950 {
951 struct iopt_area *end_area;
952 struct iopt_area *area;
953 int rc;
954
955 lockdep_assert_held(&iopt->iova_rwsem);
956 lockdep_assert_held_write(&iopt->domains_rwsem);
957
958 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
959 area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
960 struct iopt_pages *pages = area->pages;
961
962 if (!pages)
963 continue;
964
965 mutex_lock(&pages->mutex);
966 rc = iopt_area_fill_domain(area, domain);
967 if (rc) {
968 mutex_unlock(&pages->mutex);
969 goto out_unfill;
970 }
971 if (!area->storage_domain) {
972 WARN_ON(iopt->next_domain_id != 0);
973 area->storage_domain = domain;
974 interval_tree_insert(&area->pages_node,
975 &pages->domains_itree);
976 }
977 mutex_unlock(&pages->mutex);
978 }
979 return 0;
980
981 out_unfill:
982 end_area = area;
983 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
984 area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
985 struct iopt_pages *pages = area->pages;
986
987 if (area == end_area)
988 break;
989 if (!pages)
990 continue;
991 mutex_lock(&pages->mutex);
992 if (iopt->next_domain_id == 0) {
993 interval_tree_remove(&area->pages_node,
994 &pages->domains_itree);
995 area->storage_domain = NULL;
996 }
997 iopt_area_unfill_domain(area, pages, domain);
998 mutex_unlock(&pages->mutex);
999 }
1000 return rc;
1001 }
1002
1003 /* All existing area's conform to an increased page size */
iopt_check_iova_alignment(struct io_pagetable * iopt,unsigned long new_iova_alignment)1004 static int iopt_check_iova_alignment(struct io_pagetable *iopt,
1005 unsigned long new_iova_alignment)
1006 {
1007 unsigned long align_mask = new_iova_alignment - 1;
1008 struct iopt_area *area;
1009
1010 lockdep_assert_held(&iopt->iova_rwsem);
1011 lockdep_assert_held(&iopt->domains_rwsem);
1012
1013 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
1014 area = iopt_area_iter_next(area, 0, ULONG_MAX))
1015 if ((iopt_area_iova(area) & align_mask) ||
1016 (iopt_area_length(area) & align_mask) ||
1017 (area->page_offset & align_mask))
1018 return -EADDRINUSE;
1019
1020 if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) {
1021 struct iommufd_access *access;
1022 unsigned long index;
1023
1024 xa_for_each(&iopt->access_list, index, access)
1025 if (WARN_ON(access->iova_alignment >
1026 new_iova_alignment))
1027 return -EADDRINUSE;
1028 }
1029 return 0;
1030 }
1031
iopt_table_add_domain(struct io_pagetable * iopt,struct iommu_domain * domain)1032 int iopt_table_add_domain(struct io_pagetable *iopt,
1033 struct iommu_domain *domain)
1034 {
1035 const struct iommu_domain_geometry *geometry = &domain->geometry;
1036 struct iommu_domain *iter_domain;
1037 unsigned int new_iova_alignment;
1038 unsigned long index;
1039 int rc;
1040
1041 down_write(&iopt->domains_rwsem);
1042 down_write(&iopt->iova_rwsem);
1043
1044 xa_for_each(&iopt->domains, index, iter_domain) {
1045 if (WARN_ON(iter_domain == domain)) {
1046 rc = -EEXIST;
1047 goto out_unlock;
1048 }
1049 }
1050
1051 /*
1052 * The io page size drives the iova_alignment. Internally the iopt_pages
1053 * works in PAGE_SIZE units and we adjust when mapping sub-PAGE_SIZE
1054 * objects into the iommu_domain.
1055 *
1056 * A iommu_domain must always be able to accept PAGE_SIZE to be
1057 * compatible as we can't guarantee higher contiguity.
1058 */
1059 new_iova_alignment = max_t(unsigned long,
1060 1UL << __ffs(domain->pgsize_bitmap),
1061 iopt->iova_alignment);
1062 if (new_iova_alignment > PAGE_SIZE) {
1063 rc = -EINVAL;
1064 goto out_unlock;
1065 }
1066 if (new_iova_alignment != iopt->iova_alignment) {
1067 rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
1068 if (rc)
1069 goto out_unlock;
1070 }
1071
1072 /* No area exists that is outside the allowed domain aperture */
1073 if (geometry->aperture_start != 0) {
1074 rc = iopt_reserve_iova(iopt, 0, geometry->aperture_start - 1,
1075 domain);
1076 if (rc)
1077 goto out_reserved;
1078 }
1079 if (geometry->aperture_end != ULONG_MAX) {
1080 rc = iopt_reserve_iova(iopt, geometry->aperture_end + 1,
1081 ULONG_MAX, domain);
1082 if (rc)
1083 goto out_reserved;
1084 }
1085
1086 rc = xa_reserve(&iopt->domains, iopt->next_domain_id, GFP_KERNEL);
1087 if (rc)
1088 goto out_reserved;
1089
1090 rc = iopt_fill_domain(iopt, domain);
1091 if (rc)
1092 goto out_release;
1093
1094 iopt->iova_alignment = new_iova_alignment;
1095 xa_store(&iopt->domains, iopt->next_domain_id, domain, GFP_KERNEL);
1096 iopt->next_domain_id++;
1097 up_write(&iopt->iova_rwsem);
1098 up_write(&iopt->domains_rwsem);
1099 return 0;
1100 out_release:
1101 xa_release(&iopt->domains, iopt->next_domain_id);
1102 out_reserved:
1103 __iopt_remove_reserved_iova(iopt, domain);
1104 out_unlock:
1105 up_write(&iopt->iova_rwsem);
1106 up_write(&iopt->domains_rwsem);
1107 return rc;
1108 }
1109
iopt_calculate_iova_alignment(struct io_pagetable * iopt)1110 static int iopt_calculate_iova_alignment(struct io_pagetable *iopt)
1111 {
1112 unsigned long new_iova_alignment;
1113 struct iommufd_access *access;
1114 struct iommu_domain *domain;
1115 unsigned long index;
1116
1117 lockdep_assert_held_write(&iopt->iova_rwsem);
1118 lockdep_assert_held(&iopt->domains_rwsem);
1119
1120 /* See batch_iommu_map_small() */
1121 if (iopt->disable_large_pages)
1122 new_iova_alignment = PAGE_SIZE;
1123 else
1124 new_iova_alignment = 1;
1125
1126 xa_for_each(&iopt->domains, index, domain)
1127 new_iova_alignment = max_t(unsigned long,
1128 1UL << __ffs(domain->pgsize_bitmap),
1129 new_iova_alignment);
1130 xa_for_each(&iopt->access_list, index, access)
1131 new_iova_alignment = max_t(unsigned long,
1132 access->iova_alignment,
1133 new_iova_alignment);
1134
1135 if (new_iova_alignment > iopt->iova_alignment) {
1136 int rc;
1137
1138 rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
1139 if (rc)
1140 return rc;
1141 }
1142 iopt->iova_alignment = new_iova_alignment;
1143 return 0;
1144 }
1145
iopt_table_remove_domain(struct io_pagetable * iopt,struct iommu_domain * domain)1146 void iopt_table_remove_domain(struct io_pagetable *iopt,
1147 struct iommu_domain *domain)
1148 {
1149 struct iommu_domain *iter_domain = NULL;
1150 unsigned long index;
1151
1152 down_write(&iopt->domains_rwsem);
1153 down_write(&iopt->iova_rwsem);
1154
1155 xa_for_each(&iopt->domains, index, iter_domain)
1156 if (iter_domain == domain)
1157 break;
1158 if (WARN_ON(iter_domain != domain) || index >= iopt->next_domain_id)
1159 goto out_unlock;
1160
1161 /*
1162 * Compress the xarray to keep it linear by swapping the entry to erase
1163 * with the tail entry and shrinking the tail.
1164 */
1165 iopt->next_domain_id--;
1166 iter_domain = xa_erase(&iopt->domains, iopt->next_domain_id);
1167 if (index != iopt->next_domain_id)
1168 xa_store(&iopt->domains, index, iter_domain, GFP_KERNEL);
1169
1170 iopt_unfill_domain(iopt, domain);
1171 __iopt_remove_reserved_iova(iopt, domain);
1172
1173 WARN_ON(iopt_calculate_iova_alignment(iopt));
1174 out_unlock:
1175 up_write(&iopt->iova_rwsem);
1176 up_write(&iopt->domains_rwsem);
1177 }
1178
1179 /**
1180 * iopt_area_split - Split an area into two parts at iova
1181 * @area: The area to split
1182 * @iova: Becomes the last of a new area
1183 *
1184 * This splits an area into two. It is part of the VFIO compatibility to allow
1185 * poking a hole in the mapping. The two areas continue to point at the same
1186 * iopt_pages, just with different starting bytes.
1187 */
iopt_area_split(struct iopt_area * area,unsigned long iova)1188 static int iopt_area_split(struct iopt_area *area, unsigned long iova)
1189 {
1190 unsigned long alignment = area->iopt->iova_alignment;
1191 unsigned long last_iova = iopt_area_last_iova(area);
1192 unsigned long start_iova = iopt_area_iova(area);
1193 unsigned long new_start = iova + 1;
1194 struct io_pagetable *iopt = area->iopt;
1195 struct iopt_pages *pages = area->pages;
1196 struct iopt_area *lhs;
1197 struct iopt_area *rhs;
1198 int rc;
1199
1200 lockdep_assert_held_write(&iopt->iova_rwsem);
1201
1202 if (iova == start_iova || iova == last_iova)
1203 return 0;
1204
1205 if (!pages || area->prevent_access)
1206 return -EBUSY;
1207
1208 if (new_start & (alignment - 1) ||
1209 iopt_area_start_byte(area, new_start) & (alignment - 1))
1210 return -EINVAL;
1211
1212 lhs = iopt_area_alloc();
1213 if (!lhs)
1214 return -ENOMEM;
1215
1216 rhs = iopt_area_alloc();
1217 if (!rhs) {
1218 rc = -ENOMEM;
1219 goto err_free_lhs;
1220 }
1221
1222 mutex_lock(&pages->mutex);
1223 /*
1224 * Splitting is not permitted if an access exists, we don't track enough
1225 * information to split existing accesses.
1226 */
1227 if (area->num_accesses) {
1228 rc = -EINVAL;
1229 goto err_unlock;
1230 }
1231
1232 /*
1233 * Splitting is not permitted if a domain could have been mapped with
1234 * huge pages.
1235 */
1236 if (area->storage_domain && !iopt->disable_large_pages) {
1237 rc = -EINVAL;
1238 goto err_unlock;
1239 }
1240
1241 interval_tree_remove(&area->node, &iopt->area_itree);
1242 rc = iopt_insert_area(iopt, lhs, area->pages, start_iova,
1243 iopt_area_start_byte(area, start_iova),
1244 (new_start - 1) - start_iova + 1,
1245 area->iommu_prot);
1246 if (WARN_ON(rc))
1247 goto err_insert;
1248
1249 rc = iopt_insert_area(iopt, rhs, area->pages, new_start,
1250 iopt_area_start_byte(area, new_start),
1251 last_iova - new_start + 1, area->iommu_prot);
1252 if (WARN_ON(rc))
1253 goto err_remove_lhs;
1254
1255 /*
1256 * If the original area has filled a domain, domains_itree has to be
1257 * updated.
1258 */
1259 if (area->storage_domain) {
1260 interval_tree_remove(&area->pages_node, &pages->domains_itree);
1261 interval_tree_insert(&lhs->pages_node, &pages->domains_itree);
1262 interval_tree_insert(&rhs->pages_node, &pages->domains_itree);
1263 }
1264
1265 lhs->storage_domain = area->storage_domain;
1266 lhs->pages = area->pages;
1267 rhs->storage_domain = area->storage_domain;
1268 rhs->pages = area->pages;
1269 kref_get(&rhs->pages->kref);
1270 kfree(area);
1271 mutex_unlock(&pages->mutex);
1272
1273 /*
1274 * No change to domains or accesses because the pages hasn't been
1275 * changed
1276 */
1277 return 0;
1278
1279 err_remove_lhs:
1280 interval_tree_remove(&lhs->node, &iopt->area_itree);
1281 err_insert:
1282 interval_tree_insert(&area->node, &iopt->area_itree);
1283 err_unlock:
1284 mutex_unlock(&pages->mutex);
1285 kfree(rhs);
1286 err_free_lhs:
1287 kfree(lhs);
1288 return rc;
1289 }
1290
iopt_cut_iova(struct io_pagetable * iopt,unsigned long * iovas,size_t num_iovas)1291 int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas,
1292 size_t num_iovas)
1293 {
1294 int rc = 0;
1295 int i;
1296
1297 down_write(&iopt->iova_rwsem);
1298 for (i = 0; i < num_iovas; i++) {
1299 struct iopt_area *area;
1300
1301 area = iopt_area_iter_first(iopt, iovas[i], iovas[i]);
1302 if (!area)
1303 continue;
1304 rc = iopt_area_split(area, iovas[i]);
1305 if (rc)
1306 break;
1307 }
1308 up_write(&iopt->iova_rwsem);
1309 return rc;
1310 }
1311
iopt_enable_large_pages(struct io_pagetable * iopt)1312 void iopt_enable_large_pages(struct io_pagetable *iopt)
1313 {
1314 int rc;
1315
1316 down_write(&iopt->domains_rwsem);
1317 down_write(&iopt->iova_rwsem);
1318 WRITE_ONCE(iopt->disable_large_pages, false);
1319 rc = iopt_calculate_iova_alignment(iopt);
1320 WARN_ON(rc);
1321 up_write(&iopt->iova_rwsem);
1322 up_write(&iopt->domains_rwsem);
1323 }
1324
iopt_disable_large_pages(struct io_pagetable * iopt)1325 int iopt_disable_large_pages(struct io_pagetable *iopt)
1326 {
1327 int rc = 0;
1328
1329 down_write(&iopt->domains_rwsem);
1330 down_write(&iopt->iova_rwsem);
1331 if (iopt->disable_large_pages)
1332 goto out_unlock;
1333
1334 /* Won't do it if domains already have pages mapped in them */
1335 if (!xa_empty(&iopt->domains) &&
1336 !RB_EMPTY_ROOT(&iopt->area_itree.rb_root)) {
1337 rc = -EINVAL;
1338 goto out_unlock;
1339 }
1340
1341 WRITE_ONCE(iopt->disable_large_pages, true);
1342 rc = iopt_calculate_iova_alignment(iopt);
1343 if (rc)
1344 WRITE_ONCE(iopt->disable_large_pages, false);
1345 out_unlock:
1346 up_write(&iopt->iova_rwsem);
1347 up_write(&iopt->domains_rwsem);
1348 return rc;
1349 }
1350
iopt_add_access(struct io_pagetable * iopt,struct iommufd_access * access)1351 int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access)
1352 {
1353 u32 new_id;
1354 int rc;
1355
1356 down_write(&iopt->domains_rwsem);
1357 down_write(&iopt->iova_rwsem);
1358 rc = xa_alloc(&iopt->access_list, &new_id, access, xa_limit_16b,
1359 GFP_KERNEL_ACCOUNT);
1360
1361 if (rc)
1362 goto out_unlock;
1363
1364 rc = iopt_calculate_iova_alignment(iopt);
1365 if (rc) {
1366 xa_erase(&iopt->access_list, new_id);
1367 goto out_unlock;
1368 }
1369 access->iopt_access_list_id = new_id;
1370
1371 out_unlock:
1372 up_write(&iopt->iova_rwsem);
1373 up_write(&iopt->domains_rwsem);
1374 return rc;
1375 }
1376
iopt_remove_access(struct io_pagetable * iopt,struct iommufd_access * access,u32 iopt_access_list_id)1377 void iopt_remove_access(struct io_pagetable *iopt,
1378 struct iommufd_access *access,
1379 u32 iopt_access_list_id)
1380 {
1381 down_write(&iopt->domains_rwsem);
1382 down_write(&iopt->iova_rwsem);
1383 WARN_ON(xa_erase(&iopt->access_list, iopt_access_list_id) != access);
1384 WARN_ON(iopt_calculate_iova_alignment(iopt));
1385 up_write(&iopt->iova_rwsem);
1386 up_write(&iopt->domains_rwsem);
1387 }
1388
1389 /* Narrow the valid_iova_itree to include reserved ranges from a device. */
iopt_table_enforce_dev_resv_regions(struct io_pagetable * iopt,struct device * dev,phys_addr_t * sw_msi_start)1390 int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt,
1391 struct device *dev,
1392 phys_addr_t *sw_msi_start)
1393 {
1394 struct iommu_resv_region *resv;
1395 LIST_HEAD(resv_regions);
1396 unsigned int num_hw_msi = 0;
1397 unsigned int num_sw_msi = 0;
1398 int rc;
1399
1400 if (iommufd_should_fail())
1401 return -EINVAL;
1402
1403 down_write(&iopt->iova_rwsem);
1404 /* FIXME: drivers allocate memory but there is no failure propogated */
1405 iommu_get_resv_regions(dev, &resv_regions);
1406
1407 list_for_each_entry(resv, &resv_regions, list) {
1408 if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE)
1409 continue;
1410
1411 if (sw_msi_start && resv->type == IOMMU_RESV_MSI)
1412 num_hw_msi++;
1413 if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) {
1414 *sw_msi_start = resv->start;
1415 num_sw_msi++;
1416 }
1417
1418 rc = iopt_reserve_iova(iopt, resv->start,
1419 resv->length - 1 + resv->start, dev);
1420 if (rc)
1421 goto out_reserved;
1422 }
1423
1424 /* Drivers must offer sane combinations of regions */
1425 if (WARN_ON(num_sw_msi && num_hw_msi) || WARN_ON(num_sw_msi > 1)) {
1426 rc = -EINVAL;
1427 goto out_reserved;
1428 }
1429
1430 rc = 0;
1431 goto out_free_resv;
1432
1433 out_reserved:
1434 __iopt_remove_reserved_iova(iopt, dev);
1435 out_free_resv:
1436 iommu_put_resv_regions(dev, &resv_regions);
1437 up_write(&iopt->iova_rwsem);
1438 return rc;
1439 }
1440