1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
3 *
4 * The io_pagetable is the top of datastructure that maps IOVA's to PFNs. The
5 * PFNs can be placed into an iommu_domain, or returned to the caller as a page
6 * list for access by an in-kernel user.
7 *
8 * The datastructure uses the iopt_pages to optimize the storage of the PFNs
9 * between the domains and xarray.
10 */
11 #include <linux/iommufd.h>
12 #include <linux/lockdep.h>
13 #include <linux/iommu.h>
14 #include <linux/sched/mm.h>
15 #include <linux/err.h>
16 #include <linux/slab.h>
17 #include <linux/errno.h>
18
19 #include "io_pagetable.h"
20 #include "double_span.h"
21
22 struct iopt_pages_list {
23 struct iopt_pages *pages;
24 struct iopt_area *area;
25 struct list_head next;
26 unsigned long start_byte;
27 unsigned long length;
28 };
29
iopt_area_contig_init(struct iopt_area_contig_iter * iter,struct io_pagetable * iopt,unsigned long iova,unsigned long last_iova)30 struct iopt_area *iopt_area_contig_init(struct iopt_area_contig_iter *iter,
31 struct io_pagetable *iopt,
32 unsigned long iova,
33 unsigned long last_iova)
34 {
35 lockdep_assert_held(&iopt->iova_rwsem);
36
37 iter->cur_iova = iova;
38 iter->last_iova = last_iova;
39 iter->area = iopt_area_iter_first(iopt, iova, iova);
40 if (!iter->area)
41 return NULL;
42 if (!iter->area->pages) {
43 iter->area = NULL;
44 return NULL;
45 }
46 return iter->area;
47 }
48
iopt_area_contig_next(struct iopt_area_contig_iter * iter)49 struct iopt_area *iopt_area_contig_next(struct iopt_area_contig_iter *iter)
50 {
51 unsigned long last_iova;
52
53 if (!iter->area)
54 return NULL;
55 last_iova = iopt_area_last_iova(iter->area);
56 if (iter->last_iova <= last_iova)
57 return NULL;
58
59 iter->cur_iova = last_iova + 1;
60 iter->area = iopt_area_iter_next(iter->area, iter->cur_iova,
61 iter->last_iova);
62 if (!iter->area)
63 return NULL;
64 if (iter->cur_iova != iopt_area_iova(iter->area) ||
65 !iter->area->pages) {
66 iter->area = NULL;
67 return NULL;
68 }
69 return iter->area;
70 }
71
__alloc_iova_check_hole(struct interval_tree_double_span_iter * span,unsigned long length,unsigned long iova_alignment,unsigned long page_offset)72 static bool __alloc_iova_check_hole(struct interval_tree_double_span_iter *span,
73 unsigned long length,
74 unsigned long iova_alignment,
75 unsigned long page_offset)
76 {
77 if (span->is_used || span->last_hole - span->start_hole < length - 1)
78 return false;
79
80 span->start_hole = ALIGN(span->start_hole, iova_alignment) |
81 page_offset;
82 if (span->start_hole > span->last_hole ||
83 span->last_hole - span->start_hole < length - 1)
84 return false;
85 return true;
86 }
87
__alloc_iova_check_used(struct interval_tree_span_iter * span,unsigned long length,unsigned long iova_alignment,unsigned long page_offset)88 static bool __alloc_iova_check_used(struct interval_tree_span_iter *span,
89 unsigned long length,
90 unsigned long iova_alignment,
91 unsigned long page_offset)
92 {
93 if (span->is_hole || span->last_used - span->start_used < length - 1)
94 return false;
95
96 span->start_used = ALIGN(span->start_used, iova_alignment) |
97 page_offset;
98 if (span->start_used > span->last_used ||
99 span->last_used - span->start_used < length - 1)
100 return false;
101 return true;
102 }
103
104 /*
105 * Automatically find a block of IOVA that is not being used and not reserved.
106 * Does not return a 0 IOVA even if it is valid.
107 */
iopt_alloc_iova(struct io_pagetable * iopt,unsigned long * iova,unsigned long uptr,unsigned long length)108 static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova,
109 unsigned long uptr, unsigned long length)
110 {
111 unsigned long page_offset = uptr % PAGE_SIZE;
112 struct interval_tree_double_span_iter used_span;
113 struct interval_tree_span_iter allowed_span;
114 unsigned long iova_alignment;
115
116 lockdep_assert_held(&iopt->iova_rwsem);
117
118 /* Protect roundup_pow-of_two() from overflow */
119 if (length == 0 || length >= ULONG_MAX / 2)
120 return -EOVERFLOW;
121
122 /*
123 * Keep alignment present in the uptr when building the IOVA, this
124 * increases the chance we can map a THP.
125 */
126 if (!uptr)
127 iova_alignment = roundup_pow_of_two(length);
128 else
129 iova_alignment = min_t(unsigned long,
130 roundup_pow_of_two(length),
131 1UL << __ffs64(uptr));
132
133 if (iova_alignment < iopt->iova_alignment)
134 return -EINVAL;
135
136 interval_tree_for_each_span(&allowed_span, &iopt->allowed_itree,
137 PAGE_SIZE, ULONG_MAX - PAGE_SIZE) {
138 if (RB_EMPTY_ROOT(&iopt->allowed_itree.rb_root)) {
139 allowed_span.start_used = PAGE_SIZE;
140 allowed_span.last_used = ULONG_MAX - PAGE_SIZE;
141 allowed_span.is_hole = false;
142 }
143
144 if (!__alloc_iova_check_used(&allowed_span, length,
145 iova_alignment, page_offset))
146 continue;
147
148 interval_tree_for_each_double_span(
149 &used_span, &iopt->reserved_itree, &iopt->area_itree,
150 allowed_span.start_used, allowed_span.last_used) {
151 if (!__alloc_iova_check_hole(&used_span, length,
152 iova_alignment,
153 page_offset))
154 continue;
155
156 *iova = used_span.start_hole;
157 return 0;
158 }
159 }
160 return -ENOSPC;
161 }
162
iopt_check_iova(struct io_pagetable * iopt,unsigned long iova,unsigned long length)163 static int iopt_check_iova(struct io_pagetable *iopt, unsigned long iova,
164 unsigned long length)
165 {
166 unsigned long last;
167
168 lockdep_assert_held(&iopt->iova_rwsem);
169
170 if ((iova & (iopt->iova_alignment - 1)))
171 return -EINVAL;
172
173 if (check_add_overflow(iova, length - 1, &last))
174 return -EOVERFLOW;
175
176 /* No reserved IOVA intersects the range */
177 if (iopt_reserved_iter_first(iopt, iova, last))
178 return -EINVAL;
179
180 /* Check that there is not already a mapping in the range */
181 if (iopt_area_iter_first(iopt, iova, last))
182 return -EEXIST;
183 return 0;
184 }
185
186 /*
187 * The area takes a slice of the pages from start_bytes to start_byte + length
188 */
iopt_insert_area(struct io_pagetable * iopt,struct iopt_area * area,struct iopt_pages * pages,unsigned long iova,unsigned long start_byte,unsigned long length,int iommu_prot)189 static int iopt_insert_area(struct io_pagetable *iopt, struct iopt_area *area,
190 struct iopt_pages *pages, unsigned long iova,
191 unsigned long start_byte, unsigned long length,
192 int iommu_prot)
193 {
194 lockdep_assert_held_write(&iopt->iova_rwsem);
195
196 if ((iommu_prot & IOMMU_WRITE) && !pages->writable)
197 return -EPERM;
198
199 area->iommu_prot = iommu_prot;
200 area->page_offset = start_byte % PAGE_SIZE;
201 if (area->page_offset & (iopt->iova_alignment - 1))
202 return -EINVAL;
203
204 area->node.start = iova;
205 if (check_add_overflow(iova, length - 1, &area->node.last))
206 return -EOVERFLOW;
207
208 area->pages_node.start = start_byte / PAGE_SIZE;
209 if (check_add_overflow(start_byte, length - 1, &area->pages_node.last))
210 return -EOVERFLOW;
211 area->pages_node.last = area->pages_node.last / PAGE_SIZE;
212 if (WARN_ON(area->pages_node.last >= pages->npages))
213 return -EOVERFLOW;
214
215 /*
216 * The area is inserted with a NULL pages indicating it is not fully
217 * initialized yet.
218 */
219 area->iopt = iopt;
220 interval_tree_insert(&area->node, &iopt->area_itree);
221 return 0;
222 }
223
iopt_area_alloc(void)224 static struct iopt_area *iopt_area_alloc(void)
225 {
226 struct iopt_area *area;
227
228 area = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT);
229 if (!area)
230 return NULL;
231 RB_CLEAR_NODE(&area->node.rb);
232 RB_CLEAR_NODE(&area->pages_node.rb);
233 return area;
234 }
235
iopt_alloc_area_pages(struct io_pagetable * iopt,struct list_head * pages_list,unsigned long length,unsigned long * dst_iova,int iommu_prot,unsigned int flags)236 static int iopt_alloc_area_pages(struct io_pagetable *iopt,
237 struct list_head *pages_list,
238 unsigned long length, unsigned long *dst_iova,
239 int iommu_prot, unsigned int flags)
240 {
241 struct iopt_pages_list *elm;
242 unsigned long iova;
243 int rc = 0;
244
245 list_for_each_entry(elm, pages_list, next) {
246 elm->area = iopt_area_alloc();
247 if (!elm->area)
248 return -ENOMEM;
249 }
250
251 down_write(&iopt->iova_rwsem);
252 if ((length & (iopt->iova_alignment - 1)) || !length) {
253 rc = -EINVAL;
254 goto out_unlock;
255 }
256
257 if (flags & IOPT_ALLOC_IOVA) {
258 /* Use the first entry to guess the ideal IOVA alignment */
259 elm = list_first_entry(pages_list, struct iopt_pages_list,
260 next);
261 rc = iopt_alloc_iova(
262 iopt, dst_iova,
263 (uintptr_t)elm->pages->uptr + elm->start_byte, length);
264 if (rc)
265 goto out_unlock;
266 if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
267 WARN_ON(iopt_check_iova(iopt, *dst_iova, length))) {
268 rc = -EINVAL;
269 goto out_unlock;
270 }
271 } else {
272 rc = iopt_check_iova(iopt, *dst_iova, length);
273 if (rc)
274 goto out_unlock;
275 }
276
277 /*
278 * Areas are created with a NULL pages so that the IOVA space is
279 * reserved and we can unlock the iova_rwsem.
280 */
281 iova = *dst_iova;
282 list_for_each_entry(elm, pages_list, next) {
283 rc = iopt_insert_area(iopt, elm->area, elm->pages, iova,
284 elm->start_byte, elm->length, iommu_prot);
285 if (rc)
286 goto out_unlock;
287 iova += elm->length;
288 }
289
290 out_unlock:
291 up_write(&iopt->iova_rwsem);
292 return rc;
293 }
294
iopt_abort_area(struct iopt_area * area)295 static void iopt_abort_area(struct iopt_area *area)
296 {
297 if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
298 WARN_ON(area->pages);
299 if (area->iopt) {
300 down_write(&area->iopt->iova_rwsem);
301 interval_tree_remove(&area->node, &area->iopt->area_itree);
302 up_write(&area->iopt->iova_rwsem);
303 }
304 kfree(area);
305 }
306
iopt_free_pages_list(struct list_head * pages_list)307 void iopt_free_pages_list(struct list_head *pages_list)
308 {
309 struct iopt_pages_list *elm;
310
311 while ((elm = list_first_entry_or_null(pages_list,
312 struct iopt_pages_list, next))) {
313 if (elm->area)
314 iopt_abort_area(elm->area);
315 if (elm->pages)
316 iopt_put_pages(elm->pages);
317 list_del(&elm->next);
318 kfree(elm);
319 }
320 }
321
iopt_fill_domains_pages(struct list_head * pages_list)322 static int iopt_fill_domains_pages(struct list_head *pages_list)
323 {
324 struct iopt_pages_list *undo_elm;
325 struct iopt_pages_list *elm;
326 int rc;
327
328 list_for_each_entry(elm, pages_list, next) {
329 rc = iopt_area_fill_domains(elm->area, elm->pages);
330 if (rc)
331 goto err_undo;
332 }
333 return 0;
334
335 err_undo:
336 list_for_each_entry(undo_elm, pages_list, next) {
337 if (undo_elm == elm)
338 break;
339 iopt_area_unfill_domains(undo_elm->area, undo_elm->pages);
340 }
341 return rc;
342 }
343
iopt_map_pages(struct io_pagetable * iopt,struct list_head * pages_list,unsigned long length,unsigned long * dst_iova,int iommu_prot,unsigned int flags)344 int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list,
345 unsigned long length, unsigned long *dst_iova,
346 int iommu_prot, unsigned int flags)
347 {
348 struct iopt_pages_list *elm;
349 int rc;
350
351 rc = iopt_alloc_area_pages(iopt, pages_list, length, dst_iova,
352 iommu_prot, flags);
353 if (rc)
354 return rc;
355
356 down_read(&iopt->domains_rwsem);
357 rc = iopt_fill_domains_pages(pages_list);
358 if (rc)
359 goto out_unlock_domains;
360
361 down_write(&iopt->iova_rwsem);
362 list_for_each_entry(elm, pages_list, next) {
363 /*
364 * area->pages must be set inside the domains_rwsem to ensure
365 * any newly added domains will get filled. Moves the reference
366 * in from the list.
367 */
368 elm->area->pages = elm->pages;
369 elm->pages = NULL;
370 elm->area = NULL;
371 }
372 up_write(&iopt->iova_rwsem);
373 out_unlock_domains:
374 up_read(&iopt->domains_rwsem);
375 return rc;
376 }
377
378 /**
379 * iopt_map_user_pages() - Map a user VA to an iova in the io page table
380 * @ictx: iommufd_ctx the iopt is part of
381 * @iopt: io_pagetable to act on
382 * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
383 * the chosen iova on output. Otherwise is the iova to map to on input
384 * @uptr: User VA to map
385 * @length: Number of bytes to map
386 * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
387 * @flags: IOPT_ALLOC_IOVA or zero
388 *
389 * iova, uptr, and length must be aligned to iova_alignment. For domain backed
390 * page tables this will pin the pages and load them into the domain at iova.
391 * For non-domain page tables this will only setup a lazy reference and the
392 * caller must use iopt_access_pages() to touch them.
393 *
394 * iopt_unmap_iova() must be called to undo this before the io_pagetable can be
395 * destroyed.
396 */
iopt_map_user_pages(struct iommufd_ctx * ictx,struct io_pagetable * iopt,unsigned long * iova,void __user * uptr,unsigned long length,int iommu_prot,unsigned int flags)397 int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
398 unsigned long *iova, void __user *uptr,
399 unsigned long length, int iommu_prot,
400 unsigned int flags)
401 {
402 struct iopt_pages_list elm = {};
403 LIST_HEAD(pages_list);
404 int rc;
405
406 elm.pages = iopt_alloc_pages(uptr, length, iommu_prot & IOMMU_WRITE);
407 if (IS_ERR(elm.pages))
408 return PTR_ERR(elm.pages);
409 if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM &&
410 elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER)
411 elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM;
412 elm.start_byte = uptr - elm.pages->uptr;
413 elm.length = length;
414 list_add(&elm.next, &pages_list);
415
416 rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags);
417 if (rc) {
418 if (elm.area)
419 iopt_abort_area(elm.area);
420 if (elm.pages)
421 iopt_put_pages(elm.pages);
422 return rc;
423 }
424 return 0;
425 }
426
iopt_get_pages(struct io_pagetable * iopt,unsigned long iova,unsigned long length,struct list_head * pages_list)427 int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova,
428 unsigned long length, struct list_head *pages_list)
429 {
430 struct iopt_area_contig_iter iter;
431 unsigned long last_iova;
432 struct iopt_area *area;
433 int rc;
434
435 if (!length)
436 return -EINVAL;
437 if (check_add_overflow(iova, length - 1, &last_iova))
438 return -EOVERFLOW;
439
440 down_read(&iopt->iova_rwsem);
441 iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) {
442 struct iopt_pages_list *elm;
443 unsigned long last = min(last_iova, iopt_area_last_iova(area));
444
445 elm = kzalloc(sizeof(*elm), GFP_KERNEL_ACCOUNT);
446 if (!elm) {
447 rc = -ENOMEM;
448 goto err_free;
449 }
450 elm->start_byte = iopt_area_start_byte(area, iter.cur_iova);
451 elm->pages = area->pages;
452 elm->length = (last - iter.cur_iova) + 1;
453 kref_get(&elm->pages->kref);
454 list_add_tail(&elm->next, pages_list);
455 }
456 if (!iopt_area_contig_done(&iter)) {
457 rc = -ENOENT;
458 goto err_free;
459 }
460 up_read(&iopt->iova_rwsem);
461 return 0;
462 err_free:
463 up_read(&iopt->iova_rwsem);
464 iopt_free_pages_list(pages_list);
465 return rc;
466 }
467
iopt_unmap_iova_range(struct io_pagetable * iopt,unsigned long start,unsigned long last,unsigned long * unmapped)468 static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start,
469 unsigned long last, unsigned long *unmapped)
470 {
471 struct iopt_area *area;
472 unsigned long unmapped_bytes = 0;
473 unsigned int tries = 0;
474 int rc = -ENOENT;
475
476 /*
477 * The domains_rwsem must be held in read mode any time any area->pages
478 * is NULL. This prevents domain attach/detatch from running
479 * concurrently with cleaning up the area.
480 */
481 again:
482 down_read(&iopt->domains_rwsem);
483 down_write(&iopt->iova_rwsem);
484 while ((area = iopt_area_iter_first(iopt, start, last))) {
485 unsigned long area_last = iopt_area_last_iova(area);
486 unsigned long area_first = iopt_area_iova(area);
487 struct iopt_pages *pages;
488
489 /* Userspace should not race map/unmap's of the same area */
490 if (!area->pages) {
491 rc = -EBUSY;
492 goto out_unlock_iova;
493 }
494
495 if (area_first < start || area_last > last) {
496 rc = -ENOENT;
497 goto out_unlock_iova;
498 }
499
500 if (area_first != start)
501 tries = 0;
502
503 /*
504 * num_accesses writers must hold the iova_rwsem too, so we can
505 * safely read it under the write side of the iovam_rwsem
506 * without the pages->mutex.
507 */
508 if (area->num_accesses) {
509 size_t length = iopt_area_length(area);
510
511 start = area_first;
512 area->prevent_access = true;
513 up_write(&iopt->iova_rwsem);
514 up_read(&iopt->domains_rwsem);
515
516 iommufd_access_notify_unmap(iopt, area_first, length);
517 /* Something is not responding to unmap requests. */
518 tries++;
519 if (WARN_ON(tries > 100))
520 return -EDEADLOCK;
521 goto again;
522 }
523
524 pages = area->pages;
525 area->pages = NULL;
526 up_write(&iopt->iova_rwsem);
527
528 iopt_area_unfill_domains(area, pages);
529 iopt_abort_area(area);
530 iopt_put_pages(pages);
531
532 unmapped_bytes += area_last - area_first + 1;
533
534 down_write(&iopt->iova_rwsem);
535 }
536 if (unmapped_bytes)
537 rc = 0;
538
539 out_unlock_iova:
540 up_write(&iopt->iova_rwsem);
541 up_read(&iopt->domains_rwsem);
542 if (unmapped)
543 *unmapped = unmapped_bytes;
544 return rc;
545 }
546
547 /**
548 * iopt_unmap_iova() - Remove a range of iova
549 * @iopt: io_pagetable to act on
550 * @iova: Starting iova to unmap
551 * @length: Number of bytes to unmap
552 * @unmapped: Return number of bytes unmapped
553 *
554 * The requested range must be a superset of existing ranges.
555 * Splitting/truncating IOVA mappings is not allowed.
556 */
iopt_unmap_iova(struct io_pagetable * iopt,unsigned long iova,unsigned long length,unsigned long * unmapped)557 int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
558 unsigned long length, unsigned long *unmapped)
559 {
560 unsigned long iova_last;
561
562 if (!length)
563 return -EINVAL;
564
565 if (check_add_overflow(iova, length - 1, &iova_last))
566 return -EOVERFLOW;
567
568 return iopt_unmap_iova_range(iopt, iova, iova_last, unmapped);
569 }
570
iopt_unmap_all(struct io_pagetable * iopt,unsigned long * unmapped)571 int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped)
572 {
573 int rc;
574
575 rc = iopt_unmap_iova_range(iopt, 0, ULONG_MAX, unmapped);
576 /* If the IOVAs are empty then unmap all succeeds */
577 if (rc == -ENOENT)
578 return 0;
579 return rc;
580 }
581
582 /* The caller must always free all the nodes in the allowed_iova rb_root. */
iopt_set_allow_iova(struct io_pagetable * iopt,struct rb_root_cached * allowed_iova)583 int iopt_set_allow_iova(struct io_pagetable *iopt,
584 struct rb_root_cached *allowed_iova)
585 {
586 struct iopt_allowed *allowed;
587
588 down_write(&iopt->iova_rwsem);
589 swap(*allowed_iova, iopt->allowed_itree);
590
591 for (allowed = iopt_allowed_iter_first(iopt, 0, ULONG_MAX); allowed;
592 allowed = iopt_allowed_iter_next(allowed, 0, ULONG_MAX)) {
593 if (iopt_reserved_iter_first(iopt, allowed->node.start,
594 allowed->node.last)) {
595 swap(*allowed_iova, iopt->allowed_itree);
596 up_write(&iopt->iova_rwsem);
597 return -EADDRINUSE;
598 }
599 }
600 up_write(&iopt->iova_rwsem);
601 return 0;
602 }
603
iopt_reserve_iova(struct io_pagetable * iopt,unsigned long start,unsigned long last,void * owner)604 int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start,
605 unsigned long last, void *owner)
606 {
607 struct iopt_reserved *reserved;
608
609 lockdep_assert_held_write(&iopt->iova_rwsem);
610
611 if (iopt_area_iter_first(iopt, start, last) ||
612 iopt_allowed_iter_first(iopt, start, last))
613 return -EADDRINUSE;
614
615 reserved = kzalloc(sizeof(*reserved), GFP_KERNEL_ACCOUNT);
616 if (!reserved)
617 return -ENOMEM;
618 reserved->node.start = start;
619 reserved->node.last = last;
620 reserved->owner = owner;
621 interval_tree_insert(&reserved->node, &iopt->reserved_itree);
622 return 0;
623 }
624
__iopt_remove_reserved_iova(struct io_pagetable * iopt,void * owner)625 static void __iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
626 {
627 struct iopt_reserved *reserved, *next;
628
629 lockdep_assert_held_write(&iopt->iova_rwsem);
630
631 for (reserved = iopt_reserved_iter_first(iopt, 0, ULONG_MAX); reserved;
632 reserved = next) {
633 next = iopt_reserved_iter_next(reserved, 0, ULONG_MAX);
634
635 if (reserved->owner == owner) {
636 interval_tree_remove(&reserved->node,
637 &iopt->reserved_itree);
638 kfree(reserved);
639 }
640 }
641 }
642
iopt_remove_reserved_iova(struct io_pagetable * iopt,void * owner)643 void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
644 {
645 down_write(&iopt->iova_rwsem);
646 __iopt_remove_reserved_iova(iopt, owner);
647 up_write(&iopt->iova_rwsem);
648 }
649
iopt_init_table(struct io_pagetable * iopt)650 void iopt_init_table(struct io_pagetable *iopt)
651 {
652 init_rwsem(&iopt->iova_rwsem);
653 init_rwsem(&iopt->domains_rwsem);
654 iopt->area_itree = RB_ROOT_CACHED;
655 iopt->allowed_itree = RB_ROOT_CACHED;
656 iopt->reserved_itree = RB_ROOT_CACHED;
657 xa_init_flags(&iopt->domains, XA_FLAGS_ACCOUNT);
658 xa_init_flags(&iopt->access_list, XA_FLAGS_ALLOC);
659
660 /*
661 * iopt's start as SW tables that can use the entire size_t IOVA space
662 * due to the use of size_t in the APIs. They have no alignment
663 * restriction.
664 */
665 iopt->iova_alignment = 1;
666 }
667
iopt_destroy_table(struct io_pagetable * iopt)668 void iopt_destroy_table(struct io_pagetable *iopt)
669 {
670 struct interval_tree_node *node;
671
672 if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
673 iopt_remove_reserved_iova(iopt, NULL);
674
675 while ((node = interval_tree_iter_first(&iopt->allowed_itree, 0,
676 ULONG_MAX))) {
677 interval_tree_remove(node, &iopt->allowed_itree);
678 kfree(container_of(node, struct iopt_allowed, node));
679 }
680
681 WARN_ON(!RB_EMPTY_ROOT(&iopt->reserved_itree.rb_root));
682 WARN_ON(!xa_empty(&iopt->domains));
683 WARN_ON(!xa_empty(&iopt->access_list));
684 WARN_ON(!RB_EMPTY_ROOT(&iopt->area_itree.rb_root));
685 }
686
687 /**
688 * iopt_unfill_domain() - Unfill a domain with PFNs
689 * @iopt: io_pagetable to act on
690 * @domain: domain to unfill
691 *
692 * This is used when removing a domain from the iopt. Every area in the iopt
693 * will be unmapped from the domain. The domain must already be removed from the
694 * domains xarray.
695 */
iopt_unfill_domain(struct io_pagetable * iopt,struct iommu_domain * domain)696 static void iopt_unfill_domain(struct io_pagetable *iopt,
697 struct iommu_domain *domain)
698 {
699 struct iopt_area *area;
700
701 lockdep_assert_held(&iopt->iova_rwsem);
702 lockdep_assert_held_write(&iopt->domains_rwsem);
703
704 /*
705 * Some other domain is holding all the pfns still, rapidly unmap this
706 * domain.
707 */
708 if (iopt->next_domain_id != 0) {
709 /* Pick an arbitrary remaining domain to act as storage */
710 struct iommu_domain *storage_domain =
711 xa_load(&iopt->domains, 0);
712
713 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
714 area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
715 struct iopt_pages *pages = area->pages;
716
717 if (!pages)
718 continue;
719
720 mutex_lock(&pages->mutex);
721 if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
722 WARN_ON(!area->storage_domain);
723 if (area->storage_domain == domain)
724 area->storage_domain = storage_domain;
725 mutex_unlock(&pages->mutex);
726
727 iopt_area_unmap_domain(area, domain);
728 }
729 return;
730 }
731
732 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
733 area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
734 struct iopt_pages *pages = area->pages;
735
736 if (!pages)
737 continue;
738
739 mutex_lock(&pages->mutex);
740 interval_tree_remove(&area->pages_node, &pages->domains_itree);
741 WARN_ON(area->storage_domain != domain);
742 area->storage_domain = NULL;
743 iopt_area_unfill_domain(area, pages, domain);
744 mutex_unlock(&pages->mutex);
745 }
746 }
747
748 /**
749 * iopt_fill_domain() - Fill a domain with PFNs
750 * @iopt: io_pagetable to act on
751 * @domain: domain to fill
752 *
753 * Fill the domain with PFNs from every area in the iopt. On failure the domain
754 * is left unchanged.
755 */
iopt_fill_domain(struct io_pagetable * iopt,struct iommu_domain * domain)756 static int iopt_fill_domain(struct io_pagetable *iopt,
757 struct iommu_domain *domain)
758 {
759 struct iopt_area *end_area;
760 struct iopt_area *area;
761 int rc;
762
763 lockdep_assert_held(&iopt->iova_rwsem);
764 lockdep_assert_held_write(&iopt->domains_rwsem);
765
766 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
767 area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
768 struct iopt_pages *pages = area->pages;
769
770 if (!pages)
771 continue;
772
773 mutex_lock(&pages->mutex);
774 rc = iopt_area_fill_domain(area, domain);
775 if (rc) {
776 mutex_unlock(&pages->mutex);
777 goto out_unfill;
778 }
779 if (!area->storage_domain) {
780 WARN_ON(iopt->next_domain_id != 0);
781 area->storage_domain = domain;
782 interval_tree_insert(&area->pages_node,
783 &pages->domains_itree);
784 }
785 mutex_unlock(&pages->mutex);
786 }
787 return 0;
788
789 out_unfill:
790 end_area = area;
791 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
792 area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
793 struct iopt_pages *pages = area->pages;
794
795 if (area == end_area)
796 break;
797 if (!pages)
798 continue;
799 mutex_lock(&pages->mutex);
800 if (iopt->next_domain_id == 0) {
801 interval_tree_remove(&area->pages_node,
802 &pages->domains_itree);
803 area->storage_domain = NULL;
804 }
805 iopt_area_unfill_domain(area, pages, domain);
806 mutex_unlock(&pages->mutex);
807 }
808 return rc;
809 }
810
811 /* All existing area's conform to an increased page size */
iopt_check_iova_alignment(struct io_pagetable * iopt,unsigned long new_iova_alignment)812 static int iopt_check_iova_alignment(struct io_pagetable *iopt,
813 unsigned long new_iova_alignment)
814 {
815 unsigned long align_mask = new_iova_alignment - 1;
816 struct iopt_area *area;
817
818 lockdep_assert_held(&iopt->iova_rwsem);
819 lockdep_assert_held(&iopt->domains_rwsem);
820
821 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
822 area = iopt_area_iter_next(area, 0, ULONG_MAX))
823 if ((iopt_area_iova(area) & align_mask) ||
824 (iopt_area_length(area) & align_mask) ||
825 (area->page_offset & align_mask))
826 return -EADDRINUSE;
827
828 if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) {
829 struct iommufd_access *access;
830 unsigned long index;
831
832 xa_for_each(&iopt->access_list, index, access)
833 if (WARN_ON(access->iova_alignment >
834 new_iova_alignment))
835 return -EADDRINUSE;
836 }
837 return 0;
838 }
839
iopt_table_add_domain(struct io_pagetable * iopt,struct iommu_domain * domain)840 int iopt_table_add_domain(struct io_pagetable *iopt,
841 struct iommu_domain *domain)
842 {
843 const struct iommu_domain_geometry *geometry = &domain->geometry;
844 struct iommu_domain *iter_domain;
845 unsigned int new_iova_alignment;
846 unsigned long index;
847 int rc;
848
849 down_write(&iopt->domains_rwsem);
850 down_write(&iopt->iova_rwsem);
851
852 xa_for_each(&iopt->domains, index, iter_domain) {
853 if (WARN_ON(iter_domain == domain)) {
854 rc = -EEXIST;
855 goto out_unlock;
856 }
857 }
858
859 /*
860 * The io page size drives the iova_alignment. Internally the iopt_pages
861 * works in PAGE_SIZE units and we adjust when mapping sub-PAGE_SIZE
862 * objects into the iommu_domain.
863 *
864 * A iommu_domain must always be able to accept PAGE_SIZE to be
865 * compatible as we can't guarantee higher contiguity.
866 */
867 new_iova_alignment = max_t(unsigned long,
868 1UL << __ffs(domain->pgsize_bitmap),
869 iopt->iova_alignment);
870 if (new_iova_alignment > PAGE_SIZE) {
871 rc = -EINVAL;
872 goto out_unlock;
873 }
874 if (new_iova_alignment != iopt->iova_alignment) {
875 rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
876 if (rc)
877 goto out_unlock;
878 }
879
880 /* No area exists that is outside the allowed domain aperture */
881 if (geometry->aperture_start != 0) {
882 rc = iopt_reserve_iova(iopt, 0, geometry->aperture_start - 1,
883 domain);
884 if (rc)
885 goto out_reserved;
886 }
887 if (geometry->aperture_end != ULONG_MAX) {
888 rc = iopt_reserve_iova(iopt, geometry->aperture_end + 1,
889 ULONG_MAX, domain);
890 if (rc)
891 goto out_reserved;
892 }
893
894 rc = xa_reserve(&iopt->domains, iopt->next_domain_id, GFP_KERNEL);
895 if (rc)
896 goto out_reserved;
897
898 rc = iopt_fill_domain(iopt, domain);
899 if (rc)
900 goto out_release;
901
902 iopt->iova_alignment = new_iova_alignment;
903 xa_store(&iopt->domains, iopt->next_domain_id, domain, GFP_KERNEL);
904 iopt->next_domain_id++;
905 up_write(&iopt->iova_rwsem);
906 up_write(&iopt->domains_rwsem);
907 return 0;
908 out_release:
909 xa_release(&iopt->domains, iopt->next_domain_id);
910 out_reserved:
911 __iopt_remove_reserved_iova(iopt, domain);
912 out_unlock:
913 up_write(&iopt->iova_rwsem);
914 up_write(&iopt->domains_rwsem);
915 return rc;
916 }
917
iopt_calculate_iova_alignment(struct io_pagetable * iopt)918 static int iopt_calculate_iova_alignment(struct io_pagetable *iopt)
919 {
920 unsigned long new_iova_alignment;
921 struct iommufd_access *access;
922 struct iommu_domain *domain;
923 unsigned long index;
924
925 lockdep_assert_held_write(&iopt->iova_rwsem);
926 lockdep_assert_held(&iopt->domains_rwsem);
927
928 /* See batch_iommu_map_small() */
929 if (iopt->disable_large_pages)
930 new_iova_alignment = PAGE_SIZE;
931 else
932 new_iova_alignment = 1;
933
934 xa_for_each(&iopt->domains, index, domain)
935 new_iova_alignment = max_t(unsigned long,
936 1UL << __ffs(domain->pgsize_bitmap),
937 new_iova_alignment);
938 xa_for_each(&iopt->access_list, index, access)
939 new_iova_alignment = max_t(unsigned long,
940 access->iova_alignment,
941 new_iova_alignment);
942
943 if (new_iova_alignment > iopt->iova_alignment) {
944 int rc;
945
946 rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
947 if (rc)
948 return rc;
949 }
950 iopt->iova_alignment = new_iova_alignment;
951 return 0;
952 }
953
iopt_table_remove_domain(struct io_pagetable * iopt,struct iommu_domain * domain)954 void iopt_table_remove_domain(struct io_pagetable *iopt,
955 struct iommu_domain *domain)
956 {
957 struct iommu_domain *iter_domain = NULL;
958 unsigned long index;
959
960 down_write(&iopt->domains_rwsem);
961 down_write(&iopt->iova_rwsem);
962
963 xa_for_each(&iopt->domains, index, iter_domain)
964 if (iter_domain == domain)
965 break;
966 if (WARN_ON(iter_domain != domain) || index >= iopt->next_domain_id)
967 goto out_unlock;
968
969 /*
970 * Compress the xarray to keep it linear by swapping the entry to erase
971 * with the tail entry and shrinking the tail.
972 */
973 iopt->next_domain_id--;
974 iter_domain = xa_erase(&iopt->domains, iopt->next_domain_id);
975 if (index != iopt->next_domain_id)
976 xa_store(&iopt->domains, index, iter_domain, GFP_KERNEL);
977
978 iopt_unfill_domain(iopt, domain);
979 __iopt_remove_reserved_iova(iopt, domain);
980
981 WARN_ON(iopt_calculate_iova_alignment(iopt));
982 out_unlock:
983 up_write(&iopt->iova_rwsem);
984 up_write(&iopt->domains_rwsem);
985 }
986
987 /**
988 * iopt_area_split - Split an area into two parts at iova
989 * @area: The area to split
990 * @iova: Becomes the last of a new area
991 *
992 * This splits an area into two. It is part of the VFIO compatibility to allow
993 * poking a hole in the mapping. The two areas continue to point at the same
994 * iopt_pages, just with different starting bytes.
995 */
iopt_area_split(struct iopt_area * area,unsigned long iova)996 static int iopt_area_split(struct iopt_area *area, unsigned long iova)
997 {
998 unsigned long alignment = area->iopt->iova_alignment;
999 unsigned long last_iova = iopt_area_last_iova(area);
1000 unsigned long start_iova = iopt_area_iova(area);
1001 unsigned long new_start = iova + 1;
1002 struct io_pagetable *iopt = area->iopt;
1003 struct iopt_pages *pages = area->pages;
1004 struct iopt_area *lhs;
1005 struct iopt_area *rhs;
1006 int rc;
1007
1008 lockdep_assert_held_write(&iopt->iova_rwsem);
1009
1010 if (iova == start_iova || iova == last_iova)
1011 return 0;
1012
1013 if (!pages || area->prevent_access)
1014 return -EBUSY;
1015
1016 if (new_start & (alignment - 1) ||
1017 iopt_area_start_byte(area, new_start) & (alignment - 1))
1018 return -EINVAL;
1019
1020 lhs = iopt_area_alloc();
1021 if (!lhs)
1022 return -ENOMEM;
1023
1024 rhs = iopt_area_alloc();
1025 if (!rhs) {
1026 rc = -ENOMEM;
1027 goto err_free_lhs;
1028 }
1029
1030 mutex_lock(&pages->mutex);
1031 /*
1032 * Splitting is not permitted if an access exists, we don't track enough
1033 * information to split existing accesses.
1034 */
1035 if (area->num_accesses) {
1036 rc = -EINVAL;
1037 goto err_unlock;
1038 }
1039
1040 /*
1041 * Splitting is not permitted if a domain could have been mapped with
1042 * huge pages.
1043 */
1044 if (area->storage_domain && !iopt->disable_large_pages) {
1045 rc = -EINVAL;
1046 goto err_unlock;
1047 }
1048
1049 interval_tree_remove(&area->node, &iopt->area_itree);
1050 rc = iopt_insert_area(iopt, lhs, area->pages, start_iova,
1051 iopt_area_start_byte(area, start_iova),
1052 (new_start - 1) - start_iova + 1,
1053 area->iommu_prot);
1054 if (WARN_ON(rc))
1055 goto err_insert;
1056
1057 rc = iopt_insert_area(iopt, rhs, area->pages, new_start,
1058 iopt_area_start_byte(area, new_start),
1059 last_iova - new_start + 1, area->iommu_prot);
1060 if (WARN_ON(rc))
1061 goto err_remove_lhs;
1062
1063 /*
1064 * If the original area has filled a domain, domains_itree has to be
1065 * updated.
1066 */
1067 if (area->storage_domain) {
1068 interval_tree_remove(&area->pages_node, &pages->domains_itree);
1069 interval_tree_insert(&lhs->pages_node, &pages->domains_itree);
1070 interval_tree_insert(&rhs->pages_node, &pages->domains_itree);
1071 }
1072
1073 lhs->storage_domain = area->storage_domain;
1074 lhs->pages = area->pages;
1075 rhs->storage_domain = area->storage_domain;
1076 rhs->pages = area->pages;
1077 kref_get(&rhs->pages->kref);
1078 kfree(area);
1079 mutex_unlock(&pages->mutex);
1080
1081 /*
1082 * No change to domains or accesses because the pages hasn't been
1083 * changed
1084 */
1085 return 0;
1086
1087 err_remove_lhs:
1088 interval_tree_remove(&lhs->node, &iopt->area_itree);
1089 err_insert:
1090 interval_tree_insert(&area->node, &iopt->area_itree);
1091 err_unlock:
1092 mutex_unlock(&pages->mutex);
1093 kfree(rhs);
1094 err_free_lhs:
1095 kfree(lhs);
1096 return rc;
1097 }
1098
iopt_cut_iova(struct io_pagetable * iopt,unsigned long * iovas,size_t num_iovas)1099 int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas,
1100 size_t num_iovas)
1101 {
1102 int rc = 0;
1103 int i;
1104
1105 down_write(&iopt->iova_rwsem);
1106 for (i = 0; i < num_iovas; i++) {
1107 struct iopt_area *area;
1108
1109 area = iopt_area_iter_first(iopt, iovas[i], iovas[i]);
1110 if (!area)
1111 continue;
1112 rc = iopt_area_split(area, iovas[i]);
1113 if (rc)
1114 break;
1115 }
1116 up_write(&iopt->iova_rwsem);
1117 return rc;
1118 }
1119
iopt_enable_large_pages(struct io_pagetable * iopt)1120 void iopt_enable_large_pages(struct io_pagetable *iopt)
1121 {
1122 int rc;
1123
1124 down_write(&iopt->domains_rwsem);
1125 down_write(&iopt->iova_rwsem);
1126 WRITE_ONCE(iopt->disable_large_pages, false);
1127 rc = iopt_calculate_iova_alignment(iopt);
1128 WARN_ON(rc);
1129 up_write(&iopt->iova_rwsem);
1130 up_write(&iopt->domains_rwsem);
1131 }
1132
iopt_disable_large_pages(struct io_pagetable * iopt)1133 int iopt_disable_large_pages(struct io_pagetable *iopt)
1134 {
1135 int rc = 0;
1136
1137 down_write(&iopt->domains_rwsem);
1138 down_write(&iopt->iova_rwsem);
1139 if (iopt->disable_large_pages)
1140 goto out_unlock;
1141
1142 /* Won't do it if domains already have pages mapped in them */
1143 if (!xa_empty(&iopt->domains) &&
1144 !RB_EMPTY_ROOT(&iopt->area_itree.rb_root)) {
1145 rc = -EINVAL;
1146 goto out_unlock;
1147 }
1148
1149 WRITE_ONCE(iopt->disable_large_pages, true);
1150 rc = iopt_calculate_iova_alignment(iopt);
1151 if (rc)
1152 WRITE_ONCE(iopt->disable_large_pages, false);
1153 out_unlock:
1154 up_write(&iopt->iova_rwsem);
1155 up_write(&iopt->domains_rwsem);
1156 return rc;
1157 }
1158
iopt_add_access(struct io_pagetable * iopt,struct iommufd_access * access)1159 int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access)
1160 {
1161 u32 new_id;
1162 int rc;
1163
1164 down_write(&iopt->domains_rwsem);
1165 down_write(&iopt->iova_rwsem);
1166 rc = xa_alloc(&iopt->access_list, &new_id, access, xa_limit_16b,
1167 GFP_KERNEL_ACCOUNT);
1168
1169 if (rc)
1170 goto out_unlock;
1171
1172 rc = iopt_calculate_iova_alignment(iopt);
1173 if (rc) {
1174 xa_erase(&iopt->access_list, new_id);
1175 goto out_unlock;
1176 }
1177 access->iopt_access_list_id = new_id;
1178
1179 out_unlock:
1180 up_write(&iopt->iova_rwsem);
1181 up_write(&iopt->domains_rwsem);
1182 return rc;
1183 }
1184
iopt_remove_access(struct io_pagetable * iopt,struct iommufd_access * access,u32 iopt_access_list_id)1185 void iopt_remove_access(struct io_pagetable *iopt,
1186 struct iommufd_access *access,
1187 u32 iopt_access_list_id)
1188 {
1189 down_write(&iopt->domains_rwsem);
1190 down_write(&iopt->iova_rwsem);
1191 WARN_ON(xa_erase(&iopt->access_list, iopt_access_list_id) != access);
1192 WARN_ON(iopt_calculate_iova_alignment(iopt));
1193 up_write(&iopt->iova_rwsem);
1194 up_write(&iopt->domains_rwsem);
1195 }
1196
1197 /* Narrow the valid_iova_itree to include reserved ranges from a device. */
iopt_table_enforce_dev_resv_regions(struct io_pagetable * iopt,struct device * dev,phys_addr_t * sw_msi_start)1198 int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt,
1199 struct device *dev,
1200 phys_addr_t *sw_msi_start)
1201 {
1202 struct iommu_resv_region *resv;
1203 LIST_HEAD(resv_regions);
1204 unsigned int num_hw_msi = 0;
1205 unsigned int num_sw_msi = 0;
1206 int rc;
1207
1208 if (iommufd_should_fail())
1209 return -EINVAL;
1210
1211 down_write(&iopt->iova_rwsem);
1212 /* FIXME: drivers allocate memory but there is no failure propogated */
1213 iommu_get_resv_regions(dev, &resv_regions);
1214
1215 list_for_each_entry(resv, &resv_regions, list) {
1216 if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE)
1217 continue;
1218
1219 if (sw_msi_start && resv->type == IOMMU_RESV_MSI)
1220 num_hw_msi++;
1221 if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) {
1222 *sw_msi_start = resv->start;
1223 num_sw_msi++;
1224 }
1225
1226 rc = iopt_reserve_iova(iopt, resv->start,
1227 resv->length - 1 + resv->start, dev);
1228 if (rc)
1229 goto out_reserved;
1230 }
1231
1232 /* Drivers must offer sane combinations of regions */
1233 if (WARN_ON(num_sw_msi && num_hw_msi) || WARN_ON(num_sw_msi > 1)) {
1234 rc = -EINVAL;
1235 goto out_reserved;
1236 }
1237
1238 rc = 0;
1239 goto out_free_resv;
1240
1241 out_reserved:
1242 __iopt_remove_reserved_iova(iopt, dev);
1243 out_free_resv:
1244 iommu_put_resv_regions(dev, &resv_regions);
1245 up_write(&iopt->iova_rwsem);
1246 return rc;
1247 }
1248