• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
3  *
4  * The io_pagetable is the top of datastructure that maps IOVA's to PFNs. The
5  * PFNs can be placed into an iommu_domain, or returned to the caller as a page
6  * list for access by an in-kernel user.
7  *
8  * The datastructure uses the iopt_pages to optimize the storage of the PFNs
9  * between the domains and xarray.
10  */
11 #include <linux/err.h>
12 #include <linux/errno.h>
13 #include <linux/iommu.h>
14 #include <linux/iommufd.h>
15 #include <linux/lockdep.h>
16 #include <linux/sched/mm.h>
17 #include <linux/slab.h>
18 #include <uapi/linux/iommufd.h>
19 
20 #include "double_span.h"
21 #include "io_pagetable.h"
22 
23 struct iopt_pages_list {
24 	struct iopt_pages *pages;
25 	struct iopt_area *area;
26 	struct list_head next;
27 	unsigned long start_byte;
28 	unsigned long length;
29 };
30 
iopt_area_contig_init(struct iopt_area_contig_iter * iter,struct io_pagetable * iopt,unsigned long iova,unsigned long last_iova)31 struct iopt_area *iopt_area_contig_init(struct iopt_area_contig_iter *iter,
32 					struct io_pagetable *iopt,
33 					unsigned long iova,
34 					unsigned long last_iova)
35 {
36 	lockdep_assert_held(&iopt->iova_rwsem);
37 
38 	iter->cur_iova = iova;
39 	iter->last_iova = last_iova;
40 	iter->area = iopt_area_iter_first(iopt, iova, iova);
41 	if (!iter->area)
42 		return NULL;
43 	if (!iter->area->pages) {
44 		iter->area = NULL;
45 		return NULL;
46 	}
47 	return iter->area;
48 }
49 
iopt_area_contig_next(struct iopt_area_contig_iter * iter)50 struct iopt_area *iopt_area_contig_next(struct iopt_area_contig_iter *iter)
51 {
52 	unsigned long last_iova;
53 
54 	if (!iter->area)
55 		return NULL;
56 	last_iova = iopt_area_last_iova(iter->area);
57 	if (iter->last_iova <= last_iova)
58 		return NULL;
59 
60 	iter->cur_iova = last_iova + 1;
61 	iter->area = iopt_area_iter_next(iter->area, iter->cur_iova,
62 					 iter->last_iova);
63 	if (!iter->area)
64 		return NULL;
65 	if (iter->cur_iova != iopt_area_iova(iter->area) ||
66 	    !iter->area->pages) {
67 		iter->area = NULL;
68 		return NULL;
69 	}
70 	return iter->area;
71 }
72 
__alloc_iova_check_range(unsigned long * start,unsigned long last,unsigned long length,unsigned long iova_alignment,unsigned long page_offset)73 static bool __alloc_iova_check_range(unsigned long *start, unsigned long last,
74 				     unsigned long length,
75 				     unsigned long iova_alignment,
76 				     unsigned long page_offset)
77 {
78 	unsigned long aligned_start;
79 
80 	/* ALIGN_UP() */
81 	if (check_add_overflow(*start, iova_alignment - 1, &aligned_start))
82 		return false;
83 	aligned_start &= ~(iova_alignment - 1);
84 	aligned_start |= page_offset;
85 
86 	if (aligned_start >= last || last - aligned_start < length - 1)
87 		return false;
88 	*start = aligned_start;
89 	return true;
90 }
91 
__alloc_iova_check_hole(struct interval_tree_double_span_iter * span,unsigned long length,unsigned long iova_alignment,unsigned long page_offset)92 static bool __alloc_iova_check_hole(struct interval_tree_double_span_iter *span,
93 				    unsigned long length,
94 				    unsigned long iova_alignment,
95 				    unsigned long page_offset)
96 {
97 	if (span->is_used)
98 		return false;
99 	return __alloc_iova_check_range(&span->start_hole, span->last_hole,
100 					length, iova_alignment, page_offset);
101 }
102 
__alloc_iova_check_used(struct interval_tree_span_iter * span,unsigned long length,unsigned long iova_alignment,unsigned long page_offset)103 static bool __alloc_iova_check_used(struct interval_tree_span_iter *span,
104 				    unsigned long length,
105 				    unsigned long iova_alignment,
106 				    unsigned long page_offset)
107 {
108 	if (span->is_hole)
109 		return false;
110 	return __alloc_iova_check_range(&span->start_used, span->last_used,
111 					length, iova_alignment, page_offset);
112 }
113 
114 /*
115  * Automatically find a block of IOVA that is not being used and not reserved.
116  * Does not return a 0 IOVA even if it is valid.
117  */
iopt_alloc_iova(struct io_pagetable * iopt,unsigned long * iova,unsigned long uptr,unsigned long length)118 static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova,
119 			   unsigned long uptr, unsigned long length)
120 {
121 	unsigned long page_offset = uptr % PAGE_SIZE;
122 	struct interval_tree_double_span_iter used_span;
123 	struct interval_tree_span_iter allowed_span;
124 	unsigned long max_alignment = PAGE_SIZE;
125 	unsigned long iova_alignment;
126 
127 	lockdep_assert_held(&iopt->iova_rwsem);
128 
129 	/* Protect roundup_pow-of_two() from overflow */
130 	if (length == 0 || length >= ULONG_MAX / 2)
131 		return -EOVERFLOW;
132 
133 	/*
134 	 * Keep alignment present in the uptr when building the IOVA, this
135 	 * increases the chance we can map a THP.
136 	 */
137 	if (!uptr)
138 		iova_alignment = roundup_pow_of_two(length);
139 	else
140 		iova_alignment = min_t(unsigned long,
141 				       roundup_pow_of_two(length),
142 				       1UL << __ffs64(uptr));
143 
144 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
145 	max_alignment = HPAGE_SIZE;
146 #endif
147 	/* Protect against ALIGN() overflow */
148 	if (iova_alignment >= max_alignment)
149 		iova_alignment = max_alignment;
150 
151 	if (iova_alignment < iopt->iova_alignment)
152 		return -EINVAL;
153 
154 	interval_tree_for_each_span(&allowed_span, &iopt->allowed_itree,
155 				    PAGE_SIZE, ULONG_MAX - PAGE_SIZE) {
156 		if (RB_EMPTY_ROOT(&iopt->allowed_itree.rb_root)) {
157 			allowed_span.start_used = PAGE_SIZE;
158 			allowed_span.last_used = ULONG_MAX - PAGE_SIZE;
159 			allowed_span.is_hole = false;
160 		}
161 
162 		if (!__alloc_iova_check_used(&allowed_span, length,
163 					     iova_alignment, page_offset))
164 			continue;
165 
166 		interval_tree_for_each_double_span(
167 			&used_span, &iopt->reserved_itree, &iopt->area_itree,
168 			allowed_span.start_used, allowed_span.last_used) {
169 			if (!__alloc_iova_check_hole(&used_span, length,
170 						     iova_alignment,
171 						     page_offset))
172 				continue;
173 
174 			*iova = used_span.start_hole;
175 			return 0;
176 		}
177 	}
178 	return -ENOSPC;
179 }
180 
iopt_check_iova(struct io_pagetable * iopt,unsigned long iova,unsigned long length)181 static int iopt_check_iova(struct io_pagetable *iopt, unsigned long iova,
182 			   unsigned long length)
183 {
184 	unsigned long last;
185 
186 	lockdep_assert_held(&iopt->iova_rwsem);
187 
188 	if ((iova & (iopt->iova_alignment - 1)))
189 		return -EINVAL;
190 
191 	if (check_add_overflow(iova, length - 1, &last))
192 		return -EOVERFLOW;
193 
194 	/* No reserved IOVA intersects the range */
195 	if (iopt_reserved_iter_first(iopt, iova, last))
196 		return -EINVAL;
197 
198 	/* Check that there is not already a mapping in the range */
199 	if (iopt_area_iter_first(iopt, iova, last))
200 		return -EEXIST;
201 	return 0;
202 }
203 
204 /*
205  * The area takes a slice of the pages from start_bytes to start_byte + length
206  */
iopt_insert_area(struct io_pagetable * iopt,struct iopt_area * area,struct iopt_pages * pages,unsigned long iova,unsigned long start_byte,unsigned long length,int iommu_prot)207 static int iopt_insert_area(struct io_pagetable *iopt, struct iopt_area *area,
208 			    struct iopt_pages *pages, unsigned long iova,
209 			    unsigned long start_byte, unsigned long length,
210 			    int iommu_prot)
211 {
212 	lockdep_assert_held_write(&iopt->iova_rwsem);
213 
214 	if ((iommu_prot & IOMMU_WRITE) && !pages->writable)
215 		return -EPERM;
216 
217 	area->iommu_prot = iommu_prot;
218 	area->page_offset = start_byte % PAGE_SIZE;
219 	if (area->page_offset & (iopt->iova_alignment - 1))
220 		return -EINVAL;
221 
222 	area->node.start = iova;
223 	if (check_add_overflow(iova, length - 1, &area->node.last))
224 		return -EOVERFLOW;
225 
226 	area->pages_node.start = start_byte / PAGE_SIZE;
227 	if (check_add_overflow(start_byte, length - 1, &area->pages_node.last))
228 		return -EOVERFLOW;
229 	area->pages_node.last = area->pages_node.last / PAGE_SIZE;
230 	if (WARN_ON(area->pages_node.last >= pages->npages))
231 		return -EOVERFLOW;
232 
233 	/*
234 	 * The area is inserted with a NULL pages indicating it is not fully
235 	 * initialized yet.
236 	 */
237 	area->iopt = iopt;
238 	interval_tree_insert(&area->node, &iopt->area_itree);
239 	return 0;
240 }
241 
iopt_area_alloc(void)242 static struct iopt_area *iopt_area_alloc(void)
243 {
244 	struct iopt_area *area;
245 
246 	area = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT);
247 	if (!area)
248 		return NULL;
249 	RB_CLEAR_NODE(&area->node.rb);
250 	RB_CLEAR_NODE(&area->pages_node.rb);
251 	return area;
252 }
253 
iopt_alloc_area_pages(struct io_pagetable * iopt,struct list_head * pages_list,unsigned long length,unsigned long * dst_iova,int iommu_prot,unsigned int flags)254 static int iopt_alloc_area_pages(struct io_pagetable *iopt,
255 				 struct list_head *pages_list,
256 				 unsigned long length, unsigned long *dst_iova,
257 				 int iommu_prot, unsigned int flags)
258 {
259 	struct iopt_pages_list *elm;
260 	unsigned long iova;
261 	int rc = 0;
262 
263 	list_for_each_entry(elm, pages_list, next) {
264 		elm->area = iopt_area_alloc();
265 		if (!elm->area)
266 			return -ENOMEM;
267 	}
268 
269 	down_write(&iopt->iova_rwsem);
270 	if ((length & (iopt->iova_alignment - 1)) || !length) {
271 		rc = -EINVAL;
272 		goto out_unlock;
273 	}
274 
275 	if (flags & IOPT_ALLOC_IOVA) {
276 		/* Use the first entry to guess the ideal IOVA alignment */
277 		elm = list_first_entry(pages_list, struct iopt_pages_list,
278 				       next);
279 		rc = iopt_alloc_iova(
280 			iopt, dst_iova,
281 			(uintptr_t)elm->pages->uptr + elm->start_byte, length);
282 		if (rc)
283 			goto out_unlock;
284 		if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
285 		    WARN_ON(iopt_check_iova(iopt, *dst_iova, length))) {
286 			rc = -EINVAL;
287 			goto out_unlock;
288 		}
289 	} else {
290 		rc = iopt_check_iova(iopt, *dst_iova, length);
291 		if (rc)
292 			goto out_unlock;
293 	}
294 
295 	/*
296 	 * Areas are created with a NULL pages so that the IOVA space is
297 	 * reserved and we can unlock the iova_rwsem.
298 	 */
299 	iova = *dst_iova;
300 	list_for_each_entry(elm, pages_list, next) {
301 		rc = iopt_insert_area(iopt, elm->area, elm->pages, iova,
302 				      elm->start_byte, elm->length, iommu_prot);
303 		if (rc)
304 			goto out_unlock;
305 		iova += elm->length;
306 	}
307 
308 out_unlock:
309 	up_write(&iopt->iova_rwsem);
310 	return rc;
311 }
312 
iopt_abort_area(struct iopt_area * area)313 static void iopt_abort_area(struct iopt_area *area)
314 {
315 	if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
316 		WARN_ON(area->pages);
317 	if (area->iopt) {
318 		down_write(&area->iopt->iova_rwsem);
319 		interval_tree_remove(&area->node, &area->iopt->area_itree);
320 		up_write(&area->iopt->iova_rwsem);
321 	}
322 	kfree(area);
323 }
324 
iopt_free_pages_list(struct list_head * pages_list)325 void iopt_free_pages_list(struct list_head *pages_list)
326 {
327 	struct iopt_pages_list *elm;
328 
329 	while ((elm = list_first_entry_or_null(pages_list,
330 					       struct iopt_pages_list, next))) {
331 		if (elm->area)
332 			iopt_abort_area(elm->area);
333 		if (elm->pages)
334 			iopt_put_pages(elm->pages);
335 		list_del(&elm->next);
336 		kfree(elm);
337 	}
338 }
339 
iopt_fill_domains_pages(struct list_head * pages_list)340 static int iopt_fill_domains_pages(struct list_head *pages_list)
341 {
342 	struct iopt_pages_list *undo_elm;
343 	struct iopt_pages_list *elm;
344 	int rc;
345 
346 	list_for_each_entry(elm, pages_list, next) {
347 		rc = iopt_area_fill_domains(elm->area, elm->pages);
348 		if (rc)
349 			goto err_undo;
350 	}
351 	return 0;
352 
353 err_undo:
354 	list_for_each_entry(undo_elm, pages_list, next) {
355 		if (undo_elm == elm)
356 			break;
357 		iopt_area_unfill_domains(undo_elm->area, undo_elm->pages);
358 	}
359 	return rc;
360 }
361 
iopt_map_pages(struct io_pagetable * iopt,struct list_head * pages_list,unsigned long length,unsigned long * dst_iova,int iommu_prot,unsigned int flags)362 int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list,
363 		   unsigned long length, unsigned long *dst_iova,
364 		   int iommu_prot, unsigned int flags)
365 {
366 	struct iopt_pages_list *elm;
367 	int rc;
368 
369 	rc = iopt_alloc_area_pages(iopt, pages_list, length, dst_iova,
370 				   iommu_prot, flags);
371 	if (rc)
372 		return rc;
373 
374 	down_read(&iopt->domains_rwsem);
375 	rc = iopt_fill_domains_pages(pages_list);
376 	if (rc)
377 		goto out_unlock_domains;
378 
379 	down_write(&iopt->iova_rwsem);
380 	list_for_each_entry(elm, pages_list, next) {
381 		/*
382 		 * area->pages must be set inside the domains_rwsem to ensure
383 		 * any newly added domains will get filled. Moves the reference
384 		 * in from the list.
385 		 */
386 		elm->area->pages = elm->pages;
387 		elm->pages = NULL;
388 		elm->area = NULL;
389 	}
390 	up_write(&iopt->iova_rwsem);
391 out_unlock_domains:
392 	up_read(&iopt->domains_rwsem);
393 	return rc;
394 }
395 
396 /**
397  * iopt_map_user_pages() - Map a user VA to an iova in the io page table
398  * @ictx: iommufd_ctx the iopt is part of
399  * @iopt: io_pagetable to act on
400  * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
401  *        the chosen iova on output. Otherwise is the iova to map to on input
402  * @uptr: User VA to map
403  * @length: Number of bytes to map
404  * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
405  * @flags: IOPT_ALLOC_IOVA or zero
406  *
407  * iova, uptr, and length must be aligned to iova_alignment. For domain backed
408  * page tables this will pin the pages and load them into the domain at iova.
409  * For non-domain page tables this will only setup a lazy reference and the
410  * caller must use iopt_access_pages() to touch them.
411  *
412  * iopt_unmap_iova() must be called to undo this before the io_pagetable can be
413  * destroyed.
414  */
iopt_map_user_pages(struct iommufd_ctx * ictx,struct io_pagetable * iopt,unsigned long * iova,void __user * uptr,unsigned long length,int iommu_prot,unsigned int flags)415 int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
416 			unsigned long *iova, void __user *uptr,
417 			unsigned long length, int iommu_prot,
418 			unsigned int flags)
419 {
420 	struct iopt_pages_list elm = {};
421 	LIST_HEAD(pages_list);
422 	int rc;
423 
424 	elm.pages = iopt_alloc_pages(uptr, length, iommu_prot & IOMMU_WRITE);
425 	if (IS_ERR(elm.pages))
426 		return PTR_ERR(elm.pages);
427 	if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM &&
428 	    elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER)
429 		elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM;
430 	elm.start_byte = uptr - elm.pages->uptr;
431 	elm.length = length;
432 	list_add(&elm.next, &pages_list);
433 
434 	rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags);
435 	if (rc) {
436 		if (elm.area)
437 			iopt_abort_area(elm.area);
438 		if (elm.pages)
439 			iopt_put_pages(elm.pages);
440 		return rc;
441 	}
442 	return 0;
443 }
444 
445 struct iova_bitmap_fn_arg {
446 	unsigned long flags;
447 	struct io_pagetable *iopt;
448 	struct iommu_domain *domain;
449 	struct iommu_dirty_bitmap *dirty;
450 };
451 
__iommu_read_and_clear_dirty(struct iova_bitmap * bitmap,unsigned long iova,size_t length,void * opaque)452 static int __iommu_read_and_clear_dirty(struct iova_bitmap *bitmap,
453 					unsigned long iova, size_t length,
454 					void *opaque)
455 {
456 	struct iopt_area *area;
457 	struct iopt_area_contig_iter iter;
458 	struct iova_bitmap_fn_arg *arg = opaque;
459 	struct iommu_domain *domain = arg->domain;
460 	struct iommu_dirty_bitmap *dirty = arg->dirty;
461 	const struct iommu_dirty_ops *ops = domain->dirty_ops;
462 	unsigned long last_iova = iova + length - 1;
463 	unsigned long flags = arg->flags;
464 	int ret;
465 
466 	iopt_for_each_contig_area(&iter, area, arg->iopt, iova, last_iova) {
467 		unsigned long last = min(last_iova, iopt_area_last_iova(area));
468 
469 		ret = ops->read_and_clear_dirty(domain, iter.cur_iova,
470 						last - iter.cur_iova + 1, flags,
471 						dirty);
472 		if (ret)
473 			return ret;
474 	}
475 
476 	if (!iopt_area_contig_done(&iter))
477 		return -EINVAL;
478 	return 0;
479 }
480 
481 static int
iommu_read_and_clear_dirty(struct iommu_domain * domain,struct io_pagetable * iopt,unsigned long flags,struct iommu_hwpt_get_dirty_bitmap * bitmap)482 iommu_read_and_clear_dirty(struct iommu_domain *domain,
483 			   struct io_pagetable *iopt, unsigned long flags,
484 			   struct iommu_hwpt_get_dirty_bitmap *bitmap)
485 {
486 	const struct iommu_dirty_ops *ops = domain->dirty_ops;
487 	struct iommu_iotlb_gather gather;
488 	struct iommu_dirty_bitmap dirty;
489 	struct iova_bitmap_fn_arg arg;
490 	struct iova_bitmap *iter;
491 	int ret = 0;
492 
493 	if (!ops || !ops->read_and_clear_dirty)
494 		return -EOPNOTSUPP;
495 
496 	iter = iova_bitmap_alloc(bitmap->iova, bitmap->length,
497 				 bitmap->page_size,
498 				 u64_to_user_ptr(bitmap->data));
499 	if (IS_ERR(iter))
500 		return -ENOMEM;
501 
502 	iommu_dirty_bitmap_init(&dirty, iter, &gather);
503 
504 	arg.flags = flags;
505 	arg.iopt = iopt;
506 	arg.domain = domain;
507 	arg.dirty = &dirty;
508 	iova_bitmap_for_each(iter, &arg, __iommu_read_and_clear_dirty);
509 
510 	if (!(flags & IOMMU_DIRTY_NO_CLEAR))
511 		iommu_iotlb_sync(domain, &gather);
512 
513 	iova_bitmap_free(iter);
514 
515 	return ret;
516 }
517 
iommufd_check_iova_range(struct io_pagetable * iopt,struct iommu_hwpt_get_dirty_bitmap * bitmap)518 int iommufd_check_iova_range(struct io_pagetable *iopt,
519 			     struct iommu_hwpt_get_dirty_bitmap *bitmap)
520 {
521 	size_t iommu_pgsize = iopt->iova_alignment;
522 	u64 last_iova;
523 
524 	if (check_add_overflow(bitmap->iova, bitmap->length - 1, &last_iova))
525 		return -EOVERFLOW;
526 
527 	if (bitmap->iova > ULONG_MAX || last_iova > ULONG_MAX)
528 		return -EOVERFLOW;
529 
530 	if ((bitmap->iova & (iommu_pgsize - 1)) ||
531 	    ((last_iova + 1) & (iommu_pgsize - 1)))
532 		return -EINVAL;
533 
534 	if (!bitmap->page_size)
535 		return -EINVAL;
536 
537 	if ((bitmap->iova & (bitmap->page_size - 1)) ||
538 	    ((last_iova + 1) & (bitmap->page_size - 1)))
539 		return -EINVAL;
540 
541 	return 0;
542 }
543 
iopt_read_and_clear_dirty_data(struct io_pagetable * iopt,struct iommu_domain * domain,unsigned long flags,struct iommu_hwpt_get_dirty_bitmap * bitmap)544 int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt,
545 				   struct iommu_domain *domain,
546 				   unsigned long flags,
547 				   struct iommu_hwpt_get_dirty_bitmap *bitmap)
548 {
549 	int ret;
550 
551 	ret = iommufd_check_iova_range(iopt, bitmap);
552 	if (ret)
553 		return ret;
554 
555 	down_read(&iopt->iova_rwsem);
556 	ret = iommu_read_and_clear_dirty(domain, iopt, flags, bitmap);
557 	up_read(&iopt->iova_rwsem);
558 
559 	return ret;
560 }
561 
iopt_clear_dirty_data(struct io_pagetable * iopt,struct iommu_domain * domain)562 static int iopt_clear_dirty_data(struct io_pagetable *iopt,
563 				 struct iommu_domain *domain)
564 {
565 	const struct iommu_dirty_ops *ops = domain->dirty_ops;
566 	struct iommu_iotlb_gather gather;
567 	struct iommu_dirty_bitmap dirty;
568 	struct iopt_area *area;
569 	int ret = 0;
570 
571 	lockdep_assert_held_read(&iopt->iova_rwsem);
572 
573 	iommu_dirty_bitmap_init(&dirty, NULL, &gather);
574 
575 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
576 	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
577 		if (!area->pages)
578 			continue;
579 
580 		ret = ops->read_and_clear_dirty(domain, iopt_area_iova(area),
581 						iopt_area_length(area), 0,
582 						&dirty);
583 		if (ret)
584 			break;
585 	}
586 
587 	iommu_iotlb_sync(domain, &gather);
588 	return ret;
589 }
590 
iopt_set_dirty_tracking(struct io_pagetable * iopt,struct iommu_domain * domain,bool enable)591 int iopt_set_dirty_tracking(struct io_pagetable *iopt,
592 			    struct iommu_domain *domain, bool enable)
593 {
594 	const struct iommu_dirty_ops *ops = domain->dirty_ops;
595 	int ret = 0;
596 
597 	if (!ops)
598 		return -EOPNOTSUPP;
599 
600 	down_read(&iopt->iova_rwsem);
601 
602 	/* Clear dirty bits from PTEs to ensure a clean snapshot */
603 	if (enable) {
604 		ret = iopt_clear_dirty_data(iopt, domain);
605 		if (ret)
606 			goto out_unlock;
607 	}
608 
609 	ret = ops->set_dirty_tracking(domain, enable);
610 
611 out_unlock:
612 	up_read(&iopt->iova_rwsem);
613 	return ret;
614 }
615 
iopt_get_pages(struct io_pagetable * iopt,unsigned long iova,unsigned long length,struct list_head * pages_list)616 int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova,
617 		   unsigned long length, struct list_head *pages_list)
618 {
619 	struct iopt_area_contig_iter iter;
620 	unsigned long last_iova;
621 	struct iopt_area *area;
622 	int rc;
623 
624 	if (!length)
625 		return -EINVAL;
626 	if (check_add_overflow(iova, length - 1, &last_iova))
627 		return -EOVERFLOW;
628 
629 	down_read(&iopt->iova_rwsem);
630 	iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) {
631 		struct iopt_pages_list *elm;
632 		unsigned long last = min(last_iova, iopt_area_last_iova(area));
633 
634 		elm = kzalloc(sizeof(*elm), GFP_KERNEL_ACCOUNT);
635 		if (!elm) {
636 			rc = -ENOMEM;
637 			goto err_free;
638 		}
639 		elm->start_byte = iopt_area_start_byte(area, iter.cur_iova);
640 		elm->pages = area->pages;
641 		elm->length = (last - iter.cur_iova) + 1;
642 		kref_get(&elm->pages->kref);
643 		list_add_tail(&elm->next, pages_list);
644 	}
645 	if (!iopt_area_contig_done(&iter)) {
646 		rc = -ENOENT;
647 		goto err_free;
648 	}
649 	up_read(&iopt->iova_rwsem);
650 	return 0;
651 err_free:
652 	up_read(&iopt->iova_rwsem);
653 	iopt_free_pages_list(pages_list);
654 	return rc;
655 }
656 
iopt_unmap_iova_range(struct io_pagetable * iopt,unsigned long start,unsigned long last,unsigned long * unmapped)657 static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start,
658 				 unsigned long last, unsigned long *unmapped)
659 {
660 	struct iopt_area *area;
661 	unsigned long unmapped_bytes = 0;
662 	unsigned int tries = 0;
663 	int rc = -ENOENT;
664 
665 	/*
666 	 * The domains_rwsem must be held in read mode any time any area->pages
667 	 * is NULL. This prevents domain attach/detatch from running
668 	 * concurrently with cleaning up the area.
669 	 */
670 again:
671 	down_read(&iopt->domains_rwsem);
672 	down_write(&iopt->iova_rwsem);
673 	while ((area = iopt_area_iter_first(iopt, start, last))) {
674 		unsigned long area_last = iopt_area_last_iova(area);
675 		unsigned long area_first = iopt_area_iova(area);
676 		struct iopt_pages *pages;
677 
678 		/* Userspace should not race map/unmap's of the same area */
679 		if (!area->pages) {
680 			rc = -EBUSY;
681 			goto out_unlock_iova;
682 		}
683 
684 		if (area_first < start || area_last > last) {
685 			rc = -ENOENT;
686 			goto out_unlock_iova;
687 		}
688 
689 		if (area_first != start)
690 			tries = 0;
691 
692 		/*
693 		 * num_accesses writers must hold the iova_rwsem too, so we can
694 		 * safely read it under the write side of the iovam_rwsem
695 		 * without the pages->mutex.
696 		 */
697 		if (area->num_accesses) {
698 			size_t length = iopt_area_length(area);
699 
700 			start = area_first;
701 			area->prevent_access = true;
702 			up_write(&iopt->iova_rwsem);
703 			up_read(&iopt->domains_rwsem);
704 
705 			iommufd_access_notify_unmap(iopt, area_first, length);
706 			/* Something is not responding to unmap requests. */
707 			tries++;
708 			if (WARN_ON(tries > 100)) {
709 				rc = -EDEADLOCK;
710 				goto out_unmapped;
711 			}
712 			goto again;
713 		}
714 
715 		pages = area->pages;
716 		area->pages = NULL;
717 		up_write(&iopt->iova_rwsem);
718 
719 		iopt_area_unfill_domains(area, pages);
720 		iopt_abort_area(area);
721 		iopt_put_pages(pages);
722 
723 		unmapped_bytes += area_last - area_first + 1;
724 
725 		down_write(&iopt->iova_rwsem);
726 	}
727 	if (unmapped_bytes)
728 		rc = 0;
729 
730 out_unlock_iova:
731 	up_write(&iopt->iova_rwsem);
732 	up_read(&iopt->domains_rwsem);
733 out_unmapped:
734 	if (unmapped)
735 		*unmapped = unmapped_bytes;
736 	return rc;
737 }
738 
739 /**
740  * iopt_unmap_iova() - Remove a range of iova
741  * @iopt: io_pagetable to act on
742  * @iova: Starting iova to unmap
743  * @length: Number of bytes to unmap
744  * @unmapped: Return number of bytes unmapped
745  *
746  * The requested range must be a superset of existing ranges.
747  * Splitting/truncating IOVA mappings is not allowed.
748  */
iopt_unmap_iova(struct io_pagetable * iopt,unsigned long iova,unsigned long length,unsigned long * unmapped)749 int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
750 		    unsigned long length, unsigned long *unmapped)
751 {
752 	unsigned long iova_last;
753 
754 	if (!length)
755 		return -EINVAL;
756 
757 	if (check_add_overflow(iova, length - 1, &iova_last))
758 		return -EOVERFLOW;
759 
760 	return iopt_unmap_iova_range(iopt, iova, iova_last, unmapped);
761 }
762 
iopt_unmap_all(struct io_pagetable * iopt,unsigned long * unmapped)763 int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped)
764 {
765 	int rc;
766 
767 	rc = iopt_unmap_iova_range(iopt, 0, ULONG_MAX, unmapped);
768 	/* If the IOVAs are empty then unmap all succeeds */
769 	if (rc == -ENOENT)
770 		return 0;
771 	return rc;
772 }
773 
774 /* The caller must always free all the nodes in the allowed_iova rb_root. */
iopt_set_allow_iova(struct io_pagetable * iopt,struct rb_root_cached * allowed_iova)775 int iopt_set_allow_iova(struct io_pagetable *iopt,
776 			struct rb_root_cached *allowed_iova)
777 {
778 	struct iopt_allowed *allowed;
779 
780 	down_write(&iopt->iova_rwsem);
781 	swap(*allowed_iova, iopt->allowed_itree);
782 
783 	for (allowed = iopt_allowed_iter_first(iopt, 0, ULONG_MAX); allowed;
784 	     allowed = iopt_allowed_iter_next(allowed, 0, ULONG_MAX)) {
785 		if (iopt_reserved_iter_first(iopt, allowed->node.start,
786 					     allowed->node.last)) {
787 			swap(*allowed_iova, iopt->allowed_itree);
788 			up_write(&iopt->iova_rwsem);
789 			return -EADDRINUSE;
790 		}
791 	}
792 	up_write(&iopt->iova_rwsem);
793 	return 0;
794 }
795 
iopt_reserve_iova(struct io_pagetable * iopt,unsigned long start,unsigned long last,void * owner)796 int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start,
797 		      unsigned long last, void *owner)
798 {
799 	struct iopt_reserved *reserved;
800 
801 	lockdep_assert_held_write(&iopt->iova_rwsem);
802 
803 	if (iopt_area_iter_first(iopt, start, last) ||
804 	    iopt_allowed_iter_first(iopt, start, last))
805 		return -EADDRINUSE;
806 
807 	reserved = kzalloc(sizeof(*reserved), GFP_KERNEL_ACCOUNT);
808 	if (!reserved)
809 		return -ENOMEM;
810 	reserved->node.start = start;
811 	reserved->node.last = last;
812 	reserved->owner = owner;
813 	interval_tree_insert(&reserved->node, &iopt->reserved_itree);
814 	return 0;
815 }
816 
__iopt_remove_reserved_iova(struct io_pagetable * iopt,void * owner)817 static void __iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
818 {
819 	struct iopt_reserved *reserved, *next;
820 
821 	lockdep_assert_held_write(&iopt->iova_rwsem);
822 
823 	for (reserved = iopt_reserved_iter_first(iopt, 0, ULONG_MAX); reserved;
824 	     reserved = next) {
825 		next = iopt_reserved_iter_next(reserved, 0, ULONG_MAX);
826 
827 		if (reserved->owner == owner) {
828 			interval_tree_remove(&reserved->node,
829 					     &iopt->reserved_itree);
830 			kfree(reserved);
831 		}
832 	}
833 }
834 
iopt_remove_reserved_iova(struct io_pagetable * iopt,void * owner)835 void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
836 {
837 	down_write(&iopt->iova_rwsem);
838 	__iopt_remove_reserved_iova(iopt, owner);
839 	up_write(&iopt->iova_rwsem);
840 }
841 
iopt_init_table(struct io_pagetable * iopt)842 void iopt_init_table(struct io_pagetable *iopt)
843 {
844 	init_rwsem(&iopt->iova_rwsem);
845 	init_rwsem(&iopt->domains_rwsem);
846 	iopt->area_itree = RB_ROOT_CACHED;
847 	iopt->allowed_itree = RB_ROOT_CACHED;
848 	iopt->reserved_itree = RB_ROOT_CACHED;
849 	xa_init_flags(&iopt->domains, XA_FLAGS_ACCOUNT);
850 	xa_init_flags(&iopt->access_list, XA_FLAGS_ALLOC);
851 
852 	/*
853 	 * iopt's start as SW tables that can use the entire size_t IOVA space
854 	 * due to the use of size_t in the APIs. They have no alignment
855 	 * restriction.
856 	 */
857 	iopt->iova_alignment = 1;
858 }
859 
iopt_destroy_table(struct io_pagetable * iopt)860 void iopt_destroy_table(struct io_pagetable *iopt)
861 {
862 	struct interval_tree_node *node;
863 
864 	if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
865 		iopt_remove_reserved_iova(iopt, NULL);
866 
867 	while ((node = interval_tree_iter_first(&iopt->allowed_itree, 0,
868 						ULONG_MAX))) {
869 		interval_tree_remove(node, &iopt->allowed_itree);
870 		kfree(container_of(node, struct iopt_allowed, node));
871 	}
872 
873 	WARN_ON(!RB_EMPTY_ROOT(&iopt->reserved_itree.rb_root));
874 	WARN_ON(!xa_empty(&iopt->domains));
875 	WARN_ON(!xa_empty(&iopt->access_list));
876 	WARN_ON(!RB_EMPTY_ROOT(&iopt->area_itree.rb_root));
877 }
878 
879 /**
880  * iopt_unfill_domain() - Unfill a domain with PFNs
881  * @iopt: io_pagetable to act on
882  * @domain: domain to unfill
883  *
884  * This is used when removing a domain from the iopt. Every area in the iopt
885  * will be unmapped from the domain. The domain must already be removed from the
886  * domains xarray.
887  */
iopt_unfill_domain(struct io_pagetable * iopt,struct iommu_domain * domain)888 static void iopt_unfill_domain(struct io_pagetable *iopt,
889 			       struct iommu_domain *domain)
890 {
891 	struct iopt_area *area;
892 
893 	lockdep_assert_held(&iopt->iova_rwsem);
894 	lockdep_assert_held_write(&iopt->domains_rwsem);
895 
896 	/*
897 	 * Some other domain is holding all the pfns still, rapidly unmap this
898 	 * domain.
899 	 */
900 	if (iopt->next_domain_id != 0) {
901 		/* Pick an arbitrary remaining domain to act as storage */
902 		struct iommu_domain *storage_domain =
903 			xa_load(&iopt->domains, 0);
904 
905 		for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
906 		     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
907 			struct iopt_pages *pages = area->pages;
908 
909 			if (!pages)
910 				continue;
911 
912 			mutex_lock(&pages->mutex);
913 			if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
914 				WARN_ON(!area->storage_domain);
915 			if (area->storage_domain == domain)
916 				area->storage_domain = storage_domain;
917 			mutex_unlock(&pages->mutex);
918 
919 			iopt_area_unmap_domain(area, domain);
920 		}
921 		return;
922 	}
923 
924 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
925 	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
926 		struct iopt_pages *pages = area->pages;
927 
928 		if (!pages)
929 			continue;
930 
931 		mutex_lock(&pages->mutex);
932 		interval_tree_remove(&area->pages_node, &pages->domains_itree);
933 		WARN_ON(area->storage_domain != domain);
934 		area->storage_domain = NULL;
935 		iopt_area_unfill_domain(area, pages, domain);
936 		mutex_unlock(&pages->mutex);
937 	}
938 }
939 
940 /**
941  * iopt_fill_domain() - Fill a domain with PFNs
942  * @iopt: io_pagetable to act on
943  * @domain: domain to fill
944  *
945  * Fill the domain with PFNs from every area in the iopt. On failure the domain
946  * is left unchanged.
947  */
iopt_fill_domain(struct io_pagetable * iopt,struct iommu_domain * domain)948 static int iopt_fill_domain(struct io_pagetable *iopt,
949 			    struct iommu_domain *domain)
950 {
951 	struct iopt_area *end_area;
952 	struct iopt_area *area;
953 	int rc;
954 
955 	lockdep_assert_held(&iopt->iova_rwsem);
956 	lockdep_assert_held_write(&iopt->domains_rwsem);
957 
958 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
959 	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
960 		struct iopt_pages *pages = area->pages;
961 
962 		if (!pages)
963 			continue;
964 
965 		mutex_lock(&pages->mutex);
966 		rc = iopt_area_fill_domain(area, domain);
967 		if (rc) {
968 			mutex_unlock(&pages->mutex);
969 			goto out_unfill;
970 		}
971 		if (!area->storage_domain) {
972 			WARN_ON(iopt->next_domain_id != 0);
973 			area->storage_domain = domain;
974 			interval_tree_insert(&area->pages_node,
975 					     &pages->domains_itree);
976 		}
977 		mutex_unlock(&pages->mutex);
978 	}
979 	return 0;
980 
981 out_unfill:
982 	end_area = area;
983 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
984 	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
985 		struct iopt_pages *pages = area->pages;
986 
987 		if (area == end_area)
988 			break;
989 		if (!pages)
990 			continue;
991 		mutex_lock(&pages->mutex);
992 		if (iopt->next_domain_id == 0) {
993 			interval_tree_remove(&area->pages_node,
994 					     &pages->domains_itree);
995 			area->storage_domain = NULL;
996 		}
997 		iopt_area_unfill_domain(area, pages, domain);
998 		mutex_unlock(&pages->mutex);
999 	}
1000 	return rc;
1001 }
1002 
1003 /* All existing area's conform to an increased page size */
iopt_check_iova_alignment(struct io_pagetable * iopt,unsigned long new_iova_alignment)1004 static int iopt_check_iova_alignment(struct io_pagetable *iopt,
1005 				     unsigned long new_iova_alignment)
1006 {
1007 	unsigned long align_mask = new_iova_alignment - 1;
1008 	struct iopt_area *area;
1009 
1010 	lockdep_assert_held(&iopt->iova_rwsem);
1011 	lockdep_assert_held(&iopt->domains_rwsem);
1012 
1013 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
1014 	     area = iopt_area_iter_next(area, 0, ULONG_MAX))
1015 		if ((iopt_area_iova(area) & align_mask) ||
1016 		    (iopt_area_length(area) & align_mask) ||
1017 		    (area->page_offset & align_mask))
1018 			return -EADDRINUSE;
1019 
1020 	if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) {
1021 		struct iommufd_access *access;
1022 		unsigned long index;
1023 
1024 		xa_for_each(&iopt->access_list, index, access)
1025 			if (WARN_ON(access->iova_alignment >
1026 				    new_iova_alignment))
1027 				return -EADDRINUSE;
1028 	}
1029 	return 0;
1030 }
1031 
iopt_table_add_domain(struct io_pagetable * iopt,struct iommu_domain * domain)1032 int iopt_table_add_domain(struct io_pagetable *iopt,
1033 			  struct iommu_domain *domain)
1034 {
1035 	const struct iommu_domain_geometry *geometry = &domain->geometry;
1036 	struct iommu_domain *iter_domain;
1037 	unsigned int new_iova_alignment;
1038 	unsigned long index;
1039 	int rc;
1040 
1041 	down_write(&iopt->domains_rwsem);
1042 	down_write(&iopt->iova_rwsem);
1043 
1044 	xa_for_each(&iopt->domains, index, iter_domain) {
1045 		if (WARN_ON(iter_domain == domain)) {
1046 			rc = -EEXIST;
1047 			goto out_unlock;
1048 		}
1049 	}
1050 
1051 	/*
1052 	 * The io page size drives the iova_alignment. Internally the iopt_pages
1053 	 * works in PAGE_SIZE units and we adjust when mapping sub-PAGE_SIZE
1054 	 * objects into the iommu_domain.
1055 	 *
1056 	 * A iommu_domain must always be able to accept PAGE_SIZE to be
1057 	 * compatible as we can't guarantee higher contiguity.
1058 	 */
1059 	new_iova_alignment = max_t(unsigned long,
1060 				   1UL << __ffs(domain->pgsize_bitmap),
1061 				   iopt->iova_alignment);
1062 	if (new_iova_alignment > PAGE_SIZE) {
1063 		rc = -EINVAL;
1064 		goto out_unlock;
1065 	}
1066 	if (new_iova_alignment != iopt->iova_alignment) {
1067 		rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
1068 		if (rc)
1069 			goto out_unlock;
1070 	}
1071 
1072 	/* No area exists that is outside the allowed domain aperture */
1073 	if (geometry->aperture_start != 0) {
1074 		rc = iopt_reserve_iova(iopt, 0, geometry->aperture_start - 1,
1075 				       domain);
1076 		if (rc)
1077 			goto out_reserved;
1078 	}
1079 	if (geometry->aperture_end != ULONG_MAX) {
1080 		rc = iopt_reserve_iova(iopt, geometry->aperture_end + 1,
1081 				       ULONG_MAX, domain);
1082 		if (rc)
1083 			goto out_reserved;
1084 	}
1085 
1086 	rc = xa_reserve(&iopt->domains, iopt->next_domain_id, GFP_KERNEL);
1087 	if (rc)
1088 		goto out_reserved;
1089 
1090 	rc = iopt_fill_domain(iopt, domain);
1091 	if (rc)
1092 		goto out_release;
1093 
1094 	iopt->iova_alignment = new_iova_alignment;
1095 	xa_store(&iopt->domains, iopt->next_domain_id, domain, GFP_KERNEL);
1096 	iopt->next_domain_id++;
1097 	up_write(&iopt->iova_rwsem);
1098 	up_write(&iopt->domains_rwsem);
1099 	return 0;
1100 out_release:
1101 	xa_release(&iopt->domains, iopt->next_domain_id);
1102 out_reserved:
1103 	__iopt_remove_reserved_iova(iopt, domain);
1104 out_unlock:
1105 	up_write(&iopt->iova_rwsem);
1106 	up_write(&iopt->domains_rwsem);
1107 	return rc;
1108 }
1109 
iopt_calculate_iova_alignment(struct io_pagetable * iopt)1110 static int iopt_calculate_iova_alignment(struct io_pagetable *iopt)
1111 {
1112 	unsigned long new_iova_alignment;
1113 	struct iommufd_access *access;
1114 	struct iommu_domain *domain;
1115 	unsigned long index;
1116 
1117 	lockdep_assert_held_write(&iopt->iova_rwsem);
1118 	lockdep_assert_held(&iopt->domains_rwsem);
1119 
1120 	/* See batch_iommu_map_small() */
1121 	if (iopt->disable_large_pages)
1122 		new_iova_alignment = PAGE_SIZE;
1123 	else
1124 		new_iova_alignment = 1;
1125 
1126 	xa_for_each(&iopt->domains, index, domain)
1127 		new_iova_alignment = max_t(unsigned long,
1128 					   1UL << __ffs(domain->pgsize_bitmap),
1129 					   new_iova_alignment);
1130 	xa_for_each(&iopt->access_list, index, access)
1131 		new_iova_alignment = max_t(unsigned long,
1132 					   access->iova_alignment,
1133 					   new_iova_alignment);
1134 
1135 	if (new_iova_alignment > iopt->iova_alignment) {
1136 		int rc;
1137 
1138 		rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
1139 		if (rc)
1140 			return rc;
1141 	}
1142 	iopt->iova_alignment = new_iova_alignment;
1143 	return 0;
1144 }
1145 
iopt_table_remove_domain(struct io_pagetable * iopt,struct iommu_domain * domain)1146 void iopt_table_remove_domain(struct io_pagetable *iopt,
1147 			      struct iommu_domain *domain)
1148 {
1149 	struct iommu_domain *iter_domain = NULL;
1150 	unsigned long index;
1151 
1152 	down_write(&iopt->domains_rwsem);
1153 	down_write(&iopt->iova_rwsem);
1154 
1155 	xa_for_each(&iopt->domains, index, iter_domain)
1156 		if (iter_domain == domain)
1157 			break;
1158 	if (WARN_ON(iter_domain != domain) || index >= iopt->next_domain_id)
1159 		goto out_unlock;
1160 
1161 	/*
1162 	 * Compress the xarray to keep it linear by swapping the entry to erase
1163 	 * with the tail entry and shrinking the tail.
1164 	 */
1165 	iopt->next_domain_id--;
1166 	iter_domain = xa_erase(&iopt->domains, iopt->next_domain_id);
1167 	if (index != iopt->next_domain_id)
1168 		xa_store(&iopt->domains, index, iter_domain, GFP_KERNEL);
1169 
1170 	iopt_unfill_domain(iopt, domain);
1171 	__iopt_remove_reserved_iova(iopt, domain);
1172 
1173 	WARN_ON(iopt_calculate_iova_alignment(iopt));
1174 out_unlock:
1175 	up_write(&iopt->iova_rwsem);
1176 	up_write(&iopt->domains_rwsem);
1177 }
1178 
1179 /**
1180  * iopt_area_split - Split an area into two parts at iova
1181  * @area: The area to split
1182  * @iova: Becomes the last of a new area
1183  *
1184  * This splits an area into two. It is part of the VFIO compatibility to allow
1185  * poking a hole in the mapping. The two areas continue to point at the same
1186  * iopt_pages, just with different starting bytes.
1187  */
iopt_area_split(struct iopt_area * area,unsigned long iova)1188 static int iopt_area_split(struct iopt_area *area, unsigned long iova)
1189 {
1190 	unsigned long alignment = area->iopt->iova_alignment;
1191 	unsigned long last_iova = iopt_area_last_iova(area);
1192 	unsigned long start_iova = iopt_area_iova(area);
1193 	unsigned long new_start = iova + 1;
1194 	struct io_pagetable *iopt = area->iopt;
1195 	struct iopt_pages *pages = area->pages;
1196 	struct iopt_area *lhs;
1197 	struct iopt_area *rhs;
1198 	int rc;
1199 
1200 	lockdep_assert_held_write(&iopt->iova_rwsem);
1201 
1202 	if (iova == start_iova || iova == last_iova)
1203 		return 0;
1204 
1205 	if (!pages || area->prevent_access)
1206 		return -EBUSY;
1207 
1208 	if (new_start & (alignment - 1) ||
1209 	    iopt_area_start_byte(area, new_start) & (alignment - 1))
1210 		return -EINVAL;
1211 
1212 	lhs = iopt_area_alloc();
1213 	if (!lhs)
1214 		return -ENOMEM;
1215 
1216 	rhs = iopt_area_alloc();
1217 	if (!rhs) {
1218 		rc = -ENOMEM;
1219 		goto err_free_lhs;
1220 	}
1221 
1222 	mutex_lock(&pages->mutex);
1223 	/*
1224 	 * Splitting is not permitted if an access exists, we don't track enough
1225 	 * information to split existing accesses.
1226 	 */
1227 	if (area->num_accesses) {
1228 		rc = -EINVAL;
1229 		goto err_unlock;
1230 	}
1231 
1232 	/*
1233 	 * Splitting is not permitted if a domain could have been mapped with
1234 	 * huge pages.
1235 	 */
1236 	if (area->storage_domain && !iopt->disable_large_pages) {
1237 		rc = -EINVAL;
1238 		goto err_unlock;
1239 	}
1240 
1241 	interval_tree_remove(&area->node, &iopt->area_itree);
1242 	rc = iopt_insert_area(iopt, lhs, area->pages, start_iova,
1243 			      iopt_area_start_byte(area, start_iova),
1244 			      (new_start - 1) - start_iova + 1,
1245 			      area->iommu_prot);
1246 	if (WARN_ON(rc))
1247 		goto err_insert;
1248 
1249 	rc = iopt_insert_area(iopt, rhs, area->pages, new_start,
1250 			      iopt_area_start_byte(area, new_start),
1251 			      last_iova - new_start + 1, area->iommu_prot);
1252 	if (WARN_ON(rc))
1253 		goto err_remove_lhs;
1254 
1255 	/*
1256 	 * If the original area has filled a domain, domains_itree has to be
1257 	 * updated.
1258 	 */
1259 	if (area->storage_domain) {
1260 		interval_tree_remove(&area->pages_node, &pages->domains_itree);
1261 		interval_tree_insert(&lhs->pages_node, &pages->domains_itree);
1262 		interval_tree_insert(&rhs->pages_node, &pages->domains_itree);
1263 	}
1264 
1265 	lhs->storage_domain = area->storage_domain;
1266 	lhs->pages = area->pages;
1267 	rhs->storage_domain = area->storage_domain;
1268 	rhs->pages = area->pages;
1269 	kref_get(&rhs->pages->kref);
1270 	kfree(area);
1271 	mutex_unlock(&pages->mutex);
1272 
1273 	/*
1274 	 * No change to domains or accesses because the pages hasn't been
1275 	 * changed
1276 	 */
1277 	return 0;
1278 
1279 err_remove_lhs:
1280 	interval_tree_remove(&lhs->node, &iopt->area_itree);
1281 err_insert:
1282 	interval_tree_insert(&area->node, &iopt->area_itree);
1283 err_unlock:
1284 	mutex_unlock(&pages->mutex);
1285 	kfree(rhs);
1286 err_free_lhs:
1287 	kfree(lhs);
1288 	return rc;
1289 }
1290 
iopt_cut_iova(struct io_pagetable * iopt,unsigned long * iovas,size_t num_iovas)1291 int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas,
1292 		  size_t num_iovas)
1293 {
1294 	int rc = 0;
1295 	int i;
1296 
1297 	down_write(&iopt->iova_rwsem);
1298 	for (i = 0; i < num_iovas; i++) {
1299 		struct iopt_area *area;
1300 
1301 		area = iopt_area_iter_first(iopt, iovas[i], iovas[i]);
1302 		if (!area)
1303 			continue;
1304 		rc = iopt_area_split(area, iovas[i]);
1305 		if (rc)
1306 			break;
1307 	}
1308 	up_write(&iopt->iova_rwsem);
1309 	return rc;
1310 }
1311 
iopt_enable_large_pages(struct io_pagetable * iopt)1312 void iopt_enable_large_pages(struct io_pagetable *iopt)
1313 {
1314 	int rc;
1315 
1316 	down_write(&iopt->domains_rwsem);
1317 	down_write(&iopt->iova_rwsem);
1318 	WRITE_ONCE(iopt->disable_large_pages, false);
1319 	rc = iopt_calculate_iova_alignment(iopt);
1320 	WARN_ON(rc);
1321 	up_write(&iopt->iova_rwsem);
1322 	up_write(&iopt->domains_rwsem);
1323 }
1324 
iopt_disable_large_pages(struct io_pagetable * iopt)1325 int iopt_disable_large_pages(struct io_pagetable *iopt)
1326 {
1327 	int rc = 0;
1328 
1329 	down_write(&iopt->domains_rwsem);
1330 	down_write(&iopt->iova_rwsem);
1331 	if (iopt->disable_large_pages)
1332 		goto out_unlock;
1333 
1334 	/* Won't do it if domains already have pages mapped in them */
1335 	if (!xa_empty(&iopt->domains) &&
1336 	    !RB_EMPTY_ROOT(&iopt->area_itree.rb_root)) {
1337 		rc = -EINVAL;
1338 		goto out_unlock;
1339 	}
1340 
1341 	WRITE_ONCE(iopt->disable_large_pages, true);
1342 	rc = iopt_calculate_iova_alignment(iopt);
1343 	if (rc)
1344 		WRITE_ONCE(iopt->disable_large_pages, false);
1345 out_unlock:
1346 	up_write(&iopt->iova_rwsem);
1347 	up_write(&iopt->domains_rwsem);
1348 	return rc;
1349 }
1350 
iopt_add_access(struct io_pagetable * iopt,struct iommufd_access * access)1351 int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access)
1352 {
1353 	u32 new_id;
1354 	int rc;
1355 
1356 	down_write(&iopt->domains_rwsem);
1357 	down_write(&iopt->iova_rwsem);
1358 	rc = xa_alloc(&iopt->access_list, &new_id, access, xa_limit_16b,
1359 		      GFP_KERNEL_ACCOUNT);
1360 
1361 	if (rc)
1362 		goto out_unlock;
1363 
1364 	rc = iopt_calculate_iova_alignment(iopt);
1365 	if (rc) {
1366 		xa_erase(&iopt->access_list, new_id);
1367 		goto out_unlock;
1368 	}
1369 	access->iopt_access_list_id = new_id;
1370 
1371 out_unlock:
1372 	up_write(&iopt->iova_rwsem);
1373 	up_write(&iopt->domains_rwsem);
1374 	return rc;
1375 }
1376 
iopt_remove_access(struct io_pagetable * iopt,struct iommufd_access * access,u32 iopt_access_list_id)1377 void iopt_remove_access(struct io_pagetable *iopt,
1378 			struct iommufd_access *access,
1379 			u32 iopt_access_list_id)
1380 {
1381 	down_write(&iopt->domains_rwsem);
1382 	down_write(&iopt->iova_rwsem);
1383 	WARN_ON(xa_erase(&iopt->access_list, iopt_access_list_id) != access);
1384 	WARN_ON(iopt_calculate_iova_alignment(iopt));
1385 	up_write(&iopt->iova_rwsem);
1386 	up_write(&iopt->domains_rwsem);
1387 }
1388 
1389 /* Narrow the valid_iova_itree to include reserved ranges from a device. */
iopt_table_enforce_dev_resv_regions(struct io_pagetable * iopt,struct device * dev,phys_addr_t * sw_msi_start)1390 int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt,
1391 					struct device *dev,
1392 					phys_addr_t *sw_msi_start)
1393 {
1394 	struct iommu_resv_region *resv;
1395 	LIST_HEAD(resv_regions);
1396 	unsigned int num_hw_msi = 0;
1397 	unsigned int num_sw_msi = 0;
1398 	int rc;
1399 
1400 	if (iommufd_should_fail())
1401 		return -EINVAL;
1402 
1403 	down_write(&iopt->iova_rwsem);
1404 	/* FIXME: drivers allocate memory but there is no failure propogated */
1405 	iommu_get_resv_regions(dev, &resv_regions);
1406 
1407 	list_for_each_entry(resv, &resv_regions, list) {
1408 		if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE)
1409 			continue;
1410 
1411 		if (sw_msi_start && resv->type == IOMMU_RESV_MSI)
1412 			num_hw_msi++;
1413 		if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) {
1414 			*sw_msi_start = resv->start;
1415 			num_sw_msi++;
1416 		}
1417 
1418 		rc = iopt_reserve_iova(iopt, resv->start,
1419 				       resv->length - 1 + resv->start, dev);
1420 		if (rc)
1421 			goto out_reserved;
1422 	}
1423 
1424 	/* Drivers must offer sane combinations of regions */
1425 	if (WARN_ON(num_sw_msi && num_hw_msi) || WARN_ON(num_sw_msi > 1)) {
1426 		rc = -EINVAL;
1427 		goto out_reserved;
1428 	}
1429 
1430 	rc = 0;
1431 	goto out_free_resv;
1432 
1433 out_reserved:
1434 	__iopt_remove_reserved_iova(iopt, dev);
1435 out_free_resv:
1436 	iommu_put_resv_regions(dev, &resv_regions);
1437 	up_write(&iopt->iova_rwsem);
1438 	return rc;
1439 }
1440