1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
3 */
4 #include <linux/file.h>
5 #include <linux/interval_tree.h>
6 #include <linux/iommu.h>
7 #include <linux/iommufd.h>
8 #include <linux/slab.h>
9 #include <linux/vfio.h>
10 #include <uapi/linux/vfio.h>
11 #include <uapi/linux/iommufd.h>
12
13 #include "iommufd_private.h"
14
get_compat_ioas(struct iommufd_ctx * ictx)15 static struct iommufd_ioas *get_compat_ioas(struct iommufd_ctx *ictx)
16 {
17 struct iommufd_ioas *ioas = ERR_PTR(-ENODEV);
18
19 xa_lock(&ictx->objects);
20 if (!ictx->vfio_ioas || !iommufd_lock_obj(&ictx->vfio_ioas->obj))
21 goto out_unlock;
22 ioas = ictx->vfio_ioas;
23 out_unlock:
24 xa_unlock(&ictx->objects);
25 return ioas;
26 }
27
28 /**
29 * iommufd_vfio_compat_ioas_get_id - Ensure a compat IOAS exists
30 * @ictx: Context to operate on
31 * @out_ioas_id: The IOAS ID of the compatibility IOAS
32 *
33 * Return the ID of the current compatibility IOAS. The ID can be passed into
34 * other functions that take an ioas_id.
35 */
iommufd_vfio_compat_ioas_get_id(struct iommufd_ctx * ictx,u32 * out_ioas_id)36 int iommufd_vfio_compat_ioas_get_id(struct iommufd_ctx *ictx, u32 *out_ioas_id)
37 {
38 struct iommufd_ioas *ioas;
39
40 ioas = get_compat_ioas(ictx);
41 if (IS_ERR(ioas))
42 return PTR_ERR(ioas);
43 *out_ioas_id = ioas->obj.id;
44 iommufd_put_object(&ioas->obj);
45 return 0;
46 }
47 EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_get_id, IOMMUFD_VFIO);
48
49 /**
50 * iommufd_vfio_compat_set_no_iommu - Called when a no-iommu device is attached
51 * @ictx: Context to operate on
52 *
53 * This allows selecting the VFIO_NOIOMMU_IOMMU and blocks normal types.
54 */
iommufd_vfio_compat_set_no_iommu(struct iommufd_ctx * ictx)55 int iommufd_vfio_compat_set_no_iommu(struct iommufd_ctx *ictx)
56 {
57 int ret;
58
59 xa_lock(&ictx->objects);
60 if (!ictx->vfio_ioas) {
61 ictx->no_iommu_mode = 1;
62 ret = 0;
63 } else {
64 ret = -EINVAL;
65 }
66 xa_unlock(&ictx->objects);
67 return ret;
68 }
69 EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_set_no_iommu, IOMMUFD_VFIO);
70
71 /**
72 * iommufd_vfio_compat_ioas_create - Ensure the compat IOAS is created
73 * @ictx: Context to operate on
74 *
75 * The compatibility IOAS is the IOAS that the vfio compatibility ioctls operate
76 * on since they do not have an IOAS ID input in their ABI. Only attaching a
77 * group should cause a default creation of the internal ioas, this does nothing
78 * if an existing ioas has already been assigned somehow.
79 */
iommufd_vfio_compat_ioas_create(struct iommufd_ctx * ictx)80 int iommufd_vfio_compat_ioas_create(struct iommufd_ctx *ictx)
81 {
82 struct iommufd_ioas *ioas = NULL;
83 int ret;
84
85 ioas = iommufd_ioas_alloc(ictx);
86 if (IS_ERR(ioas))
87 return PTR_ERR(ioas);
88
89 xa_lock(&ictx->objects);
90 /*
91 * VFIO won't allow attaching a container to both iommu and no iommu
92 * operation
93 */
94 if (ictx->no_iommu_mode) {
95 ret = -EINVAL;
96 goto out_abort;
97 }
98
99 if (ictx->vfio_ioas && iommufd_lock_obj(&ictx->vfio_ioas->obj)) {
100 ret = 0;
101 iommufd_put_object(&ictx->vfio_ioas->obj);
102 goto out_abort;
103 }
104 ictx->vfio_ioas = ioas;
105 xa_unlock(&ictx->objects);
106
107 /*
108 * An automatically created compat IOAS is treated as a userspace
109 * created object. Userspace can learn the ID via IOMMU_VFIO_IOAS_GET,
110 * and if not manually destroyed it will be destroyed automatically
111 * at iommufd release.
112 */
113 iommufd_object_finalize(ictx, &ioas->obj);
114 return 0;
115
116 out_abort:
117 xa_unlock(&ictx->objects);
118 iommufd_object_abort(ictx, &ioas->obj);
119 return ret;
120 }
121 EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_create, IOMMUFD_VFIO);
122
iommufd_vfio_ioas(struct iommufd_ucmd * ucmd)123 int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd)
124 {
125 struct iommu_vfio_ioas *cmd = ucmd->cmd;
126 struct iommufd_ioas *ioas;
127
128 if (cmd->__reserved)
129 return -EOPNOTSUPP;
130 switch (cmd->op) {
131 case IOMMU_VFIO_IOAS_GET:
132 ioas = get_compat_ioas(ucmd->ictx);
133 if (IS_ERR(ioas))
134 return PTR_ERR(ioas);
135 cmd->ioas_id = ioas->obj.id;
136 iommufd_put_object(&ioas->obj);
137 return iommufd_ucmd_respond(ucmd, sizeof(*cmd));
138
139 case IOMMU_VFIO_IOAS_SET:
140 ioas = iommufd_get_ioas(ucmd->ictx, cmd->ioas_id);
141 if (IS_ERR(ioas))
142 return PTR_ERR(ioas);
143 xa_lock(&ucmd->ictx->objects);
144 ucmd->ictx->vfio_ioas = ioas;
145 xa_unlock(&ucmd->ictx->objects);
146 iommufd_put_object(&ioas->obj);
147 return 0;
148
149 case IOMMU_VFIO_IOAS_CLEAR:
150 xa_lock(&ucmd->ictx->objects);
151 ucmd->ictx->vfio_ioas = NULL;
152 xa_unlock(&ucmd->ictx->objects);
153 return 0;
154 default:
155 return -EOPNOTSUPP;
156 }
157 }
158
iommufd_vfio_map_dma(struct iommufd_ctx * ictx,unsigned int cmd,void __user * arg)159 static int iommufd_vfio_map_dma(struct iommufd_ctx *ictx, unsigned int cmd,
160 void __user *arg)
161 {
162 u32 supported_flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
163 size_t minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
164 struct vfio_iommu_type1_dma_map map;
165 int iommu_prot = IOMMU_CACHE;
166 struct iommufd_ioas *ioas;
167 unsigned long iova;
168 int rc;
169
170 if (copy_from_user(&map, arg, minsz))
171 return -EFAULT;
172
173 if (map.argsz < minsz || map.flags & ~supported_flags)
174 return -EINVAL;
175
176 if (map.flags & VFIO_DMA_MAP_FLAG_READ)
177 iommu_prot |= IOMMU_READ;
178 if (map.flags & VFIO_DMA_MAP_FLAG_WRITE)
179 iommu_prot |= IOMMU_WRITE;
180
181 ioas = get_compat_ioas(ictx);
182 if (IS_ERR(ioas))
183 return PTR_ERR(ioas);
184
185 /*
186 * Maps created through the legacy interface always use VFIO compatible
187 * rlimit accounting. If the user wishes to use the faster user based
188 * rlimit accounting then they must use the new interface.
189 */
190 iova = map.iova;
191 rc = iopt_map_user_pages(ictx, &ioas->iopt, &iova, u64_to_user_ptr(map.vaddr),
192 map.size, iommu_prot, 0);
193 iommufd_put_object(&ioas->obj);
194 return rc;
195 }
196
iommufd_vfio_unmap_dma(struct iommufd_ctx * ictx,unsigned int cmd,void __user * arg)197 static int iommufd_vfio_unmap_dma(struct iommufd_ctx *ictx, unsigned int cmd,
198 void __user *arg)
199 {
200 size_t minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
201 /*
202 * VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP is obsoleted by the new
203 * dirty tracking direction:
204 * https://lore.kernel.org/kvm/20220731125503.142683-1-yishaih@nvidia.com/
205 * https://lore.kernel.org/kvm/20220428210933.3583-1-joao.m.martins@oracle.com/
206 */
207 u32 supported_flags = VFIO_DMA_UNMAP_FLAG_ALL;
208 struct vfio_iommu_type1_dma_unmap unmap;
209 unsigned long unmapped = 0;
210 struct iommufd_ioas *ioas;
211 int rc;
212
213 if (copy_from_user(&unmap, arg, minsz))
214 return -EFAULT;
215
216 if (unmap.argsz < minsz || unmap.flags & ~supported_flags)
217 return -EINVAL;
218
219 ioas = get_compat_ioas(ictx);
220 if (IS_ERR(ioas))
221 return PTR_ERR(ioas);
222
223 if (unmap.flags & VFIO_DMA_UNMAP_FLAG_ALL) {
224 if (unmap.iova != 0 || unmap.size != 0) {
225 rc = -EINVAL;
226 goto err_put;
227 }
228 rc = iopt_unmap_all(&ioas->iopt, &unmapped);
229 } else {
230 if (READ_ONCE(ioas->iopt.disable_large_pages)) {
231 /*
232 * Create cuts at the start and last of the requested
233 * range. If the start IOVA is 0 then it doesn't need to
234 * be cut.
235 */
236 unsigned long iovas[] = { unmap.iova + unmap.size - 1,
237 unmap.iova - 1 };
238
239 rc = iopt_cut_iova(&ioas->iopt, iovas,
240 unmap.iova ? 2 : 1);
241 if (rc)
242 goto err_put;
243 }
244 rc = iopt_unmap_iova(&ioas->iopt, unmap.iova, unmap.size,
245 &unmapped);
246 }
247 unmap.size = unmapped;
248 if (copy_to_user(arg, &unmap, minsz))
249 rc = -EFAULT;
250
251 err_put:
252 iommufd_put_object(&ioas->obj);
253 return rc;
254 }
255
iommufd_vfio_cc_iommu(struct iommufd_ctx * ictx)256 static int iommufd_vfio_cc_iommu(struct iommufd_ctx *ictx)
257 {
258 struct iommufd_hw_pagetable *hwpt;
259 struct iommufd_ioas *ioas;
260 int rc = 1;
261
262 ioas = get_compat_ioas(ictx);
263 if (IS_ERR(ioas))
264 return PTR_ERR(ioas);
265
266 mutex_lock(&ioas->mutex);
267 list_for_each_entry(hwpt, &ioas->hwpt_list, hwpt_item) {
268 if (!hwpt->enforce_cache_coherency) {
269 rc = 0;
270 break;
271 }
272 }
273 mutex_unlock(&ioas->mutex);
274
275 iommufd_put_object(&ioas->obj);
276 return rc;
277 }
278
iommufd_vfio_check_extension(struct iommufd_ctx * ictx,unsigned long type)279 static int iommufd_vfio_check_extension(struct iommufd_ctx *ictx,
280 unsigned long type)
281 {
282 switch (type) {
283 case VFIO_TYPE1_IOMMU:
284 case VFIO_TYPE1v2_IOMMU:
285 case VFIO_UNMAP_ALL:
286 return 1;
287
288 case VFIO_NOIOMMU_IOMMU:
289 return IS_ENABLED(CONFIG_VFIO_NOIOMMU);
290
291 case VFIO_DMA_CC_IOMMU:
292 return iommufd_vfio_cc_iommu(ictx);
293
294 /*
295 * This is obsolete, and to be removed from VFIO. It was an incomplete
296 * idea that got merged.
297 * https://lore.kernel.org/kvm/0-v1-0093c9b0e345+19-vfio_no_nesting_jgg@nvidia.com/
298 */
299 case VFIO_TYPE1_NESTING_IOMMU:
300 return 0;
301
302 /*
303 * VFIO_DMA_MAP_FLAG_VADDR
304 * https://lore.kernel.org/kvm/1611939252-7240-1-git-send-email-steven.sistare@oracle.com/
305 * https://lore.kernel.org/all/Yz777bJZjTyLrHEQ@nvidia.com/
306 *
307 * It is hard to see how this could be implemented safely.
308 */
309 case VFIO_UPDATE_VADDR:
310 default:
311 return 0;
312 }
313 }
314
iommufd_vfio_set_iommu(struct iommufd_ctx * ictx,unsigned long type)315 static int iommufd_vfio_set_iommu(struct iommufd_ctx *ictx, unsigned long type)
316 {
317 bool no_iommu_mode = READ_ONCE(ictx->no_iommu_mode);
318 struct iommufd_ioas *ioas = NULL;
319 int rc = 0;
320
321 /*
322 * Emulation for NOIOMMU is imperfect in that VFIO blocks almost all
323 * other ioctls. We let them keep working but they mostly fail since no
324 * IOAS should exist.
325 */
326 if (IS_ENABLED(CONFIG_VFIO_NOIOMMU) && type == VFIO_NOIOMMU_IOMMU &&
327 no_iommu_mode) {
328 if (!capable(CAP_SYS_RAWIO))
329 return -EPERM;
330 return 0;
331 }
332
333 if ((type != VFIO_TYPE1_IOMMU && type != VFIO_TYPE1v2_IOMMU) ||
334 no_iommu_mode)
335 return -EINVAL;
336
337 /* VFIO fails the set_iommu if there is no group */
338 ioas = get_compat_ioas(ictx);
339 if (IS_ERR(ioas))
340 return PTR_ERR(ioas);
341
342 /*
343 * The difference between TYPE1 and TYPE1v2 is the ability to unmap in
344 * the middle of mapped ranges. This is complicated by huge page support
345 * which creates single large IOPTEs that cannot be split by the iommu
346 * driver. TYPE1 is very old at this point and likely nothing uses it,
347 * however it is simple enough to emulate by simply disabling the
348 * problematic large IOPTEs. Then we can safely unmap within any range.
349 */
350 if (type == VFIO_TYPE1_IOMMU)
351 rc = iopt_disable_large_pages(&ioas->iopt);
352 iommufd_put_object(&ioas->obj);
353 return rc;
354 }
355
iommufd_get_pagesizes(struct iommufd_ioas * ioas)356 static unsigned long iommufd_get_pagesizes(struct iommufd_ioas *ioas)
357 {
358 struct io_pagetable *iopt = &ioas->iopt;
359 unsigned long pgsize_bitmap = ULONG_MAX;
360 struct iommu_domain *domain;
361 unsigned long index;
362
363 down_read(&iopt->domains_rwsem);
364 xa_for_each(&iopt->domains, index, domain)
365 pgsize_bitmap &= domain->pgsize_bitmap;
366
367 /* See vfio_update_pgsize_bitmap() */
368 if (pgsize_bitmap & ~PAGE_MASK) {
369 pgsize_bitmap &= PAGE_MASK;
370 pgsize_bitmap |= PAGE_SIZE;
371 }
372 pgsize_bitmap = max(pgsize_bitmap, ioas->iopt.iova_alignment);
373 up_read(&iopt->domains_rwsem);
374 return pgsize_bitmap;
375 }
376
iommufd_fill_cap_iova(struct iommufd_ioas * ioas,struct vfio_info_cap_header __user * cur,size_t avail)377 static int iommufd_fill_cap_iova(struct iommufd_ioas *ioas,
378 struct vfio_info_cap_header __user *cur,
379 size_t avail)
380 {
381 struct vfio_iommu_type1_info_cap_iova_range __user *ucap_iovas =
382 container_of(cur,
383 struct vfio_iommu_type1_info_cap_iova_range __user,
384 header);
385 struct vfio_iommu_type1_info_cap_iova_range cap_iovas = {
386 .header = {
387 .id = VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE,
388 .version = 1,
389 },
390 };
391 struct interval_tree_span_iter span;
392
393 interval_tree_for_each_span(&span, &ioas->iopt.reserved_itree, 0,
394 ULONG_MAX) {
395 struct vfio_iova_range range;
396
397 if (!span.is_hole)
398 continue;
399 range.start = span.start_hole;
400 range.end = span.last_hole;
401 if (avail >= struct_size(&cap_iovas, iova_ranges,
402 cap_iovas.nr_iovas + 1) &&
403 copy_to_user(&ucap_iovas->iova_ranges[cap_iovas.nr_iovas],
404 &range, sizeof(range)))
405 return -EFAULT;
406 cap_iovas.nr_iovas++;
407 }
408 if (avail >= struct_size(&cap_iovas, iova_ranges, cap_iovas.nr_iovas) &&
409 copy_to_user(ucap_iovas, &cap_iovas, sizeof(cap_iovas)))
410 return -EFAULT;
411 return struct_size(&cap_iovas, iova_ranges, cap_iovas.nr_iovas);
412 }
413
iommufd_fill_cap_dma_avail(struct iommufd_ioas * ioas,struct vfio_info_cap_header __user * cur,size_t avail)414 static int iommufd_fill_cap_dma_avail(struct iommufd_ioas *ioas,
415 struct vfio_info_cap_header __user *cur,
416 size_t avail)
417 {
418 struct vfio_iommu_type1_info_dma_avail cap_dma = {
419 .header = {
420 .id = VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL,
421 .version = 1,
422 },
423 /*
424 * iommufd's limit is based on the cgroup's memory limit.
425 * Normally vfio would return U16_MAX here, and provide a module
426 * parameter to adjust it. Since S390 qemu userspace actually
427 * pays attention and needs a value bigger than U16_MAX return
428 * U32_MAX.
429 */
430 .avail = U32_MAX,
431 };
432
433 if (avail >= sizeof(cap_dma) &&
434 copy_to_user(cur, &cap_dma, sizeof(cap_dma)))
435 return -EFAULT;
436 return sizeof(cap_dma);
437 }
438
iommufd_vfio_iommu_get_info(struct iommufd_ctx * ictx,void __user * arg)439 static int iommufd_vfio_iommu_get_info(struct iommufd_ctx *ictx,
440 void __user *arg)
441 {
442 typedef int (*fill_cap_fn)(struct iommufd_ioas *ioas,
443 struct vfio_info_cap_header __user *cur,
444 size_t avail);
445 static const fill_cap_fn fill_fns[] = {
446 iommufd_fill_cap_dma_avail,
447 iommufd_fill_cap_iova,
448 };
449 size_t minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
450 struct vfio_info_cap_header __user *last_cap = NULL;
451 struct vfio_iommu_type1_info info = {};
452 struct iommufd_ioas *ioas;
453 size_t total_cap_size;
454 int rc;
455 int i;
456
457 if (copy_from_user(&info, arg, minsz))
458 return -EFAULT;
459
460 if (info.argsz < minsz)
461 return -EINVAL;
462 minsz = min_t(size_t, info.argsz, sizeof(info));
463
464 ioas = get_compat_ioas(ictx);
465 if (IS_ERR(ioas))
466 return PTR_ERR(ioas);
467
468 info.flags = VFIO_IOMMU_INFO_PGSIZES;
469 info.iova_pgsizes = iommufd_get_pagesizes(ioas);
470 info.cap_offset = 0;
471
472 down_read(&ioas->iopt.iova_rwsem);
473 total_cap_size = sizeof(info);
474 for (i = 0; i != ARRAY_SIZE(fill_fns); i++) {
475 int cap_size;
476
477 if (info.argsz > total_cap_size)
478 cap_size = fill_fns[i](ioas, arg + total_cap_size,
479 info.argsz - total_cap_size);
480 else
481 cap_size = fill_fns[i](ioas, NULL, 0);
482 if (cap_size < 0) {
483 rc = cap_size;
484 goto out_put;
485 }
486 cap_size = ALIGN(cap_size, sizeof(u64));
487
488 if (last_cap && info.argsz >= total_cap_size &&
489 put_user(total_cap_size, &last_cap->next)) {
490 rc = -EFAULT;
491 goto out_put;
492 }
493 last_cap = arg + total_cap_size;
494 total_cap_size += cap_size;
495 }
496
497 /*
498 * If the user did not provide enough space then only some caps are
499 * returned and the argsz will be updated to the correct amount to get
500 * all caps.
501 */
502 if (info.argsz >= total_cap_size)
503 info.cap_offset = sizeof(info);
504 info.argsz = total_cap_size;
505 info.flags |= VFIO_IOMMU_INFO_CAPS;
506 if (copy_to_user(arg, &info, minsz)) {
507 rc = -EFAULT;
508 goto out_put;
509 }
510 rc = 0;
511
512 out_put:
513 up_read(&ioas->iopt.iova_rwsem);
514 iommufd_put_object(&ioas->obj);
515 return rc;
516 }
517
iommufd_vfio_ioctl(struct iommufd_ctx * ictx,unsigned int cmd,unsigned long arg)518 int iommufd_vfio_ioctl(struct iommufd_ctx *ictx, unsigned int cmd,
519 unsigned long arg)
520 {
521 void __user *uarg = (void __user *)arg;
522
523 switch (cmd) {
524 case VFIO_GET_API_VERSION:
525 return VFIO_API_VERSION;
526 case VFIO_SET_IOMMU:
527 return iommufd_vfio_set_iommu(ictx, arg);
528 case VFIO_CHECK_EXTENSION:
529 return iommufd_vfio_check_extension(ictx, arg);
530 case VFIO_IOMMU_GET_INFO:
531 return iommufd_vfio_iommu_get_info(ictx, uarg);
532 case VFIO_IOMMU_MAP_DMA:
533 return iommufd_vfio_map_dma(ictx, cmd, uarg);
534 case VFIO_IOMMU_UNMAP_DMA:
535 return iommufd_vfio_unmap_dma(ictx, cmd, uarg);
536 case VFIO_IOMMU_DIRTY_PAGES:
537 default:
538 return -ENOIOCTLCMD;
539 }
540 return -ENOIOCTLCMD;
541 }
542