1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Code related to the io_uring_register() syscall
4  *
5  * Copyright (C) 2023 Jens Axboe
6  */
7 #include <linux/kernel.h>
8 #include <linux/errno.h>
9 #include <linux/syscalls.h>
10 #include <linux/refcount.h>
11 #include <linux/bits.h>
12 #include <linux/fs.h>
13 #include <linux/file.h>
14 #include <linux/slab.h>
15 #include <linux/uaccess.h>
16 #include <linux/nospec.h>
17 #include <linux/compat.h>
18 #include <linux/io_uring.h>
19 #include <linux/io_uring_types.h>
20 
21 #include "io_uring.h"
22 #include "opdef.h"
23 #include "tctx.h"
24 #include "rsrc.h"
25 #include "sqpoll.h"
26 #include "register.h"
27 #include "cancel.h"
28 #include "kbuf.h"
29 #include "napi.h"
30 #include "eventfd.h"
31 
32 #define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \
33 				 IORING_REGISTER_LAST + IORING_OP_LAST)
34 
io_probe(struct io_ring_ctx * ctx,void __user * arg,unsigned nr_args)35 static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
36 			   unsigned nr_args)
37 {
38 	struct io_uring_probe *p;
39 	size_t size;
40 	int i, ret;
41 
42 	if (nr_args > IORING_OP_LAST)
43 		nr_args = IORING_OP_LAST;
44 
45 	size = struct_size(p, ops, nr_args);
46 	p = kzalloc(size, GFP_KERNEL);
47 	if (!p)
48 		return -ENOMEM;
49 
50 	ret = -EFAULT;
51 	if (copy_from_user(p, arg, size))
52 		goto out;
53 	ret = -EINVAL;
54 	if (memchr_inv(p, 0, size))
55 		goto out;
56 
57 	p->last_op = IORING_OP_LAST - 1;
58 
59 	for (i = 0; i < nr_args; i++) {
60 		p->ops[i].op = i;
61 		if (io_uring_op_supported(i))
62 			p->ops[i].flags = IO_URING_OP_SUPPORTED;
63 	}
64 	p->ops_len = i;
65 
66 	ret = 0;
67 	if (copy_to_user(arg, p, size))
68 		ret = -EFAULT;
69 out:
70 	kfree(p);
71 	return ret;
72 }
73 
io_unregister_personality(struct io_ring_ctx * ctx,unsigned id)74 int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
75 {
76 	const struct cred *creds;
77 
78 	creds = xa_erase(&ctx->personalities, id);
79 	if (creds) {
80 		put_cred(creds);
81 		return 0;
82 	}
83 
84 	return -EINVAL;
85 }
86 
87 
io_register_personality(struct io_ring_ctx * ctx)88 static int io_register_personality(struct io_ring_ctx *ctx)
89 {
90 	const struct cred *creds;
91 	u32 id;
92 	int ret;
93 
94 	creds = get_current_cred();
95 
96 	ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
97 			XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
98 	if (ret < 0) {
99 		put_cred(creds);
100 		return ret;
101 	}
102 	return id;
103 }
104 
io_register_restrictions(struct io_ring_ctx * ctx,void __user * arg,unsigned int nr_args)105 static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
106 					   void __user *arg, unsigned int nr_args)
107 {
108 	struct io_uring_restriction *res;
109 	size_t size;
110 	int i, ret;
111 
112 	/* Restrictions allowed only if rings started disabled */
113 	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
114 		return -EBADFD;
115 
116 	/* We allow only a single restrictions registration */
117 	if (ctx->restrictions.registered)
118 		return -EBUSY;
119 
120 	if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
121 		return -EINVAL;
122 
123 	size = array_size(nr_args, sizeof(*res));
124 	if (size == SIZE_MAX)
125 		return -EOVERFLOW;
126 
127 	res = memdup_user(arg, size);
128 	if (IS_ERR(res))
129 		return PTR_ERR(res);
130 
131 	ret = 0;
132 
133 	for (i = 0; i < nr_args; i++) {
134 		switch (res[i].opcode) {
135 		case IORING_RESTRICTION_REGISTER_OP:
136 			if (res[i].register_op >= IORING_REGISTER_LAST) {
137 				ret = -EINVAL;
138 				goto out;
139 			}
140 
141 			__set_bit(res[i].register_op,
142 				  ctx->restrictions.register_op);
143 			break;
144 		case IORING_RESTRICTION_SQE_OP:
145 			if (res[i].sqe_op >= IORING_OP_LAST) {
146 				ret = -EINVAL;
147 				goto out;
148 			}
149 
150 			__set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
151 			break;
152 		case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
153 			ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
154 			break;
155 		case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
156 			ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
157 			break;
158 		default:
159 			ret = -EINVAL;
160 			goto out;
161 		}
162 	}
163 
164 out:
165 	/* Reset all restrictions if an error happened */
166 	if (ret != 0)
167 		memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
168 	else
169 		ctx->restrictions.registered = true;
170 
171 	kfree(res);
172 	return ret;
173 }
174 
io_register_enable_rings(struct io_ring_ctx * ctx)175 static int io_register_enable_rings(struct io_ring_ctx *ctx)
176 {
177 	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
178 		return -EBADFD;
179 
180 	if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
181 		WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
182 		/*
183 		 * Lazy activation attempts would fail if it was polled before
184 		 * submitter_task is set.
185 		 */
186 		if (wq_has_sleeper(&ctx->poll_wq))
187 			io_activate_pollwq(ctx);
188 	}
189 
190 	if (ctx->restrictions.registered)
191 		ctx->restricted = 1;
192 
193 	ctx->flags &= ~IORING_SETUP_R_DISABLED;
194 	if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
195 		wake_up(&ctx->sq_data->wait);
196 	return 0;
197 }
198 
__io_register_iowq_aff(struct io_ring_ctx * ctx,cpumask_var_t new_mask)199 static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
200 					 cpumask_var_t new_mask)
201 {
202 	int ret;
203 
204 	if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
205 		ret = io_wq_cpu_affinity(current->io_uring, new_mask);
206 	} else {
207 		mutex_unlock(&ctx->uring_lock);
208 		ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
209 		mutex_lock(&ctx->uring_lock);
210 	}
211 
212 	return ret;
213 }
214 
io_register_iowq_aff(struct io_ring_ctx * ctx,void __user * arg,unsigned len)215 static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
216 				       void __user *arg, unsigned len)
217 {
218 	cpumask_var_t new_mask;
219 	int ret;
220 
221 	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
222 		return -ENOMEM;
223 
224 	cpumask_clear(new_mask);
225 	if (len > cpumask_size())
226 		len = cpumask_size();
227 
228 #ifdef CONFIG_COMPAT
229 	if (in_compat_syscall())
230 		ret = compat_get_bitmap(cpumask_bits(new_mask),
231 					(const compat_ulong_t __user *)arg,
232 					len * 8 /* CHAR_BIT */);
233 	else
234 #endif
235 		ret = copy_from_user(new_mask, arg, len);
236 
237 	if (ret) {
238 		free_cpumask_var(new_mask);
239 		return -EFAULT;
240 	}
241 
242 	ret = __io_register_iowq_aff(ctx, new_mask);
243 	free_cpumask_var(new_mask);
244 	return ret;
245 }
246 
io_unregister_iowq_aff(struct io_ring_ctx * ctx)247 static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
248 {
249 	return __io_register_iowq_aff(ctx, NULL);
250 }
251 
io_register_iowq_max_workers(struct io_ring_ctx * ctx,void __user * arg)252 static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
253 					       void __user *arg)
254 	__must_hold(&ctx->uring_lock)
255 {
256 	struct io_tctx_node *node;
257 	struct io_uring_task *tctx = NULL;
258 	struct io_sq_data *sqd = NULL;
259 	__u32 new_count[2];
260 	int i, ret;
261 
262 	if (copy_from_user(new_count, arg, sizeof(new_count)))
263 		return -EFAULT;
264 	for (i = 0; i < ARRAY_SIZE(new_count); i++)
265 		if (new_count[i] > INT_MAX)
266 			return -EINVAL;
267 
268 	if (ctx->flags & IORING_SETUP_SQPOLL) {
269 		sqd = ctx->sq_data;
270 		if (sqd) {
271 			struct task_struct *tsk;
272 
273 			/*
274 			 * Observe the correct sqd->lock -> ctx->uring_lock
275 			 * ordering. Fine to drop uring_lock here, we hold
276 			 * a ref to the ctx.
277 			 */
278 			refcount_inc(&sqd->refs);
279 			mutex_unlock(&ctx->uring_lock);
280 			mutex_lock(&sqd->lock);
281 			mutex_lock(&ctx->uring_lock);
282 			tsk = sqpoll_task_locked(sqd);
283 			if (tsk)
284 				tctx = tsk->io_uring;
285 		}
286 	} else {
287 		tctx = current->io_uring;
288 	}
289 
290 	BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
291 
292 	for (i = 0; i < ARRAY_SIZE(new_count); i++)
293 		if (new_count[i])
294 			ctx->iowq_limits[i] = new_count[i];
295 	ctx->iowq_limits_set = true;
296 
297 	if (tctx && tctx->io_wq) {
298 		ret = io_wq_max_workers(tctx->io_wq, new_count);
299 		if (ret)
300 			goto err;
301 	} else {
302 		memset(new_count, 0, sizeof(new_count));
303 	}
304 
305 	if (sqd) {
306 		mutex_unlock(&ctx->uring_lock);
307 		mutex_unlock(&sqd->lock);
308 		io_put_sq_data(sqd);
309 		mutex_lock(&ctx->uring_lock);
310 	}
311 
312 	if (copy_to_user(arg, new_count, sizeof(new_count)))
313 		return -EFAULT;
314 
315 	/* that's it for SQPOLL, only the SQPOLL task creates requests */
316 	if (sqd)
317 		return 0;
318 
319 	/* now propagate the restriction to all registered users */
320 	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
321 		tctx = node->task->io_uring;
322 		if (WARN_ON_ONCE(!tctx->io_wq))
323 			continue;
324 
325 		for (i = 0; i < ARRAY_SIZE(new_count); i++)
326 			new_count[i] = ctx->iowq_limits[i];
327 		/* ignore errors, it always returns zero anyway */
328 		(void)io_wq_max_workers(tctx->io_wq, new_count);
329 	}
330 	return 0;
331 err:
332 	if (sqd) {
333 		mutex_unlock(&ctx->uring_lock);
334 		mutex_unlock(&sqd->lock);
335 		io_put_sq_data(sqd);
336 		mutex_lock(&ctx->uring_lock);
337 	}
338 	return ret;
339 }
340 
io_register_clock(struct io_ring_ctx * ctx,struct io_uring_clock_register __user * arg)341 static int io_register_clock(struct io_ring_ctx *ctx,
342 			     struct io_uring_clock_register __user *arg)
343 {
344 	struct io_uring_clock_register reg;
345 
346 	if (copy_from_user(®, arg, sizeof(reg)))
347 		return -EFAULT;
348 	if (memchr_inv(®.__resv, 0, sizeof(reg.__resv)))
349 		return -EINVAL;
350 
351 	switch (reg.clockid) {
352 	case CLOCK_MONOTONIC:
353 		ctx->clock_offset = 0;
354 		break;
355 	case CLOCK_BOOTTIME:
356 		ctx->clock_offset = TK_OFFS_BOOT;
357 		break;
358 	default:
359 		return -EINVAL;
360 	}
361 
362 	ctx->clockid = reg.clockid;
363 	return 0;
364 }
365 
__io_uring_register(struct io_ring_ctx * ctx,unsigned opcode,void __user * arg,unsigned nr_args)366 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
367 			       void __user *arg, unsigned nr_args)
368 	__releases(ctx->uring_lock)
369 	__acquires(ctx->uring_lock)
370 {
371 	int ret;
372 
373 	/*
374 	 * We don't quiesce the refs for register anymore and so it can't be
375 	 * dying as we're holding a file ref here.
376 	 */
377 	if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
378 		return -ENXIO;
379 
380 	if (ctx->submitter_task && ctx->submitter_task != current)
381 		return -EEXIST;
382 
383 	if (ctx->restricted) {
384 		opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
385 		if (!test_bit(opcode, ctx->restrictions.register_op))
386 			return -EACCES;
387 	}
388 
389 	switch (opcode) {
390 	case IORING_REGISTER_BUFFERS:
391 		ret = -EFAULT;
392 		if (!arg)
393 			break;
394 		ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
395 		break;
396 	case IORING_UNREGISTER_BUFFERS:
397 		ret = -EINVAL;
398 		if (arg || nr_args)
399 			break;
400 		ret = io_sqe_buffers_unregister(ctx);
401 		break;
402 	case IORING_REGISTER_FILES:
403 		ret = -EFAULT;
404 		if (!arg)
405 			break;
406 		ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
407 		break;
408 	case IORING_UNREGISTER_FILES:
409 		ret = -EINVAL;
410 		if (arg || nr_args)
411 			break;
412 		ret = io_sqe_files_unregister(ctx);
413 		break;
414 	case IORING_REGISTER_FILES_UPDATE:
415 		ret = io_register_files_update(ctx, arg, nr_args);
416 		break;
417 	case IORING_REGISTER_EVENTFD:
418 		ret = -EINVAL;
419 		if (nr_args != 1)
420 			break;
421 		ret = io_eventfd_register(ctx, arg, 0);
422 		break;
423 	case IORING_REGISTER_EVENTFD_ASYNC:
424 		ret = -EINVAL;
425 		if (nr_args != 1)
426 			break;
427 		ret = io_eventfd_register(ctx, arg, 1);
428 		break;
429 	case IORING_UNREGISTER_EVENTFD:
430 		ret = -EINVAL;
431 		if (arg || nr_args)
432 			break;
433 		ret = io_eventfd_unregister(ctx);
434 		break;
435 	case IORING_REGISTER_PROBE:
436 		ret = -EINVAL;
437 		if (!arg || nr_args > 256)
438 			break;
439 		ret = io_probe(ctx, arg, nr_args);
440 		break;
441 	case IORING_REGISTER_PERSONALITY:
442 		ret = -EINVAL;
443 		if (arg || nr_args)
444 			break;
445 		ret = io_register_personality(ctx);
446 		break;
447 	case IORING_UNREGISTER_PERSONALITY:
448 		ret = -EINVAL;
449 		if (arg)
450 			break;
451 		ret = io_unregister_personality(ctx, nr_args);
452 		break;
453 	case IORING_REGISTER_ENABLE_RINGS:
454 		ret = -EINVAL;
455 		if (arg || nr_args)
456 			break;
457 		ret = io_register_enable_rings(ctx);
458 		break;
459 	case IORING_REGISTER_RESTRICTIONS:
460 		ret = io_register_restrictions(ctx, arg, nr_args);
461 		break;
462 	case IORING_REGISTER_FILES2:
463 		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
464 		break;
465 	case IORING_REGISTER_FILES_UPDATE2:
466 		ret = io_register_rsrc_update(ctx, arg, nr_args,
467 					      IORING_RSRC_FILE);
468 		break;
469 	case IORING_REGISTER_BUFFERS2:
470 		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
471 		break;
472 	case IORING_REGISTER_BUFFERS_UPDATE:
473 		ret = io_register_rsrc_update(ctx, arg, nr_args,
474 					      IORING_RSRC_BUFFER);
475 		break;
476 	case IORING_REGISTER_IOWQ_AFF:
477 		ret = -EINVAL;
478 		if (!arg || !nr_args)
479 			break;
480 		ret = io_register_iowq_aff(ctx, arg, nr_args);
481 		break;
482 	case IORING_UNREGISTER_IOWQ_AFF:
483 		ret = -EINVAL;
484 		if (arg || nr_args)
485 			break;
486 		ret = io_unregister_iowq_aff(ctx);
487 		break;
488 	case IORING_REGISTER_IOWQ_MAX_WORKERS:
489 		ret = -EINVAL;
490 		if (!arg || nr_args != 2)
491 			break;
492 		ret = io_register_iowq_max_workers(ctx, arg);
493 		break;
494 	case IORING_REGISTER_RING_FDS:
495 		ret = io_ringfd_register(ctx, arg, nr_args);
496 		break;
497 	case IORING_UNREGISTER_RING_FDS:
498 		ret = io_ringfd_unregister(ctx, arg, nr_args);
499 		break;
500 	case IORING_REGISTER_PBUF_RING:
501 		ret = -EINVAL;
502 		if (!arg || nr_args != 1)
503 			break;
504 		ret = io_register_pbuf_ring(ctx, arg);
505 		break;
506 	case IORING_UNREGISTER_PBUF_RING:
507 		ret = -EINVAL;
508 		if (!arg || nr_args != 1)
509 			break;
510 		ret = io_unregister_pbuf_ring(ctx, arg);
511 		break;
512 	case IORING_REGISTER_SYNC_CANCEL:
513 		ret = -EINVAL;
514 		if (!arg || nr_args != 1)
515 			break;
516 		ret = io_sync_cancel(ctx, arg);
517 		break;
518 	case IORING_REGISTER_FILE_ALLOC_RANGE:
519 		ret = -EINVAL;
520 		if (!arg || nr_args)
521 			break;
522 		ret = io_register_file_alloc_range(ctx, arg);
523 		break;
524 	case IORING_REGISTER_PBUF_STATUS:
525 		ret = -EINVAL;
526 		if (!arg || nr_args != 1)
527 			break;
528 		ret = io_register_pbuf_status(ctx, arg);
529 		break;
530 	case IORING_REGISTER_NAPI:
531 		ret = -EINVAL;
532 		if (!arg || nr_args != 1)
533 			break;
534 		ret = io_register_napi(ctx, arg);
535 		break;
536 	case IORING_UNREGISTER_NAPI:
537 		ret = -EINVAL;
538 		if (nr_args != 1)
539 			break;
540 		ret = io_unregister_napi(ctx, arg);
541 		break;
542 	case IORING_REGISTER_CLOCK:
543 		ret = -EINVAL;
544 		if (!arg || nr_args)
545 			break;
546 		ret = io_register_clock(ctx, arg);
547 		break;
548 	case IORING_REGISTER_CLONE_BUFFERS:
549 		ret = -EINVAL;
550 		if (!arg || nr_args != 1)
551 			break;
552 		ret = io_register_clone_buffers(ctx, arg);
553 		break;
554 	default:
555 		ret = -EINVAL;
556 		break;
557 	}
558 
559 	return ret;
560 }
561 
562 /*
563  * Given an 'fd' value, return the ctx associated with if. If 'registered' is
564  * true, then the registered index is used. Otherwise, the normal fd table.
565  * Caller must call fput() on the returned file, unless it's an ERR_PTR.
566  */
io_uring_register_get_file(unsigned int fd,bool registered)567 struct file *io_uring_register_get_file(unsigned int fd, bool registered)
568 {
569 	struct file *file;
570 
571 	if (registered) {
572 		/*
573 		 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
574 		 * need only dereference our task private array to find it.
575 		 */
576 		struct io_uring_task *tctx = current->io_uring;
577 
578 		if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
579 			return ERR_PTR(-EINVAL);
580 		fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
581 		file = tctx->registered_rings[fd];
582 	} else {
583 		file = fget(fd);
584 	}
585 
586 	if (unlikely(!file))
587 		return ERR_PTR(-EBADF);
588 	if (io_is_uring_fops(file))
589 		return file;
590 	fput(file);
591 	return ERR_PTR(-EOPNOTSUPP);
592 }
593 
SYSCALL_DEFINE4(io_uring_register,unsigned int,fd,unsigned int,opcode,void __user *,arg,unsigned int,nr_args)594 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
595 		void __user *, arg, unsigned int, nr_args)
596 {
597 	struct io_ring_ctx *ctx;
598 	long ret = -EBADF;
599 	struct file *file;
600 	bool use_registered_ring;
601 
602 	use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
603 	opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
604 
605 	if (opcode >= IORING_REGISTER_LAST)
606 		return -EINVAL;
607 
608 	file = io_uring_register_get_file(fd, use_registered_ring);
609 	if (IS_ERR(file))
610 		return PTR_ERR(file);
611 	ctx = file->private_data;
612 
613 	mutex_lock(&ctx->uring_lock);
614 	ret = __io_uring_register(ctx, opcode, arg, nr_args);
615 	mutex_unlock(&ctx->uring_lock);
616 	trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
617 	if (!use_registered_ring)
618 		fput(file);
619 	return ret;
620 }
621