1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * fs/userfaultfd.c
4 *
5 * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org>
6 * Copyright (C) 2008-2009 Red Hat, Inc.
7 * Copyright (C) 2015 Red Hat, Inc.
8 *
9 * Some part derived from fs/eventfd.c (anon inode setup) and
10 * mm/ksm.c (mm hashing).
11 */
12
13 #include <linux/list.h>
14 #include <linux/hashtable.h>
15 #include <linux/sched/signal.h>
16 #include <linux/sched/mm.h>
17 #include <linux/mm.h>
18 #include <linux/mm_inline.h>
19 #include <linux/mmu_notifier.h>
20 #include <linux/poll.h>
21 #include <linux/slab.h>
22 #include <linux/seq_file.h>
23 #include <linux/file.h>
24 #include <linux/bug.h>
25 #include <linux/anon_inodes.h>
26 #include <linux/syscalls.h>
27 #include <linux/userfaultfd_k.h>
28 #include <linux/mempolicy.h>
29 #include <linux/ioctl.h>
30 #include <linux/security.h>
31 #include <linux/hugetlb.h>
32 #include <linux/swapops.h>
33 #include <linux/miscdevice.h>
34
35 int sysctl_unprivileged_userfaultfd __read_mostly;
36
37 static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
38
39 /*
40 * Start with fault_pending_wqh and fault_wqh so they're more likely
41 * to be in the same cacheline.
42 *
43 * Locking order:
44 * fd_wqh.lock
45 * fault_pending_wqh.lock
46 * fault_wqh.lock
47 * event_wqh.lock
48 *
49 * To avoid deadlocks, IRQs must be disabled when taking any of the above locks,
50 * since fd_wqh.lock is taken by aio_poll() while it's holding a lock that's
51 * also taken in IRQ context.
52 */
53 struct userfaultfd_ctx {
54 /* waitqueue head for the pending (i.e. not read) userfaults */
55 wait_queue_head_t fault_pending_wqh;
56 /* waitqueue head for the userfaults */
57 wait_queue_head_t fault_wqh;
58 /* waitqueue head for the pseudo fd to wakeup poll/read */
59 wait_queue_head_t fd_wqh;
60 /* waitqueue head for events */
61 wait_queue_head_t event_wqh;
62 /* a refile sequence protected by fault_pending_wqh lock */
63 seqcount_spinlock_t refile_seq;
64 /* pseudo fd refcounting */
65 refcount_t refcount;
66 /* userfaultfd syscall flags */
67 unsigned int flags;
68 /* features requested from the userspace */
69 unsigned int features;
70 /* released */
71 bool released;
72 /* memory mappings are changing because of non-cooperative event */
73 atomic_t mmap_changing;
74 /* mm with one ore more vmas attached to this userfaultfd_ctx */
75 struct mm_struct *mm;
76 };
77
78 struct userfaultfd_fork_ctx {
79 struct userfaultfd_ctx *orig;
80 struct userfaultfd_ctx *new;
81 struct list_head list;
82 };
83
84 struct userfaultfd_unmap_ctx {
85 struct userfaultfd_ctx *ctx;
86 unsigned long start;
87 unsigned long end;
88 struct list_head list;
89 };
90
91 struct userfaultfd_wait_queue {
92 struct uffd_msg msg;
93 wait_queue_entry_t wq;
94 struct userfaultfd_ctx *ctx;
95 bool waken;
96 };
97
98 struct userfaultfd_wake_range {
99 unsigned long start;
100 unsigned long len;
101 };
102
103 /* internal indication that UFFD_API ioctl was successfully executed */
104 #define UFFD_FEATURE_INITIALIZED (1u << 31)
105
userfaultfd_is_initialized(struct userfaultfd_ctx * ctx)106 static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx)
107 {
108 return ctx->features & UFFD_FEATURE_INITIALIZED;
109 }
110
userfaultfd_set_vm_flags(struct vm_area_struct * vma,vm_flags_t flags)111 static void userfaultfd_set_vm_flags(struct vm_area_struct *vma,
112 vm_flags_t flags)
113 {
114 const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP;
115
116 vm_flags_reset(vma, flags);
117 /*
118 * For shared mappings, we want to enable writenotify while
119 * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply
120 * recalculate vma->vm_page_prot whenever userfaultfd-wp changes.
121 */
122 if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed)
123 vma_set_page_prot(vma);
124 }
125
userfaultfd_wake_function(wait_queue_entry_t * wq,unsigned mode,int wake_flags,void * key)126 static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
127 int wake_flags, void *key)
128 {
129 struct userfaultfd_wake_range *range = key;
130 int ret;
131 struct userfaultfd_wait_queue *uwq;
132 unsigned long start, len;
133
134 uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
135 ret = 0;
136 /* len == 0 means wake all */
137 start = range->start;
138 len = range->len;
139 if (len && (start > uwq->msg.arg.pagefault.address ||
140 start + len <= uwq->msg.arg.pagefault.address))
141 goto out;
142 WRITE_ONCE(uwq->waken, true);
143 /*
144 * The Program-Order guarantees provided by the scheduler
145 * ensure uwq->waken is visible before the task is woken.
146 */
147 ret = wake_up_state(wq->private, mode);
148 if (ret) {
149 /*
150 * Wake only once, autoremove behavior.
151 *
152 * After the effect of list_del_init is visible to the other
153 * CPUs, the waitqueue may disappear from under us, see the
154 * !list_empty_careful() in handle_userfault().
155 *
156 * try_to_wake_up() has an implicit smp_mb(), and the
157 * wq->private is read before calling the extern function
158 * "wake_up_state" (which in turns calls try_to_wake_up).
159 */
160 list_del_init(&wq->entry);
161 }
162 out:
163 return ret;
164 }
165
166 /**
167 * userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd
168 * context.
169 * @ctx: [in] Pointer to the userfaultfd context.
170 */
userfaultfd_ctx_get(struct userfaultfd_ctx * ctx)171 static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx)
172 {
173 refcount_inc(&ctx->refcount);
174 }
175
176 /**
177 * userfaultfd_ctx_put - Releases a reference to the internal userfaultfd
178 * context.
179 * @ctx: [in] Pointer to userfaultfd context.
180 *
181 * The userfaultfd context reference must have been previously acquired either
182 * with userfaultfd_ctx_get() or userfaultfd_ctx_fdget().
183 */
userfaultfd_ctx_put(struct userfaultfd_ctx * ctx)184 static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
185 {
186 if (refcount_dec_and_test(&ctx->refcount)) {
187 VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock));
188 VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh));
189 VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock));
190 VM_BUG_ON(waitqueue_active(&ctx->fault_wqh));
191 VM_BUG_ON(spin_is_locked(&ctx->event_wqh.lock));
192 VM_BUG_ON(waitqueue_active(&ctx->event_wqh));
193 VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
194 VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
195 mmdrop(ctx->mm);
196 kmem_cache_free(userfaultfd_ctx_cachep, ctx);
197 }
198 }
199
msg_init(struct uffd_msg * msg)200 static inline void msg_init(struct uffd_msg *msg)
201 {
202 BUILD_BUG_ON(sizeof(struct uffd_msg) != 32);
203 /*
204 * Must use memset to zero out the paddings or kernel data is
205 * leaked to userland.
206 */
207 memset(msg, 0, sizeof(struct uffd_msg));
208 }
209
userfault_msg(unsigned long address,unsigned long real_address,unsigned int flags,unsigned long reason,unsigned int features)210 static inline struct uffd_msg userfault_msg(unsigned long address,
211 unsigned long real_address,
212 unsigned int flags,
213 unsigned long reason,
214 unsigned int features)
215 {
216 struct uffd_msg msg;
217
218 msg_init(&msg);
219 msg.event = UFFD_EVENT_PAGEFAULT;
220
221 msg.arg.pagefault.address = (features & UFFD_FEATURE_EXACT_ADDRESS) ?
222 real_address : address;
223
224 /*
225 * These flags indicate why the userfault occurred:
226 * - UFFD_PAGEFAULT_FLAG_WP indicates a write protect fault.
227 * - UFFD_PAGEFAULT_FLAG_MINOR indicates a minor fault.
228 * - Neither of these flags being set indicates a MISSING fault.
229 *
230 * Separately, UFFD_PAGEFAULT_FLAG_WRITE indicates it was a write
231 * fault. Otherwise, it was a read fault.
232 */
233 if (flags & FAULT_FLAG_WRITE)
234 msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE;
235 if (reason & VM_UFFD_WP)
236 msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
237 if (reason & VM_UFFD_MINOR)
238 msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_MINOR;
239 if (features & UFFD_FEATURE_THREAD_ID)
240 msg.arg.pagefault.feat.ptid = task_pid_vnr(current);
241 return msg;
242 }
243
244 #ifdef CONFIG_HUGETLB_PAGE
245 /*
246 * Same functionality as userfaultfd_must_wait below with modifications for
247 * hugepmd ranges.
248 */
userfaultfd_huge_must_wait(struct userfaultfd_ctx * ctx,struct vm_fault * vmf,unsigned long reason)249 static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
250 struct vm_fault *vmf,
251 unsigned long reason)
252 {
253 struct vm_area_struct *vma = vmf->vma;
254 struct mm_struct *mm = ctx->mm;
255 pte_t *ptep, pte;
256 bool ret = true;
257
258 assert_fault_locked(vmf);
259
260 ptep = huge_pte_offset(mm, vmf->address, vma_mmu_pagesize(vma));
261
262 if (!ptep)
263 goto out;
264
265 ret = false;
266 pte = huge_ptep_get(ptep);
267
268 /*
269 * Lockless access: we're in a wait_event so it's ok if it
270 * changes under us. PTE markers should be handled the same as none
271 * ptes here.
272 */
273 if (huge_pte_none_mostly(pte))
274 ret = true;
275 if (!huge_pte_write(pte) && (reason & VM_UFFD_WP))
276 ret = true;
277 out:
278 return ret;
279 }
280 #else
userfaultfd_huge_must_wait(struct userfaultfd_ctx * ctx,struct vm_fault * vmf,unsigned long reason)281 static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
282 struct vm_fault *vmf,
283 unsigned long reason)
284 {
285 return false; /* should never get here */
286 }
287 #endif /* CONFIG_HUGETLB_PAGE */
288
289 /*
290 * Verify the pagetables are still not ok after having reigstered into
291 * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any
292 * userfault that has already been resolved, if userfaultfd_read and
293 * UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different
294 * threads.
295 */
userfaultfd_must_wait(struct userfaultfd_ctx * ctx,struct vm_fault * vmf,unsigned long reason)296 static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
297 struct vm_fault *vmf,
298 unsigned long reason)
299 {
300 struct mm_struct *mm = ctx->mm;
301 unsigned long address = vmf->address;
302 pgd_t *pgd;
303 p4d_t *p4d;
304 pud_t *pud;
305 pmd_t *pmd, _pmd;
306 pte_t *pte;
307 bool ret = true;
308
309 assert_fault_locked(vmf);
310
311 pgd = pgd_offset(mm, address);
312 if (!pgd_present(*pgd))
313 goto out;
314 p4d = p4d_offset(pgd, address);
315 if (!p4d_present(*p4d))
316 goto out;
317 pud = pud_offset(p4d, address);
318 if (!pud_present(*pud))
319 goto out;
320 pmd = pmd_offset(pud, address);
321 /*
322 * READ_ONCE must function as a barrier with narrower scope
323 * and it must be equivalent to:
324 * _pmd = *pmd; barrier();
325 *
326 * This is to deal with the instability (as in
327 * pmd_trans_unstable) of the pmd.
328 */
329 _pmd = READ_ONCE(*pmd);
330 if (pmd_none(_pmd))
331 goto out;
332
333 ret = false;
334 if (!pmd_present(_pmd))
335 goto out;
336
337 if (pmd_trans_huge(_pmd)) {
338 if (!pmd_write(_pmd) && (reason & VM_UFFD_WP))
339 ret = true;
340 goto out;
341 }
342
343 /*
344 * the pmd is stable (as in !pmd_trans_unstable) so we can re-read it
345 * and use the standard pte_offset_map() instead of parsing _pmd.
346 */
347 pte = pte_offset_map(pmd, address);
348 /*
349 * Lockless access: we're in a wait_event so it's ok if it
350 * changes under us. PTE markers should be handled the same as none
351 * ptes here.
352 */
353 if (pte_none_mostly(*pte))
354 ret = true;
355 if (!pte_write(*pte) && (reason & VM_UFFD_WP))
356 ret = true;
357 pte_unmap(pte);
358
359 out:
360 return ret;
361 }
362
userfaultfd_get_blocking_state(unsigned int flags)363 static inline unsigned int userfaultfd_get_blocking_state(unsigned int flags)
364 {
365 if (flags & FAULT_FLAG_INTERRUPTIBLE)
366 return TASK_INTERRUPTIBLE;
367
368 if (flags & FAULT_FLAG_KILLABLE)
369 return TASK_KILLABLE;
370
371 return TASK_UNINTERRUPTIBLE;
372 }
373
374 /*
375 * The locking rules involved in returning VM_FAULT_RETRY depending on
376 * FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and
377 * FAULT_FLAG_KILLABLE are not straightforward. The "Caution"
378 * recommendation in __lock_page_or_retry is not an understatement.
379 *
380 * If FAULT_FLAG_ALLOW_RETRY is set, the mmap_lock must be released
381 * before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is
382 * not set.
383 *
384 * If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not
385 * set, VM_FAULT_RETRY can still be returned if and only if there are
386 * fatal_signal_pending()s, and the mmap_lock must be released before
387 * returning it.
388 */
handle_userfault(struct vm_fault * vmf,unsigned long reason)389 vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
390 {
391 struct mm_struct *mm = vmf->vma->vm_mm;
392 struct userfaultfd_ctx *ctx;
393 struct userfaultfd_wait_queue uwq;
394 vm_fault_t ret = VM_FAULT_SIGBUS;
395 bool must_wait;
396 unsigned int blocking_state;
397
398 /*
399 * We don't do userfault handling for the final child pid update.
400 *
401 * We also don't do userfault handling during
402 * coredumping. hugetlbfs has the special
403 * follow_hugetlb_page() to skip missing pages in the
404 * FOLL_DUMP case, anon memory also checks for FOLL_DUMP with
405 * the no_page_table() helper in follow_page_mask(), but the
406 * shmem_vm_ops->fault method is invoked even during
407 * coredumping without mmap_lock and it ends up here.
408 */
409 if (current->flags & (PF_EXITING|PF_DUMPCORE))
410 goto out;
411
412 /*
413 * Coredumping runs without mmap_lock so we can only check that
414 * the mmap_lock is held, if PF_DUMPCORE was not set.
415 */
416 assert_fault_locked(vmf);
417
418 ctx = vmf->vma->vm_userfaultfd_ctx.ctx;
419 if (!ctx)
420 goto out;
421
422 BUG_ON(ctx->mm != mm);
423
424 /* Any unrecognized flag is a bug. */
425 VM_BUG_ON(reason & ~__VM_UFFD_FLAGS);
426 /* 0 or > 1 flags set is a bug; we expect exactly 1. */
427 VM_BUG_ON(!reason || (reason & (reason - 1)));
428
429 if (ctx->features & UFFD_FEATURE_SIGBUS)
430 goto out;
431 if (!(vmf->flags & FAULT_FLAG_USER) && (ctx->flags & UFFD_USER_MODE_ONLY))
432 goto out;
433
434 /*
435 * If it's already released don't get it. This avoids to loop
436 * in __get_user_pages if userfaultfd_release waits on the
437 * caller of handle_userfault to release the mmap_lock.
438 */
439 if (unlikely(READ_ONCE(ctx->released))) {
440 /*
441 * Don't return VM_FAULT_SIGBUS in this case, so a non
442 * cooperative manager can close the uffd after the
443 * last UFFDIO_COPY, without risking to trigger an
444 * involuntary SIGBUS if the process was starting the
445 * userfaultfd while the userfaultfd was still armed
446 * (but after the last UFFDIO_COPY). If the uffd
447 * wasn't already closed when the userfault reached
448 * this point, that would normally be solved by
449 * userfaultfd_must_wait returning 'false'.
450 *
451 * If we were to return VM_FAULT_SIGBUS here, the non
452 * cooperative manager would be instead forced to
453 * always call UFFDIO_UNREGISTER before it can safely
454 * close the uffd.
455 */
456 ret = VM_FAULT_NOPAGE;
457 goto out;
458 }
459
460 /*
461 * Check that we can return VM_FAULT_RETRY.
462 *
463 * NOTE: it should become possible to return VM_FAULT_RETRY
464 * even if FAULT_FLAG_TRIED is set without leading to gup()
465 * -EBUSY failures, if the userfaultfd is to be extended for
466 * VM_UFFD_WP tracking and we intend to arm the userfault
467 * without first stopping userland access to the memory. For
468 * VM_UFFD_MISSING userfaults this is enough for now.
469 */
470 if (unlikely(!(vmf->flags & FAULT_FLAG_ALLOW_RETRY))) {
471 /*
472 * Validate the invariant that nowait must allow retry
473 * to be sure not to return SIGBUS erroneously on
474 * nowait invocations.
475 */
476 BUG_ON(vmf->flags & FAULT_FLAG_RETRY_NOWAIT);
477 #ifdef CONFIG_DEBUG_VM
478 if (printk_ratelimit()) {
479 printk(KERN_WARNING
480 "FAULT_FLAG_ALLOW_RETRY missing %x\n",
481 vmf->flags);
482 dump_stack();
483 }
484 #endif
485 goto out;
486 }
487
488 /*
489 * Handle nowait, not much to do other than tell it to retry
490 * and wait.
491 */
492 ret = VM_FAULT_RETRY;
493 if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
494 goto out;
495
496 /* take the reference before dropping the mmap_lock */
497 userfaultfd_ctx_get(ctx);
498
499 init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
500 uwq.wq.private = current;
501 uwq.msg = userfault_msg(vmf->address, vmf->real_address, vmf->flags,
502 reason, ctx->features);
503 uwq.ctx = ctx;
504 uwq.waken = false;
505
506 blocking_state = userfaultfd_get_blocking_state(vmf->flags);
507
508 spin_lock_irq(&ctx->fault_pending_wqh.lock);
509 /*
510 * After the __add_wait_queue the uwq is visible to userland
511 * through poll/read().
512 */
513 __add_wait_queue(&ctx->fault_pending_wqh, &uwq.wq);
514 /*
515 * The smp_mb() after __set_current_state prevents the reads
516 * following the spin_unlock to happen before the list_add in
517 * __add_wait_queue.
518 */
519 set_current_state(blocking_state);
520 spin_unlock_irq(&ctx->fault_pending_wqh.lock);
521
522 if (!is_vm_hugetlb_page(vmf->vma))
523 must_wait = userfaultfd_must_wait(ctx, vmf, reason);
524 else
525 must_wait = userfaultfd_huge_must_wait(ctx, vmf, reason);
526 release_fault_lock(vmf);
527
528 if (likely(must_wait && !READ_ONCE(ctx->released))) {
529 wake_up_poll(&ctx->fd_wqh, EPOLLIN);
530 schedule();
531 }
532
533 __set_current_state(TASK_RUNNING);
534
535 /*
536 * Here we race with the list_del; list_add in
537 * userfaultfd_ctx_read(), however because we don't ever run
538 * list_del_init() to refile across the two lists, the prev
539 * and next pointers will never point to self. list_add also
540 * would never let any of the two pointers to point to
541 * self. So list_empty_careful won't risk to see both pointers
542 * pointing to self at any time during the list refile. The
543 * only case where list_del_init() is called is the full
544 * removal in the wake function and there we don't re-list_add
545 * and it's fine not to block on the spinlock. The uwq on this
546 * kernel stack can be released after the list_del_init.
547 */
548 if (!list_empty_careful(&uwq.wq.entry)) {
549 spin_lock_irq(&ctx->fault_pending_wqh.lock);
550 /*
551 * No need of list_del_init(), the uwq on the stack
552 * will be freed shortly anyway.
553 */
554 list_del(&uwq.wq.entry);
555 spin_unlock_irq(&ctx->fault_pending_wqh.lock);
556 }
557
558 /*
559 * ctx may go away after this if the userfault pseudo fd is
560 * already released.
561 */
562 userfaultfd_ctx_put(ctx);
563
564 out:
565 return ret;
566 }
567
userfaultfd_event_wait_completion(struct userfaultfd_ctx * ctx,struct userfaultfd_wait_queue * ewq)568 static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
569 struct userfaultfd_wait_queue *ewq)
570 {
571 struct userfaultfd_ctx *release_new_ctx;
572
573 if (WARN_ON_ONCE(current->flags & PF_EXITING))
574 goto out;
575
576 ewq->ctx = ctx;
577 init_waitqueue_entry(&ewq->wq, current);
578 release_new_ctx = NULL;
579
580 spin_lock_irq(&ctx->event_wqh.lock);
581 /*
582 * After the __add_wait_queue the uwq is visible to userland
583 * through poll/read().
584 */
585 __add_wait_queue(&ctx->event_wqh, &ewq->wq);
586 for (;;) {
587 set_current_state(TASK_KILLABLE);
588 if (ewq->msg.event == 0)
589 break;
590 if (READ_ONCE(ctx->released) ||
591 fatal_signal_pending(current)) {
592 /*
593 * &ewq->wq may be queued in fork_event, but
594 * __remove_wait_queue ignores the head
595 * parameter. It would be a problem if it
596 * didn't.
597 */
598 __remove_wait_queue(&ctx->event_wqh, &ewq->wq);
599 if (ewq->msg.event == UFFD_EVENT_FORK) {
600 struct userfaultfd_ctx *new;
601
602 new = (struct userfaultfd_ctx *)
603 (unsigned long)
604 ewq->msg.arg.reserved.reserved1;
605 release_new_ctx = new;
606 }
607 break;
608 }
609
610 spin_unlock_irq(&ctx->event_wqh.lock);
611
612 wake_up_poll(&ctx->fd_wqh, EPOLLIN);
613 schedule();
614
615 spin_lock_irq(&ctx->event_wqh.lock);
616 }
617 __set_current_state(TASK_RUNNING);
618 spin_unlock_irq(&ctx->event_wqh.lock);
619
620 if (release_new_ctx) {
621 struct vm_area_struct *vma;
622 struct mm_struct *mm = release_new_ctx->mm;
623 VMA_ITERATOR(vmi, mm, 0);
624
625 /* the various vma->vm_userfaultfd_ctx still points to it */
626 mmap_write_lock(mm);
627 for_each_vma(vmi, vma) {
628 if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
629 vma_start_write(vma);
630 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
631 userfaultfd_set_vm_flags(vma,
632 vma->vm_flags & ~__VM_UFFD_FLAGS);
633 }
634 }
635 mmap_write_unlock(mm);
636
637 userfaultfd_ctx_put(release_new_ctx);
638 }
639
640 /*
641 * ctx may go away after this if the userfault pseudo fd is
642 * already released.
643 */
644 out:
645 atomic_dec(&ctx->mmap_changing);
646 VM_BUG_ON(atomic_read(&ctx->mmap_changing) < 0);
647 userfaultfd_ctx_put(ctx);
648 }
649
userfaultfd_event_complete(struct userfaultfd_ctx * ctx,struct userfaultfd_wait_queue * ewq)650 static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx,
651 struct userfaultfd_wait_queue *ewq)
652 {
653 ewq->msg.event = 0;
654 wake_up_locked(&ctx->event_wqh);
655 __remove_wait_queue(&ctx->event_wqh, &ewq->wq);
656 }
657
dup_userfaultfd(struct vm_area_struct * vma,struct list_head * fcs)658 int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
659 {
660 struct userfaultfd_ctx *ctx = NULL, *octx;
661 struct userfaultfd_fork_ctx *fctx;
662
663 octx = vma->vm_userfaultfd_ctx.ctx;
664 if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
665 vma_start_write(vma);
666 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
667 userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
668 return 0;
669 }
670
671 list_for_each_entry(fctx, fcs, list)
672 if (fctx->orig == octx) {
673 ctx = fctx->new;
674 break;
675 }
676
677 if (!ctx) {
678 fctx = kmalloc(sizeof(*fctx), GFP_KERNEL);
679 if (!fctx)
680 return -ENOMEM;
681
682 ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
683 if (!ctx) {
684 kfree(fctx);
685 return -ENOMEM;
686 }
687
688 refcount_set(&ctx->refcount, 1);
689 ctx->flags = octx->flags;
690 ctx->features = octx->features;
691 ctx->released = false;
692 atomic_set(&ctx->mmap_changing, 0);
693 ctx->mm = vma->vm_mm;
694 mmgrab(ctx->mm);
695
696 userfaultfd_ctx_get(octx);
697 atomic_inc(&octx->mmap_changing);
698 fctx->orig = octx;
699 fctx->new = ctx;
700 list_add_tail(&fctx->list, fcs);
701 }
702
703 vma->vm_userfaultfd_ctx.ctx = ctx;
704 return 0;
705 }
706
dup_fctx(struct userfaultfd_fork_ctx * fctx)707 static void dup_fctx(struct userfaultfd_fork_ctx *fctx)
708 {
709 struct userfaultfd_ctx *ctx = fctx->orig;
710 struct userfaultfd_wait_queue ewq;
711
712 msg_init(&ewq.msg);
713
714 ewq.msg.event = UFFD_EVENT_FORK;
715 ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new;
716
717 userfaultfd_event_wait_completion(ctx, &ewq);
718 }
719
dup_userfaultfd_complete(struct list_head * fcs)720 void dup_userfaultfd_complete(struct list_head *fcs)
721 {
722 struct userfaultfd_fork_ctx *fctx, *n;
723
724 list_for_each_entry_safe(fctx, n, fcs, list) {
725 dup_fctx(fctx);
726 list_del(&fctx->list);
727 kfree(fctx);
728 }
729 }
730
mremap_userfaultfd_prep(struct vm_area_struct * vma,struct vm_userfaultfd_ctx * vm_ctx)731 void mremap_userfaultfd_prep(struct vm_area_struct *vma,
732 struct vm_userfaultfd_ctx *vm_ctx)
733 {
734 struct userfaultfd_ctx *ctx;
735
736 ctx = vma->vm_userfaultfd_ctx.ctx;
737
738 if (!ctx)
739 return;
740
741 if (ctx->features & UFFD_FEATURE_EVENT_REMAP) {
742 vm_ctx->ctx = ctx;
743 userfaultfd_ctx_get(ctx);
744 atomic_inc(&ctx->mmap_changing);
745 } else {
746 /* Drop uffd context if remap feature not enabled */
747 vma_start_write(vma);
748 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
749 userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
750 }
751 }
752
mremap_userfaultfd_complete(struct vm_userfaultfd_ctx * vm_ctx,unsigned long from,unsigned long to,unsigned long len)753 void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx,
754 unsigned long from, unsigned long to,
755 unsigned long len)
756 {
757 struct userfaultfd_ctx *ctx = vm_ctx->ctx;
758 struct userfaultfd_wait_queue ewq;
759
760 if (!ctx)
761 return;
762
763 if (to & ~PAGE_MASK) {
764 userfaultfd_ctx_put(ctx);
765 return;
766 }
767
768 msg_init(&ewq.msg);
769
770 ewq.msg.event = UFFD_EVENT_REMAP;
771 ewq.msg.arg.remap.from = from;
772 ewq.msg.arg.remap.to = to;
773 ewq.msg.arg.remap.len = len;
774
775 userfaultfd_event_wait_completion(ctx, &ewq);
776 }
777
userfaultfd_remove(struct vm_area_struct * vma,unsigned long start,unsigned long end)778 bool userfaultfd_remove(struct vm_area_struct *vma,
779 unsigned long start, unsigned long end)
780 {
781 struct mm_struct *mm = vma->vm_mm;
782 struct userfaultfd_ctx *ctx;
783 struct userfaultfd_wait_queue ewq;
784
785 ctx = vma->vm_userfaultfd_ctx.ctx;
786 if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_REMOVE))
787 return true;
788
789 userfaultfd_ctx_get(ctx);
790 atomic_inc(&ctx->mmap_changing);
791 mmap_read_unlock(mm);
792
793 msg_init(&ewq.msg);
794
795 ewq.msg.event = UFFD_EVENT_REMOVE;
796 ewq.msg.arg.remove.start = start;
797 ewq.msg.arg.remove.end = end;
798
799 userfaultfd_event_wait_completion(ctx, &ewq);
800
801 return false;
802 }
803
has_unmap_ctx(struct userfaultfd_ctx * ctx,struct list_head * unmaps,unsigned long start,unsigned long end)804 static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps,
805 unsigned long start, unsigned long end)
806 {
807 struct userfaultfd_unmap_ctx *unmap_ctx;
808
809 list_for_each_entry(unmap_ctx, unmaps, list)
810 if (unmap_ctx->ctx == ctx && unmap_ctx->start == start &&
811 unmap_ctx->end == end)
812 return true;
813
814 return false;
815 }
816
userfaultfd_unmap_prep(struct vm_area_struct * vma,unsigned long start,unsigned long end,struct list_head * unmaps)817 int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start,
818 unsigned long end, struct list_head *unmaps)
819 {
820 struct userfaultfd_unmap_ctx *unmap_ctx;
821 struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
822
823 if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) ||
824 has_unmap_ctx(ctx, unmaps, start, end))
825 return 0;
826
827 unmap_ctx = kzalloc(sizeof(*unmap_ctx), GFP_KERNEL);
828 if (!unmap_ctx)
829 return -ENOMEM;
830
831 userfaultfd_ctx_get(ctx);
832 atomic_inc(&ctx->mmap_changing);
833 unmap_ctx->ctx = ctx;
834 unmap_ctx->start = start;
835 unmap_ctx->end = end;
836 list_add_tail(&unmap_ctx->list, unmaps);
837
838 return 0;
839 }
840
userfaultfd_unmap_complete(struct mm_struct * mm,struct list_head * uf)841 void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf)
842 {
843 struct userfaultfd_unmap_ctx *ctx, *n;
844 struct userfaultfd_wait_queue ewq;
845
846 list_for_each_entry_safe(ctx, n, uf, list) {
847 msg_init(&ewq.msg);
848
849 ewq.msg.event = UFFD_EVENT_UNMAP;
850 ewq.msg.arg.remove.start = ctx->start;
851 ewq.msg.arg.remove.end = ctx->end;
852
853 userfaultfd_event_wait_completion(ctx->ctx, &ewq);
854
855 list_del(&ctx->list);
856 kfree(ctx);
857 }
858 }
859
userfaultfd_release(struct inode * inode,struct file * file)860 static int userfaultfd_release(struct inode *inode, struct file *file)
861 {
862 struct userfaultfd_ctx *ctx = file->private_data;
863 struct mm_struct *mm = ctx->mm;
864 struct vm_area_struct *vma, *prev;
865 /* len == 0 means wake all */
866 struct userfaultfd_wake_range range = { .len = 0, };
867 unsigned long new_flags;
868 MA_STATE(mas, &mm->mm_mt, 0, 0);
869
870 WRITE_ONCE(ctx->released, true);
871
872 if (!mmget_not_zero(mm))
873 goto wakeup;
874
875 /*
876 * Flush page faults out of all CPUs. NOTE: all page faults
877 * must be retried without returning VM_FAULT_SIGBUS if
878 * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx
879 * changes while handle_userfault released the mmap_lock. So
880 * it's critical that released is set to true (above), before
881 * taking the mmap_lock for writing.
882 */
883 mmap_write_lock(mm);
884 prev = NULL;
885 mas_for_each(&mas, vma, ULONG_MAX) {
886 cond_resched();
887 BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
888 !!(vma->vm_flags & __VM_UFFD_FLAGS));
889 if (vma->vm_userfaultfd_ctx.ctx != ctx) {
890 prev = vma;
891 continue;
892 }
893 new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
894 prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end,
895 new_flags, vma->anon_vma,
896 vma->vm_file, vma->vm_pgoff,
897 vma_policy(vma),
898 NULL_VM_UFFD_CTX, anon_vma_name(vma));
899 if (prev) {
900 mas_pause(&mas);
901 vma = prev;
902 } else {
903 prev = vma;
904 }
905
906 vma_start_write(vma);
907 userfaultfd_set_vm_flags(vma, new_flags);
908 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
909 }
910 mmap_write_unlock(mm);
911 mmput(mm);
912 wakeup:
913 /*
914 * After no new page faults can wait on this fault_*wqh, flush
915 * the last page faults that may have been already waiting on
916 * the fault_*wqh.
917 */
918 spin_lock_irq(&ctx->fault_pending_wqh.lock);
919 __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range);
920 __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, &range);
921 spin_unlock_irq(&ctx->fault_pending_wqh.lock);
922
923 /* Flush pending events that may still wait on event_wqh */
924 wake_up_all(&ctx->event_wqh);
925
926 wake_up_poll(&ctx->fd_wqh, EPOLLHUP);
927 userfaultfd_ctx_put(ctx);
928 return 0;
929 }
930
931 /* fault_pending_wqh.lock must be hold by the caller */
find_userfault_in(wait_queue_head_t * wqh)932 static inline struct userfaultfd_wait_queue *find_userfault_in(
933 wait_queue_head_t *wqh)
934 {
935 wait_queue_entry_t *wq;
936 struct userfaultfd_wait_queue *uwq;
937
938 lockdep_assert_held(&wqh->lock);
939
940 uwq = NULL;
941 if (!waitqueue_active(wqh))
942 goto out;
943 /* walk in reverse to provide FIFO behavior to read userfaults */
944 wq = list_last_entry(&wqh->head, typeof(*wq), entry);
945 uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
946 out:
947 return uwq;
948 }
949
find_userfault(struct userfaultfd_ctx * ctx)950 static inline struct userfaultfd_wait_queue *find_userfault(
951 struct userfaultfd_ctx *ctx)
952 {
953 return find_userfault_in(&ctx->fault_pending_wqh);
954 }
955
find_userfault_evt(struct userfaultfd_ctx * ctx)956 static inline struct userfaultfd_wait_queue *find_userfault_evt(
957 struct userfaultfd_ctx *ctx)
958 {
959 return find_userfault_in(&ctx->event_wqh);
960 }
961
userfaultfd_poll(struct file * file,poll_table * wait)962 static __poll_t userfaultfd_poll(struct file *file, poll_table *wait)
963 {
964 struct userfaultfd_ctx *ctx = file->private_data;
965 __poll_t ret;
966
967 poll_wait(file, &ctx->fd_wqh, wait);
968
969 if (!userfaultfd_is_initialized(ctx))
970 return EPOLLERR;
971
972 /*
973 * poll() never guarantees that read won't block.
974 * userfaults can be waken before they're read().
975 */
976 if (unlikely(!(file->f_flags & O_NONBLOCK)))
977 return EPOLLERR;
978 /*
979 * lockless access to see if there are pending faults
980 * __pollwait last action is the add_wait_queue but
981 * the spin_unlock would allow the waitqueue_active to
982 * pass above the actual list_add inside
983 * add_wait_queue critical section. So use a full
984 * memory barrier to serialize the list_add write of
985 * add_wait_queue() with the waitqueue_active read
986 * below.
987 */
988 ret = 0;
989 smp_mb();
990 if (waitqueue_active(&ctx->fault_pending_wqh))
991 ret = EPOLLIN;
992 else if (waitqueue_active(&ctx->event_wqh))
993 ret = EPOLLIN;
994
995 return ret;
996 }
997
998 static const struct file_operations userfaultfd_fops;
999
resolve_userfault_fork(struct userfaultfd_ctx * new,struct inode * inode,struct uffd_msg * msg)1000 static int resolve_userfault_fork(struct userfaultfd_ctx *new,
1001 struct inode *inode,
1002 struct uffd_msg *msg)
1003 {
1004 int fd;
1005
1006 fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, new,
1007 O_RDONLY | (new->flags & UFFD_SHARED_FCNTL_FLAGS), inode);
1008 if (fd < 0)
1009 return fd;
1010
1011 msg->arg.reserved.reserved1 = 0;
1012 msg->arg.fork.ufd = fd;
1013 return 0;
1014 }
1015
userfaultfd_ctx_read(struct userfaultfd_ctx * ctx,int no_wait,struct uffd_msg * msg,struct inode * inode)1016 static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
1017 struct uffd_msg *msg, struct inode *inode)
1018 {
1019 ssize_t ret;
1020 DECLARE_WAITQUEUE(wait, current);
1021 struct userfaultfd_wait_queue *uwq;
1022 /*
1023 * Handling fork event requires sleeping operations, so
1024 * we drop the event_wqh lock, then do these ops, then
1025 * lock it back and wake up the waiter. While the lock is
1026 * dropped the ewq may go away so we keep track of it
1027 * carefully.
1028 */
1029 LIST_HEAD(fork_event);
1030 struct userfaultfd_ctx *fork_nctx = NULL;
1031
1032 /* always take the fd_wqh lock before the fault_pending_wqh lock */
1033 spin_lock_irq(&ctx->fd_wqh.lock);
1034 __add_wait_queue(&ctx->fd_wqh, &wait);
1035 for (;;) {
1036 set_current_state(TASK_INTERRUPTIBLE);
1037 spin_lock(&ctx->fault_pending_wqh.lock);
1038 uwq = find_userfault(ctx);
1039 if (uwq) {
1040 /*
1041 * Use a seqcount to repeat the lockless check
1042 * in wake_userfault() to avoid missing
1043 * wakeups because during the refile both
1044 * waitqueue could become empty if this is the
1045 * only userfault.
1046 */
1047 write_seqcount_begin(&ctx->refile_seq);
1048
1049 /*
1050 * The fault_pending_wqh.lock prevents the uwq
1051 * to disappear from under us.
1052 *
1053 * Refile this userfault from
1054 * fault_pending_wqh to fault_wqh, it's not
1055 * pending anymore after we read it.
1056 *
1057 * Use list_del() by hand (as
1058 * userfaultfd_wake_function also uses
1059 * list_del_init() by hand) to be sure nobody
1060 * changes __remove_wait_queue() to use
1061 * list_del_init() in turn breaking the
1062 * !list_empty_careful() check in
1063 * handle_userfault(). The uwq->wq.head list
1064 * must never be empty at any time during the
1065 * refile, or the waitqueue could disappear
1066 * from under us. The "wait_queue_head_t"
1067 * parameter of __remove_wait_queue() is unused
1068 * anyway.
1069 */
1070 list_del(&uwq->wq.entry);
1071 add_wait_queue(&ctx->fault_wqh, &uwq->wq);
1072
1073 write_seqcount_end(&ctx->refile_seq);
1074
1075 /* careful to always initialize msg if ret == 0 */
1076 *msg = uwq->msg;
1077 spin_unlock(&ctx->fault_pending_wqh.lock);
1078 ret = 0;
1079 break;
1080 }
1081 spin_unlock(&ctx->fault_pending_wqh.lock);
1082
1083 spin_lock(&ctx->event_wqh.lock);
1084 uwq = find_userfault_evt(ctx);
1085 if (uwq) {
1086 *msg = uwq->msg;
1087
1088 if (uwq->msg.event == UFFD_EVENT_FORK) {
1089 fork_nctx = (struct userfaultfd_ctx *)
1090 (unsigned long)
1091 uwq->msg.arg.reserved.reserved1;
1092 list_move(&uwq->wq.entry, &fork_event);
1093 /*
1094 * fork_nctx can be freed as soon as
1095 * we drop the lock, unless we take a
1096 * reference on it.
1097 */
1098 userfaultfd_ctx_get(fork_nctx);
1099 spin_unlock(&ctx->event_wqh.lock);
1100 ret = 0;
1101 break;
1102 }
1103
1104 userfaultfd_event_complete(ctx, uwq);
1105 spin_unlock(&ctx->event_wqh.lock);
1106 ret = 0;
1107 break;
1108 }
1109 spin_unlock(&ctx->event_wqh.lock);
1110
1111 if (signal_pending(current)) {
1112 ret = -ERESTARTSYS;
1113 break;
1114 }
1115 if (no_wait) {
1116 ret = -EAGAIN;
1117 break;
1118 }
1119 spin_unlock_irq(&ctx->fd_wqh.lock);
1120 schedule();
1121 spin_lock_irq(&ctx->fd_wqh.lock);
1122 }
1123 __remove_wait_queue(&ctx->fd_wqh, &wait);
1124 __set_current_state(TASK_RUNNING);
1125 spin_unlock_irq(&ctx->fd_wqh.lock);
1126
1127 if (!ret && msg->event == UFFD_EVENT_FORK) {
1128 ret = resolve_userfault_fork(fork_nctx, inode, msg);
1129 spin_lock_irq(&ctx->event_wqh.lock);
1130 if (!list_empty(&fork_event)) {
1131 /*
1132 * The fork thread didn't abort, so we can
1133 * drop the temporary refcount.
1134 */
1135 userfaultfd_ctx_put(fork_nctx);
1136
1137 uwq = list_first_entry(&fork_event,
1138 typeof(*uwq),
1139 wq.entry);
1140 /*
1141 * If fork_event list wasn't empty and in turn
1142 * the event wasn't already released by fork
1143 * (the event is allocated on fork kernel
1144 * stack), put the event back to its place in
1145 * the event_wq. fork_event head will be freed
1146 * as soon as we return so the event cannot
1147 * stay queued there no matter the current
1148 * "ret" value.
1149 */
1150 list_del(&uwq->wq.entry);
1151 __add_wait_queue(&ctx->event_wqh, &uwq->wq);
1152
1153 /*
1154 * Leave the event in the waitqueue and report
1155 * error to userland if we failed to resolve
1156 * the userfault fork.
1157 */
1158 if (likely(!ret))
1159 userfaultfd_event_complete(ctx, uwq);
1160 } else {
1161 /*
1162 * Here the fork thread aborted and the
1163 * refcount from the fork thread on fork_nctx
1164 * has already been released. We still hold
1165 * the reference we took before releasing the
1166 * lock above. If resolve_userfault_fork
1167 * failed we've to drop it because the
1168 * fork_nctx has to be freed in such case. If
1169 * it succeeded we'll hold it because the new
1170 * uffd references it.
1171 */
1172 if (ret)
1173 userfaultfd_ctx_put(fork_nctx);
1174 }
1175 spin_unlock_irq(&ctx->event_wqh.lock);
1176 }
1177
1178 return ret;
1179 }
1180
userfaultfd_read(struct file * file,char __user * buf,size_t count,loff_t * ppos)1181 static ssize_t userfaultfd_read(struct file *file, char __user *buf,
1182 size_t count, loff_t *ppos)
1183 {
1184 struct userfaultfd_ctx *ctx = file->private_data;
1185 ssize_t _ret, ret = 0;
1186 struct uffd_msg msg;
1187 int no_wait = file->f_flags & O_NONBLOCK;
1188 struct inode *inode = file_inode(file);
1189
1190 if (!userfaultfd_is_initialized(ctx))
1191 return -EINVAL;
1192
1193 for (;;) {
1194 if (count < sizeof(msg))
1195 return ret ? ret : -EINVAL;
1196 _ret = userfaultfd_ctx_read(ctx, no_wait, &msg, inode);
1197 if (_ret < 0)
1198 return ret ? ret : _ret;
1199 if (copy_to_user((__u64 __user *) buf, &msg, sizeof(msg)))
1200 return ret ? ret : -EFAULT;
1201 ret += sizeof(msg);
1202 buf += sizeof(msg);
1203 count -= sizeof(msg);
1204 /*
1205 * Allow to read more than one fault at time but only
1206 * block if waiting for the very first one.
1207 */
1208 no_wait = O_NONBLOCK;
1209 }
1210 }
1211
__wake_userfault(struct userfaultfd_ctx * ctx,struct userfaultfd_wake_range * range)1212 static void __wake_userfault(struct userfaultfd_ctx *ctx,
1213 struct userfaultfd_wake_range *range)
1214 {
1215 spin_lock_irq(&ctx->fault_pending_wqh.lock);
1216 /* wake all in the range and autoremove */
1217 if (waitqueue_active(&ctx->fault_pending_wqh))
1218 __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL,
1219 range);
1220 if (waitqueue_active(&ctx->fault_wqh))
1221 __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, range);
1222 spin_unlock_irq(&ctx->fault_pending_wqh.lock);
1223 }
1224
wake_userfault(struct userfaultfd_ctx * ctx,struct userfaultfd_wake_range * range)1225 static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
1226 struct userfaultfd_wake_range *range)
1227 {
1228 unsigned seq;
1229 bool need_wakeup;
1230
1231 /*
1232 * To be sure waitqueue_active() is not reordered by the CPU
1233 * before the pagetable update, use an explicit SMP memory
1234 * barrier here. PT lock release or mmap_read_unlock(mm) still
1235 * have release semantics that can allow the
1236 * waitqueue_active() to be reordered before the pte update.
1237 */
1238 smp_mb();
1239
1240 /*
1241 * Use waitqueue_active because it's very frequent to
1242 * change the address space atomically even if there are no
1243 * userfaults yet. So we take the spinlock only when we're
1244 * sure we've userfaults to wake.
1245 */
1246 do {
1247 seq = read_seqcount_begin(&ctx->refile_seq);
1248 need_wakeup = waitqueue_active(&ctx->fault_pending_wqh) ||
1249 waitqueue_active(&ctx->fault_wqh);
1250 cond_resched();
1251 } while (read_seqcount_retry(&ctx->refile_seq, seq));
1252 if (need_wakeup)
1253 __wake_userfault(ctx, range);
1254 }
1255
validate_range(struct mm_struct * mm,__u64 start,__u64 len)1256 static __always_inline int validate_range(struct mm_struct *mm,
1257 __u64 start, __u64 len)
1258 {
1259 __u64 task_size = mm->task_size;
1260
1261 if (start & ~PAGE_MASK)
1262 return -EINVAL;
1263 if (len & ~PAGE_MASK)
1264 return -EINVAL;
1265 if (!len)
1266 return -EINVAL;
1267 if (start < mmap_min_addr)
1268 return -EINVAL;
1269 if (start >= task_size)
1270 return -EINVAL;
1271 if (len > task_size - start)
1272 return -EINVAL;
1273 return 0;
1274 }
1275
userfaultfd_register(struct userfaultfd_ctx * ctx,unsigned long arg)1276 static int userfaultfd_register(struct userfaultfd_ctx *ctx,
1277 unsigned long arg)
1278 {
1279 struct mm_struct *mm = ctx->mm;
1280 struct vm_area_struct *vma, *prev, *cur;
1281 int ret;
1282 struct uffdio_register uffdio_register;
1283 struct uffdio_register __user *user_uffdio_register;
1284 unsigned long vm_flags, new_flags;
1285 bool found;
1286 bool basic_ioctls;
1287 unsigned long start, end, vma_end;
1288 MA_STATE(mas, &mm->mm_mt, 0, 0);
1289
1290 user_uffdio_register = (struct uffdio_register __user *) arg;
1291
1292 ret = -EFAULT;
1293 if (copy_from_user(&uffdio_register, user_uffdio_register,
1294 sizeof(uffdio_register)-sizeof(__u64)))
1295 goto out;
1296
1297 ret = -EINVAL;
1298 if (!uffdio_register.mode)
1299 goto out;
1300 if (uffdio_register.mode & ~UFFD_API_REGISTER_MODES)
1301 goto out;
1302 vm_flags = 0;
1303 if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
1304 vm_flags |= VM_UFFD_MISSING;
1305 if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
1306 #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
1307 goto out;
1308 #endif
1309 vm_flags |= VM_UFFD_WP;
1310 }
1311 if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) {
1312 #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
1313 goto out;
1314 #endif
1315 vm_flags |= VM_UFFD_MINOR;
1316 }
1317
1318 ret = validate_range(mm, uffdio_register.range.start,
1319 uffdio_register.range.len);
1320 if (ret)
1321 goto out;
1322
1323 start = uffdio_register.range.start;
1324 end = start + uffdio_register.range.len;
1325
1326 ret = -ENOMEM;
1327 if (!mmget_not_zero(mm))
1328 goto out;
1329
1330 mmap_write_lock(mm);
1331 mas_set(&mas, start);
1332 vma = mas_find(&mas, ULONG_MAX);
1333 if (!vma)
1334 goto out_unlock;
1335
1336 /* check that there's at least one vma in the range */
1337 ret = -EINVAL;
1338 if (vma->vm_start >= end)
1339 goto out_unlock;
1340
1341 /*
1342 * If the first vma contains huge pages, make sure start address
1343 * is aligned to huge page size.
1344 */
1345 if (is_vm_hugetlb_page(vma)) {
1346 unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
1347
1348 if (start & (vma_hpagesize - 1))
1349 goto out_unlock;
1350 }
1351
1352 /*
1353 * Search for not compatible vmas.
1354 */
1355 found = false;
1356 basic_ioctls = false;
1357 for (cur = vma; cur; cur = mas_next(&mas, end - 1)) {
1358 cond_resched();
1359
1360 BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
1361 !!(cur->vm_flags & __VM_UFFD_FLAGS));
1362
1363 /* check not compatible vmas */
1364 ret = -EINVAL;
1365 if (!vma_can_userfault(cur, vm_flags))
1366 goto out_unlock;
1367
1368 /*
1369 * UFFDIO_COPY will fill file holes even without
1370 * PROT_WRITE. This check enforces that if this is a
1371 * MAP_SHARED, the process has write permission to the backing
1372 * file. If VM_MAYWRITE is set it also enforces that on a
1373 * MAP_SHARED vma: there is no F_WRITE_SEAL and no further
1374 * F_WRITE_SEAL can be taken until the vma is destroyed.
1375 */
1376 ret = -EPERM;
1377 if (unlikely(!(cur->vm_flags & VM_MAYWRITE)))
1378 goto out_unlock;
1379
1380 /*
1381 * If this vma contains ending address, and huge pages
1382 * check alignment.
1383 */
1384 if (is_vm_hugetlb_page(cur) && end <= cur->vm_end &&
1385 end > cur->vm_start) {
1386 unsigned long vma_hpagesize = vma_kernel_pagesize(cur);
1387
1388 ret = -EINVAL;
1389
1390 if (end & (vma_hpagesize - 1))
1391 goto out_unlock;
1392 }
1393 if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_MAYWRITE))
1394 goto out_unlock;
1395
1396 /*
1397 * Check that this vma isn't already owned by a
1398 * different userfaultfd. We can't allow more than one
1399 * userfaultfd to own a single vma simultaneously or we
1400 * wouldn't know which one to deliver the userfaults to.
1401 */
1402 ret = -EBUSY;
1403 if (cur->vm_userfaultfd_ctx.ctx &&
1404 cur->vm_userfaultfd_ctx.ctx != ctx)
1405 goto out_unlock;
1406
1407 /*
1408 * Note vmas containing huge pages
1409 */
1410 if (is_vm_hugetlb_page(cur))
1411 basic_ioctls = true;
1412
1413 found = true;
1414 }
1415 BUG_ON(!found);
1416
1417 mas_set(&mas, start);
1418 prev = mas_prev(&mas, 0);
1419 if (prev != vma)
1420 mas_next(&mas, ULONG_MAX);
1421
1422 ret = 0;
1423 do {
1424 cond_resched();
1425
1426 BUG_ON(!vma_can_userfault(vma, vm_flags));
1427 BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
1428 vma->vm_userfaultfd_ctx.ctx != ctx);
1429 WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
1430
1431 /*
1432 * Nothing to do: this vma is already registered into this
1433 * userfaultfd and with the right tracking mode too.
1434 */
1435 if (vma->vm_userfaultfd_ctx.ctx == ctx &&
1436 (vma->vm_flags & vm_flags) == vm_flags)
1437 goto skip;
1438
1439 if (vma->vm_start > start)
1440 start = vma->vm_start;
1441 vma_end = min(end, vma->vm_end);
1442
1443 new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
1444 prev = vma_merge(mm, prev, start, vma_end, new_flags,
1445 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
1446 vma_policy(vma),
1447 ((struct vm_userfaultfd_ctx){ ctx }),
1448 anon_vma_name(vma));
1449 if (prev) {
1450 /* vma_merge() invalidated the mas */
1451 mas_pause(&mas);
1452 vma = prev;
1453 goto next;
1454 }
1455 if (vma->vm_start < start) {
1456 ret = split_vma(mm, vma, start, 1);
1457 if (ret)
1458 break;
1459 /* split_vma() invalidated the mas */
1460 mas_pause(&mas);
1461 }
1462 if (vma->vm_end > end) {
1463 ret = split_vma(mm, vma, end, 0);
1464 if (ret)
1465 break;
1466 /* split_vma() invalidated the mas */
1467 mas_pause(&mas);
1468 }
1469 next:
1470 /*
1471 * In the vma_merge() successful mprotect-like case 8:
1472 * the next vma was merged into the current one and
1473 * the current one has not been updated yet.
1474 */
1475 vma_start_write(vma);
1476 userfaultfd_set_vm_flags(vma, new_flags);
1477 vma->vm_userfaultfd_ctx.ctx = ctx;
1478
1479 if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma))
1480 hugetlb_unshare_all_pmds(vma);
1481
1482 skip:
1483 prev = vma;
1484 start = vma->vm_end;
1485 vma = mas_next(&mas, end - 1);
1486 } while (vma);
1487 out_unlock:
1488 mmap_write_unlock(mm);
1489 mmput(mm);
1490 if (!ret) {
1491 __u64 ioctls_out;
1492
1493 ioctls_out = basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC :
1494 UFFD_API_RANGE_IOCTLS;
1495
1496 /*
1497 * Declare the WP ioctl only if the WP mode is
1498 * specified and all checks passed with the range
1499 */
1500 if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_WP))
1501 ioctls_out &= ~((__u64)1 << _UFFDIO_WRITEPROTECT);
1502
1503 /* CONTINUE ioctl is only supported for MINOR ranges. */
1504 if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR))
1505 ioctls_out &= ~((__u64)1 << _UFFDIO_CONTINUE);
1506
1507 /*
1508 * Now that we scanned all vmas we can already tell
1509 * userland which ioctls methods are guaranteed to
1510 * succeed on this range.
1511 */
1512 if (put_user(ioctls_out, &user_uffdio_register->ioctls))
1513 ret = -EFAULT;
1514 }
1515 out:
1516 return ret;
1517 }
1518
userfaultfd_unregister(struct userfaultfd_ctx * ctx,unsigned long arg)1519 static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
1520 unsigned long arg)
1521 {
1522 struct mm_struct *mm = ctx->mm;
1523 struct vm_area_struct *vma, *prev, *cur;
1524 int ret;
1525 struct uffdio_range uffdio_unregister;
1526 unsigned long new_flags;
1527 bool found;
1528 unsigned long start, end, vma_end;
1529 const void __user *buf = (void __user *)arg;
1530 MA_STATE(mas, &mm->mm_mt, 0, 0);
1531
1532 ret = -EFAULT;
1533 if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
1534 goto out;
1535
1536 ret = validate_range(mm, uffdio_unregister.start,
1537 uffdio_unregister.len);
1538 if (ret)
1539 goto out;
1540
1541 start = uffdio_unregister.start;
1542 end = start + uffdio_unregister.len;
1543
1544 ret = -ENOMEM;
1545 if (!mmget_not_zero(mm))
1546 goto out;
1547
1548 mmap_write_lock(mm);
1549 mas_set(&mas, start);
1550 vma = mas_find(&mas, ULONG_MAX);
1551 if (!vma)
1552 goto out_unlock;
1553
1554 /* check that there's at least one vma in the range */
1555 ret = -EINVAL;
1556 if (vma->vm_start >= end)
1557 goto out_unlock;
1558
1559 /*
1560 * If the first vma contains huge pages, make sure start address
1561 * is aligned to huge page size.
1562 */
1563 if (is_vm_hugetlb_page(vma)) {
1564 unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
1565
1566 if (start & (vma_hpagesize - 1))
1567 goto out_unlock;
1568 }
1569
1570 /*
1571 * Search for not compatible vmas.
1572 */
1573 found = false;
1574 ret = -EINVAL;
1575 for (cur = vma; cur; cur = mas_next(&mas, end - 1)) {
1576 cond_resched();
1577
1578 BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
1579 !!(cur->vm_flags & __VM_UFFD_FLAGS));
1580
1581 /*
1582 * Check not compatible vmas, not strictly required
1583 * here as not compatible vmas cannot have an
1584 * userfaultfd_ctx registered on them, but this
1585 * provides for more strict behavior to notice
1586 * unregistration errors.
1587 */
1588 if (!vma_can_userfault(cur, cur->vm_flags))
1589 goto out_unlock;
1590
1591 found = true;
1592 }
1593 BUG_ON(!found);
1594
1595 mas_set(&mas, start);
1596 prev = mas_prev(&mas, 0);
1597 if (prev != vma)
1598 mas_next(&mas, ULONG_MAX);
1599
1600 ret = 0;
1601 do {
1602 cond_resched();
1603
1604 BUG_ON(!vma_can_userfault(vma, vma->vm_flags));
1605
1606 /*
1607 * Nothing to do: this vma is already registered into this
1608 * userfaultfd and with the right tracking mode too.
1609 */
1610 if (!vma->vm_userfaultfd_ctx.ctx)
1611 goto skip;
1612
1613 WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
1614
1615 if (vma->vm_start > start)
1616 start = vma->vm_start;
1617 vma_end = min(end, vma->vm_end);
1618
1619 if (userfaultfd_missing(vma)) {
1620 /*
1621 * Wake any concurrent pending userfault while
1622 * we unregister, so they will not hang
1623 * permanently and it avoids userland to call
1624 * UFFDIO_WAKE explicitly.
1625 */
1626 struct userfaultfd_wake_range range;
1627 range.start = start;
1628 range.len = vma_end - start;
1629 wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range);
1630 }
1631
1632 /* Reset ptes for the whole vma range if wr-protected */
1633 if (userfaultfd_wp(vma))
1634 uffd_wp_range(mm, vma, start, vma_end - start, false);
1635
1636 new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
1637 prev = vma_merge(mm, prev, start, vma_end, new_flags,
1638 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
1639 vma_policy(vma),
1640 NULL_VM_UFFD_CTX, anon_vma_name(vma));
1641 if (prev) {
1642 vma = prev;
1643 mas_pause(&mas);
1644 goto next;
1645 }
1646 if (vma->vm_start < start) {
1647 ret = split_vma(mm, vma, start, 1);
1648 if (ret)
1649 break;
1650 mas_pause(&mas);
1651 }
1652 if (vma->vm_end > end) {
1653 ret = split_vma(mm, vma, end, 0);
1654 if (ret)
1655 break;
1656 mas_pause(&mas);
1657 }
1658 next:
1659 /*
1660 * In the vma_merge() successful mprotect-like case 8:
1661 * the next vma was merged into the current one and
1662 * the current one has not been updated yet.
1663 */
1664 vma_start_write(vma);
1665 userfaultfd_set_vm_flags(vma, new_flags);
1666 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
1667
1668 skip:
1669 prev = vma;
1670 start = vma->vm_end;
1671 vma = mas_next(&mas, end - 1);
1672 } while (vma);
1673 out_unlock:
1674 mmap_write_unlock(mm);
1675 mmput(mm);
1676 out:
1677 return ret;
1678 }
1679
1680 /*
1681 * userfaultfd_wake may be used in combination with the
1682 * UFFDIO_*_MODE_DONTWAKE to wakeup userfaults in batches.
1683 */
userfaultfd_wake(struct userfaultfd_ctx * ctx,unsigned long arg)1684 static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
1685 unsigned long arg)
1686 {
1687 int ret;
1688 struct uffdio_range uffdio_wake;
1689 struct userfaultfd_wake_range range;
1690 const void __user *buf = (void __user *)arg;
1691
1692 ret = -EFAULT;
1693 if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake)))
1694 goto out;
1695
1696 ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len);
1697 if (ret)
1698 goto out;
1699
1700 range.start = uffdio_wake.start;
1701 range.len = uffdio_wake.len;
1702
1703 /*
1704 * len == 0 means wake all and we don't want to wake all here,
1705 * so check it again to be sure.
1706 */
1707 VM_BUG_ON(!range.len);
1708
1709 wake_userfault(ctx, &range);
1710 ret = 0;
1711
1712 out:
1713 return ret;
1714 }
1715
userfaultfd_copy(struct userfaultfd_ctx * ctx,unsigned long arg)1716 static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
1717 unsigned long arg)
1718 {
1719 __s64 ret;
1720 struct uffdio_copy uffdio_copy;
1721 struct uffdio_copy __user *user_uffdio_copy;
1722 struct userfaultfd_wake_range range;
1723
1724 user_uffdio_copy = (struct uffdio_copy __user *) arg;
1725
1726 ret = -EAGAIN;
1727 if (atomic_read(&ctx->mmap_changing))
1728 goto out;
1729
1730 ret = -EFAULT;
1731 if (copy_from_user(&uffdio_copy, user_uffdio_copy,
1732 /* don't copy "copy" last field */
1733 sizeof(uffdio_copy)-sizeof(__s64)))
1734 goto out;
1735
1736 ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len);
1737 if (ret)
1738 goto out;
1739 /*
1740 * double check for wraparound just in case. copy_from_user()
1741 * will later check uffdio_copy.src + uffdio_copy.len to fit
1742 * in the userland range.
1743 */
1744 ret = -EINVAL;
1745 if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src)
1746 goto out;
1747 if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP))
1748 goto out;
1749 if (mmget_not_zero(ctx->mm)) {
1750 ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
1751 uffdio_copy.len, &ctx->mmap_changing,
1752 uffdio_copy.mode);
1753 mmput(ctx->mm);
1754 } else {
1755 return -ESRCH;
1756 }
1757 if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
1758 return -EFAULT;
1759 if (ret < 0)
1760 goto out;
1761 BUG_ON(!ret);
1762 /* len == 0 would wake all */
1763 range.len = ret;
1764 if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) {
1765 range.start = uffdio_copy.dst;
1766 wake_userfault(ctx, &range);
1767 }
1768 ret = range.len == uffdio_copy.len ? 0 : -EAGAIN;
1769 out:
1770 return ret;
1771 }
1772
userfaultfd_zeropage(struct userfaultfd_ctx * ctx,unsigned long arg)1773 static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
1774 unsigned long arg)
1775 {
1776 __s64 ret;
1777 struct uffdio_zeropage uffdio_zeropage;
1778 struct uffdio_zeropage __user *user_uffdio_zeropage;
1779 struct userfaultfd_wake_range range;
1780
1781 user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;
1782
1783 ret = -EAGAIN;
1784 if (atomic_read(&ctx->mmap_changing))
1785 goto out;
1786
1787 ret = -EFAULT;
1788 if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage,
1789 /* don't copy "zeropage" last field */
1790 sizeof(uffdio_zeropage)-sizeof(__s64)))
1791 goto out;
1792
1793 ret = validate_range(ctx->mm, uffdio_zeropage.range.start,
1794 uffdio_zeropage.range.len);
1795 if (ret)
1796 goto out;
1797 ret = -EINVAL;
1798 if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE)
1799 goto out;
1800
1801 if (mmget_not_zero(ctx->mm)) {
1802 ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
1803 uffdio_zeropage.range.len,
1804 &ctx->mmap_changing);
1805 mmput(ctx->mm);
1806 } else {
1807 return -ESRCH;
1808 }
1809 if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
1810 return -EFAULT;
1811 if (ret < 0)
1812 goto out;
1813 /* len == 0 would wake all */
1814 BUG_ON(!ret);
1815 range.len = ret;
1816 if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) {
1817 range.start = uffdio_zeropage.range.start;
1818 wake_userfault(ctx, &range);
1819 }
1820 ret = range.len == uffdio_zeropage.range.len ? 0 : -EAGAIN;
1821 out:
1822 return ret;
1823 }
1824
userfaultfd_writeprotect(struct userfaultfd_ctx * ctx,unsigned long arg)1825 static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
1826 unsigned long arg)
1827 {
1828 int ret;
1829 struct uffdio_writeprotect uffdio_wp;
1830 struct uffdio_writeprotect __user *user_uffdio_wp;
1831 struct userfaultfd_wake_range range;
1832 bool mode_wp, mode_dontwake;
1833
1834 if (atomic_read(&ctx->mmap_changing))
1835 return -EAGAIN;
1836
1837 user_uffdio_wp = (struct uffdio_writeprotect __user *) arg;
1838
1839 if (copy_from_user(&uffdio_wp, user_uffdio_wp,
1840 sizeof(struct uffdio_writeprotect)))
1841 return -EFAULT;
1842
1843 ret = validate_range(ctx->mm, uffdio_wp.range.start,
1844 uffdio_wp.range.len);
1845 if (ret)
1846 return ret;
1847
1848 if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE |
1849 UFFDIO_WRITEPROTECT_MODE_WP))
1850 return -EINVAL;
1851
1852 mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP;
1853 mode_dontwake = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE;
1854
1855 if (mode_wp && mode_dontwake)
1856 return -EINVAL;
1857
1858 if (mmget_not_zero(ctx->mm)) {
1859 ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
1860 uffdio_wp.range.len, mode_wp,
1861 &ctx->mmap_changing);
1862 mmput(ctx->mm);
1863 } else {
1864 return -ESRCH;
1865 }
1866
1867 if (ret)
1868 return ret;
1869
1870 if (!mode_wp && !mode_dontwake) {
1871 range.start = uffdio_wp.range.start;
1872 range.len = uffdio_wp.range.len;
1873 wake_userfault(ctx, &range);
1874 }
1875 return ret;
1876 }
1877
userfaultfd_continue(struct userfaultfd_ctx * ctx,unsigned long arg)1878 static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
1879 {
1880 __s64 ret;
1881 struct uffdio_continue uffdio_continue;
1882 struct uffdio_continue __user *user_uffdio_continue;
1883 struct userfaultfd_wake_range range;
1884
1885 user_uffdio_continue = (struct uffdio_continue __user *)arg;
1886
1887 ret = -EAGAIN;
1888 if (atomic_read(&ctx->mmap_changing))
1889 goto out;
1890
1891 ret = -EFAULT;
1892 if (copy_from_user(&uffdio_continue, user_uffdio_continue,
1893 /* don't copy the output fields */
1894 sizeof(uffdio_continue) - (sizeof(__s64))))
1895 goto out;
1896
1897 ret = validate_range(ctx->mm, uffdio_continue.range.start,
1898 uffdio_continue.range.len);
1899 if (ret)
1900 goto out;
1901
1902 ret = -EINVAL;
1903 /* double check for wraparound just in case. */
1904 if (uffdio_continue.range.start + uffdio_continue.range.len <=
1905 uffdio_continue.range.start) {
1906 goto out;
1907 }
1908 if (uffdio_continue.mode & ~UFFDIO_CONTINUE_MODE_DONTWAKE)
1909 goto out;
1910
1911 if (mmget_not_zero(ctx->mm)) {
1912 ret = mcopy_continue(ctx->mm, uffdio_continue.range.start,
1913 uffdio_continue.range.len,
1914 &ctx->mmap_changing);
1915 mmput(ctx->mm);
1916 } else {
1917 return -ESRCH;
1918 }
1919
1920 if (unlikely(put_user(ret, &user_uffdio_continue->mapped)))
1921 return -EFAULT;
1922 if (ret < 0)
1923 goto out;
1924
1925 /* len == 0 would wake all */
1926 BUG_ON(!ret);
1927 range.len = ret;
1928 if (!(uffdio_continue.mode & UFFDIO_CONTINUE_MODE_DONTWAKE)) {
1929 range.start = uffdio_continue.range.start;
1930 wake_userfault(ctx, &range);
1931 }
1932 ret = range.len == uffdio_continue.range.len ? 0 : -EAGAIN;
1933
1934 out:
1935 return ret;
1936 }
1937
uffd_ctx_features(__u64 user_features)1938 static inline unsigned int uffd_ctx_features(__u64 user_features)
1939 {
1940 /*
1941 * For the current set of features the bits just coincide. Set
1942 * UFFD_FEATURE_INITIALIZED to mark the features as enabled.
1943 */
1944 return (unsigned int)user_features | UFFD_FEATURE_INITIALIZED;
1945 }
1946
1947 /*
1948 * userland asks for a certain API version and we return which bits
1949 * and ioctl commands are implemented in this kernel for such API
1950 * version or -EINVAL if unknown.
1951 */
userfaultfd_api(struct userfaultfd_ctx * ctx,unsigned long arg)1952 static int userfaultfd_api(struct userfaultfd_ctx *ctx,
1953 unsigned long arg)
1954 {
1955 struct uffdio_api uffdio_api;
1956 void __user *buf = (void __user *)arg;
1957 unsigned int ctx_features;
1958 int ret;
1959 __u64 features;
1960
1961 ret = -EFAULT;
1962 if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
1963 goto out;
1964 features = uffdio_api.features;
1965 ret = -EINVAL;
1966 if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES))
1967 goto err_out;
1968 ret = -EPERM;
1969 if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE))
1970 goto err_out;
1971 /* report all available features and ioctls to userland */
1972 uffdio_api.features = UFFD_API_FEATURES;
1973 #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
1974 uffdio_api.features &=
1975 ~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM);
1976 #endif
1977 #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
1978 uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP;
1979 #endif
1980 #ifndef CONFIG_PTE_MARKER_UFFD_WP
1981 uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
1982 #endif
1983 uffdio_api.ioctls = UFFD_API_IOCTLS;
1984 ret = -EFAULT;
1985 if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
1986 goto out;
1987
1988 /* only enable the requested features for this uffd context */
1989 ctx_features = uffd_ctx_features(features);
1990 ret = -EINVAL;
1991 if (cmpxchg(&ctx->features, 0, ctx_features) != 0)
1992 goto err_out;
1993
1994 ret = 0;
1995 out:
1996 return ret;
1997 err_out:
1998 memset(&uffdio_api, 0, sizeof(uffdio_api));
1999 if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
2000 ret = -EFAULT;
2001 goto out;
2002 }
2003
userfaultfd_ioctl(struct file * file,unsigned cmd,unsigned long arg)2004 static long userfaultfd_ioctl(struct file *file, unsigned cmd,
2005 unsigned long arg)
2006 {
2007 int ret = -EINVAL;
2008 struct userfaultfd_ctx *ctx = file->private_data;
2009
2010 if (cmd != UFFDIO_API && !userfaultfd_is_initialized(ctx))
2011 return -EINVAL;
2012
2013 switch(cmd) {
2014 case UFFDIO_API:
2015 ret = userfaultfd_api(ctx, arg);
2016 break;
2017 case UFFDIO_REGISTER:
2018 ret = userfaultfd_register(ctx, arg);
2019 break;
2020 case UFFDIO_UNREGISTER:
2021 ret = userfaultfd_unregister(ctx, arg);
2022 break;
2023 case UFFDIO_WAKE:
2024 ret = userfaultfd_wake(ctx, arg);
2025 break;
2026 case UFFDIO_COPY:
2027 ret = userfaultfd_copy(ctx, arg);
2028 break;
2029 case UFFDIO_ZEROPAGE:
2030 ret = userfaultfd_zeropage(ctx, arg);
2031 break;
2032 case UFFDIO_WRITEPROTECT:
2033 ret = userfaultfd_writeprotect(ctx, arg);
2034 break;
2035 case UFFDIO_CONTINUE:
2036 ret = userfaultfd_continue(ctx, arg);
2037 break;
2038 }
2039 return ret;
2040 }
2041
2042 #ifdef CONFIG_PROC_FS
userfaultfd_show_fdinfo(struct seq_file * m,struct file * f)2043 static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
2044 {
2045 struct userfaultfd_ctx *ctx = f->private_data;
2046 wait_queue_entry_t *wq;
2047 unsigned long pending = 0, total = 0;
2048
2049 spin_lock_irq(&ctx->fault_pending_wqh.lock);
2050 list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) {
2051 pending++;
2052 total++;
2053 }
2054 list_for_each_entry(wq, &ctx->fault_wqh.head, entry) {
2055 total++;
2056 }
2057 spin_unlock_irq(&ctx->fault_pending_wqh.lock);
2058
2059 /*
2060 * If more protocols will be added, there will be all shown
2061 * separated by a space. Like this:
2062 * protocols: aa:... bb:...
2063 */
2064 seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n",
2065 pending, total, UFFD_API, ctx->features,
2066 UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS);
2067 }
2068 #endif
2069
2070 static const struct file_operations userfaultfd_fops = {
2071 #ifdef CONFIG_PROC_FS
2072 .show_fdinfo = userfaultfd_show_fdinfo,
2073 #endif
2074 .release = userfaultfd_release,
2075 .poll = userfaultfd_poll,
2076 .read = userfaultfd_read,
2077 .unlocked_ioctl = userfaultfd_ioctl,
2078 .compat_ioctl = compat_ptr_ioctl,
2079 .llseek = noop_llseek,
2080 };
2081
init_once_userfaultfd_ctx(void * mem)2082 static void init_once_userfaultfd_ctx(void *mem)
2083 {
2084 struct userfaultfd_ctx *ctx = (struct userfaultfd_ctx *) mem;
2085
2086 init_waitqueue_head(&ctx->fault_pending_wqh);
2087 init_waitqueue_head(&ctx->fault_wqh);
2088 init_waitqueue_head(&ctx->event_wqh);
2089 init_waitqueue_head(&ctx->fd_wqh);
2090 seqcount_spinlock_init(&ctx->refile_seq, &ctx->fault_pending_wqh.lock);
2091 }
2092
new_userfaultfd(int flags)2093 static int new_userfaultfd(int flags)
2094 {
2095 struct userfaultfd_ctx *ctx;
2096 int fd;
2097
2098 BUG_ON(!current->mm);
2099
2100 /* Check the UFFD_* constants for consistency. */
2101 BUILD_BUG_ON(UFFD_USER_MODE_ONLY & UFFD_SHARED_FCNTL_FLAGS);
2102 BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC);
2103 BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK);
2104
2105 if (flags & ~(UFFD_SHARED_FCNTL_FLAGS | UFFD_USER_MODE_ONLY))
2106 return -EINVAL;
2107
2108 ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
2109 if (!ctx)
2110 return -ENOMEM;
2111
2112 refcount_set(&ctx->refcount, 1);
2113 ctx->flags = flags;
2114 ctx->features = 0;
2115 ctx->released = false;
2116 atomic_set(&ctx->mmap_changing, 0);
2117 ctx->mm = current->mm;
2118 /* prevent the mm struct to be freed */
2119 mmgrab(ctx->mm);
2120
2121 fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, ctx,
2122 O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL);
2123 if (fd < 0) {
2124 mmdrop(ctx->mm);
2125 kmem_cache_free(userfaultfd_ctx_cachep, ctx);
2126 }
2127 return fd;
2128 }
2129
userfaultfd_syscall_allowed(int flags)2130 static inline bool userfaultfd_syscall_allowed(int flags)
2131 {
2132 /* Userspace-only page faults are always allowed */
2133 if (flags & UFFD_USER_MODE_ONLY)
2134 return true;
2135
2136 /*
2137 * The user is requesting a userfaultfd which can handle kernel faults.
2138 * Privileged users are always allowed to do this.
2139 */
2140 if (capable(CAP_SYS_PTRACE))
2141 return true;
2142
2143 /* Otherwise, access to kernel fault handling is sysctl controlled. */
2144 return sysctl_unprivileged_userfaultfd;
2145 }
2146
SYSCALL_DEFINE1(userfaultfd,int,flags)2147 SYSCALL_DEFINE1(userfaultfd, int, flags)
2148 {
2149 if (!userfaultfd_syscall_allowed(flags))
2150 return -EPERM;
2151
2152 return new_userfaultfd(flags);
2153 }
2154
userfaultfd_dev_ioctl(struct file * file,unsigned int cmd,unsigned long flags)2155 static long userfaultfd_dev_ioctl(struct file *file, unsigned int cmd, unsigned long flags)
2156 {
2157 if (cmd != USERFAULTFD_IOC_NEW)
2158 return -EINVAL;
2159
2160 return new_userfaultfd(flags);
2161 }
2162
2163 static const struct file_operations userfaultfd_dev_fops = {
2164 .unlocked_ioctl = userfaultfd_dev_ioctl,
2165 .compat_ioctl = userfaultfd_dev_ioctl,
2166 .owner = THIS_MODULE,
2167 .llseek = noop_llseek,
2168 };
2169
2170 static struct miscdevice userfaultfd_misc = {
2171 .minor = MISC_DYNAMIC_MINOR,
2172 .name = "userfaultfd",
2173 .fops = &userfaultfd_dev_fops
2174 };
2175
userfaultfd_init(void)2176 static int __init userfaultfd_init(void)
2177 {
2178 int ret;
2179
2180 ret = misc_register(&userfaultfd_misc);
2181 if (ret)
2182 return ret;
2183
2184 userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache",
2185 sizeof(struct userfaultfd_ctx),
2186 0,
2187 SLAB_HWCACHE_ALIGN|SLAB_PANIC,
2188 init_once_userfaultfd_ctx);
2189 return 0;
2190 }
2191 __initcall(userfaultfd_init);
2192