1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * mm/mremap.c
4 *
5 * (C) Copyright 1996 Linus Torvalds
6 *
7 * Address space accounting code <alan@lxorguk.ukuu.org.uk>
8 * (C) Copyright 2002 Red Hat Inc, All Rights Reserved
9 */
10
11 #include <linux/mm.h>
12 #include <linux/mm_inline.h>
13 #include <linux/hugetlb.h>
14 #include <linux/shm.h>
15 #include <linux/ksm.h>
16 #include <linux/mman.h>
17 #include <linux/swap.h>
18 #include <linux/capability.h>
19 #include <linux/fs.h>
20 #include <linux/swapops.h>
21 #include <linux/highmem.h>
22 #include <linux/security.h>
23 #include <linux/syscalls.h>
24 #include <linux/mmu_notifier.h>
25 #include <linux/uaccess.h>
26 #include <linux/userfaultfd_k.h>
27 #include <linux/mempolicy.h>
28 #include <linux/page_size_compat.h>
29
30 #include <asm/cacheflush.h>
31 #include <asm/tlb.h>
32 #include <asm/pgalloc.h>
33
34 #include "internal.h"
35
get_old_pud(struct mm_struct * mm,unsigned long addr)36 static pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr)
37 {
38 pgd_t *pgd;
39 p4d_t *p4d;
40 pud_t *pud;
41
42 pgd = pgd_offset(mm, addr);
43 if (pgd_none_or_clear_bad(pgd))
44 return NULL;
45
46 p4d = p4d_offset(pgd, addr);
47 if (p4d_none_or_clear_bad(p4d))
48 return NULL;
49
50 pud = pud_offset(p4d, addr);
51 if (pud_none_or_clear_bad(pud))
52 return NULL;
53
54 return pud;
55 }
56
get_old_pmd(struct mm_struct * mm,unsigned long addr)57 static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
58 {
59 pud_t *pud;
60 pmd_t *pmd;
61
62 pud = get_old_pud(mm, addr);
63 if (!pud)
64 return NULL;
65
66 pmd = pmd_offset(pud, addr);
67 if (pmd_none(*pmd))
68 return NULL;
69
70 return pmd;
71 }
72
alloc_new_pud(struct mm_struct * mm,struct vm_area_struct * vma,unsigned long addr)73 static pud_t *alloc_new_pud(struct mm_struct *mm, struct vm_area_struct *vma,
74 unsigned long addr)
75 {
76 pgd_t *pgd;
77 p4d_t *p4d;
78
79 pgd = pgd_offset(mm, addr);
80 p4d = p4d_alloc(mm, pgd, addr);
81 if (!p4d)
82 return NULL;
83
84 return pud_alloc(mm, p4d, addr);
85 }
86
alloc_new_pmd(struct mm_struct * mm,struct vm_area_struct * vma,unsigned long addr)87 static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
88 unsigned long addr)
89 {
90 pud_t *pud;
91 pmd_t *pmd;
92
93 pud = alloc_new_pud(mm, vma, addr);
94 if (!pud)
95 return NULL;
96
97 pmd = pmd_alloc(mm, pud, addr);
98 if (!pmd)
99 return NULL;
100
101 VM_BUG_ON(pmd_trans_huge(*pmd));
102
103 return pmd;
104 }
105
take_rmap_locks(struct vm_area_struct * vma)106 static void take_rmap_locks(struct vm_area_struct *vma)
107 {
108 if (vma->vm_file)
109 i_mmap_lock_write(vma->vm_file->f_mapping);
110 if (vma->anon_vma)
111 anon_vma_lock_write(vma->anon_vma);
112 }
113
drop_rmap_locks(struct vm_area_struct * vma)114 static void drop_rmap_locks(struct vm_area_struct *vma)
115 {
116 if (vma->anon_vma)
117 anon_vma_unlock_write(vma->anon_vma);
118 if (vma->vm_file)
119 i_mmap_unlock_write(vma->vm_file->f_mapping);
120 }
121
move_soft_dirty_pte(pte_t pte)122 static pte_t move_soft_dirty_pte(pte_t pte)
123 {
124 /*
125 * Set soft dirty bit so we can notice
126 * in userspace the ptes were moved.
127 */
128 #ifdef CONFIG_MEM_SOFT_DIRTY
129 if (pte_present(pte))
130 pte = pte_mksoft_dirty(pte);
131 else if (is_swap_pte(pte))
132 pte = pte_swp_mksoft_dirty(pte);
133 #endif
134 return pte;
135 }
136
move_ptes(struct vm_area_struct * vma,pmd_t * old_pmd,unsigned long old_addr,unsigned long old_end,struct vm_area_struct * new_vma,pmd_t * new_pmd,unsigned long new_addr,bool need_rmap_locks)137 static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
138 unsigned long old_addr, unsigned long old_end,
139 struct vm_area_struct *new_vma, pmd_t *new_pmd,
140 unsigned long new_addr, bool need_rmap_locks)
141 {
142 bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma);
143 struct mm_struct *mm = vma->vm_mm;
144 pte_t *old_pte, *new_pte, pte;
145 pmd_t dummy_pmdval;
146 spinlock_t *old_ptl, *new_ptl;
147 bool force_flush = false;
148 unsigned long len = old_end - old_addr;
149 int err = 0;
150
151 /*
152 * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma
153 * locks to ensure that rmap will always observe either the old or the
154 * new ptes. This is the easiest way to avoid races with
155 * truncate_pagecache(), page migration, etc...
156 *
157 * When need_rmap_locks is false, we use other ways to avoid
158 * such races:
159 *
160 * - During exec() shift_arg_pages(), we use a specially tagged vma
161 * which rmap call sites look for using vma_is_temporary_stack().
162 *
163 * - During mremap(), new_vma is often known to be placed after vma
164 * in rmap traversal order. This ensures rmap will always observe
165 * either the old pte, or the new pte, or both (the page table locks
166 * serialize access to individual ptes, but only rmap traversal
167 * order guarantees that we won't miss both the old and new ptes).
168 */
169 if (need_rmap_locks)
170 take_rmap_locks(vma);
171
172 /*
173 * We don't have to worry about the ordering of src and dst
174 * pte locks because exclusive mmap_lock prevents deadlock.
175 */
176 old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
177 if (!old_pte) {
178 err = -EAGAIN;
179 goto out;
180 }
181 /*
182 * Now new_pte is none, so hpage_collapse_scan_file() path can not find
183 * this by traversing file->f_mapping, so there is no concurrency with
184 * retract_page_tables(). In addition, we already hold the exclusive
185 * mmap_lock, so this new_pte page is stable, so there is no need to get
186 * pmdval and do pmd_same() check.
187 */
188 new_pte = pte_offset_map_rw_nolock(mm, new_pmd, new_addr, &dummy_pmdval,
189 &new_ptl);
190 if (!new_pte) {
191 pte_unmap_unlock(old_pte, old_ptl);
192 err = -EAGAIN;
193 goto out;
194 }
195 if (new_ptl != old_ptl)
196 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
197 flush_tlb_batched_pending(vma->vm_mm);
198 arch_enter_lazy_mmu_mode();
199
200 for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
201 new_pte++, new_addr += PAGE_SIZE) {
202 if (pte_none(ptep_get(old_pte)))
203 continue;
204
205 pte = ptep_get_and_clear(mm, old_addr, old_pte);
206 /*
207 * If we are remapping a valid PTE, make sure
208 * to flush TLB before we drop the PTL for the
209 * PTE.
210 *
211 * NOTE! Both old and new PTL matter: the old one
212 * for racing with folio_mkclean(), the new one to
213 * make sure the physical page stays valid until
214 * the TLB entry for the old mapping has been
215 * flushed.
216 */
217 if (pte_present(pte))
218 force_flush = true;
219 pte = move_pte(pte, old_addr, new_addr);
220 pte = move_soft_dirty_pte(pte);
221
222 if (need_clear_uffd_wp && pte_marker_uffd_wp(pte))
223 pte_clear(mm, new_addr, new_pte);
224 else {
225 if (need_clear_uffd_wp) {
226 if (pte_present(pte))
227 pte = pte_clear_uffd_wp(pte);
228 else if (is_swap_pte(pte))
229 pte = pte_swp_clear_uffd_wp(pte);
230 }
231 set_pte_at(mm, new_addr, new_pte, pte);
232 }
233 }
234
235 arch_leave_lazy_mmu_mode();
236 if (force_flush)
237 flush_tlb_range(vma, old_end - len, old_end);
238 if (new_ptl != old_ptl)
239 spin_unlock(new_ptl);
240 pte_unmap(new_pte - 1);
241 pte_unmap_unlock(old_pte - 1, old_ptl);
242 out:
243 if (need_rmap_locks)
244 drop_rmap_locks(vma);
245 return err;
246 }
247
248 #ifndef arch_supports_page_table_move
249 #define arch_supports_page_table_move arch_supports_page_table_move
arch_supports_page_table_move(void)250 static inline bool arch_supports_page_table_move(void)
251 {
252 return IS_ENABLED(CONFIG_HAVE_MOVE_PMD) ||
253 IS_ENABLED(CONFIG_HAVE_MOVE_PUD);
254 }
255 #endif
256
257 #ifdef CONFIG_HAVE_MOVE_PMD
move_normal_pmd(struct vm_area_struct * vma,unsigned long old_addr,unsigned long new_addr,pmd_t * old_pmd,pmd_t * new_pmd)258 static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
259 unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
260 {
261 spinlock_t *old_ptl, *new_ptl;
262 struct mm_struct *mm = vma->vm_mm;
263 bool res = false;
264 pmd_t pmd;
265
266 if (!arch_supports_page_table_move())
267 return false;
268 /*
269 * The destination pmd shouldn't be established, free_pgtables()
270 * should have released it.
271 *
272 * However, there's a case during execve() where we use mremap
273 * to move the initial stack, and in that case the target area
274 * may overlap the source area (always moving down).
275 *
276 * If everything is PMD-aligned, that works fine, as moving
277 * each pmd down will clear the source pmd. But if we first
278 * have a few 4kB-only pages that get moved down, and then
279 * hit the "now the rest is PMD-aligned, let's do everything
280 * one pmd at a time", we will still have the old (now empty
281 * of any 4kB pages, but still there) PMD in the page table
282 * tree.
283 *
284 * Warn on it once - because we really should try to figure
285 * out how to do this better - but then say "I won't move
286 * this pmd".
287 *
288 * One alternative might be to just unmap the target pmd at
289 * this point, and verify that it really is empty. We'll see.
290 */
291 if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
292 return false;
293
294 /* If this pmd belongs to a uffd vma with remap events disabled, we need
295 * to ensure that the uffd-wp state is cleared from all pgtables. This
296 * means recursing into lower page tables in move_page_tables(), and we
297 * can reuse the existing code if we simply treat the entry as "not
298 * moved".
299 */
300 if (vma_has_uffd_without_event_remap(vma))
301 return false;
302
303 /*
304 * We don't have to worry about the ordering of src and dst
305 * ptlocks because exclusive mmap_lock prevents deadlock.
306 */
307 old_ptl = pmd_lock(vma->vm_mm, old_pmd);
308 new_ptl = pmd_lockptr(mm, new_pmd);
309 if (new_ptl != old_ptl)
310 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
311
312 pmd = *old_pmd;
313
314 /* Racing with collapse? */
315 if (unlikely(!pmd_present(pmd) || pmd_leaf(pmd)))
316 goto out_unlock;
317 /* Clear the pmd */
318 pmd_clear(old_pmd);
319 res = true;
320
321 VM_BUG_ON(!pmd_none(*new_pmd));
322
323 pmd_populate(mm, new_pmd, pmd_pgtable(pmd));
324 flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
325 out_unlock:
326 if (new_ptl != old_ptl)
327 spin_unlock(new_ptl);
328 spin_unlock(old_ptl);
329
330 return res;
331 }
332 #else
move_normal_pmd(struct vm_area_struct * vma,unsigned long old_addr,unsigned long new_addr,pmd_t * old_pmd,pmd_t * new_pmd)333 static inline bool move_normal_pmd(struct vm_area_struct *vma,
334 unsigned long old_addr, unsigned long new_addr, pmd_t *old_pmd,
335 pmd_t *new_pmd)
336 {
337 return false;
338 }
339 #endif
340
341 #if CONFIG_PGTABLE_LEVELS > 2 && defined(CONFIG_HAVE_MOVE_PUD)
move_normal_pud(struct vm_area_struct * vma,unsigned long old_addr,unsigned long new_addr,pud_t * old_pud,pud_t * new_pud)342 static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
343 unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
344 {
345 spinlock_t *old_ptl, *new_ptl;
346 struct mm_struct *mm = vma->vm_mm;
347 pud_t pud;
348
349 if (!arch_supports_page_table_move())
350 return false;
351 /*
352 * The destination pud shouldn't be established, free_pgtables()
353 * should have released it.
354 */
355 if (WARN_ON_ONCE(!pud_none(*new_pud)))
356 return false;
357
358 /* If this pud belongs to a uffd vma with remap events disabled, we need
359 * to ensure that the uffd-wp state is cleared from all pgtables. This
360 * means recursing into lower page tables in move_page_tables(), and we
361 * can reuse the existing code if we simply treat the entry as "not
362 * moved".
363 */
364 if (vma_has_uffd_without_event_remap(vma))
365 return false;
366
367 /*
368 * We don't have to worry about the ordering of src and dst
369 * ptlocks because exclusive mmap_lock prevents deadlock.
370 */
371 old_ptl = pud_lock(vma->vm_mm, old_pud);
372 new_ptl = pud_lockptr(mm, new_pud);
373 if (new_ptl != old_ptl)
374 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
375
376 /* Clear the pud */
377 pud = *old_pud;
378 pud_clear(old_pud);
379
380 VM_BUG_ON(!pud_none(*new_pud));
381
382 pud_populate(mm, new_pud, pud_pgtable(pud));
383 flush_tlb_range(vma, old_addr, old_addr + PUD_SIZE);
384 if (new_ptl != old_ptl)
385 spin_unlock(new_ptl);
386 spin_unlock(old_ptl);
387
388 return true;
389 }
390 #else
move_normal_pud(struct vm_area_struct * vma,unsigned long old_addr,unsigned long new_addr,pud_t * old_pud,pud_t * new_pud)391 static inline bool move_normal_pud(struct vm_area_struct *vma,
392 unsigned long old_addr, unsigned long new_addr, pud_t *old_pud,
393 pud_t *new_pud)
394 {
395 return false;
396 }
397 #endif
398
399 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
move_huge_pud(struct vm_area_struct * vma,unsigned long old_addr,unsigned long new_addr,pud_t * old_pud,pud_t * new_pud)400 static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
401 unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
402 {
403 spinlock_t *old_ptl, *new_ptl;
404 struct mm_struct *mm = vma->vm_mm;
405 pud_t pud;
406
407 /*
408 * The destination pud shouldn't be established, free_pgtables()
409 * should have released it.
410 */
411 if (WARN_ON_ONCE(!pud_none(*new_pud)))
412 return false;
413
414 /*
415 * We don't have to worry about the ordering of src and dst
416 * ptlocks because exclusive mmap_lock prevents deadlock.
417 */
418 old_ptl = pud_lock(vma->vm_mm, old_pud);
419 new_ptl = pud_lockptr(mm, new_pud);
420 if (new_ptl != old_ptl)
421 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
422
423 /* Clear the pud */
424 pud = *old_pud;
425 pud_clear(old_pud);
426
427 VM_BUG_ON(!pud_none(*new_pud));
428
429 /* Set the new pud */
430 /* mark soft_ditry when we add pud level soft dirty support */
431 set_pud_at(mm, new_addr, new_pud, pud);
432 flush_pud_tlb_range(vma, old_addr, old_addr + HPAGE_PUD_SIZE);
433 if (new_ptl != old_ptl)
434 spin_unlock(new_ptl);
435 spin_unlock(old_ptl);
436
437 return true;
438 }
439 #else
move_huge_pud(struct vm_area_struct * vma,unsigned long old_addr,unsigned long new_addr,pud_t * old_pud,pud_t * new_pud)440 static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
441 unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
442 {
443 WARN_ON_ONCE(1);
444 return false;
445
446 }
447 #endif
448
449 enum pgt_entry {
450 NORMAL_PMD,
451 HPAGE_PMD,
452 NORMAL_PUD,
453 HPAGE_PUD,
454 };
455
456 /*
457 * Returns an extent of the corresponding size for the pgt_entry specified if
458 * valid. Else returns a smaller extent bounded by the end of the source and
459 * destination pgt_entry.
460 */
get_extent(enum pgt_entry entry,unsigned long old_addr,unsigned long old_end,unsigned long new_addr)461 static __always_inline unsigned long get_extent(enum pgt_entry entry,
462 unsigned long old_addr, unsigned long old_end,
463 unsigned long new_addr)
464 {
465 unsigned long next, extent, mask, size;
466
467 switch (entry) {
468 case HPAGE_PMD:
469 case NORMAL_PMD:
470 mask = PMD_MASK;
471 size = PMD_SIZE;
472 break;
473 case HPAGE_PUD:
474 case NORMAL_PUD:
475 mask = PUD_MASK;
476 size = PUD_SIZE;
477 break;
478 default:
479 BUILD_BUG();
480 break;
481 }
482
483 next = (old_addr + size) & mask;
484 /* even if next overflowed, extent below will be ok */
485 extent = next - old_addr;
486 if (extent > old_end - old_addr)
487 extent = old_end - old_addr;
488 next = (new_addr + size) & mask;
489 if (extent > next - new_addr)
490 extent = next - new_addr;
491 return extent;
492 }
493
494 /*
495 * Attempts to speedup the move by moving entry at the level corresponding to
496 * pgt_entry. Returns true if the move was successful, else false.
497 */
move_pgt_entry(enum pgt_entry entry,struct vm_area_struct * vma,unsigned long old_addr,unsigned long new_addr,void * old_entry,void * new_entry,bool need_rmap_locks)498 static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma,
499 unsigned long old_addr, unsigned long new_addr,
500 void *old_entry, void *new_entry, bool need_rmap_locks)
501 {
502 bool moved = false;
503
504 /* See comment in move_ptes() */
505 if (need_rmap_locks)
506 take_rmap_locks(vma);
507
508 switch (entry) {
509 case NORMAL_PMD:
510 moved = move_normal_pmd(vma, old_addr, new_addr, old_entry,
511 new_entry);
512 break;
513 case NORMAL_PUD:
514 moved = move_normal_pud(vma, old_addr, new_addr, old_entry,
515 new_entry);
516 break;
517 case HPAGE_PMD:
518 moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
519 move_huge_pmd(vma, old_addr, new_addr, old_entry,
520 new_entry);
521 break;
522 case HPAGE_PUD:
523 moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
524 move_huge_pud(vma, old_addr, new_addr, old_entry,
525 new_entry);
526 break;
527
528 default:
529 WARN_ON_ONCE(1);
530 break;
531 }
532
533 if (need_rmap_locks)
534 drop_rmap_locks(vma);
535
536 return moved;
537 }
538
539 /*
540 * A helper to check if aligning down is OK. The aligned address should fall
541 * on *no mapping*. For the stack moving down, that's a special move within
542 * the VMA that is created to span the source and destination of the move,
543 * so we make an exception for it.
544 */
can_align_down(struct vm_area_struct * vma,unsigned long addr_to_align,unsigned long mask,bool for_stack)545 static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_align,
546 unsigned long mask, bool for_stack)
547 {
548 unsigned long addr_masked = addr_to_align & mask;
549
550 /*
551 * If @addr_to_align of either source or destination is not the beginning
552 * of the corresponding VMA, we can't align down or we will destroy part
553 * of the current mapping.
554 */
555 if (!for_stack && vma->vm_start != addr_to_align)
556 return false;
557
558 /* In the stack case we explicitly permit in-VMA alignment. */
559 if (for_stack && addr_masked >= vma->vm_start)
560 return true;
561
562 /*
563 * Make sure the realignment doesn't cause the address to fall on an
564 * existing mapping.
565 */
566 return find_vma_intersection(vma->vm_mm, addr_masked, vma->vm_start) == NULL;
567 }
568
569 /* Opportunistically realign to specified boundary for faster copy. */
try_realign_addr(unsigned long * old_addr,struct vm_area_struct * old_vma,unsigned long * new_addr,struct vm_area_struct * new_vma,unsigned long mask,bool for_stack)570 static void try_realign_addr(unsigned long *old_addr, struct vm_area_struct *old_vma,
571 unsigned long *new_addr, struct vm_area_struct *new_vma,
572 unsigned long mask, bool for_stack)
573 {
574 /* Skip if the addresses are already aligned. */
575 if ((*old_addr & ~mask) == 0)
576 return;
577
578 /* Only realign if the new and old addresses are mutually aligned. */
579 if ((*old_addr & ~mask) != (*new_addr & ~mask))
580 return;
581
582 /* Ensure realignment doesn't cause overlap with existing mappings. */
583 if (!can_align_down(old_vma, *old_addr, mask, for_stack) ||
584 !can_align_down(new_vma, *new_addr, mask, for_stack))
585 return;
586
587 *old_addr = *old_addr & mask;
588 *new_addr = *new_addr & mask;
589 }
590
move_page_tables(struct vm_area_struct * vma,unsigned long old_addr,struct vm_area_struct * new_vma,unsigned long new_addr,unsigned long len,bool need_rmap_locks,bool for_stack)591 unsigned long move_page_tables(struct vm_area_struct *vma,
592 unsigned long old_addr, struct vm_area_struct *new_vma,
593 unsigned long new_addr, unsigned long len,
594 bool need_rmap_locks, bool for_stack)
595 {
596 unsigned long extent, old_end;
597 struct mmu_notifier_range range;
598 pmd_t *old_pmd, *new_pmd;
599 pud_t *old_pud, *new_pud;
600
601 if (!len)
602 return 0;
603
604 old_end = old_addr + len;
605
606 if (is_vm_hugetlb_page(vma))
607 return move_hugetlb_page_tables(vma, new_vma, old_addr,
608 new_addr, len);
609
610 /*
611 * If possible, realign addresses to PMD boundary for faster copy.
612 * Only realign if the mremap copying hits a PMD boundary.
613 */
614 if (len >= PMD_SIZE - (old_addr & ~PMD_MASK))
615 try_realign_addr(&old_addr, vma, &new_addr, new_vma, PMD_MASK,
616 for_stack);
617
618 flush_cache_range(vma, old_addr, old_end);
619 mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm,
620 old_addr, old_end);
621 mmu_notifier_invalidate_range_start(&range);
622
623 for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
624 cond_resched();
625 /*
626 * If extent is PUD-sized try to speed up the move by moving at the
627 * PUD level if possible.
628 */
629 extent = get_extent(NORMAL_PUD, old_addr, old_end, new_addr);
630
631 old_pud = get_old_pud(vma->vm_mm, old_addr);
632 if (!old_pud)
633 continue;
634 new_pud = alloc_new_pud(vma->vm_mm, vma, new_addr);
635 if (!new_pud)
636 break;
637 if (pud_trans_huge(*old_pud) || pud_devmap(*old_pud)) {
638 if (extent == HPAGE_PUD_SIZE) {
639 move_pgt_entry(HPAGE_PUD, vma, old_addr, new_addr,
640 old_pud, new_pud, need_rmap_locks);
641 /* We ignore and continue on error? */
642 continue;
643 }
644 } else if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) {
645
646 if (move_pgt_entry(NORMAL_PUD, vma, old_addr, new_addr,
647 old_pud, new_pud, true))
648 continue;
649 }
650
651 extent = get_extent(NORMAL_PMD, old_addr, old_end, new_addr);
652 old_pmd = get_old_pmd(vma->vm_mm, old_addr);
653 if (!old_pmd)
654 continue;
655 new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
656 if (!new_pmd)
657 break;
658 again:
659 if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) ||
660 pmd_devmap(*old_pmd)) {
661 if (extent == HPAGE_PMD_SIZE &&
662 move_pgt_entry(HPAGE_PMD, vma, old_addr, new_addr,
663 old_pmd, new_pmd, need_rmap_locks))
664 continue;
665 split_huge_pmd(vma, old_pmd, old_addr);
666 } else if (IS_ENABLED(CONFIG_HAVE_MOVE_PMD) &&
667 extent == PMD_SIZE) {
668 /*
669 * If the extent is PMD-sized, try to speed the move by
670 * moving at the PMD level if possible.
671 */
672 if (move_pgt_entry(NORMAL_PMD, vma, old_addr, new_addr,
673 old_pmd, new_pmd, true))
674 continue;
675 }
676 if (pmd_none(*old_pmd))
677 continue;
678 if (pte_alloc(new_vma->vm_mm, new_pmd))
679 break;
680 if (move_ptes(vma, old_pmd, old_addr, old_addr + extent,
681 new_vma, new_pmd, new_addr, need_rmap_locks) < 0)
682 goto again;
683 }
684
685 mmu_notifier_invalidate_range_end(&range);
686
687 /*
688 * Prevent negative return values when {old,new}_addr was realigned
689 * but we broke out of the above loop for the first PMD itself.
690 */
691 if (old_addr < old_end - len)
692 return 0;
693
694 return len + old_addr - old_end; /* how much done */
695 }
696
move_vma(struct vm_area_struct * vma,unsigned long old_addr,unsigned long old_len,unsigned long new_len,unsigned long new_addr,bool * locked,unsigned long flags,struct vm_userfaultfd_ctx * uf,struct list_head * uf_unmap)697 static unsigned long move_vma(struct vm_area_struct *vma,
698 unsigned long old_addr, unsigned long old_len,
699 unsigned long new_len, unsigned long new_addr,
700 bool *locked, unsigned long flags,
701 struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap)
702 {
703 long to_account = new_len - old_len;
704 struct mm_struct *mm = vma->vm_mm;
705 struct vm_area_struct *new_vma;
706 unsigned long vm_flags = vma->vm_flags;
707 unsigned long new_pgoff;
708 unsigned long moved_len;
709 bool account_start = false;
710 bool account_end = false;
711 unsigned long hiwater_vm;
712 int err = 0;
713 bool need_rmap_locks;
714 struct vma_iterator vmi;
715
716 /*
717 * We'd prefer to avoid failure later on in do_munmap:
718 * which may split one vma into three before unmapping.
719 */
720 if (mm->map_count >= sysctl_max_map_count - 3)
721 return -ENOMEM;
722
723 if (unlikely(flags & MREMAP_DONTUNMAP))
724 to_account = new_len;
725
726 if (vma->vm_ops && vma->vm_ops->may_split) {
727 if (vma->vm_start != old_addr)
728 err = vma->vm_ops->may_split(vma, old_addr);
729 if (!err && vma->vm_end != old_addr + old_len)
730 err = vma->vm_ops->may_split(vma, old_addr + old_len);
731 if (err)
732 return err;
733 }
734
735 /*
736 * Advise KSM to break any KSM pages in the area to be moved:
737 * it would be confusing if they were to turn up at the new
738 * location, where they happen to coincide with different KSM
739 * pages recently unmapped. But leave vma->vm_flags as it was,
740 * so KSM can come around to merge on vma and new_vma afterwards.
741 */
742 err = ksm_madvise(vma, old_addr, old_addr + old_len,
743 MADV_UNMERGEABLE, &vm_flags);
744 if (err)
745 return err;
746
747 if (vm_flags & VM_ACCOUNT) {
748 if (security_vm_enough_memory_mm(mm, to_account >> PAGE_SHIFT))
749 return -ENOMEM;
750 }
751
752 vma_start_write(vma);
753 new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
754 new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
755 &need_rmap_locks);
756 if (!new_vma) {
757 if (vm_flags & VM_ACCOUNT)
758 vm_unacct_memory(to_account >> PAGE_SHIFT);
759 return -ENOMEM;
760 }
761
762 moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
763 need_rmap_locks, false);
764 if (moved_len < old_len) {
765 err = -ENOMEM;
766 } else if (vma->vm_ops && vma->vm_ops->mremap) {
767 err = vma->vm_ops->mremap(new_vma);
768 }
769
770 if (unlikely(err)) {
771 /*
772 * On error, move entries back from new area to old,
773 * which will succeed since page tables still there,
774 * and then proceed to unmap new area instead of old.
775 */
776 move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
777 true, false);
778 vma = new_vma;
779 old_len = new_len;
780 old_addr = new_addr;
781 new_addr = err;
782 } else {
783 mremap_userfaultfd_prep(new_vma, uf);
784 }
785
786 if (is_vm_hugetlb_page(vma)) {
787 clear_vma_resv_huge_pages(vma);
788 }
789
790 /* Conceal VM_ACCOUNT so old reservation is not undone */
791 if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) {
792 vm_flags_clear(vma, VM_ACCOUNT);
793 if (vma->vm_start < old_addr)
794 account_start = true;
795 if (vma->vm_end > old_addr + old_len)
796 account_end = true;
797 }
798
799 /*
800 * If we failed to move page tables we still do total_vm increment
801 * since do_munmap() will decrement it by old_len == new_len.
802 *
803 * Since total_vm is about to be raised artificially high for a
804 * moment, we need to restore high watermark afterwards: if stats
805 * are taken meanwhile, total_vm and hiwater_vm appear too high.
806 * If this were a serious issue, we'd add a flag to do_munmap().
807 */
808 hiwater_vm = mm->hiwater_vm;
809 vm_stat_account(mm, vma->vm_flags, new_len >> PAGE_SHIFT);
810
811 /* Tell pfnmap has moved from this vma */
812 if (unlikely(vma->vm_flags & VM_PFNMAP))
813 untrack_pfn_clear(vma);
814
815 if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) {
816 /* We always clear VM_LOCKED[ONFAULT] on the old vma */
817 vm_flags_clear(vma, VM_LOCKED_MASK);
818
819 /*
820 * anon_vma links of the old vma is no longer needed after its page
821 * table has been moved.
822 */
823 if (new_vma != vma && vma->vm_start == old_addr &&
824 vma->vm_end == (old_addr + old_len))
825 unlink_anon_vmas(vma);
826
827 /* Because we won't unmap we don't need to touch locked_vm */
828 return new_addr;
829 }
830
831 vma_iter_init(&vmi, mm, old_addr);
832 if (do_vmi_munmap(&vmi, mm, old_addr, old_len, uf_unmap, false) < 0) {
833 /* OOM: unable to split vma, just get accounts right */
834 if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP))
835 vm_acct_memory(old_len >> PAGE_SHIFT);
836 account_start = account_end = false;
837 }
838
839 if (vm_flags & VM_LOCKED) {
840 mm->locked_vm += new_len >> PAGE_SHIFT;
841 *locked = true;
842 }
843
844 mm->hiwater_vm = hiwater_vm;
845
846 /* Restore VM_ACCOUNT if one or two pieces of vma left */
847 if (account_start) {
848 vma = vma_prev(&vmi);
849 vm_flags_set(vma, VM_ACCOUNT);
850 }
851
852 if (account_end) {
853 vma = vma_next(&vmi);
854 vm_flags_set(vma, VM_ACCOUNT);
855 }
856
857 return new_addr;
858 }
859
vma_to_resize(unsigned long addr,unsigned long old_len,unsigned long new_len,unsigned long flags)860 static struct vm_area_struct *vma_to_resize(unsigned long addr,
861 unsigned long old_len, unsigned long new_len, unsigned long flags)
862 {
863 struct mm_struct *mm = current->mm;
864 struct vm_area_struct *vma;
865 unsigned long pgoff;
866
867 vma = vma_lookup(mm, addr);
868 if (!vma)
869 return ERR_PTR(-EFAULT);
870
871 /*
872 * !old_len is a special case where an attempt is made to 'duplicate'
873 * a mapping. This makes no sense for private mappings as it will
874 * instead create a fresh/new mapping unrelated to the original. This
875 * is contrary to the basic idea of mremap which creates new mappings
876 * based on the original. There are no known use cases for this
877 * behavior. As a result, fail such attempts.
878 */
879 if (!old_len && !(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) {
880 pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap. This is not supported.\n", current->comm, current->pid);
881 return ERR_PTR(-EINVAL);
882 }
883
884 if ((flags & MREMAP_DONTUNMAP) &&
885 (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)))
886 return ERR_PTR(-EINVAL);
887
888 /* We can't remap across vm area boundaries */
889 if (old_len > vma->vm_end - addr)
890 return ERR_PTR(-EFAULT);
891
892 if (new_len == old_len)
893 return vma;
894
895 /* Need to be careful about a growing mapping */
896 pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
897 pgoff += vma->vm_pgoff;
898 if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
899 return ERR_PTR(-EINVAL);
900
901 if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
902 return ERR_PTR(-EFAULT);
903
904 if (!mlock_future_ok(mm, vma->vm_flags, new_len - old_len))
905 return ERR_PTR(-EAGAIN);
906
907 if (!may_expand_vm(mm, vma->vm_flags,
908 (new_len - old_len) >> PAGE_SHIFT))
909 return ERR_PTR(-ENOMEM);
910
911 return vma;
912 }
913
mremap_to(unsigned long addr,unsigned long old_len,unsigned long new_addr,unsigned long new_len,bool * locked,unsigned long flags,struct vm_userfaultfd_ctx * uf,struct list_head * uf_unmap_early,struct list_head * uf_unmap)914 static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
915 unsigned long new_addr, unsigned long new_len, bool *locked,
916 unsigned long flags, struct vm_userfaultfd_ctx *uf,
917 struct list_head *uf_unmap_early,
918 struct list_head *uf_unmap)
919 {
920 struct mm_struct *mm = current->mm;
921 struct vm_area_struct *vma;
922 unsigned long ret = -EINVAL;
923 unsigned long map_flags = 0;
924
925 if (offset_in_page(new_addr))
926 goto out;
927
928 if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
929 goto out;
930
931 /* Ensure the old/new locations do not overlap */
932 if (addr + old_len > new_addr && new_addr + new_len > addr)
933 goto out;
934
935 /*
936 * move_vma() need us to stay 4 maps below the threshold, otherwise
937 * it will bail out at the very beginning.
938 * That is a problem if we have already unmaped the regions here
939 * (new_addr, and old_addr), because userspace will not know the
940 * state of the vma's after it gets -ENOMEM.
941 * So, to avoid such scenario we can pre-compute if the whole
942 * operation has high chances to success map-wise.
943 * Worst-scenario case is when both vma's (new_addr and old_addr) get
944 * split in 3 before unmapping it.
945 * That means 2 more maps (1 for each) to the ones we already hold.
946 * Check whether current map count plus 2 still leads us to 4 maps below
947 * the threshold, otherwise return -ENOMEM here to be more safe.
948 */
949 if ((mm->map_count + 2) >= sysctl_max_map_count - 3)
950 return -ENOMEM;
951
952 if (flags & MREMAP_FIXED) {
953 /*
954 * In mremap_to().
955 * VMA is moved to dst address, and munmap dst first.
956 * do_munmap will check if dst is sealed.
957 */
958 ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
959 if (ret)
960 goto out;
961 }
962
963 if (old_len > new_len) {
964 ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap);
965 if (ret)
966 goto out;
967 old_len = new_len;
968 }
969
970 vma = vma_to_resize(addr, old_len, new_len, flags);
971 if (IS_ERR(vma)) {
972 ret = PTR_ERR(vma);
973 goto out;
974 }
975
976 /* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */
977 if (flags & MREMAP_DONTUNMAP &&
978 !may_expand_vm(mm, vma->vm_flags, old_len >> PAGE_SHIFT)) {
979 ret = -ENOMEM;
980 goto out;
981 }
982
983 if (flags & MREMAP_FIXED)
984 map_flags |= MAP_FIXED;
985
986 if (vma->vm_flags & VM_MAYSHARE)
987 map_flags |= MAP_SHARED;
988
989 ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff +
990 ((addr - vma->vm_start) >> PAGE_SHIFT),
991 map_flags);
992 if (IS_ERR_VALUE(ret))
993 goto out;
994
995 /* We got a new mapping */
996 if (!(flags & MREMAP_FIXED))
997 new_addr = ret;
998
999 ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, uf,
1000 uf_unmap);
1001
1002 out:
1003 return ret;
1004 }
1005
vma_expandable(struct vm_area_struct * vma,unsigned long delta)1006 static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
1007 {
1008 unsigned long end = vma->vm_end + delta;
1009
1010 if (end < vma->vm_end) /* overflow */
1011 return 0;
1012 if (find_vma_intersection(vma->vm_mm, vma->vm_end, end))
1013 return 0;
1014 if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start,
1015 0, MAP_FIXED) & ~PAGE_MASK)
1016 return 0;
1017 return 1;
1018 }
1019
1020 /*
1021 * Expand (or shrink) an existing mapping, potentially moving it at the
1022 * same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
1023 *
1024 * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
1025 * This option implies MREMAP_MAYMOVE.
1026 */
SYSCALL_DEFINE5(mremap,unsigned long,addr,unsigned long,old_len,unsigned long,new_len,unsigned long,flags,unsigned long,new_addr)1027 SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
1028 unsigned long, new_len, unsigned long, flags,
1029 unsigned long, new_addr)
1030 {
1031 struct mm_struct *mm = current->mm;
1032 struct vm_area_struct *vma;
1033 unsigned long ret = -EINVAL;
1034 bool locked = false;
1035 struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
1036 LIST_HEAD(uf_unmap_early);
1037 LIST_HEAD(uf_unmap);
1038
1039 /*
1040 * There is a deliberate asymmetry here: we strip the pointer tag
1041 * from the old address but leave the new address alone. This is
1042 * for consistency with mmap(), where we prevent the creation of
1043 * aliasing mappings in userspace by leaving the tag bits of the
1044 * mapping address intact. A non-zero tag will cause the subsequent
1045 * range checks to reject the address as invalid.
1046 *
1047 * See Documentation/arch/arm64/tagged-address-abi.rst for more
1048 * information.
1049 */
1050 addr = untagged_addr(addr);
1051
1052 if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP))
1053 return ret;
1054
1055 if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
1056 return ret;
1057
1058 /*
1059 * MREMAP_DONTUNMAP is always a move and it does not allow resizing
1060 * in the process.
1061 */
1062 if (flags & MREMAP_DONTUNMAP &&
1063 (!(flags & MREMAP_MAYMOVE) || old_len != new_len))
1064 return ret;
1065
1066
1067 if (__offset_in_page_log(addr))
1068 return ret;
1069
1070 old_len = __PAGE_ALIGN(old_len);
1071 new_len = __PAGE_ALIGN(new_len);
1072
1073 /*
1074 * We allow a zero old-len as a special case
1075 * for DOS-emu "duplicate shm area" thing. But
1076 * a zero new-len is nonsensical.
1077 */
1078 if (!new_len)
1079 return ret;
1080
1081 if (mmap_write_lock_killable(current->mm))
1082 return -EINTR;
1083 vma = vma_lookup(mm, addr);
1084 if (!vma) {
1085 ret = -EFAULT;
1086 goto out;
1087 }
1088
1089 /* Don't allow remapping vmas when they have already been sealed */
1090 if (!can_modify_vma(vma)) {
1091 ret = -EPERM;
1092 goto out;
1093 }
1094
1095 if (is_vm_hugetlb_page(vma)) {
1096 struct hstate *h __maybe_unused = hstate_vma(vma);
1097
1098 old_len = ALIGN(old_len, huge_page_size(h));
1099 new_len = ALIGN(new_len, huge_page_size(h));
1100
1101 /* addrs must be huge page aligned */
1102 if (addr & ~huge_page_mask(h))
1103 goto out;
1104 if (new_addr & ~huge_page_mask(h))
1105 goto out;
1106
1107 /*
1108 * Don't allow remap expansion, because the underlying hugetlb
1109 * reservation is not yet capable to handle split reservation.
1110 */
1111 if (new_len > old_len)
1112 goto out;
1113 }
1114
1115 if (flags & (MREMAP_FIXED | MREMAP_DONTUNMAP)) {
1116 ret = mremap_to(addr, old_len, new_addr, new_len,
1117 &locked, flags, &uf, &uf_unmap_early,
1118 &uf_unmap);
1119 goto out;
1120 }
1121
1122 /*
1123 * Always allow a shrinking remap: that just unmaps
1124 * the unnecessary pages..
1125 * do_vmi_munmap does all the needed commit accounting, and
1126 * unlocks the mmap_lock if so directed.
1127 */
1128 if (old_len >= new_len) {
1129 VMA_ITERATOR(vmi, mm, addr + new_len);
1130
1131 if (old_len == new_len) {
1132 ret = addr;
1133 goto out;
1134 }
1135
1136 ret = do_vmi_munmap(&vmi, mm, addr + new_len, old_len - new_len,
1137 &uf_unmap, true);
1138 if (ret)
1139 goto out;
1140
1141 ret = addr;
1142 goto out_unlocked;
1143 }
1144
1145 /*
1146 * Ok, we need to grow..
1147 */
1148 vma = vma_to_resize(addr, old_len, new_len, flags);
1149 if (IS_ERR(vma)) {
1150 ret = PTR_ERR(vma);
1151 goto out;
1152 }
1153
1154 /* old_len exactly to the end of the area..
1155 */
1156 if (old_len == vma->vm_end - addr) {
1157 unsigned long delta = new_len - old_len;
1158
1159 /* can we just expand the current mapping? */
1160 if (vma_expandable(vma, delta)) {
1161 long pages = delta >> PAGE_SHIFT;
1162 VMA_ITERATOR(vmi, mm, vma->vm_end);
1163 long charged = 0;
1164
1165 if (vma->vm_flags & VM_ACCOUNT) {
1166 if (security_vm_enough_memory_mm(mm, pages)) {
1167 ret = -ENOMEM;
1168 goto out;
1169 }
1170 charged = pages;
1171 }
1172
1173 /*
1174 * Function vma_merge_extend() is called on the
1175 * extension we are adding to the already existing vma,
1176 * vma_merge_extend() will merge this extension with the
1177 * already existing vma (expand operation itself) and
1178 * possibly also with the next vma if it becomes
1179 * adjacent to the expanded vma and otherwise
1180 * compatible.
1181 */
1182 vma = vma_merge_extend(&vmi, vma, delta);
1183 if (!vma) {
1184 vm_unacct_memory(charged);
1185 ret = -ENOMEM;
1186 goto out;
1187 }
1188
1189 vm_stat_account(mm, vma->vm_flags, pages);
1190 if (vma->vm_flags & VM_LOCKED) {
1191 mm->locked_vm += pages;
1192 locked = true;
1193 new_addr = addr;
1194 }
1195 ret = addr;
1196 goto out;
1197 }
1198 }
1199
1200 /*
1201 * We weren't able to just expand or shrink the area,
1202 * we need to create a new one and move it..
1203 */
1204 ret = -ENOMEM;
1205 if (flags & MREMAP_MAYMOVE) {
1206 unsigned long map_flags = 0;
1207 if (vma->vm_flags & VM_MAYSHARE)
1208 map_flags |= MAP_SHARED;
1209
1210 new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
1211 vma->vm_pgoff +
1212 ((addr - vma->vm_start) >> PAGE_SHIFT),
1213 map_flags);
1214 if (IS_ERR_VALUE(new_addr)) {
1215 ret = new_addr;
1216 goto out;
1217 }
1218
1219 ret = move_vma(vma, addr, old_len, new_len, new_addr,
1220 &locked, flags, &uf, &uf_unmap);
1221 }
1222 out:
1223 if (offset_in_page(ret))
1224 locked = false;
1225 mmap_write_unlock(current->mm);
1226 if (locked && new_len > old_len)
1227 mm_populate(new_addr + old_len, new_len - old_len);
1228 out_unlocked:
1229 userfaultfd_unmap_complete(mm, &uf_unmap_early);
1230 mremap_userfaultfd_complete(&uf, addr, ret, old_len);
1231 userfaultfd_unmap_complete(mm, &uf_unmap);
1232 return ret;
1233 }
1234