• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *	mm/mremap.c
4  *
5  *	(C) Copyright 1996 Linus Torvalds
6  *
7  *	Address space accounting code	<alan@lxorguk.ukuu.org.uk>
8  *	(C) Copyright 2002 Red Hat Inc, All Rights Reserved
9  */
10 
11 #include <linux/mm.h>
12 #include <linux/mm_inline.h>
13 #include <linux/hugetlb.h>
14 #include <linux/shm.h>
15 #include <linux/ksm.h>
16 #include <linux/mman.h>
17 #include <linux/swap.h>
18 #include <linux/capability.h>
19 #include <linux/fs.h>
20 #include <linux/swapops.h>
21 #include <linux/highmem.h>
22 #include <linux/security.h>
23 #include <linux/syscalls.h>
24 #include <linux/mmu_notifier.h>
25 #include <linux/uaccess.h>
26 #include <linux/userfaultfd_k.h>
27 #include <linux/mempolicy.h>
28 #include <linux/page_size_compat.h>
29 
30 #include <asm/cacheflush.h>
31 #include <asm/tlb.h>
32 #include <asm/pgalloc.h>
33 
34 #include "internal.h"
35 
get_old_pud(struct mm_struct * mm,unsigned long addr)36 static pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr)
37 {
38 	pgd_t *pgd;
39 	p4d_t *p4d;
40 	pud_t *pud;
41 
42 	pgd = pgd_offset(mm, addr);
43 	if (pgd_none_or_clear_bad(pgd))
44 		return NULL;
45 
46 	p4d = p4d_offset(pgd, addr);
47 	if (p4d_none_or_clear_bad(p4d))
48 		return NULL;
49 
50 	pud = pud_offset(p4d, addr);
51 	if (pud_none_or_clear_bad(pud))
52 		return NULL;
53 
54 	return pud;
55 }
56 
get_old_pmd(struct mm_struct * mm,unsigned long addr)57 static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
58 {
59 	pud_t *pud;
60 	pmd_t *pmd;
61 
62 	pud = get_old_pud(mm, addr);
63 	if (!pud)
64 		return NULL;
65 
66 	pmd = pmd_offset(pud, addr);
67 	if (pmd_none(*pmd))
68 		return NULL;
69 
70 	return pmd;
71 }
72 
alloc_new_pud(struct mm_struct * mm,struct vm_area_struct * vma,unsigned long addr)73 static pud_t *alloc_new_pud(struct mm_struct *mm, struct vm_area_struct *vma,
74 			    unsigned long addr)
75 {
76 	pgd_t *pgd;
77 	p4d_t *p4d;
78 
79 	pgd = pgd_offset(mm, addr);
80 	p4d = p4d_alloc(mm, pgd, addr);
81 	if (!p4d)
82 		return NULL;
83 
84 	return pud_alloc(mm, p4d, addr);
85 }
86 
alloc_new_pmd(struct mm_struct * mm,struct vm_area_struct * vma,unsigned long addr)87 static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
88 			    unsigned long addr)
89 {
90 	pud_t *pud;
91 	pmd_t *pmd;
92 
93 	pud = alloc_new_pud(mm, vma, addr);
94 	if (!pud)
95 		return NULL;
96 
97 	pmd = pmd_alloc(mm, pud, addr);
98 	if (!pmd)
99 		return NULL;
100 
101 	VM_BUG_ON(pmd_trans_huge(*pmd));
102 
103 	return pmd;
104 }
105 
take_rmap_locks(struct vm_area_struct * vma)106 static void take_rmap_locks(struct vm_area_struct *vma)
107 {
108 	if (vma->vm_file)
109 		i_mmap_lock_write(vma->vm_file->f_mapping);
110 	if (vma->anon_vma)
111 		anon_vma_lock_write(vma->anon_vma);
112 }
113 
drop_rmap_locks(struct vm_area_struct * vma)114 static void drop_rmap_locks(struct vm_area_struct *vma)
115 {
116 	if (vma->anon_vma)
117 		anon_vma_unlock_write(vma->anon_vma);
118 	if (vma->vm_file)
119 		i_mmap_unlock_write(vma->vm_file->f_mapping);
120 }
121 
move_soft_dirty_pte(pte_t pte)122 static pte_t move_soft_dirty_pte(pte_t pte)
123 {
124 	/*
125 	 * Set soft dirty bit so we can notice
126 	 * in userspace the ptes were moved.
127 	 */
128 #ifdef CONFIG_MEM_SOFT_DIRTY
129 	if (pte_present(pte))
130 		pte = pte_mksoft_dirty(pte);
131 	else if (is_swap_pte(pte))
132 		pte = pte_swp_mksoft_dirty(pte);
133 #endif
134 	return pte;
135 }
136 
move_ptes(struct vm_area_struct * vma,pmd_t * old_pmd,unsigned long old_addr,unsigned long old_end,struct vm_area_struct * new_vma,pmd_t * new_pmd,unsigned long new_addr,bool need_rmap_locks)137 static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
138 		unsigned long old_addr, unsigned long old_end,
139 		struct vm_area_struct *new_vma, pmd_t *new_pmd,
140 		unsigned long new_addr, bool need_rmap_locks)
141 {
142 	bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma);
143 	struct mm_struct *mm = vma->vm_mm;
144 	pte_t *old_pte, *new_pte, pte;
145 	pmd_t dummy_pmdval;
146 	spinlock_t *old_ptl, *new_ptl;
147 	bool force_flush = false;
148 	unsigned long len = old_end - old_addr;
149 	int err = 0;
150 
151 	/*
152 	 * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma
153 	 * locks to ensure that rmap will always observe either the old or the
154 	 * new ptes. This is the easiest way to avoid races with
155 	 * truncate_pagecache(), page migration, etc...
156 	 *
157 	 * When need_rmap_locks is false, we use other ways to avoid
158 	 * such races:
159 	 *
160 	 * - During exec() shift_arg_pages(), we use a specially tagged vma
161 	 *   which rmap call sites look for using vma_is_temporary_stack().
162 	 *
163 	 * - During mremap(), new_vma is often known to be placed after vma
164 	 *   in rmap traversal order. This ensures rmap will always observe
165 	 *   either the old pte, or the new pte, or both (the page table locks
166 	 *   serialize access to individual ptes, but only rmap traversal
167 	 *   order guarantees that we won't miss both the old and new ptes).
168 	 */
169 	if (need_rmap_locks)
170 		take_rmap_locks(vma);
171 
172 	/*
173 	 * We don't have to worry about the ordering of src and dst
174 	 * pte locks because exclusive mmap_lock prevents deadlock.
175 	 */
176 	old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
177 	if (!old_pte) {
178 		err = -EAGAIN;
179 		goto out;
180 	}
181 	/*
182 	 * Now new_pte is none, so hpage_collapse_scan_file() path can not find
183 	 * this by traversing file->f_mapping, so there is no concurrency with
184 	 * retract_page_tables(). In addition, we already hold the exclusive
185 	 * mmap_lock, so this new_pte page is stable, so there is no need to get
186 	 * pmdval and do pmd_same() check.
187 	 */
188 	new_pte = pte_offset_map_rw_nolock(mm, new_pmd, new_addr, &dummy_pmdval,
189 					   &new_ptl);
190 	if (!new_pte) {
191 		pte_unmap_unlock(old_pte, old_ptl);
192 		err = -EAGAIN;
193 		goto out;
194 	}
195 	if (new_ptl != old_ptl)
196 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
197 	flush_tlb_batched_pending(vma->vm_mm);
198 	arch_enter_lazy_mmu_mode();
199 
200 	for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
201 				   new_pte++, new_addr += PAGE_SIZE) {
202 		if (pte_none(ptep_get(old_pte)))
203 			continue;
204 
205 		pte = ptep_get_and_clear(mm, old_addr, old_pte);
206 		/*
207 		 * If we are remapping a valid PTE, make sure
208 		 * to flush TLB before we drop the PTL for the
209 		 * PTE.
210 		 *
211 		 * NOTE! Both old and new PTL matter: the old one
212 		 * for racing with folio_mkclean(), the new one to
213 		 * make sure the physical page stays valid until
214 		 * the TLB entry for the old mapping has been
215 		 * flushed.
216 		 */
217 		if (pte_present(pte))
218 			force_flush = true;
219 		pte = move_pte(pte, old_addr, new_addr);
220 		pte = move_soft_dirty_pte(pte);
221 
222 		if (need_clear_uffd_wp && pte_marker_uffd_wp(pte))
223 			pte_clear(mm, new_addr, new_pte);
224 		else {
225 			if (need_clear_uffd_wp) {
226 				if (pte_present(pte))
227 					pte = pte_clear_uffd_wp(pte);
228 				else if (is_swap_pte(pte))
229 					pte = pte_swp_clear_uffd_wp(pte);
230 			}
231 			set_pte_at(mm, new_addr, new_pte, pte);
232 		}
233 	}
234 
235 	arch_leave_lazy_mmu_mode();
236 	if (force_flush)
237 		flush_tlb_range(vma, old_end - len, old_end);
238 	if (new_ptl != old_ptl)
239 		spin_unlock(new_ptl);
240 	pte_unmap(new_pte - 1);
241 	pte_unmap_unlock(old_pte - 1, old_ptl);
242 out:
243 	if (need_rmap_locks)
244 		drop_rmap_locks(vma);
245 	return err;
246 }
247 
248 #ifndef arch_supports_page_table_move
249 #define arch_supports_page_table_move arch_supports_page_table_move
arch_supports_page_table_move(void)250 static inline bool arch_supports_page_table_move(void)
251 {
252 	return IS_ENABLED(CONFIG_HAVE_MOVE_PMD) ||
253 		IS_ENABLED(CONFIG_HAVE_MOVE_PUD);
254 }
255 #endif
256 
257 #ifdef CONFIG_HAVE_MOVE_PMD
move_normal_pmd(struct vm_area_struct * vma,unsigned long old_addr,unsigned long new_addr,pmd_t * old_pmd,pmd_t * new_pmd)258 static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
259 		  unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
260 {
261 	spinlock_t *old_ptl, *new_ptl;
262 	struct mm_struct *mm = vma->vm_mm;
263 	bool res = false;
264 	pmd_t pmd;
265 
266 	if (!arch_supports_page_table_move())
267 		return false;
268 	/*
269 	 * The destination pmd shouldn't be established, free_pgtables()
270 	 * should have released it.
271 	 *
272 	 * However, there's a case during execve() where we use mremap
273 	 * to move the initial stack, and in that case the target area
274 	 * may overlap the source area (always moving down).
275 	 *
276 	 * If everything is PMD-aligned, that works fine, as moving
277 	 * each pmd down will clear the source pmd. But if we first
278 	 * have a few 4kB-only pages that get moved down, and then
279 	 * hit the "now the rest is PMD-aligned, let's do everything
280 	 * one pmd at a time", we will still have the old (now empty
281 	 * of any 4kB pages, but still there) PMD in the page table
282 	 * tree.
283 	 *
284 	 * Warn on it once - because we really should try to figure
285 	 * out how to do this better - but then say "I won't move
286 	 * this pmd".
287 	 *
288 	 * One alternative might be to just unmap the target pmd at
289 	 * this point, and verify that it really is empty. We'll see.
290 	 */
291 	if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
292 		return false;
293 
294 	/* If this pmd belongs to a uffd vma with remap events disabled, we need
295 	 * to ensure that the uffd-wp state is cleared from all pgtables. This
296 	 * means recursing into lower page tables in move_page_tables(), and we
297 	 * can reuse the existing code if we simply treat the entry as "not
298 	 * moved".
299 	 */
300 	if (vma_has_uffd_without_event_remap(vma))
301 		return false;
302 
303 	/*
304 	 * We don't have to worry about the ordering of src and dst
305 	 * ptlocks because exclusive mmap_lock prevents deadlock.
306 	 */
307 	old_ptl = pmd_lock(vma->vm_mm, old_pmd);
308 	new_ptl = pmd_lockptr(mm, new_pmd);
309 	if (new_ptl != old_ptl)
310 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
311 
312 	pmd = *old_pmd;
313 
314 	/* Racing with collapse? */
315 	if (unlikely(!pmd_present(pmd) || pmd_leaf(pmd)))
316 		goto out_unlock;
317 	/* Clear the pmd */
318 	pmd_clear(old_pmd);
319 	res = true;
320 
321 	VM_BUG_ON(!pmd_none(*new_pmd));
322 
323 	pmd_populate(mm, new_pmd, pmd_pgtable(pmd));
324 	flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
325 out_unlock:
326 	if (new_ptl != old_ptl)
327 		spin_unlock(new_ptl);
328 	spin_unlock(old_ptl);
329 
330 	return res;
331 }
332 #else
move_normal_pmd(struct vm_area_struct * vma,unsigned long old_addr,unsigned long new_addr,pmd_t * old_pmd,pmd_t * new_pmd)333 static inline bool move_normal_pmd(struct vm_area_struct *vma,
334 		unsigned long old_addr, unsigned long new_addr, pmd_t *old_pmd,
335 		pmd_t *new_pmd)
336 {
337 	return false;
338 }
339 #endif
340 
341 #if CONFIG_PGTABLE_LEVELS > 2 && defined(CONFIG_HAVE_MOVE_PUD)
move_normal_pud(struct vm_area_struct * vma,unsigned long old_addr,unsigned long new_addr,pud_t * old_pud,pud_t * new_pud)342 static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
343 		  unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
344 {
345 	spinlock_t *old_ptl, *new_ptl;
346 	struct mm_struct *mm = vma->vm_mm;
347 	pud_t pud;
348 
349 	if (!arch_supports_page_table_move())
350 		return false;
351 	/*
352 	 * The destination pud shouldn't be established, free_pgtables()
353 	 * should have released it.
354 	 */
355 	if (WARN_ON_ONCE(!pud_none(*new_pud)))
356 		return false;
357 
358 	/* If this pud belongs to a uffd vma with remap events disabled, we need
359 	 * to ensure that the uffd-wp state is cleared from all pgtables. This
360 	 * means recursing into lower page tables in move_page_tables(), and we
361 	 * can reuse the existing code if we simply treat the entry as "not
362 	 * moved".
363 	 */
364 	if (vma_has_uffd_without_event_remap(vma))
365 		return false;
366 
367 	/*
368 	 * We don't have to worry about the ordering of src and dst
369 	 * ptlocks because exclusive mmap_lock prevents deadlock.
370 	 */
371 	old_ptl = pud_lock(vma->vm_mm, old_pud);
372 	new_ptl = pud_lockptr(mm, new_pud);
373 	if (new_ptl != old_ptl)
374 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
375 
376 	/* Clear the pud */
377 	pud = *old_pud;
378 	pud_clear(old_pud);
379 
380 	VM_BUG_ON(!pud_none(*new_pud));
381 
382 	pud_populate(mm, new_pud, pud_pgtable(pud));
383 	flush_tlb_range(vma, old_addr, old_addr + PUD_SIZE);
384 	if (new_ptl != old_ptl)
385 		spin_unlock(new_ptl);
386 	spin_unlock(old_ptl);
387 
388 	return true;
389 }
390 #else
move_normal_pud(struct vm_area_struct * vma,unsigned long old_addr,unsigned long new_addr,pud_t * old_pud,pud_t * new_pud)391 static inline bool move_normal_pud(struct vm_area_struct *vma,
392 		unsigned long old_addr, unsigned long new_addr, pud_t *old_pud,
393 		pud_t *new_pud)
394 {
395 	return false;
396 }
397 #endif
398 
399 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
move_huge_pud(struct vm_area_struct * vma,unsigned long old_addr,unsigned long new_addr,pud_t * old_pud,pud_t * new_pud)400 static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
401 			  unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
402 {
403 	spinlock_t *old_ptl, *new_ptl;
404 	struct mm_struct *mm = vma->vm_mm;
405 	pud_t pud;
406 
407 	/*
408 	 * The destination pud shouldn't be established, free_pgtables()
409 	 * should have released it.
410 	 */
411 	if (WARN_ON_ONCE(!pud_none(*new_pud)))
412 		return false;
413 
414 	/*
415 	 * We don't have to worry about the ordering of src and dst
416 	 * ptlocks because exclusive mmap_lock prevents deadlock.
417 	 */
418 	old_ptl = pud_lock(vma->vm_mm, old_pud);
419 	new_ptl = pud_lockptr(mm, new_pud);
420 	if (new_ptl != old_ptl)
421 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
422 
423 	/* Clear the pud */
424 	pud = *old_pud;
425 	pud_clear(old_pud);
426 
427 	VM_BUG_ON(!pud_none(*new_pud));
428 
429 	/* Set the new pud */
430 	/* mark soft_ditry when we add pud level soft dirty support */
431 	set_pud_at(mm, new_addr, new_pud, pud);
432 	flush_pud_tlb_range(vma, old_addr, old_addr + HPAGE_PUD_SIZE);
433 	if (new_ptl != old_ptl)
434 		spin_unlock(new_ptl);
435 	spin_unlock(old_ptl);
436 
437 	return true;
438 }
439 #else
move_huge_pud(struct vm_area_struct * vma,unsigned long old_addr,unsigned long new_addr,pud_t * old_pud,pud_t * new_pud)440 static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
441 			  unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
442 {
443 	WARN_ON_ONCE(1);
444 	return false;
445 
446 }
447 #endif
448 
449 enum pgt_entry {
450 	NORMAL_PMD,
451 	HPAGE_PMD,
452 	NORMAL_PUD,
453 	HPAGE_PUD,
454 };
455 
456 /*
457  * Returns an extent of the corresponding size for the pgt_entry specified if
458  * valid. Else returns a smaller extent bounded by the end of the source and
459  * destination pgt_entry.
460  */
get_extent(enum pgt_entry entry,unsigned long old_addr,unsigned long old_end,unsigned long new_addr)461 static __always_inline unsigned long get_extent(enum pgt_entry entry,
462 			unsigned long old_addr, unsigned long old_end,
463 			unsigned long new_addr)
464 {
465 	unsigned long next, extent, mask, size;
466 
467 	switch (entry) {
468 	case HPAGE_PMD:
469 	case NORMAL_PMD:
470 		mask = PMD_MASK;
471 		size = PMD_SIZE;
472 		break;
473 	case HPAGE_PUD:
474 	case NORMAL_PUD:
475 		mask = PUD_MASK;
476 		size = PUD_SIZE;
477 		break;
478 	default:
479 		BUILD_BUG();
480 		break;
481 	}
482 
483 	next = (old_addr + size) & mask;
484 	/* even if next overflowed, extent below will be ok */
485 	extent = next - old_addr;
486 	if (extent > old_end - old_addr)
487 		extent = old_end - old_addr;
488 	next = (new_addr + size) & mask;
489 	if (extent > next - new_addr)
490 		extent = next - new_addr;
491 	return extent;
492 }
493 
494 /*
495  * Attempts to speedup the move by moving entry at the level corresponding to
496  * pgt_entry. Returns true if the move was successful, else false.
497  */
move_pgt_entry(enum pgt_entry entry,struct vm_area_struct * vma,unsigned long old_addr,unsigned long new_addr,void * old_entry,void * new_entry,bool need_rmap_locks)498 static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma,
499 			unsigned long old_addr, unsigned long new_addr,
500 			void *old_entry, void *new_entry, bool need_rmap_locks)
501 {
502 	bool moved = false;
503 
504 	/* See comment in move_ptes() */
505 	if (need_rmap_locks)
506 		take_rmap_locks(vma);
507 
508 	switch (entry) {
509 	case NORMAL_PMD:
510 		moved = move_normal_pmd(vma, old_addr, new_addr, old_entry,
511 					new_entry);
512 		break;
513 	case NORMAL_PUD:
514 		moved = move_normal_pud(vma, old_addr, new_addr, old_entry,
515 					new_entry);
516 		break;
517 	case HPAGE_PMD:
518 		moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
519 			move_huge_pmd(vma, old_addr, new_addr, old_entry,
520 				      new_entry);
521 		break;
522 	case HPAGE_PUD:
523 		moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
524 			move_huge_pud(vma, old_addr, new_addr, old_entry,
525 				      new_entry);
526 		break;
527 
528 	default:
529 		WARN_ON_ONCE(1);
530 		break;
531 	}
532 
533 	if (need_rmap_locks)
534 		drop_rmap_locks(vma);
535 
536 	return moved;
537 }
538 
539 /*
540  * A helper to check if aligning down is OK. The aligned address should fall
541  * on *no mapping*. For the stack moving down, that's a special move within
542  * the VMA that is created to span the source and destination of the move,
543  * so we make an exception for it.
544  */
can_align_down(struct vm_area_struct * vma,unsigned long addr_to_align,unsigned long mask,bool for_stack)545 static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_align,
546 			    unsigned long mask, bool for_stack)
547 {
548 	unsigned long addr_masked = addr_to_align & mask;
549 
550 	/*
551 	 * If @addr_to_align of either source or destination is not the beginning
552 	 * of the corresponding VMA, we can't align down or we will destroy part
553 	 * of the current mapping.
554 	 */
555 	if (!for_stack && vma->vm_start != addr_to_align)
556 		return false;
557 
558 	/* In the stack case we explicitly permit in-VMA alignment. */
559 	if (for_stack && addr_masked >= vma->vm_start)
560 		return true;
561 
562 	/*
563 	 * Make sure the realignment doesn't cause the address to fall on an
564 	 * existing mapping.
565 	 */
566 	return find_vma_intersection(vma->vm_mm, addr_masked, vma->vm_start) == NULL;
567 }
568 
569 /* Opportunistically realign to specified boundary for faster copy. */
try_realign_addr(unsigned long * old_addr,struct vm_area_struct * old_vma,unsigned long * new_addr,struct vm_area_struct * new_vma,unsigned long mask,bool for_stack)570 static void try_realign_addr(unsigned long *old_addr, struct vm_area_struct *old_vma,
571 			     unsigned long *new_addr, struct vm_area_struct *new_vma,
572 			     unsigned long mask, bool for_stack)
573 {
574 	/* Skip if the addresses are already aligned. */
575 	if ((*old_addr & ~mask) == 0)
576 		return;
577 
578 	/* Only realign if the new and old addresses are mutually aligned. */
579 	if ((*old_addr & ~mask) != (*new_addr & ~mask))
580 		return;
581 
582 	/* Ensure realignment doesn't cause overlap with existing mappings. */
583 	if (!can_align_down(old_vma, *old_addr, mask, for_stack) ||
584 	    !can_align_down(new_vma, *new_addr, mask, for_stack))
585 		return;
586 
587 	*old_addr = *old_addr & mask;
588 	*new_addr = *new_addr & mask;
589 }
590 
move_page_tables(struct vm_area_struct * vma,unsigned long old_addr,struct vm_area_struct * new_vma,unsigned long new_addr,unsigned long len,bool need_rmap_locks,bool for_stack)591 unsigned long move_page_tables(struct vm_area_struct *vma,
592 		unsigned long old_addr, struct vm_area_struct *new_vma,
593 		unsigned long new_addr, unsigned long len,
594 		bool need_rmap_locks, bool for_stack)
595 {
596 	unsigned long extent, old_end;
597 	struct mmu_notifier_range range;
598 	pmd_t *old_pmd, *new_pmd;
599 	pud_t *old_pud, *new_pud;
600 
601 	if (!len)
602 		return 0;
603 
604 	old_end = old_addr + len;
605 
606 	if (is_vm_hugetlb_page(vma))
607 		return move_hugetlb_page_tables(vma, new_vma, old_addr,
608 						new_addr, len);
609 
610 	/*
611 	 * If possible, realign addresses to PMD boundary for faster copy.
612 	 * Only realign if the mremap copying hits a PMD boundary.
613 	 */
614 	if (len >= PMD_SIZE - (old_addr & ~PMD_MASK))
615 		try_realign_addr(&old_addr, vma, &new_addr, new_vma, PMD_MASK,
616 				 for_stack);
617 
618 	flush_cache_range(vma, old_addr, old_end);
619 	mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm,
620 				old_addr, old_end);
621 	mmu_notifier_invalidate_range_start(&range);
622 
623 	for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
624 		cond_resched();
625 		/*
626 		 * If extent is PUD-sized try to speed up the move by moving at the
627 		 * PUD level if possible.
628 		 */
629 		extent = get_extent(NORMAL_PUD, old_addr, old_end, new_addr);
630 
631 		old_pud = get_old_pud(vma->vm_mm, old_addr);
632 		if (!old_pud)
633 			continue;
634 		new_pud = alloc_new_pud(vma->vm_mm, vma, new_addr);
635 		if (!new_pud)
636 			break;
637 		if (pud_trans_huge(*old_pud) || pud_devmap(*old_pud)) {
638 			if (extent == HPAGE_PUD_SIZE) {
639 				move_pgt_entry(HPAGE_PUD, vma, old_addr, new_addr,
640 					       old_pud, new_pud, need_rmap_locks);
641 				/* We ignore and continue on error? */
642 				continue;
643 			}
644 		} else if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) {
645 
646 			if (move_pgt_entry(NORMAL_PUD, vma, old_addr, new_addr,
647 					   old_pud, new_pud, true))
648 				continue;
649 		}
650 
651 		extent = get_extent(NORMAL_PMD, old_addr, old_end, new_addr);
652 		old_pmd = get_old_pmd(vma->vm_mm, old_addr);
653 		if (!old_pmd)
654 			continue;
655 		new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
656 		if (!new_pmd)
657 			break;
658 again:
659 		if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) ||
660 		    pmd_devmap(*old_pmd)) {
661 			if (extent == HPAGE_PMD_SIZE &&
662 			    move_pgt_entry(HPAGE_PMD, vma, old_addr, new_addr,
663 					   old_pmd, new_pmd, need_rmap_locks))
664 				continue;
665 			split_huge_pmd(vma, old_pmd, old_addr);
666 		} else if (IS_ENABLED(CONFIG_HAVE_MOVE_PMD) &&
667 			   extent == PMD_SIZE) {
668 			/*
669 			 * If the extent is PMD-sized, try to speed the move by
670 			 * moving at the PMD level if possible.
671 			 */
672 			if (move_pgt_entry(NORMAL_PMD, vma, old_addr, new_addr,
673 					   old_pmd, new_pmd, true))
674 				continue;
675 		}
676 		if (pmd_none(*old_pmd))
677 			continue;
678 		if (pte_alloc(new_vma->vm_mm, new_pmd))
679 			break;
680 		if (move_ptes(vma, old_pmd, old_addr, old_addr + extent,
681 			      new_vma, new_pmd, new_addr, need_rmap_locks) < 0)
682 			goto again;
683 	}
684 
685 	mmu_notifier_invalidate_range_end(&range);
686 
687 	/*
688 	 * Prevent negative return values when {old,new}_addr was realigned
689 	 * but we broke out of the above loop for the first PMD itself.
690 	 */
691 	if (old_addr < old_end - len)
692 		return 0;
693 
694 	return len + old_addr - old_end;	/* how much done */
695 }
696 
move_vma(struct vm_area_struct * vma,unsigned long old_addr,unsigned long old_len,unsigned long new_len,unsigned long new_addr,bool * locked,unsigned long flags,struct vm_userfaultfd_ctx * uf,struct list_head * uf_unmap)697 static unsigned long move_vma(struct vm_area_struct *vma,
698 		unsigned long old_addr, unsigned long old_len,
699 		unsigned long new_len, unsigned long new_addr,
700 		bool *locked, unsigned long flags,
701 		struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap)
702 {
703 	long to_account = new_len - old_len;
704 	struct mm_struct *mm = vma->vm_mm;
705 	struct vm_area_struct *new_vma;
706 	unsigned long vm_flags = vma->vm_flags;
707 	unsigned long new_pgoff;
708 	unsigned long moved_len;
709 	bool account_start = false;
710 	bool account_end = false;
711 	unsigned long hiwater_vm;
712 	int err = 0;
713 	bool need_rmap_locks;
714 	struct vma_iterator vmi;
715 
716 	/*
717 	 * We'd prefer to avoid failure later on in do_munmap:
718 	 * which may split one vma into three before unmapping.
719 	 */
720 	if (mm->map_count >= sysctl_max_map_count - 3)
721 		return -ENOMEM;
722 
723 	if (unlikely(flags & MREMAP_DONTUNMAP))
724 		to_account = new_len;
725 
726 	if (vma->vm_ops && vma->vm_ops->may_split) {
727 		if (vma->vm_start != old_addr)
728 			err = vma->vm_ops->may_split(vma, old_addr);
729 		if (!err && vma->vm_end != old_addr + old_len)
730 			err = vma->vm_ops->may_split(vma, old_addr + old_len);
731 		if (err)
732 			return err;
733 	}
734 
735 	/*
736 	 * Advise KSM to break any KSM pages in the area to be moved:
737 	 * it would be confusing if they were to turn up at the new
738 	 * location, where they happen to coincide with different KSM
739 	 * pages recently unmapped.  But leave vma->vm_flags as it was,
740 	 * so KSM can come around to merge on vma and new_vma afterwards.
741 	 */
742 	err = ksm_madvise(vma, old_addr, old_addr + old_len,
743 						MADV_UNMERGEABLE, &vm_flags);
744 	if (err)
745 		return err;
746 
747 	if (vm_flags & VM_ACCOUNT) {
748 		if (security_vm_enough_memory_mm(mm, to_account >> PAGE_SHIFT))
749 			return -ENOMEM;
750 	}
751 
752 	vma_start_write(vma);
753 	new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
754 	new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
755 			   &need_rmap_locks);
756 	if (!new_vma) {
757 		if (vm_flags & VM_ACCOUNT)
758 			vm_unacct_memory(to_account >> PAGE_SHIFT);
759 		return -ENOMEM;
760 	}
761 
762 	moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
763 				     need_rmap_locks, false);
764 	if (moved_len < old_len) {
765 		err = -ENOMEM;
766 	} else if (vma->vm_ops && vma->vm_ops->mremap) {
767 		err = vma->vm_ops->mremap(new_vma);
768 	}
769 
770 	if (unlikely(err)) {
771 		/*
772 		 * On error, move entries back from new area to old,
773 		 * which will succeed since page tables still there,
774 		 * and then proceed to unmap new area instead of old.
775 		 */
776 		move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
777 				 true, false);
778 		vma = new_vma;
779 		old_len = new_len;
780 		old_addr = new_addr;
781 		new_addr = err;
782 	} else {
783 		mremap_userfaultfd_prep(new_vma, uf);
784 	}
785 
786 	if (is_vm_hugetlb_page(vma)) {
787 		clear_vma_resv_huge_pages(vma);
788 	}
789 
790 	/* Conceal VM_ACCOUNT so old reservation is not undone */
791 	if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) {
792 		vm_flags_clear(vma, VM_ACCOUNT);
793 		if (vma->vm_start < old_addr)
794 			account_start = true;
795 		if (vma->vm_end > old_addr + old_len)
796 			account_end = true;
797 	}
798 
799 	/*
800 	 * If we failed to move page tables we still do total_vm increment
801 	 * since do_munmap() will decrement it by old_len == new_len.
802 	 *
803 	 * Since total_vm is about to be raised artificially high for a
804 	 * moment, we need to restore high watermark afterwards: if stats
805 	 * are taken meanwhile, total_vm and hiwater_vm appear too high.
806 	 * If this were a serious issue, we'd add a flag to do_munmap().
807 	 */
808 	hiwater_vm = mm->hiwater_vm;
809 	vm_stat_account(mm, vma->vm_flags, new_len >> PAGE_SHIFT);
810 
811 	/* Tell pfnmap has moved from this vma */
812 	if (unlikely(vma->vm_flags & VM_PFNMAP))
813 		untrack_pfn_clear(vma);
814 
815 	if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) {
816 		/* We always clear VM_LOCKED[ONFAULT] on the old vma */
817 		vm_flags_clear(vma, VM_LOCKED_MASK);
818 
819 		/*
820 		 * anon_vma links of the old vma is no longer needed after its page
821 		 * table has been moved.
822 		 */
823 		if (new_vma != vma && vma->vm_start == old_addr &&
824 			vma->vm_end == (old_addr + old_len))
825 			unlink_anon_vmas(vma);
826 
827 		/* Because we won't unmap we don't need to touch locked_vm */
828 		return new_addr;
829 	}
830 
831 	vma_iter_init(&vmi, mm, old_addr);
832 	if (do_vmi_munmap(&vmi, mm, old_addr, old_len, uf_unmap, false) < 0) {
833 		/* OOM: unable to split vma, just get accounts right */
834 		if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP))
835 			vm_acct_memory(old_len >> PAGE_SHIFT);
836 		account_start = account_end = false;
837 	}
838 
839 	if (vm_flags & VM_LOCKED) {
840 		mm->locked_vm += new_len >> PAGE_SHIFT;
841 		*locked = true;
842 	}
843 
844 	mm->hiwater_vm = hiwater_vm;
845 
846 	/* Restore VM_ACCOUNT if one or two pieces of vma left */
847 	if (account_start) {
848 		vma = vma_prev(&vmi);
849 		vm_flags_set(vma, VM_ACCOUNT);
850 	}
851 
852 	if (account_end) {
853 		vma = vma_next(&vmi);
854 		vm_flags_set(vma, VM_ACCOUNT);
855 	}
856 
857 	return new_addr;
858 }
859 
vma_to_resize(unsigned long addr,unsigned long old_len,unsigned long new_len,unsigned long flags)860 static struct vm_area_struct *vma_to_resize(unsigned long addr,
861 	unsigned long old_len, unsigned long new_len, unsigned long flags)
862 {
863 	struct mm_struct *mm = current->mm;
864 	struct vm_area_struct *vma;
865 	unsigned long pgoff;
866 
867 	vma = vma_lookup(mm, addr);
868 	if (!vma)
869 		return ERR_PTR(-EFAULT);
870 
871 	/*
872 	 * !old_len is a special case where an attempt is made to 'duplicate'
873 	 * a mapping.  This makes no sense for private mappings as it will
874 	 * instead create a fresh/new mapping unrelated to the original.  This
875 	 * is contrary to the basic idea of mremap which creates new mappings
876 	 * based on the original.  There are no known use cases for this
877 	 * behavior.  As a result, fail such attempts.
878 	 */
879 	if (!old_len && !(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) {
880 		pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap.  This is not supported.\n", current->comm, current->pid);
881 		return ERR_PTR(-EINVAL);
882 	}
883 
884 	if ((flags & MREMAP_DONTUNMAP) &&
885 			(vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)))
886 		return ERR_PTR(-EINVAL);
887 
888 	/* We can't remap across vm area boundaries */
889 	if (old_len > vma->vm_end - addr)
890 		return ERR_PTR(-EFAULT);
891 
892 	if (new_len == old_len)
893 		return vma;
894 
895 	/* Need to be careful about a growing mapping */
896 	pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
897 	pgoff += vma->vm_pgoff;
898 	if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
899 		return ERR_PTR(-EINVAL);
900 
901 	if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
902 		return ERR_PTR(-EFAULT);
903 
904 	if (!mlock_future_ok(mm, vma->vm_flags, new_len - old_len))
905 		return ERR_PTR(-EAGAIN);
906 
907 	if (!may_expand_vm(mm, vma->vm_flags,
908 				(new_len - old_len) >> PAGE_SHIFT))
909 		return ERR_PTR(-ENOMEM);
910 
911 	return vma;
912 }
913 
mremap_to(unsigned long addr,unsigned long old_len,unsigned long new_addr,unsigned long new_len,bool * locked,unsigned long flags,struct vm_userfaultfd_ctx * uf,struct list_head * uf_unmap_early,struct list_head * uf_unmap)914 static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
915 		unsigned long new_addr, unsigned long new_len, bool *locked,
916 		unsigned long flags, struct vm_userfaultfd_ctx *uf,
917 		struct list_head *uf_unmap_early,
918 		struct list_head *uf_unmap)
919 {
920 	struct mm_struct *mm = current->mm;
921 	struct vm_area_struct *vma;
922 	unsigned long ret = -EINVAL;
923 	unsigned long map_flags = 0;
924 
925 	if (offset_in_page(new_addr))
926 		goto out;
927 
928 	if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
929 		goto out;
930 
931 	/* Ensure the old/new locations do not overlap */
932 	if (addr + old_len > new_addr && new_addr + new_len > addr)
933 		goto out;
934 
935 	/*
936 	 * move_vma() need us to stay 4 maps below the threshold, otherwise
937 	 * it will bail out at the very beginning.
938 	 * That is a problem if we have already unmaped the regions here
939 	 * (new_addr, and old_addr), because userspace will not know the
940 	 * state of the vma's after it gets -ENOMEM.
941 	 * So, to avoid such scenario we can pre-compute if the whole
942 	 * operation has high chances to success map-wise.
943 	 * Worst-scenario case is when both vma's (new_addr and old_addr) get
944 	 * split in 3 before unmapping it.
945 	 * That means 2 more maps (1 for each) to the ones we already hold.
946 	 * Check whether current map count plus 2 still leads us to 4 maps below
947 	 * the threshold, otherwise return -ENOMEM here to be more safe.
948 	 */
949 	if ((mm->map_count + 2) >= sysctl_max_map_count - 3)
950 		return -ENOMEM;
951 
952 	if (flags & MREMAP_FIXED) {
953 		/*
954 		 * In mremap_to().
955 		 * VMA is moved to dst address, and munmap dst first.
956 		 * do_munmap will check if dst is sealed.
957 		 */
958 		ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
959 		if (ret)
960 			goto out;
961 	}
962 
963 	if (old_len > new_len) {
964 		ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap);
965 		if (ret)
966 			goto out;
967 		old_len = new_len;
968 	}
969 
970 	vma = vma_to_resize(addr, old_len, new_len, flags);
971 	if (IS_ERR(vma)) {
972 		ret = PTR_ERR(vma);
973 		goto out;
974 	}
975 
976 	/* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */
977 	if (flags & MREMAP_DONTUNMAP &&
978 		!may_expand_vm(mm, vma->vm_flags, old_len >> PAGE_SHIFT)) {
979 		ret = -ENOMEM;
980 		goto out;
981 	}
982 
983 	if (flags & MREMAP_FIXED)
984 		map_flags |= MAP_FIXED;
985 
986 	if (vma->vm_flags & VM_MAYSHARE)
987 		map_flags |= MAP_SHARED;
988 
989 	ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff +
990 				((addr - vma->vm_start) >> PAGE_SHIFT),
991 				map_flags);
992 	if (IS_ERR_VALUE(ret))
993 		goto out;
994 
995 	/* We got a new mapping */
996 	if (!(flags & MREMAP_FIXED))
997 		new_addr = ret;
998 
999 	ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, uf,
1000 		       uf_unmap);
1001 
1002 out:
1003 	return ret;
1004 }
1005 
vma_expandable(struct vm_area_struct * vma,unsigned long delta)1006 static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
1007 {
1008 	unsigned long end = vma->vm_end + delta;
1009 
1010 	if (end < vma->vm_end) /* overflow */
1011 		return 0;
1012 	if (find_vma_intersection(vma->vm_mm, vma->vm_end, end))
1013 		return 0;
1014 	if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start,
1015 			      0, MAP_FIXED) & ~PAGE_MASK)
1016 		return 0;
1017 	return 1;
1018 }
1019 
1020 /*
1021  * Expand (or shrink) an existing mapping, potentially moving it at the
1022  * same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
1023  *
1024  * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
1025  * This option implies MREMAP_MAYMOVE.
1026  */
SYSCALL_DEFINE5(mremap,unsigned long,addr,unsigned long,old_len,unsigned long,new_len,unsigned long,flags,unsigned long,new_addr)1027 SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
1028 		unsigned long, new_len, unsigned long, flags,
1029 		unsigned long, new_addr)
1030 {
1031 	struct mm_struct *mm = current->mm;
1032 	struct vm_area_struct *vma;
1033 	unsigned long ret = -EINVAL;
1034 	bool locked = false;
1035 	struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
1036 	LIST_HEAD(uf_unmap_early);
1037 	LIST_HEAD(uf_unmap);
1038 
1039 	/*
1040 	 * There is a deliberate asymmetry here: we strip the pointer tag
1041 	 * from the old address but leave the new address alone. This is
1042 	 * for consistency with mmap(), where we prevent the creation of
1043 	 * aliasing mappings in userspace by leaving the tag bits of the
1044 	 * mapping address intact. A non-zero tag will cause the subsequent
1045 	 * range checks to reject the address as invalid.
1046 	 *
1047 	 * See Documentation/arch/arm64/tagged-address-abi.rst for more
1048 	 * information.
1049 	 */
1050 	addr = untagged_addr(addr);
1051 
1052 	if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP))
1053 		return ret;
1054 
1055 	if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
1056 		return ret;
1057 
1058 	/*
1059 	 * MREMAP_DONTUNMAP is always a move and it does not allow resizing
1060 	 * in the process.
1061 	 */
1062 	if (flags & MREMAP_DONTUNMAP &&
1063 			(!(flags & MREMAP_MAYMOVE) || old_len != new_len))
1064 		return ret;
1065 
1066 
1067 	if (__offset_in_page_log(addr))
1068 		return ret;
1069 
1070 	old_len = __PAGE_ALIGN(old_len);
1071 	new_len = __PAGE_ALIGN(new_len);
1072 
1073 	/*
1074 	 * We allow a zero old-len as a special case
1075 	 * for DOS-emu "duplicate shm area" thing. But
1076 	 * a zero new-len is nonsensical.
1077 	 */
1078 	if (!new_len)
1079 		return ret;
1080 
1081 	if (mmap_write_lock_killable(current->mm))
1082 		return -EINTR;
1083 	vma = vma_lookup(mm, addr);
1084 	if (!vma) {
1085 		ret = -EFAULT;
1086 		goto out;
1087 	}
1088 
1089 	/* Don't allow remapping vmas when they have already been sealed */
1090 	if (!can_modify_vma(vma)) {
1091 		ret = -EPERM;
1092 		goto out;
1093 	}
1094 
1095 	if (is_vm_hugetlb_page(vma)) {
1096 		struct hstate *h __maybe_unused = hstate_vma(vma);
1097 
1098 		old_len = ALIGN(old_len, huge_page_size(h));
1099 		new_len = ALIGN(new_len, huge_page_size(h));
1100 
1101 		/* addrs must be huge page aligned */
1102 		if (addr & ~huge_page_mask(h))
1103 			goto out;
1104 		if (new_addr & ~huge_page_mask(h))
1105 			goto out;
1106 
1107 		/*
1108 		 * Don't allow remap expansion, because the underlying hugetlb
1109 		 * reservation is not yet capable to handle split reservation.
1110 		 */
1111 		if (new_len > old_len)
1112 			goto out;
1113 	}
1114 
1115 	if (flags & (MREMAP_FIXED | MREMAP_DONTUNMAP)) {
1116 		ret = mremap_to(addr, old_len, new_addr, new_len,
1117 				&locked, flags, &uf, &uf_unmap_early,
1118 				&uf_unmap);
1119 		goto out;
1120 	}
1121 
1122 	/*
1123 	 * Always allow a shrinking remap: that just unmaps
1124 	 * the unnecessary pages..
1125 	 * do_vmi_munmap does all the needed commit accounting, and
1126 	 * unlocks the mmap_lock if so directed.
1127 	 */
1128 	if (old_len >= new_len) {
1129 		VMA_ITERATOR(vmi, mm, addr + new_len);
1130 
1131 		if (old_len == new_len) {
1132 			ret = addr;
1133 			goto out;
1134 		}
1135 
1136 		ret = do_vmi_munmap(&vmi, mm, addr + new_len, old_len - new_len,
1137 				    &uf_unmap, true);
1138 		if (ret)
1139 			goto out;
1140 
1141 		ret = addr;
1142 		goto out_unlocked;
1143 	}
1144 
1145 	/*
1146 	 * Ok, we need to grow..
1147 	 */
1148 	vma = vma_to_resize(addr, old_len, new_len, flags);
1149 	if (IS_ERR(vma)) {
1150 		ret = PTR_ERR(vma);
1151 		goto out;
1152 	}
1153 
1154 	/* old_len exactly to the end of the area..
1155 	 */
1156 	if (old_len == vma->vm_end - addr) {
1157 		unsigned long delta = new_len - old_len;
1158 
1159 		/* can we just expand the current mapping? */
1160 		if (vma_expandable(vma, delta)) {
1161 			long pages = delta >> PAGE_SHIFT;
1162 			VMA_ITERATOR(vmi, mm, vma->vm_end);
1163 			long charged = 0;
1164 
1165 			if (vma->vm_flags & VM_ACCOUNT) {
1166 				if (security_vm_enough_memory_mm(mm, pages)) {
1167 					ret = -ENOMEM;
1168 					goto out;
1169 				}
1170 				charged = pages;
1171 			}
1172 
1173 			/*
1174 			 * Function vma_merge_extend() is called on the
1175 			 * extension we are adding to the already existing vma,
1176 			 * vma_merge_extend() will merge this extension with the
1177 			 * already existing vma (expand operation itself) and
1178 			 * possibly also with the next vma if it becomes
1179 			 * adjacent to the expanded vma and otherwise
1180 			 * compatible.
1181 			 */
1182 			vma = vma_merge_extend(&vmi, vma, delta);
1183 			if (!vma) {
1184 				vm_unacct_memory(charged);
1185 				ret = -ENOMEM;
1186 				goto out;
1187 			}
1188 
1189 			vm_stat_account(mm, vma->vm_flags, pages);
1190 			if (vma->vm_flags & VM_LOCKED) {
1191 				mm->locked_vm += pages;
1192 				locked = true;
1193 				new_addr = addr;
1194 			}
1195 			ret = addr;
1196 			goto out;
1197 		}
1198 	}
1199 
1200 	/*
1201 	 * We weren't able to just expand or shrink the area,
1202 	 * we need to create a new one and move it..
1203 	 */
1204 	ret = -ENOMEM;
1205 	if (flags & MREMAP_MAYMOVE) {
1206 		unsigned long map_flags = 0;
1207 		if (vma->vm_flags & VM_MAYSHARE)
1208 			map_flags |= MAP_SHARED;
1209 
1210 		new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
1211 					vma->vm_pgoff +
1212 					((addr - vma->vm_start) >> PAGE_SHIFT),
1213 					map_flags);
1214 		if (IS_ERR_VALUE(new_addr)) {
1215 			ret = new_addr;
1216 			goto out;
1217 		}
1218 
1219 		ret = move_vma(vma, addr, old_len, new_len, new_addr,
1220 			       &locked, flags, &uf, &uf_unmap);
1221 	}
1222 out:
1223 	if (offset_in_page(ret))
1224 		locked = false;
1225 	mmap_write_unlock(current->mm);
1226 	if (locked && new_len > old_len)
1227 		mm_populate(new_addr + old_len, new_len - old_len);
1228 out_unlocked:
1229 	userfaultfd_unmap_complete(mm, &uf_unmap_early);
1230 	mremap_userfaultfd_complete(&uf, addr, ret, old_len);
1231 	userfaultfd_unmap_complete(mm, &uf_unmap);
1232 	return ret;
1233 }
1234