• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *	linux/mm/madvise.c
4  *
5  * Copyright (C) 1999  Linus Torvalds
6  * Copyright (C) 2002  Christoph Hellwig
7  */
8 
9 #include <linux/mman.h>
10 #include <linux/pagemap.h>
11 #include <linux/syscalls.h>
12 #include <linux/mempolicy.h>
13 #include <linux/page_size_compat.h>
14 #include <linux/page-isolation.h>
15 #include <linux/pgsize_migration.h>
16 #include <linux/page_idle.h>
17 #include <linux/userfaultfd_k.h>
18 #include <linux/hugetlb.h>
19 #include <linux/falloc.h>
20 #include <linux/fadvise.h>
21 #include <linux/sched.h>
22 #include <linux/sched/mm.h>
23 #include <linux/mm_inline.h>
24 #include <linux/string.h>
25 #include <linux/uio.h>
26 #include <linux/ksm.h>
27 #include <linux/fs.h>
28 #include <linux/file.h>
29 #include <linux/blkdev.h>
30 #include <linux/backing-dev.h>
31 #include <linux/pagewalk.h>
32 #include <linux/swap.h>
33 #include <linux/swapops.h>
34 #include <linux/shmem_fs.h>
35 #include <linux/mmu_notifier.h>
36 #include <trace/hooks/mm.h>
37 #include <trace/hooks/madvise.h>
38 
39 #include <asm/tlb.h>
40 
41 #include "internal.h"
42 #include "swap.h"
43 
44 /*
45  * Maximum number of attempts we make to install guard pages before we give up
46  * and return -ERESTARTNOINTR to have userspace try again.
47  */
48 #define MAX_MADVISE_GUARD_RETRIES 3
49 
50 struct madvise_walk_private {
51 	struct mmu_gather *tlb;
52 	bool pageout;
53 	void *private;
54 };
55 
56 enum madvise_lock_mode {
57 	MADVISE_NO_LOCK,
58 	MADVISE_MMAP_READ_LOCK,
59 	MADVISE_MMAP_WRITE_LOCK,
60 	MADVISE_VMA_READ_LOCK,
61 };
62 
63 struct madvise_behavior {
64 	int behavior;
65 	struct mmu_gather *tlb;
66 	enum madvise_lock_mode lock_mode;
67 };
68 
69 #ifdef CONFIG_ANON_VMA_NAME
anon_vma_name_alloc(const char * name)70 struct anon_vma_name *anon_vma_name_alloc(const char *name)
71 {
72 	struct anon_vma_name *anon_name;
73 	size_t count;
74 
75 	/* Add 1 for NUL terminator at the end of the anon_name->name */
76 	count = strlen(name) + 1;
77 	anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL);
78 	if (anon_name) {
79 		kref_init(&anon_name->kref);
80 		memcpy(anon_name->name, name, count);
81 	}
82 
83 	return anon_name;
84 }
85 
anon_vma_name_free(struct kref * kref)86 void anon_vma_name_free(struct kref *kref)
87 {
88 	struct anon_vma_name *anon_name =
89 			container_of(kref, struct anon_vma_name, kref);
90 	kfree(anon_name);
91 }
92 
anon_vma_name(struct vm_area_struct * vma)93 struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
94 {
95 	if (!rwsem_is_locked(&vma->vm_mm->mmap_lock))
96 		vma_assert_locked(vma);
97 
98 	return vma->anon_name;
99 }
100 EXPORT_SYMBOL_GPL(anon_vma_name);
101 
102 /* mmap_lock should be write-locked */
replace_anon_vma_name(struct vm_area_struct * vma,struct anon_vma_name * anon_name)103 static int replace_anon_vma_name(struct vm_area_struct *vma,
104 				 struct anon_vma_name *anon_name)
105 {
106 	struct anon_vma_name *orig_name = anon_vma_name(vma);
107 
108 	if (!anon_name) {
109 		vma->anon_name = NULL;
110 		anon_vma_name_put(orig_name);
111 		return 0;
112 	}
113 
114 	if (anon_vma_name_eq(orig_name, anon_name))
115 		return 0;
116 
117 	vma->anon_name = anon_vma_name_reuse(anon_name);
118 	anon_vma_name_put(orig_name);
119 
120 	return 0;
121 }
122 #else /* CONFIG_ANON_VMA_NAME */
replace_anon_vma_name(struct vm_area_struct * vma,struct anon_vma_name * anon_name)123 static int replace_anon_vma_name(struct vm_area_struct *vma,
124 				 struct anon_vma_name *anon_name)
125 {
126 	if (anon_name)
127 		return -EINVAL;
128 
129 	return 0;
130 }
131 #endif /* CONFIG_ANON_VMA_NAME */
132 /*
133  * Update the vm_flags on region of a vma, splitting it or merging it as
134  * necessary.  Must be called with mmap_lock held for writing;
135  * Caller should ensure anon_name stability by raising its refcount even when
136  * anon_name belongs to a valid vma because this function might free that vma.
137  */
madvise_update_vma(struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end,unsigned long new_flags,struct anon_vma_name * anon_name)138 static int madvise_update_vma(struct vm_area_struct *vma,
139 			      struct vm_area_struct **prev, unsigned long start,
140 			      unsigned long end, unsigned long new_flags,
141 			      struct anon_vma_name *anon_name)
142 {
143 	struct mm_struct *mm = vma->vm_mm;
144 	int error;
145 	VMA_ITERATOR(vmi, mm, start);
146 
147 	if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) {
148 		*prev = vma;
149 		return 0;
150 	}
151 
152 	vma = vma_modify_flags_name(&vmi, *prev, vma, start, end, new_flags,
153 				    anon_name);
154 	if (IS_ERR(vma))
155 		return PTR_ERR(vma);
156 
157 	*prev = vma;
158 
159 	/* vm_flags is protected by the mmap_lock held in write mode. */
160 	vma_start_write(vma);
161 	vm_flags_reset(vma, new_flags);
162 	if (!vma->vm_file || vma_is_anon_shmem(vma)) {
163 		error = replace_anon_vma_name(vma, anon_name);
164 		if (error)
165 			return error;
166 	}
167 
168 	return 0;
169 }
170 
171 #ifdef CONFIG_SWAP
swapin_walk_pmd_entry(pmd_t * pmd,unsigned long start,unsigned long end,struct mm_walk * walk)172 static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
173 		unsigned long end, struct mm_walk *walk)
174 {
175 	struct vm_area_struct *vma = walk->private;
176 	struct swap_iocb *splug = NULL;
177 	pte_t *ptep = NULL;
178 	spinlock_t *ptl;
179 	unsigned long addr;
180 
181 	for (addr = start; addr < end; addr += PAGE_SIZE) {
182 		pte_t pte;
183 		swp_entry_t entry;
184 		struct folio *folio;
185 
186 		if (!ptep++) {
187 			ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
188 			if (!ptep)
189 				break;
190 		}
191 
192 		pte = ptep_get(ptep);
193 		if (!is_swap_pte(pte))
194 			continue;
195 		entry = pte_to_swp_entry(pte);
196 		if (unlikely(non_swap_entry(entry)))
197 			continue;
198 
199 		pte_unmap_unlock(ptep, ptl);
200 		ptep = NULL;
201 		trace_android_vh_madvise_swapin_walk_pmd_entry(entry);
202 
203 		folio = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
204 					     vma, addr, &splug);
205 		if (folio)
206 			folio_put(folio);
207 	}
208 
209 	if (ptep)
210 		pte_unmap_unlock(ptep, ptl);
211 	swap_read_unplug(splug);
212 	cond_resched();
213 
214 	return 0;
215 }
216 
217 static const struct mm_walk_ops swapin_walk_ops = {
218 	.pmd_entry		= swapin_walk_pmd_entry,
219 	.walk_lock		= PGWALK_RDLOCK,
220 };
221 
shmem_swapin_range(struct vm_area_struct * vma,unsigned long start,unsigned long end,struct address_space * mapping)222 static void shmem_swapin_range(struct vm_area_struct *vma,
223 		unsigned long start, unsigned long end,
224 		struct address_space *mapping)
225 {
226 	XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start));
227 	pgoff_t end_index = linear_page_index(vma, end) - 1;
228 	struct folio *folio;
229 	struct swap_iocb *splug = NULL;
230 
231 	rcu_read_lock();
232 	xas_for_each(&xas, folio, end_index) {
233 		unsigned long addr;
234 		swp_entry_t entry;
235 
236 		if (!xa_is_value(folio))
237 			continue;
238 		entry = radix_to_swp_entry(folio);
239 		/* There might be swapin error entries in shmem mapping. */
240 		if (non_swap_entry(entry))
241 			continue;
242 
243 		addr = vma->vm_start +
244 			((xas.xa_index - vma->vm_pgoff) << PAGE_SHIFT);
245 		xas_pause(&xas);
246 		rcu_read_unlock();
247 
248 		folio = read_swap_cache_async(entry, mapping_gfp_mask(mapping),
249 					     vma, addr, &splug);
250 		if (folio)
251 			folio_put(folio);
252 
253 		rcu_read_lock();
254 	}
255 	rcu_read_unlock();
256 	swap_read_unplug(splug);
257 }
258 #endif		/* CONFIG_SWAP */
259 
260 /*
261  * Schedule all required I/O operations.  Do not wait for completion.
262  */
madvise_willneed(struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end)263 static long madvise_willneed(struct vm_area_struct *vma,
264 			     struct vm_area_struct **prev,
265 			     unsigned long start, unsigned long end)
266 {
267 	struct mm_struct *mm = vma->vm_mm;
268 	struct file *file = vma->vm_file;
269 	loff_t offset;
270 
271 	*prev = vma;
272 #ifdef CONFIG_SWAP
273 	if (!file) {
274 		walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma);
275 		lru_add_drain(); /* Push any new pages onto the LRU now */
276 		return 0;
277 	}
278 
279 	if (shmem_mapping(file->f_mapping)) {
280 		shmem_swapin_range(vma, start, end, file->f_mapping);
281 		lru_add_drain(); /* Push any new pages onto the LRU now */
282 		return 0;
283 	}
284 #else
285 	if (!file)
286 		return -EBADF;
287 #endif
288 
289 	if (IS_DAX(file_inode(file))) {
290 		/* no bad return value, but ignore advice */
291 		return 0;
292 	}
293 
294 	/*
295 	 * Filesystem's fadvise may need to take various locks.  We need to
296 	 * explicitly grab a reference because the vma (and hence the
297 	 * vma's reference to the file) can go away as soon as we drop
298 	 * mmap_lock.
299 	 */
300 	*prev = NULL;	/* tell sys_madvise we drop mmap_lock */
301 	get_file(file);
302 	offset = (loff_t)(start - vma->vm_start)
303 			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
304 	mmap_read_unlock(mm);
305 	vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
306 	fput(file);
307 	mmap_read_lock(mm);
308 	return 0;
309 }
310 
can_do_file_pageout(struct vm_area_struct * vma)311 static inline bool can_do_file_pageout(struct vm_area_struct *vma)
312 {
313 	if (!vma->vm_file)
314 		return false;
315 	/*
316 	 * paging out pagecache only for non-anonymous mappings that correspond
317 	 * to the files the calling process could (if tried) open for writing;
318 	 * otherwise we'd be including shared non-exclusive mappings, which
319 	 * opens a side channel.
320 	 */
321 	return inode_owner_or_capable(&nop_mnt_idmap,
322 				      file_inode(vma->vm_file)) ||
323 	       file_permission(vma->vm_file, MAY_WRITE) == 0;
324 }
325 
madvise_folio_pte_batch(unsigned long addr,unsigned long end,struct folio * folio,pte_t * ptep,pte_t pte,bool * any_young,bool * any_dirty)326 static inline int madvise_folio_pte_batch(unsigned long addr, unsigned long end,
327 					  struct folio *folio, pte_t *ptep,
328 					  pte_t pte, bool *any_young,
329 					  bool *any_dirty)
330 {
331 	const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
332 	int max_nr = (end - addr) / PAGE_SIZE;
333 
334 	return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL,
335 			       any_young, any_dirty);
336 }
337 
madvise_cold_or_pageout_pte_range(pmd_t * pmd,unsigned long addr,unsigned long end,struct mm_walk * walk)338 static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
339 				unsigned long addr, unsigned long end,
340 				struct mm_walk *walk)
341 {
342 	struct madvise_walk_private *private = walk->private;
343 	struct mmu_gather *tlb = private->tlb;
344 	bool pageout = private->pageout;
345 	struct mm_struct *mm = tlb->mm;
346 	struct vm_area_struct *vma = walk->vma;
347 	pte_t *start_pte, *pte, ptent;
348 	spinlock_t *ptl;
349 	struct folio *folio = NULL;
350 	LIST_HEAD(folio_list);
351 	bool pageout_anon_only_filter;
352 	unsigned int batch_count = 0;
353 	bool abort_madvise = false;
354 	int nr;
355 	int ret = 0;
356 
357 	trace_android_vh_madvise_cold_or_pageout_abort(vma, &abort_madvise);
358 	if (fatal_signal_pending(current) || abort_madvise)
359 		return -EINTR;
360 
361 	trace_android_vh_madvise_pageout_bypass(mm, pageout, &ret);
362 	if (ret)
363 		return ret;
364 
365 	pageout_anon_only_filter = pageout && !vma_is_anonymous(vma) &&
366 					!can_do_file_pageout(vma);
367 
368 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
369 	if (pmd_trans_huge(*pmd)) {
370 		pmd_t orig_pmd;
371 		unsigned long next = pmd_addr_end(addr, end);
372 
373 		tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
374 		ptl = pmd_trans_huge_lock(pmd, vma);
375 		if (!ptl)
376 			return 0;
377 
378 		orig_pmd = *pmd;
379 		if (is_huge_zero_pmd(orig_pmd))
380 			goto huge_unlock;
381 
382 		if (unlikely(!pmd_present(orig_pmd))) {
383 			VM_BUG_ON(thp_migration_supported() &&
384 					!is_pmd_migration_entry(orig_pmd));
385 			goto huge_unlock;
386 		}
387 
388 		folio = pmd_folio(orig_pmd);
389 
390 		/* Do not interfere with other mappings of this folio */
391 		if (folio_likely_mapped_shared(folio))
392 			goto huge_unlock;
393 
394 		if (pageout_anon_only_filter && !folio_test_anon(folio))
395 			goto huge_unlock;
396 
397 		if (next - addr != HPAGE_PMD_SIZE) {
398 			int err;
399 
400 			folio_get(folio);
401 			spin_unlock(ptl);
402 			folio_lock(folio);
403 			err = split_folio(folio);
404 			folio_unlock(folio);
405 			folio_put(folio);
406 			if (!err)
407 				goto regular_folio;
408 			return 0;
409 		}
410 
411 		if (!pageout && pmd_young(orig_pmd)) {
412 			pmdp_invalidate(vma, addr, pmd);
413 			orig_pmd = pmd_mkold(orig_pmd);
414 
415 			set_pmd_at(mm, addr, pmd, orig_pmd);
416 			tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
417 		}
418 
419 		folio_clear_referenced(folio);
420 		folio_test_clear_young(folio);
421 		if (folio_test_active(folio))
422 			folio_set_workingset(folio);
423 		if (pageout) {
424 			if (folio_isolate_lru(folio)) {
425 				if (folio_test_unevictable(folio))
426 					folio_putback_lru(folio);
427 				else
428 					list_add(&folio->lru, &folio_list);
429 			}
430 		} else
431 			folio_deactivate(folio);
432 huge_unlock:
433 		spin_unlock(ptl);
434 		if (pageout)
435 			__reclaim_pages(&folio_list, private->private);
436 		return 0;
437 	}
438 
439 regular_folio:
440 #endif
441 	tlb_change_page_size(tlb, PAGE_SIZE);
442 restart:
443 	start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
444 	if (!start_pte)
445 		return 0;
446 	flush_tlb_batched_pending(mm);
447 	arch_enter_lazy_mmu_mode();
448 	for (; addr < end; pte += nr, addr += nr * PAGE_SIZE) {
449 		bool need_skip = false;
450 		nr = 1;
451 		ptent = ptep_get(pte);
452 
453 		if (++batch_count == SWAP_CLUSTER_MAX) {
454 			batch_count = 0;
455 			if (need_resched()) {
456 				arch_leave_lazy_mmu_mode();
457 				pte_unmap_unlock(start_pte, ptl);
458 				cond_resched();
459 				goto restart;
460 			}
461 		}
462 
463 		if (pte_none(ptent))
464 			continue;
465 
466 		if (!pte_present(ptent))
467 			continue;
468 
469 		folio = vm_normal_folio(vma, addr, ptent);
470 		if (!folio || folio_is_zone_device(folio))
471 			continue;
472 
473 		trace_android_vh_madvise_cold_pageout_skip(vma, folio, pageout,
474 			&need_skip);
475 
476 		if (need_skip)
477 			continue;
478 
479 		/*
480 		 * If we encounter a large folio, only split it if it is not
481 		 * fully mapped within the range we are operating on. Otherwise
482 		 * leave it as is so that it can be swapped out whole. If we
483 		 * fail to split a folio, leave it in place and advance to the
484 		 * next pte in the range.
485 		 */
486 		if (folio_test_large(folio)) {
487 			bool any_young;
488 
489 			nr = madvise_folio_pte_batch(addr, end, folio, pte,
490 						     ptent, &any_young, NULL);
491 
492 			if (any_young)
493 				ptent = pte_mkyoung(ptent);
494 
495 			if (nr < folio_nr_pages(folio)) {
496 				int err;
497 				bool bypass = false;
498 
499 				trace_android_vh_split_large_folio_bypass(&bypass);
500 				if (bypass)
501 					continue;
502 				if (folio_likely_mapped_shared(folio))
503 					continue;
504 				if (pageout_anon_only_filter && !folio_test_anon(folio))
505 					continue;
506 				if (!folio_trylock(folio))
507 					continue;
508 				folio_get(folio);
509 				arch_leave_lazy_mmu_mode();
510 				pte_unmap_unlock(start_pte, ptl);
511 				start_pte = NULL;
512 				err = split_folio(folio);
513 				folio_unlock(folio);
514 				folio_put(folio);
515 				start_pte = pte =
516 					pte_offset_map_lock(mm, pmd, addr, &ptl);
517 				if (!start_pte)
518 					break;
519 				flush_tlb_batched_pending(mm);
520 				arch_enter_lazy_mmu_mode();
521 				if (!err)
522 					nr = 0;
523 				continue;
524 			}
525 		}
526 
527 		/*
528 		 * Do not interfere with other mappings of this folio and
529 		 * non-LRU folio. If we have a large folio at this point, we
530 		 * know it is fully mapped so if its mapcount is the same as its
531 		 * number of pages, it must be exclusive.
532 		 */
533 		if (!folio_test_lru(folio) ||
534 		    folio_mapcount(folio) != folio_nr_pages(folio))
535 			continue;
536 
537 		if (pageout_anon_only_filter && !folio_test_anon(folio))
538 			continue;
539 
540 		if (!pageout && pte_young(ptent)) {
541 			clear_young_dirty_ptes(vma, addr, pte, nr,
542 					       CYDP_CLEAR_YOUNG);
543 			tlb_remove_tlb_entries(tlb, pte, nr, addr);
544 		}
545 
546 		/*
547 		 * We are deactivating a folio for accelerating reclaiming.
548 		 * VM couldn't reclaim the folio unless we clear PG_young.
549 		 * As a side effect, it makes confuse idle-page tracking
550 		 * because they will miss recent referenced history.
551 		 */
552 		folio_clear_referenced(folio);
553 		folio_test_clear_young(folio);
554 		if (folio_test_active(folio))
555 			folio_set_workingset(folio);
556 		if (pageout) {
557 			if (folio_isolate_lru(folio)) {
558 				if (folio_test_unevictable(folio))
559 					folio_putback_lru(folio);
560 				else
561 					list_add(&folio->lru, &folio_list);
562 			}
563 		} else
564 			folio_deactivate(folio);
565 	}
566 
567 	if (start_pte) {
568 		arch_leave_lazy_mmu_mode();
569 		pte_unmap_unlock(start_pte, ptl);
570 	}
571 	if (pageout)
572 		__reclaim_pages(&folio_list, private->private);
573 	cond_resched();
574 
575 	return 0;
576 }
577 
578 static const struct mm_walk_ops cold_walk_ops = {
579 	.pmd_entry = madvise_cold_or_pageout_pte_range,
580 	.walk_lock = PGWALK_RDLOCK,
581 };
582 
madvise_cold_page_range(struct mmu_gather * tlb,struct vm_area_struct * vma,unsigned long addr,unsigned long end)583 static void madvise_cold_page_range(struct mmu_gather *tlb,
584 			     struct vm_area_struct *vma,
585 			     unsigned long addr, unsigned long end)
586 {
587 	struct madvise_walk_private walk_private = {
588 		.pageout = false,
589 		.tlb = tlb,
590 	};
591 
592 	tlb_start_vma(tlb, vma);
593 	walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
594 	tlb_end_vma(tlb, vma);
595 }
596 
can_madv_lru_vma(struct vm_area_struct * vma)597 static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
598 {
599 	return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP|VM_HUGETLB));
600 }
601 
madvise_cold(struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start_addr,unsigned long end_addr)602 static long madvise_cold(struct vm_area_struct *vma,
603 			struct vm_area_struct **prev,
604 			unsigned long start_addr, unsigned long end_addr)
605 {
606 	struct mm_struct *mm = vma->vm_mm;
607 	struct mmu_gather tlb;
608 
609 	*prev = vma;
610 	if (!can_madv_lru_vma(vma))
611 		return -EINVAL;
612 
613 	lru_add_drain();
614 	tlb_gather_mmu(&tlb, mm);
615 	madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
616 	tlb_finish_mmu(&tlb);
617 
618 	return 0;
619 }
620 
madvise_pageout_page_range(struct mmu_gather * tlb,struct vm_area_struct * vma,unsigned long addr,unsigned long end)621 static int madvise_pageout_page_range(struct mmu_gather *tlb,
622 			     struct vm_area_struct *vma,
623 			     unsigned long addr, unsigned long end)
624 {
625 	struct madvise_walk_private walk_private = {
626 		.pageout = true,
627 		.tlb = tlb,
628 	};
629 	int ret;
630 	LIST_HEAD(folio_list);
631 
632 	trace_android_rvh_madvise_pageout_begin(&walk_private.private);
633 
634 	tlb_start_vma(tlb, vma);
635 	ret = walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
636 	tlb_end_vma(tlb, vma);
637 
638 	trace_android_rvh_madvise_pageout_end(walk_private.private, &folio_list);
639 	if (!list_empty(&folio_list))
640 		reclaim_pages(&folio_list);
641 
642 	return ret;
643 }
644 
madvise_pageout(struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start_addr,unsigned long end_addr)645 static long madvise_pageout(struct vm_area_struct *vma,
646 			struct vm_area_struct **prev,
647 			unsigned long start_addr, unsigned long end_addr)
648 {
649 	struct mm_struct *mm = vma->vm_mm;
650 	struct mmu_gather tlb;
651 	int ret;
652 	bool return_error = false;
653 
654 	*prev = vma;
655 	if (!can_madv_lru_vma(vma))
656 		return -EINVAL;
657 
658 	/*
659 	 * If the VMA belongs to a private file mapping, there can be private
660 	 * dirty pages which can be paged out if even this process is neither
661 	 * owner nor write capable of the file. We allow private file mappings
662 	 * further to pageout dirty anon pages.
663 	 */
664 	if (!vma_is_anonymous(vma) && (!can_do_file_pageout(vma) &&
665 				(vma->vm_flags & VM_MAYSHARE)))
666 		return 0;
667 
668 	lru_add_drain();
669 	tlb_gather_mmu(&tlb, mm);
670 	ret = madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
671 	tlb_finish_mmu(&tlb);
672 
673 	trace_android_vh_madvise_pageout_return_error(ret, &return_error);
674 	if (return_error)
675 		return (long)ret;
676 
677 	return 0;
678 }
679 
madvise_free_pte_range(pmd_t * pmd,unsigned long addr,unsigned long end,struct mm_walk * walk)680 static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
681 				unsigned long end, struct mm_walk *walk)
682 
683 {
684 	const cydp_t cydp_flags = CYDP_CLEAR_YOUNG | CYDP_CLEAR_DIRTY;
685 	struct mmu_gather *tlb = walk->private;
686 	struct mm_struct *mm = tlb->mm;
687 	struct vm_area_struct *vma = walk->vma;
688 	spinlock_t *ptl;
689 	pte_t *start_pte, *pte, ptent;
690 	struct folio *folio;
691 	int nr_swap = 0;
692 	unsigned long next;
693 	int nr, max_nr;
694 
695 	next = pmd_addr_end(addr, end);
696 	if (pmd_trans_huge(*pmd))
697 		if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
698 			return 0;
699 
700 	tlb_change_page_size(tlb, PAGE_SIZE);
701 	start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
702 	if (!start_pte)
703 		return 0;
704 	flush_tlb_batched_pending(mm);
705 	arch_enter_lazy_mmu_mode();
706 	for (; addr != end; pte += nr, addr += PAGE_SIZE * nr) {
707 		nr = 1;
708 		ptent = ptep_get(pte);
709 
710 		if (pte_none(ptent))
711 			continue;
712 		/*
713 		 * If the pte has swp_entry, just clear page table to
714 		 * prevent swap-in which is more expensive rather than
715 		 * (page allocation + zeroing).
716 		 */
717 		if (!pte_present(ptent)) {
718 			swp_entry_t entry;
719 
720 			entry = pte_to_swp_entry(ptent);
721 			if (!non_swap_entry(entry)) {
722 				max_nr = (end - addr) / PAGE_SIZE;
723 				nr = swap_pte_batch(pte, max_nr, ptent);
724 				nr_swap -= nr;
725 				free_swap_and_cache_nr(entry, nr);
726 				clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm);
727 			} else if (is_hwpoison_entry(entry) ||
728 				   is_poisoned_swp_entry(entry)) {
729 				pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
730 			}
731 			continue;
732 		}
733 
734 		folio = vm_normal_folio(vma, addr, ptent);
735 		if (!folio || folio_is_zone_device(folio))
736 			continue;
737 
738 		/*
739 		 * If we encounter a large folio, only split it if it is not
740 		 * fully mapped within the range we are operating on. Otherwise
741 		 * leave it as is so that it can be marked as lazyfree. If we
742 		 * fail to split a folio, leave it in place and advance to the
743 		 * next pte in the range.
744 		 */
745 		if (folio_test_large(folio)) {
746 			bool any_young, any_dirty;
747 
748 			nr = madvise_folio_pte_batch(addr, end, folio, pte,
749 						     ptent, &any_young, &any_dirty);
750 
751 			if (nr < folio_nr_pages(folio)) {
752 				int err;
753 
754 				if (folio_likely_mapped_shared(folio))
755 					continue;
756 				if (!folio_trylock(folio))
757 					continue;
758 				folio_get(folio);
759 				arch_leave_lazy_mmu_mode();
760 				pte_unmap_unlock(start_pte, ptl);
761 				start_pte = NULL;
762 				err = split_folio(folio);
763 				folio_unlock(folio);
764 				folio_put(folio);
765 				pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
766 				start_pte = pte;
767 				if (!start_pte)
768 					break;
769 				flush_tlb_batched_pending(mm);
770 				arch_enter_lazy_mmu_mode();
771 				if (!err)
772 					nr = 0;
773 				continue;
774 			}
775 
776 			if (any_young)
777 				ptent = pte_mkyoung(ptent);
778 			if (any_dirty)
779 				ptent = pte_mkdirty(ptent);
780 		}
781 
782 		if (folio_test_swapcache(folio) || folio_test_dirty(folio)) {
783 			if (!folio_trylock(folio))
784 				continue;
785 			/*
786 			 * If we have a large folio at this point, we know it is
787 			 * fully mapped so if its mapcount is the same as its
788 			 * number of pages, it must be exclusive.
789 			 */
790 			if (folio_mapcount(folio) != folio_nr_pages(folio)) {
791 				folio_unlock(folio);
792 				continue;
793 			}
794 
795 			if (folio_test_swapcache(folio) &&
796 			    !folio_free_swap(folio)) {
797 				folio_unlock(folio);
798 				continue;
799 			}
800 
801 			folio_clear_dirty(folio);
802 			folio_unlock(folio);
803 		}
804 
805 		if (pte_young(ptent) || pte_dirty(ptent)) {
806 			clear_young_dirty_ptes(vma, addr, pte, nr, cydp_flags);
807 			tlb_remove_tlb_entries(tlb, pte, nr, addr);
808 		}
809 		folio_mark_lazyfree(folio);
810 	}
811 
812 	if (nr_swap)
813 		add_mm_counter(mm, MM_SWAPENTS, nr_swap);
814 	if (start_pte) {
815 		arch_leave_lazy_mmu_mode();
816 		pte_unmap_unlock(start_pte, ptl);
817 	}
818 	cond_resched();
819 
820 	return 0;
821 }
822 
823 static const struct mm_walk_ops madvise_free_walk_ops = {
824 	.pmd_entry		= madvise_free_pte_range,
825 	.walk_lock		= PGWALK_RDLOCK,
826 };
827 
madvise_free_single_vma(struct vm_area_struct * vma,unsigned long start_addr,unsigned long end_addr)828 static int madvise_free_single_vma(struct vm_area_struct *vma,
829 			unsigned long start_addr, unsigned long end_addr)
830 {
831 	struct mm_struct *mm = vma->vm_mm;
832 	struct mmu_notifier_range range;
833 	struct mmu_gather tlb;
834 
835 	/* MADV_FREE works for only anon vma at the moment */
836 	if (!vma_is_anonymous(vma))
837 		return -EINVAL;
838 
839 	range.start = max(vma->vm_start, start_addr);
840 	if (range.start >= vma->vm_end)
841 		return -EINVAL;
842 	range.end = min(vma->vm_end, end_addr);
843 	if (range.end <= vma->vm_start)
844 		return -EINVAL;
845 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
846 				range.start, range.end);
847 
848 	lru_add_drain();
849 	tlb_gather_mmu(&tlb, mm);
850 	update_hiwater_rss(mm);
851 
852 	mmu_notifier_invalidate_range_start(&range);
853 	tlb_start_vma(&tlb, vma);
854 	walk_page_range(vma->vm_mm, range.start, range.end,
855 			&madvise_free_walk_ops, &tlb);
856 	tlb_end_vma(&tlb, vma);
857 	mmu_notifier_invalidate_range_end(&range);
858 	tlb_finish_mmu(&tlb);
859 
860 	return 0;
861 }
862 
863 /*
864  * Application no longer needs these pages.  If the pages are dirty,
865  * it's OK to just throw them away.  The app will be more careful about
866  * data it wants to keep.  Be sure to free swap resources too.  The
867  * zap_page_range_single call sets things up for shrink_active_list to actually
868  * free these pages later if no one else has touched them in the meantime,
869  * although we could add these pages to a global reuse list for
870  * shrink_active_list to pick up before reclaiming other pages.
871  *
872  * NB: This interface discards data rather than pushes it out to swap,
873  * as some implementations do.  This has performance implications for
874  * applications like large transactional databases which want to discard
875  * pages in anonymous maps after committing to backing store the data
876  * that was kept in them.  There is no reason to write this data out to
877  * the swap area if the application is discarding it.
878  *
879  * An interface that causes the system to free clean pages and flush
880  * dirty pages is already available as msync(MS_INVALIDATE).
881  */
madvise_dontneed_single_vma(struct vm_area_struct * vma,unsigned long start,unsigned long end)882 static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
883 					unsigned long start, unsigned long end)
884 {
885 	madvise_vma_pad_pages(vma, start, end);
886 
887 	zap_page_range_single(vma, start, end - start, NULL);
888 	return 0;
889 }
890 
madvise_dontneed_free_valid_vma(struct vm_area_struct * vma,unsigned long start,unsigned long * end,int behavior)891 static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma,
892 					    unsigned long start,
893 					    unsigned long *end,
894 					    int behavior)
895 {
896 	if (!is_vm_hugetlb_page(vma)) {
897 		unsigned int forbidden = VM_PFNMAP;
898 
899 		if (behavior != MADV_DONTNEED_LOCKED)
900 			forbidden |= VM_LOCKED;
901 
902 		return !(vma->vm_flags & forbidden);
903 	}
904 
905 	if (behavior != MADV_DONTNEED && behavior != MADV_DONTNEED_LOCKED)
906 		return false;
907 	if (start & ~huge_page_mask(hstate_vma(vma)))
908 		return false;
909 
910 	/*
911 	 * Madvise callers expect the length to be rounded up to PAGE_SIZE
912 	 * boundaries, and may be unaware that this VMA uses huge pages.
913 	 * Avoid unexpected data loss by rounding down the number of
914 	 * huge pages freed.
915 	 */
916 	*end = ALIGN_DOWN(*end, huge_page_size(hstate_vma(vma)));
917 
918 	return true;
919 }
920 
madvise_dontneed_free(struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end,struct madvise_behavior * madv_behavior)921 static long madvise_dontneed_free(struct vm_area_struct *vma,
922 				  struct vm_area_struct **prev,
923 				  unsigned long start, unsigned long end,
924 				  struct madvise_behavior *madv_behavior)
925 {
926 	int behavior = madv_behavior->behavior;
927 	struct mm_struct *mm = vma->vm_mm;
928 
929 	*prev = vma;
930 	if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior))
931 		return -EINVAL;
932 
933 	if (start == end)
934 		return 0;
935 
936 	if (!userfaultfd_remove(vma, start, end)) {
937 		*prev = NULL; /* mmap_lock has been dropped, prev is stale */
938 
939 		mmap_read_lock(mm);
940 		vma = vma_lookup(mm, start);
941 		if (!vma)
942 			return -ENOMEM;
943 		/*
944 		 * Potential end adjustment for hugetlb vma is OK as
945 		 * the check below keeps end within vma.
946 		 */
947 		if (!madvise_dontneed_free_valid_vma(vma, start, &end,
948 						     behavior))
949 			return -EINVAL;
950 		if (end > vma->vm_end) {
951 			/*
952 			 * Don't fail if end > vma->vm_end. If the old
953 			 * vma was split while the mmap_lock was
954 			 * released the effect of the concurrent
955 			 * operation may not cause madvise() to
956 			 * have an undefined result. There may be an
957 			 * adjacent next vma that we'll walk
958 			 * next. userfaultfd_remove() will generate an
959 			 * UFFD_EVENT_REMOVE repetition on the
960 			 * end-vma->vm_end range, but the manager can
961 			 * handle a repetition fine.
962 			 */
963 			end = vma->vm_end;
964 		}
965 		/*
966 		 * If the memory region between start and end was
967 		 * originally backed by 4kB pages and then remapped to
968 		 * be backed by hugepages while mmap_lock was dropped,
969 		 * the adjustment for hugetlb vma above may have rounded
970 		 * end down to the start address.
971 		 */
972 		if (start == end)
973 			return 0;
974 		VM_WARN_ON(start > end);
975 	}
976 
977 	if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED)
978 		return madvise_dontneed_single_vma(vma, start, end);
979 	else if (behavior == MADV_FREE)
980 		return madvise_free_single_vma(vma, start, end);
981 	else
982 		return -EINVAL;
983 }
984 
madvise_populate(struct mm_struct * mm,unsigned long start,unsigned long end,int behavior)985 static long madvise_populate(struct mm_struct *mm, unsigned long start,
986 		unsigned long end, int behavior)
987 {
988 	const bool write = behavior == MADV_POPULATE_WRITE;
989 	int locked = 1;
990 	long pages;
991 
992 	while (start < end) {
993 		/* Populate (prefault) page tables readable/writable. */
994 		pages = faultin_page_range(mm, start, end, write, &locked);
995 		if (!locked) {
996 			mmap_read_lock(mm);
997 			locked = 1;
998 		}
999 		if (pages < 0) {
1000 			switch (pages) {
1001 			case -EINTR:
1002 				return -EINTR;
1003 			case -EINVAL: /* Incompatible mappings / permissions. */
1004 				return -EINVAL;
1005 			case -EHWPOISON:
1006 				return -EHWPOISON;
1007 			case -EFAULT: /* VM_FAULT_SIGBUS or VM_FAULT_SIGSEGV */
1008 				return -EFAULT;
1009 			default:
1010 				pr_warn_once("%s: unhandled return value: %ld\n",
1011 					     __func__, pages);
1012 				fallthrough;
1013 			case -ENOMEM: /* No VMA or out of memory. */
1014 				return -ENOMEM;
1015 			}
1016 		}
1017 		start += pages * PAGE_SIZE;
1018 	}
1019 	return 0;
1020 }
1021 
1022 /*
1023  * Application wants to free up the pages and associated backing store.
1024  * This is effectively punching a hole into the middle of a file.
1025  */
madvise_remove(struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end)1026 static long madvise_remove(struct vm_area_struct *vma,
1027 				struct vm_area_struct **prev,
1028 				unsigned long start, unsigned long end)
1029 {
1030 	loff_t offset;
1031 	int error;
1032 	struct file *f;
1033 	struct mm_struct *mm = vma->vm_mm;
1034 
1035 	*prev = NULL;	/* tell sys_madvise we drop mmap_lock */
1036 
1037 	if (vma->vm_flags & VM_LOCKED)
1038 		return -EINVAL;
1039 
1040 	f = vma->vm_file;
1041 
1042 	if (!f || !f->f_mapping || !f->f_mapping->host) {
1043 			return -EINVAL;
1044 	}
1045 
1046 	if (!vma_is_shared_maywrite(vma))
1047 		return -EACCES;
1048 
1049 	offset = (loff_t)(start - vma->vm_start)
1050 			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
1051 
1052 	/*
1053 	 * Filesystem's fallocate may need to take i_rwsem.  We need to
1054 	 * explicitly grab a reference because the vma (and hence the
1055 	 * vma's reference to the file) can go away as soon as we drop
1056 	 * mmap_lock.
1057 	 */
1058 	get_file(f);
1059 	if (userfaultfd_remove(vma, start, end)) {
1060 		/* mmap_lock was not released by userfaultfd_remove() */
1061 		mmap_read_unlock(mm);
1062 	}
1063 	error = vfs_fallocate(f,
1064 				FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1065 				offset, end - start);
1066 	fput(f);
1067 	mmap_read_lock(mm);
1068 	return error;
1069 }
1070 
is_valid_guard_vma(struct vm_area_struct * vma,bool allow_locked)1071 static bool is_valid_guard_vma(struct vm_area_struct *vma, bool allow_locked)
1072 {
1073 	vm_flags_t disallowed = VM_SPECIAL | VM_HUGETLB;
1074 
1075 	/*
1076 	 * A user could lock after setting a guard range but that's fine, as
1077 	 * they'd not be able to fault in. The issue arises when we try to zap
1078 	 * existing locked VMAs. We don't want to do that.
1079 	 */
1080 	if (!allow_locked)
1081 		disallowed |= VM_LOCKED;
1082 
1083 	return !(vma->vm_flags & disallowed);
1084 }
1085 
is_guard_pte_marker(pte_t ptent)1086 static bool is_guard_pte_marker(pte_t ptent)
1087 {
1088 	return is_pte_marker(ptent) &&
1089 		is_guard_swp_entry(pte_to_swp_entry(ptent));
1090 }
1091 
guard_install_pud_entry(pud_t * pud,unsigned long addr,unsigned long next,struct mm_walk * walk)1092 static int guard_install_pud_entry(pud_t *pud, unsigned long addr,
1093 				   unsigned long next, struct mm_walk *walk)
1094 {
1095 	pud_t pudval = pudp_get(pud);
1096 
1097 	/* If huge return >0 so we abort the operation + zap. */
1098 	return pud_trans_huge(pudval) || pud_devmap(pudval);
1099 }
1100 
guard_install_pmd_entry(pmd_t * pmd,unsigned long addr,unsigned long next,struct mm_walk * walk)1101 static int guard_install_pmd_entry(pmd_t *pmd, unsigned long addr,
1102 				   unsigned long next, struct mm_walk *walk)
1103 {
1104 	pmd_t pmdval = pmdp_get(pmd);
1105 
1106 	/* If huge return >0 so we abort the operation + zap. */
1107 	return pmd_trans_huge(pmdval) || pmd_devmap(pmdval);
1108 }
1109 
guard_install_pte_entry(pte_t * pte,unsigned long addr,unsigned long next,struct mm_walk * walk)1110 static int guard_install_pte_entry(pte_t *pte, unsigned long addr,
1111 				   unsigned long next, struct mm_walk *walk)
1112 {
1113 	pte_t pteval = ptep_get(pte);
1114 	unsigned long *nr_pages = (unsigned long *)walk->private;
1115 
1116 	/* If there is already a guard page marker, we have nothing to do. */
1117 	if (is_guard_pte_marker(pteval)) {
1118 		(*nr_pages)++;
1119 
1120 		return 0;
1121 	}
1122 
1123 	/* If populated return >0 so we abort the operation + zap. */
1124 	return 1;
1125 }
1126 
guard_install_set_pte(unsigned long addr,unsigned long next,pte_t * ptep,struct mm_walk * walk)1127 static int guard_install_set_pte(unsigned long addr, unsigned long next,
1128 				 pte_t *ptep, struct mm_walk *walk)
1129 {
1130 	unsigned long *nr_pages = (unsigned long *)walk->private;
1131 
1132 	/* Simply install a PTE marker, this causes segfault on access. */
1133 	*ptep = make_pte_marker(PTE_MARKER_GUARD);
1134 	(*nr_pages)++;
1135 
1136 	return 0;
1137 }
1138 
1139 static const struct mm_walk_ops guard_install_walk_ops = {
1140 	.pud_entry		= guard_install_pud_entry,
1141 	.pmd_entry		= guard_install_pmd_entry,
1142 	.pte_entry		= guard_install_pte_entry,
1143 	.install_pte		= guard_install_set_pte,
1144 	.walk_lock		= PGWALK_RDLOCK,
1145 };
1146 
madvise_guard_install(struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end)1147 static long madvise_guard_install(struct vm_area_struct *vma,
1148 				 struct vm_area_struct **prev,
1149 				 unsigned long start, unsigned long end)
1150 {
1151 	long err;
1152 	int i;
1153 
1154 	*prev = vma;
1155 	if (!is_valid_guard_vma(vma, /* allow_locked = */false))
1156 		return -EINVAL;
1157 
1158 	/*
1159 	 * If we install guard markers, then the range is no longer
1160 	 * empty from a page table perspective and therefore it's
1161 	 * appropriate to have an anon_vma.
1162 	 *
1163 	 * This ensures that on fork, we copy page tables correctly.
1164 	 */
1165 	err = anon_vma_prepare(vma);
1166 	if (err)
1167 		return err;
1168 
1169 	/*
1170 	 * Optimistically try to install the guard marker pages first. If any
1171 	 * non-guard pages are encountered, give up and zap the range before
1172 	 * trying again.
1173 	 *
1174 	 * We try a few times before giving up and releasing back to userland to
1175 	 * loop around, releasing locks in the process to avoid contention. This
1176 	 * would only happen if there was a great many racing page faults.
1177 	 *
1178 	 * In most cases we should simply install the guard markers immediately
1179 	 * with no zap or looping.
1180 	 */
1181 	for (i = 0; i < MAX_MADVISE_GUARD_RETRIES; i++) {
1182 		unsigned long nr_pages = 0;
1183 
1184 		/* Returns < 0 on error, == 0 if success, > 0 if zap needed. */
1185 		err = walk_page_range_mm(vma->vm_mm, start, end,
1186 					 &guard_install_walk_ops, &nr_pages);
1187 		if (err < 0)
1188 			return err;
1189 
1190 		if (err == 0) {
1191 			unsigned long nr_expected_pages = PHYS_PFN(end - start);
1192 
1193 			VM_WARN_ON(nr_pages != nr_expected_pages);
1194 			return 0;
1195 		}
1196 
1197 		/*
1198 		 * OK some of the range have non-guard pages mapped, zap
1199 		 * them. This leaves existing guard pages in place.
1200 		 */
1201 		zap_page_range_single(vma, start, end - start, NULL);
1202 	}
1203 
1204 	/*
1205 	 * We were unable to install the guard pages due to being raced by page
1206 	 * faults. This should not happen ordinarily. We return to userspace and
1207 	 * immediately retry, relieving lock contention.
1208 	 */
1209 	return restart_syscall();
1210 }
1211 
guard_remove_pud_entry(pud_t * pud,unsigned long addr,unsigned long next,struct mm_walk * walk)1212 static int guard_remove_pud_entry(pud_t *pud, unsigned long addr,
1213 				  unsigned long next, struct mm_walk *walk)
1214 {
1215 	pud_t pudval = pudp_get(pud);
1216 
1217 	/* If huge, cannot have guard pages present, so no-op - skip. */
1218 	if (pud_trans_huge(pudval) || pud_devmap(pudval))
1219 		walk->action = ACTION_CONTINUE;
1220 
1221 	return 0;
1222 }
1223 
guard_remove_pmd_entry(pmd_t * pmd,unsigned long addr,unsigned long next,struct mm_walk * walk)1224 static int guard_remove_pmd_entry(pmd_t *pmd, unsigned long addr,
1225 				  unsigned long next, struct mm_walk *walk)
1226 {
1227 	pmd_t pmdval = pmdp_get(pmd);
1228 
1229 	/* If huge, cannot have guard pages present, so no-op - skip. */
1230 	if (pmd_trans_huge(pmdval) || pmd_devmap(pmdval))
1231 		walk->action = ACTION_CONTINUE;
1232 
1233 	return 0;
1234 }
1235 
guard_remove_pte_entry(pte_t * pte,unsigned long addr,unsigned long next,struct mm_walk * walk)1236 static int guard_remove_pte_entry(pte_t *pte, unsigned long addr,
1237 				  unsigned long next, struct mm_walk *walk)
1238 {
1239 	pte_t ptent = ptep_get(pte);
1240 
1241 	if (is_guard_pte_marker(ptent)) {
1242 		/* Simply clear the PTE marker. */
1243 		pte_clear_not_present_full(walk->mm, addr, pte, false);
1244 		update_mmu_cache(walk->vma, addr, pte);
1245 	}
1246 
1247 	return 0;
1248 }
1249 
1250 static const struct mm_walk_ops guard_remove_walk_ops = {
1251 	.pud_entry		= guard_remove_pud_entry,
1252 	.pmd_entry		= guard_remove_pmd_entry,
1253 	.pte_entry		= guard_remove_pte_entry,
1254 	.walk_lock		= PGWALK_RDLOCK,
1255 };
1256 
madvise_guard_remove(struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end)1257 static long madvise_guard_remove(struct vm_area_struct *vma,
1258 				 struct vm_area_struct **prev,
1259 				 unsigned long start, unsigned long end)
1260 {
1261 	*prev = vma;
1262 	/*
1263 	 * We're ok with removing guards in mlock()'d ranges, as this is a
1264 	 * non-destructive action.
1265 	 */
1266 	if (!is_valid_guard_vma(vma, /* allow_locked = */true))
1267 		return -EINVAL;
1268 
1269 	return walk_page_range(vma->vm_mm, start, end,
1270 			       &guard_remove_walk_ops, NULL);
1271 }
1272 
1273 /*
1274  * Apply an madvise behavior to a region of a vma.  madvise_update_vma
1275  * will handle splitting a vm area into separate areas, each area with its own
1276  * behavior.
1277  */
madvise_vma_behavior(struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end,void * behavior_arg)1278 static int madvise_vma_behavior(struct vm_area_struct *vma,
1279 				struct vm_area_struct **prev,
1280 				unsigned long start, unsigned long end,
1281 				void *behavior_arg)
1282 {
1283 	struct madvise_behavior *arg = behavior_arg;
1284 	int behavior = arg->behavior;
1285 	int error;
1286 	struct anon_vma_name *anon_name;
1287 	unsigned long new_flags = vma->vm_flags;
1288 
1289 	if (unlikely(!can_modify_vma_madv(vma, behavior)))
1290 		return -EPERM;
1291 
1292 	switch (behavior) {
1293 	case MADV_REMOVE:
1294 		return madvise_remove(vma, prev, start, end);
1295 	case MADV_WILLNEED:
1296 		return madvise_willneed(vma, prev, start, end);
1297 	case MADV_COLD:
1298 		return madvise_cold(vma, prev, start, end);
1299 	case MADV_PAGEOUT:
1300 		return madvise_pageout(vma, prev, start, end);
1301 	case MADV_FREE:
1302 	case MADV_DONTNEED:
1303 	case MADV_DONTNEED_LOCKED:
1304 		return madvise_dontneed_free(vma, prev, start, end, arg);
1305 	case MADV_NORMAL:
1306 		new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
1307 		break;
1308 	case MADV_SEQUENTIAL:
1309 		new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
1310 		break;
1311 	case MADV_RANDOM:
1312 		new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
1313 		break;
1314 	case MADV_DONTFORK:
1315 		new_flags |= VM_DONTCOPY;
1316 		break;
1317 	case MADV_DOFORK:
1318 		if (vma->vm_flags & VM_IO)
1319 			return -EINVAL;
1320 		new_flags &= ~VM_DONTCOPY;
1321 		break;
1322 	case MADV_WIPEONFORK:
1323 		/* MADV_WIPEONFORK is only supported on anonymous memory. */
1324 		if (vma->vm_file || vma->vm_flags & VM_SHARED)
1325 			return -EINVAL;
1326 		new_flags |= VM_WIPEONFORK;
1327 		break;
1328 	case MADV_KEEPONFORK:
1329 		if (vma->vm_flags & VM_DROPPABLE)
1330 			return -EINVAL;
1331 		new_flags &= ~VM_WIPEONFORK;
1332 		break;
1333 	case MADV_DONTDUMP:
1334 		new_flags |= VM_DONTDUMP;
1335 		break;
1336 	case MADV_DODUMP:
1337 		if ((!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) ||
1338 		    (vma->vm_flags & VM_DROPPABLE))
1339 			return -EINVAL;
1340 		new_flags &= ~VM_DONTDUMP;
1341 		break;
1342 	case MADV_MERGEABLE:
1343 	case MADV_UNMERGEABLE:
1344 		error = ksm_madvise(vma, start, end, behavior, &new_flags);
1345 		if (error)
1346 			goto out;
1347 		break;
1348 	case MADV_HUGEPAGE:
1349 	case MADV_NOHUGEPAGE:
1350 		error = hugepage_madvise(vma, &new_flags, behavior);
1351 		if (error)
1352 			goto out;
1353 		break;
1354 	case MADV_COLLAPSE:
1355 		return madvise_collapse(vma, prev, start, end);
1356 	case MADV_GUARD_INSTALL:
1357 		return madvise_guard_install(vma, prev, start, end);
1358 	case MADV_GUARD_REMOVE:
1359 		return madvise_guard_remove(vma, prev, start, end);
1360 	}
1361 
1362 	anon_name = anon_vma_name(vma);
1363 	anon_vma_name_get(anon_name);
1364 	error = madvise_update_vma(vma, prev, start, end, new_flags,
1365 				   anon_name);
1366 	anon_vma_name_put(anon_name);
1367 
1368 out:
1369 	/*
1370 	 * madvise() returns EAGAIN if kernel resources, such as
1371 	 * slab, are temporarily unavailable.
1372 	 */
1373 	if (error == -ENOMEM)
1374 		error = -EAGAIN;
1375 	return error;
1376 }
1377 
1378 #ifdef CONFIG_MEMORY_FAILURE
1379 /*
1380  * Error injection support for memory error handling.
1381  */
madvise_inject_error(int behavior,unsigned long start,unsigned long end)1382 static int madvise_inject_error(int behavior,
1383 		unsigned long start, unsigned long end)
1384 {
1385 	unsigned long size;
1386 
1387 	if (!capable(CAP_SYS_ADMIN))
1388 		return -EPERM;
1389 
1390 
1391 	for (; start < end; start += size) {
1392 		unsigned long pfn;
1393 		struct page *page;
1394 		int ret;
1395 
1396 		ret = get_user_pages_fast(start, 1, 0, &page);
1397 		if (ret != 1)
1398 			return ret;
1399 		pfn = page_to_pfn(page);
1400 
1401 		/*
1402 		 * When soft offlining hugepages, after migrating the page
1403 		 * we dissolve it, therefore in the second loop "page" will
1404 		 * no longer be a compound page.
1405 		 */
1406 		size = page_size(compound_head(page));
1407 
1408 		if (behavior == MADV_SOFT_OFFLINE) {
1409 			pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
1410 				 pfn, start);
1411 			ret = soft_offline_page(pfn, MF_COUNT_INCREASED);
1412 		} else {
1413 			pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
1414 				 pfn, start);
1415 			ret = memory_failure(pfn, MF_ACTION_REQUIRED | MF_COUNT_INCREASED | MF_SW_SIMULATED);
1416 			if (ret == -EOPNOTSUPP)
1417 				ret = 0;
1418 		}
1419 
1420 		if (ret)
1421 			return ret;
1422 	}
1423 
1424 	return 0;
1425 }
1426 #endif
1427 
1428 static bool
madvise_behavior_valid(int behavior)1429 madvise_behavior_valid(int behavior)
1430 {
1431 	switch (behavior) {
1432 	case MADV_DOFORK:
1433 	case MADV_DONTFORK:
1434 	case MADV_NORMAL:
1435 	case MADV_SEQUENTIAL:
1436 	case MADV_RANDOM:
1437 	case MADV_REMOVE:
1438 	case MADV_WILLNEED:
1439 	case MADV_DONTNEED:
1440 	case MADV_DONTNEED_LOCKED:
1441 	case MADV_FREE:
1442 	case MADV_COLD:
1443 	case MADV_PAGEOUT:
1444 	case MADV_POPULATE_READ:
1445 	case MADV_POPULATE_WRITE:
1446 #ifdef CONFIG_KSM
1447 	case MADV_MERGEABLE:
1448 	case MADV_UNMERGEABLE:
1449 #endif
1450 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1451 	case MADV_HUGEPAGE:
1452 	case MADV_NOHUGEPAGE:
1453 	case MADV_COLLAPSE:
1454 #endif
1455 	case MADV_DONTDUMP:
1456 	case MADV_DODUMP:
1457 	case MADV_WIPEONFORK:
1458 	case MADV_KEEPONFORK:
1459 	case MADV_GUARD_INSTALL:
1460 	case MADV_GUARD_REMOVE:
1461 #ifdef CONFIG_MEMORY_FAILURE
1462 	case MADV_SOFT_OFFLINE:
1463 	case MADV_HWPOISON:
1464 #endif
1465 		return true;
1466 
1467 	default:
1468 		return false;
1469 	}
1470 }
1471 
process_madvise_behavior_valid(int behavior)1472 static bool process_madvise_behavior_valid(int behavior)
1473 {
1474 	switch (behavior) {
1475 	case MADV_COLD:
1476 	case MADV_PAGEOUT:
1477 	case MADV_WILLNEED:
1478 	case MADV_COLLAPSE:
1479 		return true;
1480 	default:
1481 		return false;
1482 	}
1483 }
1484 
1485 /*
1486  * Try to acquire a VMA read lock if possible.
1487  *
1488  * We only support this lock over a single VMA, which the input range must
1489  * span either partially or fully.
1490  *
1491  * This function always returns with an appropriate lock held. If a VMA read
1492  * lock could be acquired, we return the locked VMA.
1493  *
1494  * If a VMA read lock could not be acquired, we return NULL and expect caller to
1495  * fallback to mmap lock behaviour.
1496  */
try_vma_read_lock(struct mm_struct * mm,struct madvise_behavior * madv_behavior,unsigned long start,unsigned long end)1497 static struct vm_area_struct *try_vma_read_lock(struct mm_struct *mm,
1498 		struct madvise_behavior *madv_behavior,
1499 		unsigned long start, unsigned long end)
1500 {
1501 	struct vm_area_struct *vma;
1502 
1503 	vma = lock_vma_under_rcu(mm, start);
1504 	if (!vma)
1505 		goto take_mmap_read_lock;
1506 	/*
1507 	 * Must span only a single VMA; uffd and remote processes are
1508 	 * unsupported.
1509 	 */
1510 	if (end > vma->vm_end || current->mm != mm ||
1511 	    userfaultfd_armed(vma)) {
1512 		vma_end_read(vma);
1513 		goto take_mmap_read_lock;
1514 	}
1515 	return vma;
1516 
1517 take_mmap_read_lock:
1518 	mmap_read_lock(mm);
1519 	madv_behavior->lock_mode = MADVISE_MMAP_READ_LOCK;
1520 	return NULL;
1521 }
1522 
1523 /*
1524  * Walk the vmas in range [start,end), and call the visit function on each one.
1525  * The visit function will get start and end parameters that cover the overlap
1526  * between the current vma and the original range.  Any unmapped regions in the
1527  * original range will result in this function returning -ENOMEM while still
1528  * calling the visit function on all of the existing vmas in the range.
1529  * Must be called with the mmap_lock held for reading or writing.
1530  */
1531 static
madvise_walk_vmas(struct mm_struct * mm,unsigned long start,unsigned long end,struct madvise_behavior * madv_behavior,void * arg,int (* visit)(struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end,void * arg))1532 int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
1533 		      unsigned long end, struct madvise_behavior *madv_behavior,
1534 		      void *arg,
1535 		      int (*visit)(struct vm_area_struct *vma,
1536 				   struct vm_area_struct **prev, unsigned long start,
1537 				   unsigned long end, void *arg))
1538 {
1539 	struct vm_area_struct *vma;
1540 	struct vm_area_struct *prev;
1541 	unsigned long tmp;
1542 	int unmapped_error = 0;
1543 	int error;
1544 
1545 	/*
1546 	 * If VMA read lock is supported, apply madvise to a single VMA
1547 	 * tentatively, avoiding walking VMAs.
1548 	 */
1549 	if (madv_behavior && madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK) {
1550 		vma = try_vma_read_lock(mm, madv_behavior, start, end);
1551 		if (vma) {
1552 			prev = vma;
1553 			error = visit(vma, &prev, start, end, arg);
1554 			vma_end_read(vma);
1555 			return error;
1556 		}
1557 	}
1558 
1559 	/*
1560 	 * If the interval [start,end) covers some unmapped address
1561 	 * ranges, just ignore them, but return -ENOMEM at the end.
1562 	 * - different from the way of handling in mlock etc.
1563 	 */
1564 	vma = find_vma_prev(mm, start, &prev);
1565 	if (vma && start > vma->vm_start)
1566 		prev = vma;
1567 
1568 	for (;;) {
1569 		/* Still start < end. */
1570 		if (!vma)
1571 			return -ENOMEM;
1572 
1573 		/* Here start < (end|vma->vm_end). */
1574 		if (start < vma->vm_start) {
1575 			unmapped_error = -ENOMEM;
1576 			start = vma->vm_start;
1577 			if (start >= end)
1578 				break;
1579 		}
1580 
1581 		/* Here vma->vm_start <= start < (end|vma->vm_end) */
1582 		tmp = vma->vm_end;
1583 		if (end < tmp)
1584 			tmp = end;
1585 
1586 		/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
1587 		error = visit(vma, &prev, start, tmp, arg);
1588 		if (error)
1589 			return error;
1590 		start = tmp;
1591 		if (prev && start < prev->vm_end)
1592 			start = prev->vm_end;
1593 		if (start >= end)
1594 			break;
1595 		if (prev)
1596 			vma = find_vma(mm, prev->vm_end);
1597 		else	/* madvise_remove dropped mmap_lock */
1598 			vma = find_vma(mm, start);
1599 	}
1600 
1601 	return unmapped_error;
1602 }
1603 
1604 #ifdef CONFIG_ANON_VMA_NAME
madvise_vma_anon_name(struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end,void * anon_name)1605 static int madvise_vma_anon_name(struct vm_area_struct *vma,
1606 				 struct vm_area_struct **prev,
1607 				 unsigned long start, unsigned long end,
1608 				 void *anon_name)
1609 {
1610 	int error;
1611 
1612 	/* Only anonymous mappings can be named */
1613 	if (vma->vm_file && !vma_is_anon_shmem(vma))
1614 		return -EBADF;
1615 
1616 	error = madvise_update_vma(vma, prev, start, end, vma->vm_flags,
1617 				   anon_name);
1618 
1619 	/*
1620 	 * madvise() returns EAGAIN if kernel resources, such as
1621 	 * slab, are temporarily unavailable.
1622 	 */
1623 	if (error == -ENOMEM)
1624 		error = -EAGAIN;
1625 	return error;
1626 }
1627 
madvise_set_anon_name(struct mm_struct * mm,unsigned long start,unsigned long len_in,struct anon_vma_name * anon_name)1628 int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
1629 			  unsigned long len_in, struct anon_vma_name *anon_name)
1630 {
1631 	unsigned long end;
1632 	unsigned long len;
1633 
1634 	if (start & ~__PAGE_MASK)
1635 		return -EINVAL;
1636 	len = (len_in + ~__PAGE_MASK) & __PAGE_MASK;
1637 
1638 	/* Check to see whether len was rounded up from small -ve to zero */
1639 	if (len_in && !len)
1640 		return -EINVAL;
1641 
1642 	end = start + len;
1643 	if (end < start)
1644 		return -EINVAL;
1645 
1646 	if (end == start)
1647 		return 0;
1648 
1649 	return madvise_walk_vmas(mm, start, end, NULL, anon_name,
1650 				 madvise_vma_anon_name);
1651 }
1652 #endif /* CONFIG_ANON_VMA_NAME */
1653 
1654 #ifdef CONFIG_MEMORY_FAILURE
is_memory_failure(int behavior)1655 static bool is_memory_failure(int behavior)
1656 {
1657 	switch (behavior) {
1658 	case MADV_HWPOISON:
1659 	case MADV_SOFT_OFFLINE:
1660 		return true;
1661 	default:
1662 		return false;
1663 	}
1664 }
1665 #else
is_memory_failure(int behavior)1666 static bool is_memory_failure(int behavior)
1667 {
1668 	return false;
1669 }
1670 #endif
1671 
1672 /*
1673  * Any behaviour which results in changes to the vma->vm_flags needs to
1674  * take mmap_lock for writing. Others, which simply traverse vmas, need
1675  * to only take it for reading.
1676  */
get_lock_mode(struct madvise_behavior * madv_behavior)1677 static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavior)
1678 {
1679 	int behavior = madv_behavior->behavior;
1680 
1681 	if (is_memory_failure(behavior))
1682 		return MADVISE_NO_LOCK;
1683 
1684 	switch (behavior) {
1685 	case MADV_REMOVE:
1686 	case MADV_WILLNEED:
1687 	case MADV_COLD:
1688 	case MADV_PAGEOUT:
1689 	case MADV_FREE:
1690 	case MADV_POPULATE_READ:
1691 	case MADV_POPULATE_WRITE:
1692 	case MADV_COLLAPSE:
1693 	case MADV_GUARD_INSTALL:
1694 	case MADV_GUARD_REMOVE:
1695 		return MADVISE_MMAP_READ_LOCK;
1696 	case MADV_DONTNEED:
1697 	case MADV_DONTNEED_LOCKED:
1698 		return MADVISE_VMA_READ_LOCK;
1699 	default:
1700 		return MADVISE_MMAP_WRITE_LOCK;
1701 	}
1702 }
1703 
madvise_lock(struct mm_struct * mm,struct madvise_behavior * madv_behavior)1704 static int madvise_lock(struct mm_struct *mm,
1705 		struct madvise_behavior *madv_behavior)
1706 {
1707 	enum madvise_lock_mode lock_mode = get_lock_mode(madv_behavior);
1708 
1709 	switch (lock_mode) {
1710 	case MADVISE_NO_LOCK:
1711 		break;
1712 	case MADVISE_MMAP_WRITE_LOCK:
1713 		if (mmap_write_lock_killable(mm))
1714 			return -EINTR;
1715 		break;
1716 	case MADVISE_MMAP_READ_LOCK:
1717 		mmap_read_lock(mm);
1718 		break;
1719 	case MADVISE_VMA_READ_LOCK:
1720 		/* We will acquire the lock per-VMA in madvise_walk_vmas(). */
1721 		break;
1722 	}
1723 
1724 	madv_behavior->lock_mode = lock_mode;
1725 	return 0;
1726 }
1727 
madvise_unlock(struct mm_struct * mm,struct madvise_behavior * madv_behavior)1728 static void madvise_unlock(struct mm_struct *mm,
1729 		struct madvise_behavior *madv_behavior)
1730 {
1731 	switch (madv_behavior->lock_mode) {
1732 	case  MADVISE_NO_LOCK:
1733 		return;
1734 	case MADVISE_MMAP_WRITE_LOCK:
1735 		mmap_write_unlock(mm);
1736 		break;
1737 	case MADVISE_MMAP_READ_LOCK:
1738 		mmap_read_unlock(mm);
1739 		break;
1740 	case MADVISE_VMA_READ_LOCK:
1741 		/* We will drop the lock per-VMA in madvise_walk_vmas(). */
1742 		break;
1743 	}
1744 
1745 	madv_behavior->lock_mode = MADVISE_NO_LOCK;
1746 }
1747 
1748 /*
1749  * untagged_addr_remote() assumes mmap_lock is already held. On
1750  * architectures like x86 and RISC-V, tagging is tricky because each
1751  * mm may have a different tagging mask. However, we might only hold
1752  * the per-VMA lock (currently only local processes are supported),
1753  * so untagged_addr is used to avoid the mmap_lock assertion for
1754  * local processes.
1755  */
get_untagged_addr(struct mm_struct * mm,unsigned long start)1756 static inline unsigned long get_untagged_addr(struct mm_struct *mm,
1757 		unsigned long start)
1758 {
1759 	return current->mm == mm ? untagged_addr(start) :
1760 				   untagged_addr_remote(mm, start);
1761 }
1762 
1763 /*
1764  * The madvise(2) system call.
1765  *
1766  * Applications can use madvise() to advise the kernel how it should
1767  * handle paging I/O in this VM area.  The idea is to help the kernel
1768  * use appropriate read-ahead and caching techniques.  The information
1769  * provided is advisory only, and can be safely disregarded by the
1770  * kernel without affecting the correct operation of the application.
1771  *
1772  * behavior values:
1773  *  MADV_NORMAL - the default behavior is to read clusters.  This
1774  *		results in some read-ahead and read-behind.
1775  *  MADV_RANDOM - the system should read the minimum amount of data
1776  *		on any access, since it is unlikely that the appli-
1777  *		cation will need more than what it asks for.
1778  *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
1779  *		once, so they can be aggressively read ahead, and
1780  *		can be freed soon after they are accessed.
1781  *  MADV_WILLNEED - the application is notifying the system to read
1782  *		some pages ahead.
1783  *  MADV_DONTNEED - the application is finished with the given range,
1784  *		so the kernel can free resources associated with it.
1785  *  MADV_FREE - the application marks pages in the given range as lazy free,
1786  *		where actual purges are postponed until memory pressure happens.
1787  *  MADV_REMOVE - the application wants to free up the given range of
1788  *		pages and associated backing store.
1789  *  MADV_DONTFORK - omit this area from child's address space when forking:
1790  *		typically, to avoid COWing pages pinned by get_user_pages().
1791  *  MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
1792  *  MADV_WIPEONFORK - present the child process with zero-filled memory in this
1793  *              range after a fork.
1794  *  MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK
1795  *  MADV_HWPOISON - trigger memory error handler as if the given memory range
1796  *		were corrupted by unrecoverable hardware memory failure.
1797  *  MADV_SOFT_OFFLINE - try to soft-offline the given range of memory.
1798  *  MADV_MERGEABLE - the application recommends that KSM try to merge pages in
1799  *		this area with pages of identical content from other such areas.
1800  *  MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
1801  *  MADV_HUGEPAGE - the application wants to back the given range by transparent
1802  *		huge pages in the future. Existing pages might be coalesced and
1803  *		new pages might be allocated as THP.
1804  *  MADV_NOHUGEPAGE - mark the given range as not worth being backed by
1805  *		transparent huge pages so the existing pages will not be
1806  *		coalesced into THP and new pages will not be allocated as THP.
1807  *  MADV_COLLAPSE - synchronously coalesce pages into new THP.
1808  *  MADV_DONTDUMP - the application wants to prevent pages in the given range
1809  *		from being included in its core dump.
1810  *  MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
1811  *  MADV_COLD - the application is not expected to use this memory soon,
1812  *		deactivate pages in this range so that they can be reclaimed
1813  *		easily if memory pressure happens.
1814  *  MADV_PAGEOUT - the application is not expected to use this memory soon,
1815  *		page out the pages in this range immediately.
1816  *  MADV_POPULATE_READ - populate (prefault) page tables readable by
1817  *		triggering read faults if required
1818  *  MADV_POPULATE_WRITE - populate (prefault) page tables writable by
1819  *		triggering write faults if required
1820  *
1821  * return values:
1822  *  zero    - success
1823  *  -EINVAL - start + len < 0, start is not page-aligned,
1824  *		"behavior" is not a valid value, or application
1825  *		is attempting to release locked or shared pages,
1826  *		or the specified address range includes file, Huge TLB,
1827  *		MAP_SHARED or VMPFNMAP range.
1828  *  -ENOMEM - addresses in the specified range are not currently
1829  *		mapped, or are outside the AS of the process.
1830  *  -EIO    - an I/O error occurred while paging in data.
1831  *  -EBADF  - map exists, but area maps something that isn't a file.
1832  *  -EAGAIN - a kernel resource was temporarily unavailable.
1833  *  -EPERM  - memory is sealed.
1834  */
do_madvise(struct mm_struct * mm,unsigned long start,size_t len_in,int behavior)1835 int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
1836 {
1837 	unsigned long end;
1838 	int error;
1839 	size_t len;
1840 	struct blk_plug plug;
1841 	struct madvise_behavior madv_behavior = {.behavior = behavior};
1842 	bool bypass = false;
1843 
1844 	if (!madvise_behavior_valid(behavior))
1845 		return -EINVAL;
1846 
1847 	if (!__PAGE_ALIGNED(start))
1848 		return -EINVAL;
1849 	len = __PAGE_ALIGN(len_in);
1850 
1851 	/* Check to see whether len was rounded up from small -ve to zero */
1852 	if (len_in && !len)
1853 		return -EINVAL;
1854 
1855 	end = start + len;
1856 	if (end < start)
1857 		return -EINVAL;
1858 
1859 	if (end == start)
1860 		return 0;
1861 
1862 	trace_android_vh_mm_do_madvise_bypass(mm, start, len, behavior,
1863 					      &error, &bypass);
1864 	if (bypass)
1865 		return error;
1866 
1867 	error = madvise_lock(mm, &madv_behavior);
1868 	if (error)
1869 		return error;
1870 
1871 #ifdef CONFIG_MEMORY_FAILURE
1872 	if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) {
1873 		int ret = madvise_inject_error(behavior, start, start + len_in);
1874 
1875 		madvise_unlock(mm, &madv_behavior);
1876 
1877 		return ret;
1878 	}
1879 #endif
1880 
1881 	start = get_untagged_addr(mm, start);
1882 	end = start + len;
1883 
1884 	blk_start_plug(&plug);
1885 	switch (behavior) {
1886 	case MADV_POPULATE_READ:
1887 	case MADV_POPULATE_WRITE:
1888 		error = madvise_populate(mm, start, end, behavior);
1889 		break;
1890 	default:
1891 		error = madvise_walk_vmas(mm, start, end, &madv_behavior,
1892 					  &madv_behavior, madvise_vma_behavior);
1893 		break;
1894 	}
1895 	blk_finish_plug(&plug);
1896 
1897 	madvise_unlock(mm, &madv_behavior);
1898 
1899 	return error;
1900 }
1901 
SYSCALL_DEFINE3(madvise,unsigned long,start,size_t,len_in,int,behavior)1902 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
1903 {
1904 	return do_madvise(current->mm, start, len_in, behavior);
1905 }
1906 
SYSCALL_DEFINE5(process_madvise,int,pidfd,const struct iovec __user *,vec,size_t,vlen,int,behavior,unsigned int,flags)1907 SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
1908 		size_t, vlen, int, behavior, unsigned int, flags)
1909 {
1910 	ssize_t ret;
1911 	struct iovec iovstack[UIO_FASTIOV];
1912 	struct iovec *iov = iovstack;
1913 	struct iov_iter iter;
1914 	struct task_struct *task;
1915 	struct mm_struct *mm;
1916 	size_t total_len;
1917 	unsigned int f_flags;
1918 	bool bypass = false;
1919 	bool return_error = false;
1920 
1921 	trace_android_rvh_process_madvise_bypass(pidfd, vec,
1922 			vlen, behavior, flags, &ret, &bypass);
1923 	if (bypass)
1924 		return ret;
1925 
1926 	if (flags != 0) {
1927 		ret = -EINVAL;
1928 		goto out;
1929 	}
1930 
1931 	ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
1932 	if (ret < 0)
1933 		goto out;
1934 
1935 	task = pidfd_get_task(pidfd, &f_flags);
1936 	if (IS_ERR(task)) {
1937 		ret = PTR_ERR(task);
1938 		goto free_iov;
1939 	}
1940 
1941 	if (!process_madvise_behavior_valid(behavior)) {
1942 		ret = -EINVAL;
1943 		goto release_task;
1944 	}
1945 
1946 	/* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
1947 	mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
1948 	if (IS_ERR_OR_NULL(mm)) {
1949 		ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
1950 		goto release_task;
1951 	}
1952 
1953 	/*
1954 	 * Require CAP_SYS_NICE for influencing process performance. Note that
1955 	 * only non-destructive hints are currently supported.
1956 	 */
1957 	if (mm != current->mm && !capable(CAP_SYS_NICE)) {
1958 		ret = -EPERM;
1959 		goto release_mm;
1960 	}
1961 
1962 	total_len = iov_iter_count(&iter);
1963 	trace_android_vh_process_madvise_begin(task, behavior);
1964 
1965 	while (iov_iter_count(&iter)) {
1966 		trace_android_vh_process_madvise_iter(task, behavior, &ret);
1967 		if (ret < 0)
1968 			break;
1969 		ret = do_madvise(mm, (unsigned long)iter_iov_addr(&iter),
1970 					iter_iov_len(&iter), behavior);
1971 		/*
1972 		 * An madvise operation is attempting to restart the syscall,
1973 		 * but we cannot proceed as it would not be correct to repeat
1974 		 * the operation in aggregate, and would be surprising to the
1975 		 * user.
1976 		 *
1977 		 * As we have already dropped locks, it is safe to just loop and
1978 		 * try again. We check for fatal signals in case we need exit
1979 		 * early anyway.
1980 		 */
1981 		if (ret == -ERESTARTNOINTR) {
1982 			if (fatal_signal_pending(current)) {
1983 				ret = -EINTR;
1984 				break;
1985 			}
1986 			continue;
1987 		}
1988 		if (ret < 0)
1989 			break;
1990 		iov_iter_advance(&iter, iter_iov_len(&iter));
1991 	}
1992 	trace_android_vh_process_madvise_return_error(behavior, ret, &return_error);
1993 	if (return_error)
1994 		goto release_mm;
1995 
1996 	ret = (total_len - iov_iter_count(&iter)) ? : ret;
1997 
1998 release_mm:
1999 	mmput(mm);
2000 release_task:
2001 	put_task_struct(task);
2002 free_iov:
2003 	kfree(iov);
2004 out:
2005 	trace_android_vh_process_madvise(behavior, &ret, NULL);
2006 	return ret;
2007 }
2008